Repository: wang-xinyu/tensorrtx
Branch: master
Commit: 2990f34a8502
Files: 744
Total size: 5.6 MB

Directory structure:
gitextract_0y61g4fh/

├── .clang-format
├── .cmake-format.yaml
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   └── tensorrtx-issue-template.md
│   ├── stale.yml
│   └── workflows/
│       └── pre-commit.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── alexnet/
│   ├── CMakeLists.txt
│   ├── FindTensorRT.cmake
│   ├── README.md
│   ├── alexnet.cc
│   ├── alexnet.py
│   ├── gen_wts.py
│   ├── logging.h
│   ├── macros.h
│   └── utils.h
├── arcface/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── arcface-mobilefacenet.cpp
│   ├── arcface-r100.cpp
│   ├── arcface-r50.cpp
│   ├── gen_wts.py
│   ├── logging.h
│   ├── macros.h
│   ├── prelu.cu
│   └── prelu.h
├── assets/
│   └── 6.pgm
├── centernet/
│   ├── README.md
│   ├── centernet.py
│   ├── dcnv2Plugin/
│   │   ├── CMakeLists.txt
│   │   ├── dcn_v2_im2col_cuda.cu
│   │   ├── dcn_v2_im2col_cuda.h
│   │   ├── dcnv2Plugin.cpp
│   │   └── dcnv2Plugin.h
│   └── sample/
│       ├── common.py
│       └── test.py
├── contributing.md
├── convnextv2/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── config.yaml
│   ├── gen_wts.py
│   ├── inference.py
│   └── src/
│       ├── LayerNormPlugin.cu
│       ├── LayerNormPlugin.h
│       ├── convnextv2.cpp
│       ├── inference_cpp.cpp
│       └── logging.h
├── crnn/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── crnn.cpp
│   ├── genwts.py
│   └── logging.h
├── csrnet/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── config.h
│   ├── csrnet.cpp
│   ├── gen_wts.py
│   ├── logging.h
│   └── macros.h
├── dbnet/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── clipper/
│   │   ├── CMakeLists.txt
│   │   ├── clipper.cpp
│   │   └── clipper.hpp
│   ├── common.hpp
│   ├── dbnet.cpp
│   ├── logging.h
│   └── utils.h
├── densenet/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── densenet121.cpp
│   ├── densenet121.py
│   └── logging.h
├── detr/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── backbone.hpp
│   ├── calibrator.hpp
│   ├── common.hpp
│   ├── detr.cpp
│   ├── gen_wts.py
│   ├── logging.h
│   └── macros.h
├── docker/
│   ├── README.md
│   ├── tensorrtx-docker-compose.yml
│   └── x86_64.dockerfile
├── efficient_ad/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── efficientAD_det.cpp
│   └── src/
│       ├── config.h
│       ├── cuda_utils.h
│       ├── logging.h
│       ├── macros.h
│       ├── model.cpp
│       ├── model.h
│       ├── postprocess.h
│       └── utils.h
├── efficientnet/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── efficientnet.cpp
│   ├── gen_wts.py
│   ├── logging.h
│   └── utils.hpp
├── ghostnet/
│   ├── README.md
│   ├── ghostnetv1/
│   │   ├── CMakeLists.txt
│   │   ├── gen_wts.py
│   │   ├── ghostnetv1.cpp
│   │   └── logging.h
│   └── ghostnetv2/
│       ├── CMakeLists.txt
│       ├── gen_wts.py
│       ├── ghostnetv2.cpp
│       └── logging.h
├── googlenet/
│   ├── CMakeLists.txt
│   ├── FindTensorRT.cmake
│   ├── README.md
│   ├── gen_wts.py
│   ├── googlenet.cpp
│   ├── logging.h
│   ├── macros.h
│   └── utils.h
├── hrnet/
│   ├── hrnet-image-classification/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── common.hpp
│   │   ├── demo.py
│   │   ├── hrnet.cpp
│   │   └── logging.h
│   └── hrnet-semantic-segmentation/
│       ├── CMakeLists.txt
│       ├── README.md
│       ├── common.hpp
│       ├── gen_wts.py
│       ├── hrnet.cpp
│       ├── hrnet_ocr.cpp
│       ├── hrnet_trt.py
│       └── logging.h
├── ibnnet/
│   ├── CMakeLists.txt
│   ├── InferenceEngine.cpp
│   ├── InferenceEngine.h
│   ├── README.md
│   ├── gen_wts.py
│   ├── holder.h
│   ├── ibnnet.cpp
│   ├── ibnnet.h
│   ├── layers.cpp
│   ├── layers.h
│   ├── logging.h
│   ├── main.cpp
│   ├── utils.cpp
│   └── utils.h
├── inception/
│   ├── inceptionv3/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── inception_v3.cpp
│   │   └── logging.h
│   └── inceptionv4/
│       ├── CMakeLists.txt
│       ├── README.md
│       ├── inception_v4.cpp
│       ├── inception_v4.h
│       ├── layers_api.cpp
│       ├── layers_api.h
│       ├── logging.h
│       ├── main.cpp
│       ├── utils.cpp
│       └── utils.h
├── lenet/
│   ├── CMakeLists.txt
│   ├── FindTensorRT.cmake
│   ├── README.md
│   ├── gen_wts.py
│   ├── lenet.cpp
│   ├── lenet.py
│   ├── lenet_tripy.py
│   ├── logging.h
│   ├── macros.h
│   └── utils.h
├── lprnet/
│   ├── CMakeLists.txt
│   ├── FindTensorRT.cmake
│   ├── README.md
│   ├── gen_wts.py
│   ├── logging.h
│   ├── lprnet.cpp
│   ├── macros.h
│   └── utils.h
├── mlp/
│   ├── CMakeLists.txt
│   ├── FindTensorRT.cmake
│   ├── README.md
│   ├── logging.h
│   ├── macros.h
│   ├── mlp.cpp
│   ├── mlp.py
│   └── utils.h
├── mnasnet/
│   ├── CMakeLists.txt
│   ├── FindTensorRT.cmake
│   ├── README.md
│   ├── gen_wts.py
│   ├── logging.h
│   ├── macros.h
│   ├── mnasnet.cpp
│   └── utils.h
├── mobilenet/
│   ├── mobilenetv2/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── logging.h
│   │   ├── mobilenet_v2.cpp
│   │   └── mobilenet_v2.py
│   └── mobilenetv3/
│       ├── CMakeLists.txt
│       ├── README.md
│       ├── logging.h
│       ├── mobilenet_v3.cpp
│       └── mobilenet_v3.py
├── psenet/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── gen_tf_wts.py
│   ├── layers.cpp
│   ├── layers.h
│   ├── main.cpp
│   ├── psenet.cpp
│   ├── psenet.h
│   ├── utils.cpp
│   └── utils.h
├── rcnn/
│   ├── BatchedNms.cu
│   ├── BatchedNmsPlugin.h
│   ├── CMakeLists.txt
│   ├── MaskRcnnInference.cu
│   ├── MaskRcnnInferencePlugin.h
│   ├── PredictorDecode.cu
│   ├── PredictorDecodePlugin.h
│   ├── README.md
│   ├── RoiAlign.cu
│   ├── RoiAlignPlugin.h
│   ├── RpnDecode.cu
│   ├── RpnDecodePlugin.h
│   ├── RpnNms.cu
│   ├── RpnNmsPlugin.h
│   ├── backbone.hpp
│   ├── calibrator.hpp
│   ├── common.hpp
│   ├── cuda_utils.h
│   ├── gen_wts.py
│   ├── logging.h
│   ├── macros.h
│   └── rcnn.cpp
├── real-esrgan/
│   ├── general-x4v3/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── cmake/
│   │   │   └── FindTensorRT.cmake
│   │   ├── gen_wts.py
│   │   ├── main.cpp
│   │   └── src/
│   │       ├── include/
│   │       │   ├── config/
│   │       │   │   └── config.hpp
│   │       │   ├── cuda_utils.h
│   │       │   ├── logging/
│   │       │   │   └── logging.h
│   │       │   ├── pixel_shuffle/
│   │       │   │   └── pixel_shuffle.hpp
│   │       │   └── preprocess/
│   │       │       └── preprocess.hpp
│   │       └── pixel_shuffle/
│   │           ├── pixel_shuffle.cpp
│   │           └── pixel_shuffle.cu
│   └── x4plus/
│       ├── CMakeLists.txt
│       ├── README.md
│       ├── common.hpp
│       ├── cuda_utils.h
│       ├── gen_wts.py
│       ├── logging.h
│       ├── macros.h
│       ├── postprocess.cu
│       ├── postprocess.hpp
│       ├── preprocess.cu
│       ├── preprocess.hpp
│       ├── real-esrgan.cpp
│       └── utils.h
├── refinedet/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── calibrator.cpp
│   ├── calibrator.h
│   ├── configure.h
│   ├── gen_wts_refinedet.py
│   ├── logging.h
│   ├── refinedet.cpp
│   └── utils.h
├── repvgg/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── gen_wts.py
│   ├── logging.h
│   └── repvgg.cpp
├── resnet/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── logging.h
│   ├── resnet18.cpp
│   ├── resnet34.cpp
│   ├── resnet50.cpp
│   ├── resnet50.py
│   ├── resnext50_32x4d.cpp
│   ├── wide_resnet50.py
│   └── wideresnet50.cpp
├── retinaface/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── calibrator.cpp
│   ├── calibrator.h
│   ├── common.hpp
│   ├── decode.cu
│   ├── decode.h
│   ├── logging.h
│   ├── macros.h
│   ├── retina_mnet.cpp
│   ├── retina_r50.cpp
│   └── retinaface_trt.py
├── retinafaceAntiCov/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── decode.cu
│   ├── decode.h
│   ├── gen_wts.py
│   ├── logging.h
│   ├── macros.h
│   └── retinafaceAntiCov.cpp
├── scaled-yolov4/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── common.hpp
│   ├── gen_wts.py
│   ├── logging.h
│   ├── mish.cu
│   ├── mish.h
│   ├── utils.h
│   ├── yololayer.cu
│   ├── yololayer.h
│   └── yolov4_csp.cpp
├── senet/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── logging.h
│   └── se_resnet50.cpp
├── shufflenetv2/
│   ├── CMakeLists.txt
│   ├── FindTensorRT.cmake
│   ├── README.md
│   ├── gen_wts.py
│   ├── logging.h
│   ├── macros.h
│   ├── shufflenetv2.cpp
│   └── utils.h
├── squeezenet/
│   ├── CMakeLists.txt
│   ├── FindTensorRT.cmake
│   ├── README.md
│   ├── gen_wts.py
│   ├── logging.h
│   ├── macros.h
│   ├── squeezenet.cpp
│   └── utils.h
├── superpoint/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── gen_wts.py
│   ├── logging.h
│   ├── supernet.cpp
│   ├── utils.cpp
│   └── utils.h
├── swin-transformer/
│   └── semantic-segmentation/
│       ├── CMakeLists.txt
│       ├── README.md
│       ├── UpsampleKernel.cu
│       ├── UpsamplePlugin.cpp
│       ├── UpsamplePlugin.h
│       ├── UpsmapleKernel.h
│       ├── common.hpp
│       ├── fillmask.cu
│       ├── fillmask.h
│       ├── gelu.cu
│       ├── gelu.h
│       ├── gen_wts.py
│       ├── include/
│       │   └── dirent.h
│       ├── layerNorm.cu
│       ├── layerNorm.h
│       ├── logging.h
│       ├── main.cpp
│       ├── myhpp.h
│       ├── trainsform.cpp
│       └── utilsn.h
├── tsm/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── demo.sh
│   ├── gen_wts.py
│   ├── logging.h
│   ├── mmaction2_tsm_r50_config.py
│   ├── test_shift.py
│   ├── tsm_r50.cpp
│   └── tsm_r50.py
├── tutorials/
│   ├── check_fp16_int8_support.md
│   ├── faq.md
│   ├── from_pytorch_to_trt_stepbystep_hrnet.md
│   ├── getting_started.md
│   ├── install.md
│   ├── measure_performance.md
│   ├── migration_guide.md
│   ├── multi_GPU_processing.md
│   └── run_on_windows.md
├── ufld/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── common.hpp
│   ├── gen_wts.py
│   ├── lane_det.cpp
│   ├── logging.h
│   ├── macros.h
│   └── pth2onnx.py
├── unet/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── common.hpp
│   ├── gen_wts.py
│   ├── logging.h
│   ├── macros.h
│   └── unet.cpp
├── vgg/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── logging.h
│   └── vgg11.cpp
├── vit/
│   ├── CMakeLists.txt
│   ├── FindTensorRT.cmake
│   ├── README.md
│   ├── cuda_allocator.cc
│   ├── cuda_allocator.h
│   ├── gen_wts.py
│   ├── logging.h
│   ├── macros.h
│   ├── profiler.cc
│   ├── profiler.h
│   ├── utils.h
│   └── vit.cc
├── yolo11/
│   ├── CMakeLists.txt
│   ├── gen_wts.py
│   ├── include/
│   │   ├── block.h
│   │   ├── calibrator.h
│   │   ├── config.h
│   │   ├── cuda_utils.h
│   │   ├── logging.h
│   │   ├── macros.h
│   │   ├── model.h
│   │   ├── postprocess.h
│   │   ├── preprocess.h
│   │   ├── types.h
│   │   └── utils.h
│   ├── plugin/
│   │   ├── yololayer.cu
│   │   └── yololayer.h
│   ├── readme.md
│   ├── src/
│   │   ├── block.cpp
│   │   ├── calibrator.cpp
│   │   ├── model.cpp
│   │   ├── postprocess.cpp
│   │   ├── postprocess.cu
│   │   └── preprocess.cu
│   ├── yolo11_cls.cpp
│   ├── yolo11_cls_trt.py
│   ├── yolo11_det.cpp
│   ├── yolo11_det_trt.py
│   ├── yolo11_obb.cpp
│   ├── yolo11_obb_trt.py
│   ├── yolo11_pose.cpp
│   ├── yolo11_pose_trt.py
│   ├── yolo11_seg.cpp
│   └── yolo11_seg_trt.py
├── yolo11_tripy/
│   ├── .gitignore
│   ├── README.md
│   ├── classify.py
│   ├── compile_classifier.py
│   ├── constants.py
│   ├── model/
│   │   ├── block.py
│   │   └── model.py
│   └── requirements.txt
├── yolo26/
│   ├── .clang-format
│   ├── .gitignore
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── gen_wts.py
│   ├── include/
│   │   ├── block.h
│   │   ├── config.h
│   │   ├── cuda_utils.h
│   │   ├── logging.h
│   │   ├── macros.h
│   │   ├── model.h
│   │   ├── postprocess.h
│   │   ├── preprocess.h
│   │   ├── types.h
│   │   └── utils.h
│   ├── plugin/
│   │   ├── yololayer.cu
│   │   └── yololayer.h
│   ├── src/
│   │   ├── block.cpp
│   │   ├── model.cpp
│   │   ├── postprocess.cpp
│   │   └── preprocess.cu
│   ├── yolo26_cls.cpp
│   ├── yolo26_det.cpp
│   └── yolo26_obb.cpp
├── yolop/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── common.hpp
│   ├── cuda_utils.h
│   ├── gen_wts.py
│   ├── logging.h
│   ├── macros.h
│   ├── utils.h
│   ├── yololayer.cu
│   ├── yololayer.h
│   ├── yolop.cpp
│   ├── yolop.hpp
│   └── yolop_trt.py
├── yolov10/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── gen_wts.py
│   ├── include/
│   │   ├── block.h
│   │   ├── calibrator.h
│   │   ├── config.h
│   │   ├── cuda_utils.h
│   │   ├── logging.h
│   │   ├── macros.h
│   │   ├── model.h
│   │   ├── postprocess.h
│   │   ├── preprocess.h
│   │   ├── types.h
│   │   └── utils.h
│   ├── plugin/
│   │   ├── yololayer.cu
│   │   └── yololayer.h
│   ├── src/
│   │   ├── block.cpp
│   │   ├── calibrator.cpp
│   │   ├── model.cpp
│   │   ├── postprocess.cpp
│   │   └── preprocess.cu
│   ├── yolov10_det.cpp
│   └── yolov10_det_trt.py
├── yolov12/
│   ├── CMakeLists.txt
│   ├── gen_wts.py
│   ├── include/
│   │   ├── block.h
│   │   ├── config.h
│   │   ├── cuda_utils.h
│   │   ├── logging.h
│   │   ├── macros.h
│   │   ├── model.h
│   │   ├── postprocess.h
│   │   ├── preprocess.h
│   │   ├── types.h
│   │   └── utils.h
│   ├── plugin/
│   │   ├── yololayer.cu
│   │   └── yololayer.h
│   ├── readme.md
│   ├── src/
│   │   ├── block.cpp
│   │   ├── model.cpp
│   │   ├── postprocess.cpp
│   │   ├── postprocess.cu
│   │   └── preprocess.cu
│   └── yolo12_det.cpp
├── yolov12-tubro/
│   ├── CMakeLists.txt
│   ├── gen_wts.py
│   ├── include/
│   │   ├── block.h
│   │   ├── calibrator.h
│   │   ├── config.h
│   │   ├── cuda_utils.h
│   │   ├── logging.h
│   │   ├── macros.h
│   │   ├── model.h
│   │   ├── postprocess.h
│   │   ├── preprocess.h
│   │   ├── types.h
│   │   └── utils.h
│   ├── plugin/
│   │   ├── yololayer.cu
│   │   └── yololayer.h
│   ├── readme.md
│   ├── src/
│   │   ├── block.cpp
│   │   ├── calibrator.cpp
│   │   ├── model.cpp
│   │   ├── postprocess.cpp
│   │   ├── postprocess.cu
│   │   └── preprocess.cu
│   ├── yolov12_cls.cpp
│   ├── yolov12_cls_trt.py
│   ├── yolov12_det.cpp
│   ├── yolov12_det_trt.py
│   ├── yolov12_seg.cpp
│   └── yolov12_seg_trt.py
├── yolov13/
│   ├── CMakeLists.txt
│   ├── gen_wts.py
│   ├── include/
│   │   ├── block.h
│   │   ├── calibrator.h
│   │   ├── config.h
│   │   ├── cuda_utils.h
│   │   ├── logging.h
│   │   ├── macros.h
│   │   ├── model.h
│   │   ├── postprocess.h
│   │   ├── preprocess.h
│   │   ├── types.h
│   │   └── utils.h
│   ├── plugin/
│   │   ├── geluKernel.cu
│   │   ├── yololayer.cu
│   │   └── yololayer.h
│   ├── readme.md
│   ├── src/
│   │   ├── block.cpp
│   │   ├── calibrator.cpp
│   │   ├── model.cpp
│   │   ├── postprocess.cpp
│   │   ├── postprocess.cu
│   │   └── preprocess.cu
│   ├── yolov13_det.cpp
│   └── yolov13_det_trt.py
├── yolov3/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── calibrator.cpp
│   ├── calibrator.h
│   ├── gen_wts.py
│   ├── logging.h
│   ├── macros.h
│   ├── utils.h
│   ├── yololayer.cu
│   ├── yololayer.h
│   ├── yolov3.cpp
│   └── yolov3_trt.py
├── yolov3-spp/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── Utils.h
│   ├── gen_wts.py
│   ├── logging.h
│   ├── yololayer.cu
│   ├── yololayer.h
│   └── yolov3-spp.cpp
├── yolov3-tiny/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── gen_wts.py
│   ├── logging.h
│   ├── macros.h
│   ├── utils.h
│   ├── yololayer.cu
│   ├── yololayer.h
│   └── yolov3-tiny.cpp
├── yolov4/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── gen_wts.py
│   ├── logging.h
│   ├── mish.cu
│   ├── mish.h
│   ├── utils.h
│   ├── yololayer.cu
│   ├── yololayer.h
│   └── yolov4.cpp
├── yolov5/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── gen_wts.py
│   ├── plugin/
│   │   ├── yololayer.cu
│   │   └── yololayer.h
│   ├── src/
│   │   ├── calibrator.cpp
│   │   ├── calibrator.h
│   │   ├── config.h
│   │   ├── cuda_utils.h
│   │   ├── logging.h
│   │   ├── macros.h
│   │   ├── model.cpp
│   │   ├── model.h
│   │   ├── postprocess.cpp
│   │   ├── postprocess.h
│   │   ├── preprocess.cu
│   │   ├── preprocess.h
│   │   ├── types.h
│   │   └── utils.h
│   ├── yolov5_cls.cpp
│   ├── yolov5_cls_trt.py
│   ├── yolov5_det.cpp
│   ├── yolov5_det_cuda_python.py
│   ├── yolov5_det_trt.py
│   ├── yolov5_seg.cpp
│   └── yolov5_seg_trt.py
├── yolov5-lite/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── calibrator.cpp
│   ├── common.hpp
│   ├── gen_wts.py
│   ├── v5lite.cpp
│   ├── yololayer.cu
│   └── yolov5-lite-trt.py
├── yolov7/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── gen_wts.py
│   ├── include/
│   │   ├── block.h
│   │   ├── calibrator.h
│   │   ├── config.h
│   │   ├── cuda_utils.h
│   │   ├── logging.h
│   │   ├── macros.h
│   │   ├── model.h
│   │   ├── postprocess.h
│   │   ├── preprocess.h
│   │   ├── types.h
│   │   └── utils.h
│   ├── main.cpp
│   ├── plugin/
│   │   ├── yololayer.cu
│   │   └── yololayer.h
│   ├── src/
│   │   ├── block.cpp
│   │   ├── calibrator.cpp
│   │   ├── model.cpp
│   │   ├── postprocess.cpp
│   │   └── preprocess.cu
│   └── yolov7_trt.py
├── yolov8/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── gen_wts.py
│   ├── include/
│   │   ├── block.h
│   │   ├── calibrator.h
│   │   ├── config.h
│   │   ├── cuda_utils.h
│   │   ├── logging.h
│   │   ├── macros.h
│   │   ├── model.h
│   │   ├── postprocess.h
│   │   ├── preprocess.h
│   │   ├── types.h
│   │   └── utils.h
│   ├── plugin/
│   │   ├── yololayer.cu
│   │   └── yololayer.h
│   ├── src/
│   │   ├── block.cpp
│   │   ├── calibrator.cpp
│   │   ├── model.cpp
│   │   ├── postprocess.cpp
│   │   ├── postprocess.cu
│   │   └── preprocess.cu
│   ├── yolov8_5u_det.cpp
│   ├── yolov8_5u_det_trt.py
│   ├── yolov8_cls.cpp
│   ├── yolov8_cls_trt.py
│   ├── yolov8_det.cpp
│   ├── yolov8_det_trt.py
│   ├── yolov8_obb.cpp
│   ├── yolov8_obb_trt.py
│   ├── yolov8_pose.cpp
│   ├── yolov8_pose_trt.py
│   ├── yolov8_seg.cpp
│   └── yolov8_seg_trt.py
└── yolov9/
    ├── CMakeLists.txt
    ├── README.md
    ├── demo.cpp
    ├── gen_wts.py
    ├── include/
    │   ├── block.h
    │   ├── calibrator.h
    │   ├── config.h
    │   ├── cuda_utils.h
    │   ├── logging.h
    │   ├── macros.h
    │   ├── model.h
    │   ├── postprocess.h
    │   ├── preprocess.h
    │   ├── types.h
    │   └── utils.h
    ├── plugin/
    │   ├── yololayer.cu
    │   └── yololayer.h
    ├── src/
    │   ├── block.cpp
    │   ├── calibrator.cpp
    │   ├── model.cpp
    │   ├── postprocess.cpp
    │   ├── postprocess.cu
    │   └── preprocess.cu
    ├── windows/
    │   └── dirent.h
    └── yolov9_trt.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .clang-format
================================================
# Google C/C++ Code Style settings (with 4-space)
# Refered to https://github.com/kehanXue/google-style-clang-format/blob/master/.clang-format

Language: Cpp
BasedOnStyle: Google
AccessModifierOffset: -1
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: None
AlignOperands: Align
AllowAllArgumentsOnNextLine: true
AllowAllConstructorInitializersOnNextLine: true
AllowAllParametersOfDeclarationOnNextLine: false
AllowShortBlocksOnASingleLine: Empty
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Inline
AllowShortIfStatementsOnASingleLine: Never  # To avoid conflict, set this "Never" and each "if statement" should include brace when coding
AllowShortLambdasOnASingleLine: Inline
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterReturnType: None
AlwaysBreakTemplateDeclarations: Yes
BinPackArguments: true
BreakBeforeBraces: Custom
BraceWrapping:
  AfterCaseLabel: false
  AfterClass: false
  AfterStruct: false
  AfterControlStatement: Never
  AfterEnum: false
  AfterFunction: false
  AfterNamespace: false
  AfterUnion: false
  AfterExternBlock: false
  BeforeCatch: false
  BeforeElse: false
  BeforeLambdaBody: false
  IndentBraces: false
  SplitEmptyFunction: false
  SplitEmptyRecord: false
  SplitEmptyNamespace: false
BreakBeforeBinaryOperators: None
BreakBeforeTernaryOperators: true
BreakConstructorInitializers: BeforeColon
BreakInheritanceList: BeforeColon
ColumnLimit: 120
CompactNamespaces: false
ContinuationIndentWidth: 8
Cpp11BracedListStyle: true
DerivePointerAlignment: false  # Make sure the * or & align on the left
EmptyLineBeforeAccessModifier: LogicalBlock
FixNamespaceComments: true
IncludeBlocks: Preserve
IndentCaseLabels: true
IndentPPDirectives: None
IndentWidth: 4
KeepEmptyLinesAtTheStartOfBlocks: true
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: true
PointerAlignment: Left
ReflowComments: false
# SeparateDefinitionBlocks: Always   # Only support since clang-format 14
SpaceAfterCStyleCast: false
SpaceAfterLogicalNot: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeCpp11BracedList: false
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeParens: ControlStatements
SpaceBeforeRangeBasedForLoopColon: true
SpaceBeforeSquareBrackets: false
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 2
SpacesInAngles: false
SpacesInCStyleCastParentheses: false
SpacesInContainerLiterals: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: c++11
TabWidth: 8
UseTab: Never


================================================
FILE: .cmake-format.yaml
================================================
_help_parse: Options affecting listfile parsing
parse:
  _help_additional_commands:
  - Specify structure for custom cmake functions
  additional_commands:
    foo:
      flags:
      - BAR
      - BAZ
      kwargs:
        HEADERS: '*'
        SOURCES: '*'
        DEPENDS: '*'
  _help_override_spec:
  - Override configurations per-command where available
  override_spec: {}
  _help_vartags:
  - Specify variable tags.
  vartags: []
  _help_proptags:
  - Specify property tags.
  proptags: []
_help_format: Options affecting formatting.
format:
  _help_disable:
  - Disable formatting entirely, making cmake-format a no-op
  disable: false
  _help_line_width:
  - How wide to allow formatted cmake files
  line_width: 80
  _help_tab_size:
  - How many spaces to tab for indent
  tab_size: 2
  _help_use_tabchars:
  - If true, lines are indented using tab characters (utf-8
  - 0x09) instead of <tab_size> space characters (utf-8 0x20).
  - In cases where the layout would require a fractional tab
  - character, the behavior of the  fractional indentation is
  - governed by <fractional_tab_policy>
  use_tabchars: false
  _help_fractional_tab_policy:
  - If <use_tabchars> is True, then the value of this variable
  - indicates how fractional indentions are handled during
  - whitespace replacement. If set to 'use-space', fractional
  - indentation is left as spaces (utf-8 0x20). If set to
  - '`round-up` fractional indentation is replaced with a single'
  - tab character (utf-8 0x09) effectively shifting the column
  - to the next tabstop
  fractional_tab_policy: use-space
  _help_max_subgroups_hwrap:
  - If an argument group contains more than this many sub-groups
  - (parg or kwarg groups) then force it to a vertical layout.
  max_subgroups_hwrap: 2
  _help_max_pargs_hwrap:
  - If a positional argument group contains more than this many
  - arguments, then force it to a vertical layout.
  max_pargs_hwrap: 6
  _help_max_rows_cmdline:
  - If a cmdline positional group consumes more than this many
  - lines without nesting, then invalidate the layout (and nest)
  max_rows_cmdline: 2
  _help_separate_ctrl_name_with_space:
  - If true, separate flow control names from their parentheses
  - with a space
  separate_ctrl_name_with_space: false
  _help_separate_fn_name_with_space:
  - If true, separate function names from parentheses with a
  - space
  separate_fn_name_with_space: false
  _help_dangle_parens:
  - If a statement is wrapped to more than one line, than dangle
  - the closing parenthesis on its own line.
  dangle_parens: false
  _help_dangle_align:
  - If the trailing parenthesis must be 'dangled' on its on
  - 'line, then align it to this reference: `prefix`: the start'
  - 'of the statement,  `prefix-indent`: the start of the'
  - 'statement, plus one indentation  level, `child`: align to'
  - the column of the arguments
  dangle_align: prefix
  _help_min_prefix_chars:
  - If the statement spelling length (including space and
  - parenthesis) is smaller than this amount, then force reject
  - nested layouts.
  min_prefix_chars: 4
  _help_max_prefix_chars:
  - If the statement spelling length (including space and
  - parenthesis) is larger than the tab width by more than this
  - amount, then force reject un-nested layouts.
  max_prefix_chars: 10
  _help_max_lines_hwrap:
  - If a candidate layout is wrapped horizontally but it exceeds
  - this many lines, then reject the layout.
  max_lines_hwrap: 2
  _help_line_ending:
  - What style line endings to use in the output.
  line_ending: unix
  _help_command_case:
  - Format command names consistently as 'lower' or 'upper' case
  command_case: canonical
  _help_keyword_case:
  - Format keywords consistently as 'lower' or 'upper' case
  keyword_case: unchanged
  _help_always_wrap:
  - A list of command names which should always be wrapped
  always_wrap: []
  _help_enable_sort:
  - If true, the argument lists which are known to be sortable
  - will be sorted lexicographicall
  enable_sort: true
  _help_autosort:
  - If true, the parsers may infer whether or not an argument
  - list is sortable (without annotation).
  autosort: false
  _help_require_valid_layout:
  - By default, if cmake-format cannot successfully fit
  - everything into the desired linewidth it will apply the
  - last, most agressive attempt that it made. If this flag is
  - True, however, cmake-format will print error, exit with non-
  - zero status code, and write-out nothing
  require_valid_layout: false
  _help_layout_passes:
  - A dictionary mapping layout nodes to a list of wrap
  - decisions. See the documentation for more information.
  layout_passes: {}
_help_markup: Options affecting comment reflow and formatting.
markup:
  _help_bullet_char:
  - What character to use for bulleted lists
  bullet_char: '*'
  _help_enum_char:
  - What character to use as punctuation after numerals in an
  - enumerated list
  enum_char: .
  _help_first_comment_is_literal:
  - If comment markup is enabled, don't reflow the first comment
  - block in each listfile. Use this to preserve formatting of
  - your copyright/license statements.
  first_comment_is_literal: false
  _help_literal_comment_pattern:
  - If comment markup is enabled, don't reflow any comment block
  - which matches this (regex) pattern. Default is `None`
  - (disabled).
  literal_comment_pattern: null
  _help_fence_pattern:
  - Regular expression to match preformat fences in comments
  - default= ``r'^\s*([`~]{3}[`~]*)(.*)$'``
  fence_pattern: ^\s*([`~]{3}[`~]*)(.*)$
  _help_ruler_pattern:
  - Regular expression to match rulers in comments default=
  - '``r''^\s*[^\w\s]{3}.*[^\w\s]{3}$''``'
  ruler_pattern: ^\s*[^\w\s]{3}.*[^\w\s]{3}$
  _help_explicit_trailing_pattern:
  - If a comment line matches starts with this pattern then it
  - is explicitly a trailing comment for the preceeding
  - argument. Default is '#<'
  explicit_trailing_pattern: '#<'
  _help_hashruler_min_length:
  - If a comment line starts with at least this many consecutive
  - hash characters, then don't lstrip() them off. This allows
  - for lazy hash rulers where the first hash char is not
  - separated by space
  hashruler_min_length: 10
  _help_canonicalize_hashrulers:
  - If true, then insert a space between the first hash char and
  - remaining hash chars in a hash ruler, and normalize its
  - length to fill the column
  canonicalize_hashrulers: true
  _help_enable_markup:
  - enable comment markup parsing and reflow
  enable_markup: true
_help_lint: Options affecting the linter
lint:
  _help_disabled_codes:
  - a list of lint codes to disable
  disabled_codes: []
  _help_function_pattern:
  - regular expression pattern describing valid function names
  function_pattern: '[0-9a-z_]+'
  _help_macro_pattern:
  - regular expression pattern describing valid macro names
  macro_pattern: '[0-9A-Z_]+'
  _help_global_var_pattern:
  - regular expression pattern describing valid names for
  - variables with global (cache) scope
  global_var_pattern: '[A-Z][0-9A-Z_]+'
  _help_internal_var_pattern:
  - regular expression pattern describing valid names for
  - variables with global scope (but internal semantic)
  internal_var_pattern: _[A-Z][0-9A-Z_]+
  _help_local_var_pattern:
  - regular expression pattern describing valid names for
  - variables with local scope
  local_var_pattern: '[a-z][a-z0-9_]+'
  _help_private_var_pattern:
  - regular expression pattern describing valid names for
  - privatedirectory variables
  private_var_pattern: _[0-9a-z_]+
  _help_public_var_pattern:
  - regular expression pattern describing valid names for public
  - directory variables
  public_var_pattern: '[A-Z][0-9A-Z_]+'
  _help_argument_var_pattern:
  - regular expression pattern describing valid names for
  - function/macro arguments and loop variables.
  argument_var_pattern: '[a-z][a-z0-9_]+'
  _help_keyword_pattern:
  - regular expression pattern describing valid names for
  - keywords used in functions or macros
  keyword_pattern: '[A-Z][0-9A-Z_]+'
  _help_max_conditionals_custom_parser:
  - In the heuristic for C0201, how many conditionals to match
  - within a loop in before considering the loop a parser.
  max_conditionals_custom_parser: 2
  _help_min_statement_spacing:
  - Require at least this many newlines between statements
  min_statement_spacing: 1
  _help_max_statement_spacing:
  - Require no more than this many newlines between statements
  max_statement_spacing: 2
  max_returns: 6
  max_branches: 12
  max_arguments: 5
  max_localvars: 15
  max_statements: 50
_help_encode: Options affecting file encoding
encode:
  _help_emit_byteorder_mark:
  - If true, emit the unicode byte-order mark (BOM) at the start
  - of the file
  emit_byteorder_mark: false
  _help_input_encoding:
  - Specify the encoding of the input file. Defaults to utf-8
  input_encoding: utf-8
  _help_output_encoding:
  - Specify the encoding of the output file. Defaults to utf-8.
  - Note that cmake only claims to support utf-8 so be careful
  - when using anything else
  output_encoding: utf-8
_help_misc: Miscellaneous configurations options.
misc:
  _help_per_command:
  - A dictionary containing any per-command configuration
  - overrides. Currently only `command_case` is supported.
  per_command: {}


================================================
FILE: .github/ISSUE_TEMPLATE/tensorrtx-issue-template.md
================================================
---
name: tensorrtx issue template
about: To understand your issue better
title: ''
labels: ''
assignees: ''

---

## Env

- GPU, e.g. V100, RTX2080, TX2, Xavier NX, Nano, etc.
- OS, e.g. Ubuntu16.04, Win10, etc.
- Cuda version
- TensorRT version

## About this repo

- which branch/tag/commit are you using?
- which model? yolov5, retinaface?

## Your problem

- what is your command? e.g. `sudo ./yolov5 -s`
- what's your output?
- what output do you expect?


================================================
FILE: .github/stale.yml
================================================
# Number of days of inactivity before an issue becomes stale
daysUntilStale: 60
# Number of days of inactivity before a stale issue is closed
daysUntilClose: 7
# Issues with these labels will never be considered stale
exemptLabels:
  - pinned
  - security
# Label to use when marking an issue as stale
staleLabel: wontfix
# Comment to post when marking an issue as stale. Set to `false` to disable
markComment: >
  This issue has been automatically marked as stale because it has not had
  recent activity. It will be closed if no further activity occurs. Thank you
  for your contributions.
# Comment to post when closing a stale issue. Set to `false` to disable
closeComment: false


================================================
FILE: .github/workflows/pre-commit.yml
================================================
name: pre-commit

on:
  pull_request:
    branches:
      - master
      - trt10

  push:
    branches:
      - master
      - trt10

jobs:
  pre-commit:
    runs-on: ubuntu-latest

    steps:
      - uses: actions/checkout@v5
        with:
          # grab the history of the PR
          fetch-depth: 0

      - name: Fetch commits
        run: |
          git fetch origin ${{ github.event.before }} || true
          git fetch origin ${{ github.sha }}

      - uses: actions/setup-python@v4

      - uses: pre-commit/action@v3.0.1
        if: github.event_name == 'push'
        with:
          extra_args: >
            --from-ref ${{ github.event.before }}
            --to-ref   ${{ github.sha }}
            --show-diff-on-failure --color=always

      - uses: pre-commit/action@v3.0.1
        if: github.event_name == 'pull_request'
        with:
          extra_args: >
            --from-ref ${{ github.event.pull_request.base.sha }}
            --to-ref   ${{ github.event.pull_request.head.sha }}
            --show-diff-on-failure --color=always


================================================
FILE: .gitignore
================================================
models
build
*.wts
*.engine
*.tpymodel
*/*.ppm
*idea*

.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
!.vscode/*.code-snippets

# Local History for Visual Studio Code
.history/

# Built Visual Studio Code Extensions
*.vsix

.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
!.vscode/*.code-snippets

# Local History for Visual Studio Code
.history/

# Built Visual Studio Code Extensions
*.vsix

# Prerequisites
*.d

# Compiled Object files
*.slo
*.lo
*.o
*.obj

# Precompiled Headers
*.gch
*.pch

# Compiled Dynamic libraries
*.so
*.dylib
*.dll

# Fortran module files
*.mod
*.smod

# Compiled Static libraries
*.lai
*.la
*.a
*.lib

# Executables
*.exe
*.out
*.app

CMakeLists.txt.user
CMakeCache.txt
CMakeFiles
CMakeScripts
Testing
cmake_install.cmake
install_manifest.txt
compile_commands.json
CTestTestfile.cmake
_deps
CMakeUserPresets.json


================================================
FILE: .pre-commit-config.yaml
================================================
repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.5.0
    hooks:
      - id: check-merge-conflict
      - id: check-symlinks
      - id: end-of-file-fixer
        types: [python]
      - id: trailing-whitespace
        types: [python]
      - id: check-added-large-files
  - repo: https://github.com/pre-commit/mirrors-clang-format
    rev: v18.1.3
    hooks:
      - id: clang-format
        types_or: [c++, c, cuda]
        args: [-style=file]
  - repo: https://github.com/PyCQA/flake8
    rev: 7.0.0
    hooks:
      - id: flake8
        args: [--max-line-length=120]
  - repo: https://github.com/cheshirekow/cmake-format-precommit
    rev: v0.6.13
    hooks:
      - id: cmake-format
        additional_dependencies: [pyyaml]
        args: [--in-place, -c, .cmake-format.yaml]
        types: [file]
        files: (\.cmake|CMakeLists.txt)(.in)?$


================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2019-2020 Wang Xinyu

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
# TensorRTx

TensorRTx aims to implement popular deep learning networks with TensorRT network definition API.

Why don't we use a parser (ONNX parser, UFF parser, caffe parser, etc), but use complex APIs to build a network from scratch? I have summarized the advantages in the following aspects.

- **Flexible**, easy to modify the network, add/delete a layer or input/output tensor, replace a layer, merge layers, integrate preprocessing and postprocessing into network, etc.
- **Debuggable**, construct the entire network in an incremental development manner, easy to get middle layer results.
- **Educational**, learn about the network structure during this development, rather than treating everything as a black box.

The basic workflow of TensorRTx is:

1. Get the trained models from pytorch, mxnet or tensorflow, etc. Some pytorch models can be found in my repo [pytorchx](https://github.com/wang-xinyu/pytorchx), the remaining are from popular open-source repos.
2. Export the weights to a plain text file -- [.wts file](./tutorials/getting_started.md#the-wts-content-format).
3. Load weights in TensorRT, define the network, build a TensorRT engine.
4. Load the TensorRT engine and run inference.

## News

- `3 Mar 2026`. [zgjja](https://github.com/zgjja) Add Vision Transformer
- `2 Feb 2026`. [fazligorkembal](https://github.com/fazligorkembal) Yolo26-Det, Yolo26-Obb, Yolo26-Cls
- `15 Jan 2026`. [zgjja](https://github.com/zgjja) Refactor multiple old CV models to support TensorRT SDK through 7~10.
- `8 Jan 2026`. [ydk61](https://github.com/ydk61): YOLOv13
- `10 May 2025`. [pranavm-nvidia](https://github.com/pranavm-nvidia): [YOLO11](./yolo11_tripy) writen in [Tripy](https://github.com/NVIDIA/TensorRT-Incubator/tree/main/tripy).
- `2 May 2025`. [fazligorkembal](https://github.com/fazligorkembal): YOLO12
- `12 Apr 2025`. [pranavm-nvidia](https://github.com/pranavm-nvidia): First [Lenet](https://github.com/wang-xinyu/tensorrtx/tree/master/lenet#tripy-new-tensorrt-python-programming-model) example writen in [Tripy](https://github.com/NVIDIA/TensorRT-Incubator/tree/main/tripy).
- `11 Apr 2025`. [mpj1234](https://github.com/mpj1234): [YOLO11-obb](https://github.com/wang-xinyu/tensorrtx/tree/master/yolo11)
- `22 Oct 2024`. [lindsayshuo](https://github.com/lindsayshuo): YOLOv8-obb
- `18 Oct 2024`. [zgjja](https://github.com/zgjja): Refactor docker image.
- `11 Oct 2024`. [mpj1234](https://github.com/mpj1234): YOLO11
- `9 Oct 2024`. [Phoenix8215](https://github.com/Phoenix8215): GhostNet V1 and V2.
- `21 Aug 2024`. [Lemonononon](https://github.com/Lemonononon): real-esrgan-general-x4v3
- `29 Jul 2024`. [mpj1234](https://github.com/mpj1234): Check the YOLOv5, YOLOv8 & YOLOv10 in TensorRT 10.x API, branch → [trt10](https://github.com/wang-xinyu/tensorrtx/tree/trt10)
- `29 Jul 2024`. [mpj1234](https://github.com/mpj1234): YOLOv10
- `21 Jun 2024`. [WuxinrongY](https://github.com/WuxinrongY): YOLOv9-T, YOLOv9-S, YOLOv9-M
- `28 Apr 2024`. [lindsayshuo](https://github.com/lindsayshuo): YOLOv8-pose
- `22 Apr 2024`. [B1SH0PP](https://github.com/B1SH0PP): EfficientAd: Accurate Visual Anomaly Detection at Millisecond-Level Latencies.
- `18 Apr 2024`. [lindsayshuo](https://github.com/lindsayshuo): YOLOv8-p2

## Tutorials

- [How to make contribution](./tutorials/contribution.md)
- [Install the dependencies.](./tutorials/install.md)
- [A guide for quickly getting started, taking lenet5 as a demo.](./tutorials/getting_started.md)
- [The .wts file content format](./tutorials/getting_started.md#the-wts-content-format)
- [Frequently Asked Questions (FAQ)](./tutorials/faq.md)
- [Migration Guide](./tutorials/migration_guide.md)
- [How to implement multi-GPU processing, taking YOLOv4 as example](./tutorials/multi_GPU_processing.md)
- [Check if Your GPU support FP16/INT8](./tutorials/check_fp16_int8_support.md)
- [How to Compile and Run on Windows](./tutorials/run_on_windows.md)
- [Deploy YOLOv4 with Triton Inference Server](https://github.com/isarsoft/yolov4-triton-tensorrt)
- [From pytorch to trt step by step, hrnet as example(Chinese)](./tutorials/from_pytorch_to_trt_stepbystep_hrnet.md)

## Test Environment

1. (**NOT recommended**) TensorRT 7.x
2. (**Recommended**)TensorRT 8.x
3. (**NOT recommended**) TensorRT 10.x

### Note

1. For history reason, some of the models are limited to specific TensorRT version, please check the README.md or code for the model you want to use.
2. Currently, TensorRT 8.x has better compatibility and the most of the features supported.

## How to run

**Note**: this project support to build each network by the `CMakeLists.txt` in its subfolder, or you can build them together by the `CMakeLists.txt` on top of this project.

- General procedures before building and running:

```bash
# 1. generate xxx.wts from https://github.com/wang-xinyu/pytorchx/tree/master/lenet
# ...

# 2. put xxx.wts on top of this folder
# ...
```

- (_Option 1_) To build a single subproject in this project, do:

```bash
## enter the subfolder
cd tensorrtx/xxx

## configure & build
cmake -S . -B build
make -C build
```

- (_Option 2_) To build many subprojects, firstly, in the top `CMakeLists.txt`, **uncomment** the project you don't want to build or not suppoted by your TensorRT version, e.g., you cannot build subprojects in `${TensorRT_8_Targets}` if your TensorRT is `7.x`. Then:

```bash
## enter the top of this project
cd tensorrtx

## configure & build
# you may use "Ninja" rather than "make" to significantly boost the build speed
cmake -G Ninja -S . -B build
ninja -C build
```

**WARNING**: This part is still under development, most subprojects are not adapted yet.

- run the generated executable, e.g.:

```bash
# serialize model to plan file i.e. 'xxx.engine'
build/xxx -s

# deserialize plan file and run inference
build/xxx -d

# (Optional) check if the output is same as pytorchx/lenet
# ...

# (Optional) customize the project
# ...
```

For more details, each subfolder may contain a `README.md` inside, which explains more.

## Models

Following models are implemented.

| Name                                     | Description                                                                                                                                                                                                                                                       |
| ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| [mlp](./mlp)                             | the very basic model for starters, properly documented                                                                                                                                                                                                            |
| [lenet](./lenet)                         | the simplest, as a "hello world" of this project                                                                                                                                                                                                                  |
| [alexnet](./alexnet)                     | easy to implement, all layers are supported in tensorrt                                                                                                                                                                                                           |
| [googlenet](./googlenet)                 | GoogLeNet (Inception v1)                                                                                                                                                                                                                                          |
| [inception](./inception)                 | Inception v3, v4                                                                                                                                                                                                                                                  |
| [mnasnet](./mnasnet)                     | MNASNet with depth multiplier of 0.5 from the paper                                                                                                                                                                                                               |
| [mobilenet](./mobilenet)                 | MobileNet v2, v3-small, v3-large                                                                                                                                                                                                                                  |
| [resnet](./resnet)                       | resnet-18, resnet-50 and resnext50-32x4d are implemented                                                                                                                                                                                                          |
| [senet](./senet)                         | se-resnet50                                                                                                                                                                                                                                                       |
| [shufflenet](./shufflenetv2)             | ShuffleNet v2 with 0.5x output channels                                                                                                                                                                                                                           |
| [squeezenet](./squeezenet)               | SqueezeNet 1.1 model                                                                                                                                                                                                                                              |
| [vgg](./vgg)                             | VGG 11-layer model                                                                                                                                                                                                                                                |
| [ViT](./vit)                             | vision transformer, using weight and model from huggingface                                                                                                                                                                                                       |
| [yolov3-tiny](./yolov3-tiny)             | weights and pytorch implementation from [ultralytics/yolov3](https://github.com/ultralytics/yolov3)                                                                                                                                                               |
| [yolov3](./yolov3)                       | darknet-53, weights and pytorch implementation from [ultralytics/yolov3](https://github.com/ultralytics/yolov3)                                                                                                                                                   |
| [yolov3-spp](./yolov3-spp)               | darknet-53, weights and pytorch implementation from [ultralytics/yolov3](https://github.com/ultralytics/yolov3)                                                                                                                                                   |
| [yolov4](./yolov4)                       | CSPDarknet53, weights from [AlexeyAB/darknet](https://github.com/AlexeyAB/darknet#pre-trained-models), pytorch implementation from [ultralytics/yolov3](https://github.com/ultralytics/yolov3)                                                                    |
| [yolov5](./yolov5)                       | yolov5 v1.0-v7.0 of [ultralytics/yolov5](https://github.com/ultralytics/yolov5), detection, classification and instance segmentation                                                                                                                              |
| [yolov7](./yolov7)                       | yolov7 v0.1, pytorch implementation from [WongKinYiu/yolov7](https://github.com/WongKinYiu/yolov7)                                                                                                                                                                |
| [yolov8](./yolov8)                       | yolov8, pytorch implementation from [ultralytics](https://github.com/ultralytics/ultralytics)                                                                                                                                                                     |
| [yolov9](./yolov9)                       | The Pytorch implementation is [WongKinYiu/yolov9](https://github.com/WongKinYiu/yolov9).                                                                                                                                                                          |
| [yolov10](./yolov10)                     | The Pytorch implementation is [THU-MIG/yolov10](https://github.com/THU-MIG/yolov10).                                                                                                                                                                              |
| [yolo11](./yolo11)                       | The Pytorch implementation is [ultralytics](https://github.com/ultralytics/ultralytics).                                                                                                                                                                          |
| [yolo12](./yolov12)                      | The Pytorch implementation is [ultralytics](https://github.com/ultralytics/ultralytics).                                                                                                                                                                          |
| [yolop](./yolop)                         | yolop, pytorch implementation from [hustvl/YOLOP](https://github.com/hustvl/YOLOP)                                                                                                                                                                                |
| [retinaface](./retinaface)               | resnet50 and mobilnet0.25, weights from [biubug6/Pytorch_Retinaface](https://github.com/biubug6/Pytorch_Retinaface)                                                                                                                                               |
| [arcface](./arcface)                     | LResNet50E-IR, LResNet100E-IR and MobileFaceNet, weights from [deepinsight/insightface](https://github.com/deepinsight/insightface)                                                                                                                               |
| [retinafaceAntiCov](./retinafaceAntiCov) | mobilenet0.25, weights from [deepinsight/insightface](https://github.com/deepinsight/insightface), retinaface anti-COVID-19, detect face and mask attribute                                                                                                       |
| [dbnet](./dbnet)                         | Scene Text Detection, weights from [BaofengZan/DBNet.pytorch](https://github.com/BaofengZan/DBNet.pytorch)                                                                                                                                                        |
| [crnn](./crnn)                           | pytorch implementation from [meijieru/crnn.pytorch](https://github.com/meijieru/crnn.pytorch)                                                                                                                                                                     |
| [ufld](./ufld)                           | pytorch implementation from [Ultra-Fast-Lane-Detection](https://github.com/cfzd/Ultra-Fast-Lane-Detection), ECCV2020                                                                                                                                              |
| [hrnet](./hrnet)                         | hrnet-image-classification and hrnet-semantic-segmentation, pytorch implementation from [HRNet-Image-Classification](https://github.com/HRNet/HRNet-Image-Classification) and [HRNet-Semantic-Segmentation](https://github.com/HRNet/HRNet-Semantic-Segmentation) |
| [psenet](./psenet)                       | PSENet Text Detection, tensorflow implementation from [liuheng92/tensorflow_PSENet](https://github.com/liuheng92/tensorflow_PSENet)                                                                                                                               |
| [ibnnet](./ibnnet)                       | IBN-Net, pytorch implementation from [XingangPan/IBN-Net](https://github.com/XingangPan/IBN-Net), ECCV2018                                                                                                                                                        |
| [unet](./unet)                           | U-Net, pytorch implementation from [milesial/Pytorch-UNet](https://github.com/milesial/Pytorch-UNet)                                                                                                                                                              |
| [repvgg](./repvgg)                       | RepVGG, pytorch implementation from [DingXiaoH/RepVGG](https://github.com/DingXiaoH/RepVGG)                                                                                                                                                                       |
| [lprnet](./lprnet)                       | LPRNet, pytorch implementation from [xuexingyu24/License_Plate_Detection_Pytorch](https://github.com/xuexingyu24/License_Plate_Detection_Pytorch)                                                                                                                 |
| [refinedet](./refinedet)                 | RefineDet, pytorch implementation from [luuuyi/RefineDet.PyTorch](https://github.com/luuuyi/RefineDet.PyTorch)                                                                                                                                                    |
| [densenet](./densenet)                   | DenseNet-121, from torchvision.models                                                                                                                                                                                                                             |
| [rcnn](./rcnn)                           | FasterRCNN and MaskRCNN, model from [detectron2](https://github.com/facebookresearch/detectron2)                                                                                                                                                                  |
| [tsm](./tsm)                             | TSM: Temporal Shift Module for Efficient Video Understanding, ICCV2019                                                                                                                                                                                            |
| [scaled-yolov4](./scaled-yolov4)         | yolov4-csp, pytorch from [WongKinYiu/ScaledYOLOv4](https://github.com/WongKinYiu/ScaledYOLOv4)                                                                                                                                                                    |
| [centernet](./centernet)                 | CenterNet DLA-34, pytorch from [xingyizhou/CenterNet](https://github.com/xingyizhou/CenterNet)                                                                                                                                                                    |
| [efficientnet](./efficientnet)           | EfficientNet b0-b8 and l2, pytorch from [lukemelas/EfficientNet-PyTorch](https://github.com/lukemelas/EfficientNet-PyTorch)                                                                                                                                       |
| [detr](./detr)                           | DE⫶TR, pytorch from [facebookresearch/detr](https://github.com/facebookresearch/detr)                                                                                                                                                                             |
| [swin-transformer](./swin-transformer)   | Swin Transformer - Semantic Segmentation, only support Swin-T. The Pytorch implementation is [microsoft/Swin-Transformer](https://github.com/microsoft/Swin-Transformer.git)                                                                                      |
| [real-esrgan](./real-esrgan)             | Real-ESRGAN. The Pytorch implementation is [real-esrgan](https://github.com/xinntao/Real-ESRGAN)                                                                                                                                                                  |
| [superpoint](./superpoint)               | SuperPoint. The Pytorch model is from [magicleap/SuperPointPretrainedNetwork](https://github.com/magicleap/SuperPointPretrainedNetwork)                                                                                                                           |
| [csrnet](./csrnet)                       | CSRNet. The Pytorch implementation is [leeyeehoo/CSRNet-pytorch](https://github.com/leeyeehoo/CSRNet-pytorch)                                                                                                                                                     |
| [EfficientAd](./efficient_ad)            | EfficientAd: Accurate Visual Anomaly Detection at Millisecond-Level Latencies. From [anomalib](https://github.com/openvinotoolkit/anomalib)                                                                                                                       |

## Model Zoo

The .wts files can be downloaded from model zoo for quick evaluation. But it is recommended to convert .wts from pytorch/mxnet/tensorflow model, so that you can retrain your own model.

[GoogleDrive](https://drive.google.com/drive/folders/1Ri0IDa5OChtcA3zjqRTW57uG6TnfN4Do?usp=sharing) | [BaiduPan](https://pan.baidu.com/s/19s6hO8esU7-TtZEXN7G3OA) pwd: uvv2

## Tricky Operations

Some tricky operations encountered in these models, already solved, but might have better solutions.

| Name                      | Description                                                                                           |
| ------------------------- | ----------------------------------------------------------------------------------------------------- |
| BatchNorm                 | Implement by a scale layer, used in resnet, googlenet, mobilenet, etc.                                |
| MaxPool2d(ceil_mode=True) | use a padding layer before maxpool to solve ceil_mode=True, see googlenet.                            |
| average pool with padding | use setAverageCountExcludesPadding() when necessary, see inception.                                   |
| relu6                     | use `Relu6(x) = Relu(x) - Relu(x-6)`, see mobilenet.                                                  |
| torch.chunk()             | implement the 'chunk(2, dim=C)' by tensorrt plugin, see shufflenet.                                   |
| channel shuffle           | use two shuffle layers to implement `channel_shuffle`, see shufflenet.                                |
| adaptive pool             | use fixed input dimension, and use regular average pooling, see shufflenet.                           |
| leaky relu                | I wrote a leaky relu plugin, but PRelu in `NvInferPlugin.h` can be used, see yolov3 in branch `trt4`. |
| yolo layer v1             | yolo layer is implemented as a plugin, see yolov3 in branch `trt4`.                                   |
| yolo layer v2             | three yolo layers implemented in one plugin, see yolov3-spp.                                          |
| upsample                  | replaced by a deconvolution layer, see yolov3.                                                        |
| hsigmoid                  | hard sigmoid is implemented as a plugin, hsigmoid and hswish are used in mobilenetv3                  |
| retinaface output decode  | implement a plugin to decode bbox, confidence and landmarks, see retinaface.                          |
| mish                      | mish activation is implemented as a plugin, mish is used in yolov4                                    |
| prelu                     | mxnet's prelu activation with trainable gamma is implemented as a plugin, used in arcface             |
| HardSwish                 | hard_swish = x \* hard_sigmoid, used in yolov5 v3.0                                                   |
| LSTM                      | Implemented pytorch nn.LSTM() with tensorrt api                                                       |

## Speed Benchmark

| Models                    | Device               | BatchSize | Mode | Input Shape(HxW) | FPS  |
| ------------------------- | -------------------- | :-------: | :--: | :--------------: | :--: |
| YOLOv3-tiny               | Xeon E5-2620/GTX1080 |     1     | FP32 |     608x608      | 333  |
| YOLOv3(darknet53)         | Xeon E5-2620/GTX1080 |     1     | FP32 |     608x608      | 39.2 |
| YOLOv3(darknet53)         | Xeon E5-2620/GTX1080 |     1     | INT8 |     608x608      | 71.4 |
| YOLOv3-spp(darknet53)     | Xeon E5-2620/GTX1080 |     1     | FP32 |     608x608      | 38.5 |
| YOLOv4(CSPDarknet53)      | Xeon E5-2620/GTX1080 |     1     | FP32 |     608x608      | 35.7 |
| YOLOv4(CSPDarknet53)      | Xeon E5-2620/GTX1080 |     4     | FP32 |     608x608      | 40.9 |
| YOLOv4(CSPDarknet53)      | Xeon E5-2620/GTX1080 |     8     | FP32 |     608x608      | 41.3 |
| YOLOv5-s v3.0             | Xeon E5-2620/GTX1080 |     1     | FP32 |     608x608      | 142  |
| YOLOv5-s v3.0             | Xeon E5-2620/GTX1080 |     4     | FP32 |     608x608      | 173  |
| YOLOv5-s v3.0             | Xeon E5-2620/GTX1080 |     8     | FP32 |     608x608      | 190  |
| YOLOv5-m v3.0             | Xeon E5-2620/GTX1080 |     1     | FP32 |     608x608      |  71  |
| YOLOv5-l v3.0             | Xeon E5-2620/GTX1080 |     1     | FP32 |     608x608      |  43  |
| YOLOv5-x v3.0             | Xeon E5-2620/GTX1080 |     1     | FP32 |     608x608      |  29  |
| YOLOv5-s v4.0             | Xeon E5-2620/GTX1080 |     1     | FP32 |     608x608      | 142  |
| YOLOv5-m v4.0             | Xeon E5-2620/GTX1080 |     1     | FP32 |     608x608      |  71  |
| YOLOv5-l v4.0             | Xeon E5-2620/GTX1080 |     1     | FP32 |     608x608      |  40  |
| YOLOv5-x v4.0             | Xeon E5-2620/GTX1080 |     1     | FP32 |     608x608      |  27  |
| RetinaFace(resnet50)      | Xeon E5-2620/GTX1080 |     1     | FP32 |     480x640      |  90  |
| RetinaFace(resnet50)      | Xeon E5-2620/GTX1080 |     1     | INT8 |     480x640      | 204  |
| RetinaFace(mobilenet0.25) | Xeon E5-2620/GTX1080 |     1     | FP32 |     480x640      | 417  |
| ArcFace(LResNet50E-IR)    | Xeon E5-2620/GTX1080 |     1     | FP32 |     112x112      | 333  |
| CRNN                      | Xeon E5-2620/GTX1080 |     1     | FP32 |      32x100      | 1000 |

Help wanted, if you got speed results, please add an issue or PR.

## Acknowledgments & Contact

Any contributions, questions and discussions are welcomed, contact me by following info.

E-mail: wangxinyu_es@163.com

WeChat ID: wangxinyu0375 (可加我微信进 tensorrtx 交流群，**备注：tensorrtx**)


================================================
FILE: alexnet/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.14)

project(
  alexnet
  VERSION 0.1
  LANGUAGES C CXX CUDA)

if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
  set(CMAKE_CUDA_ARCHITECTURES
      75
      80
      86
      89
      90
      100
      120)
endif()

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_INCLUDE_CURRENT_DIR TRUE)

option(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cudaruntime library" OFF)

find_package(Threads REQUIRED)
find_package(CUDAToolkit REQUIRED)
find_package(OpenCV REQUIRED)

if(NOT TARGET TensorRT::TensorRT)
  include(FindTensorRT.cmake)
else()
  message("TensorRT has been found, skipping for ${PROJECT_NAME}")
endif()

add_executable(${PROJECT_NAME} alexnet.cc)

target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR}
                                                   ${OpenCV_INCLUDE_DIRS})

target_link_libraries(
  ${PROJECT_NAME} PRIVATE Threads::Threads TensorRT::TensorRT CUDA::cudart
                          ${OpenCV_LIBS})


================================================
FILE: alexnet/FindTensorRT.cmake
================================================
cmake_minimum_required(VERSION 3.17.0)

function(_guess_path var_name required_files)
  set(_result "")

  foreach(path_entry IN LISTS ARGN)
    if(NOT EXISTS "${path_entry}")
      message(DEBUG "skip non-existing path '${path_entry}'")
      continue()
    endif()

    set(_ok TRUE)
    foreach(required_file IN LISTS required_files)
      if(NOT EXISTS "${path_entry}/${required_file}")
        set(_ok FALSE)
        message(DEBUG "'${path_entry}' missing '${required_file}'")
        break()
      endif()
    endforeach()

    if(_ok)
      list(APPEND _result "${path_entry}")
      message(DEBUG "accept '${path_entry}'")
    else()
      message(DEBUG "reject '${path_entry}'")
    endif()
  endforeach()

  if(_result STREQUAL "")
    message(
      FATAL_ERROR
        "_guess_path(${var_name}) failed: no valid path found. required_files='${required_files}' candidates='${ARGN}'"
    )
  endif()

  set(${var_name}
      "${_result}"
      PARENT_SCOPE)
endfunction()

# add library
add_library(TensorRT IMPORTED INTERFACE)
add_library(TensorRT::TensorRT ALIAS TensorRT)

set(TRT_VERSION
    CACHE
      STRING
      "TensorRT version, e.g. \"8.6.1.6\" or \"8.6.1.6+cuda12.0.1.011\", \"8.6.1.6.Windows10.x86_64.cuda-12.0\" etc"
)

if(NOT TRT_VERSION STREQUAL "" AND NOT $ENV{TRT_VERSION} STREQUAL "")
  message(
    WARNING
      "TRT_VERSION defined by cmake and environment variable both, using the later one"
  )
endif()

if(NOT $ENV{TRT_VERSION} STREQUAL "")
  set(TRT_VERSION $ENV{TRT_VERSION})
endif()

string(REGEX MATCH "([0-9]+)" _match ${TRT_VERSION})
set(TRT_MAJOR_VERSION "${_match}")
unset(_match)

if(WIN32)
  set(TensorRT_DIR "C:/Program Files/TensorRT-${TRT_VERSION}")
  if(NOT EXISTS "${TensorRT_DIR}")
    message(
      FATAL_ERROR
        "TensorRT_DIR=${TensorRT_DIR} does not exist!"
    )
  endif()

  if(${TRT_MAJOR_VERSION} GREATER_EQUAL 10)
    set(_modules nvinfer_10 nvinfer_plugin_10 nvinfer_vc_plugin_10
                 nvinfer_dispatch_10 nvinfer_lean_10)
    message(DEBUG "Using ${_modules}")
  else()
    set(_modules nvinfer nvinfer_plugin nvinfer_vc_plugin nvinfer_dispatch
                 nvinfer_lean)
  endif()

  set(TensorRT_LIBRARY_DIR "${TensorRT_DIR}/lib")
  set(TensorRT_INCLUDE_DIR "${TensorRT_DIR}/include")
elseif(UNIX)
  string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" _trt_arch)
  set(_trt_include_candidates)
  if(_trt_arch MATCHES "^(aarch64|arm64|arch64)$")
    set(_trt_include_candidates "/usr/include/aarch64-linux-gnu" "/usr/include"
                                "/usr/local/cuda/targets/aarch64-linux/include")
    set(_trt_library_candidates
        "/usr/local/tensorrt/targets/aarch64-linux-gnu/lib"
        "/usr/lib/aarch64-linux-gnu" "/usr/lib/aarch64-linux-gnu/tegra"
        "/usr/lib")
  elseif(_trt_arch MATCHES "^(x86_64|amd64)$")
    set(_trt_include_candidates
        "/usr/local/tensorrt/targets/x86_64-linux-gnu/include"
        "/usr/include/x86_64-linux-gnu" "/usr/include")
    set(_trt_library_candidates
        "/usr/local/tensorrt/targets/x86_64-linux-gnu/lib"
        "/usr/lib/x86_64-linux-gnu" "/usr/lib")
  else()
    message(FATAL_ERROR "Unknown architecture")
  endif()

  set(_modules nvinfer nvinfer_plugin)
  if(${TRT_MAJOR_VERSION} GREATER_EQUAL 8)
    list(APPEND _modules nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean)
  endif()

  _guess_path(TensorRT_LIBRARY_DIR "libnvinfer.so;libnvinfer_plugin.so"
              ${_trt_library_candidates})
  message(STATUS "TensorRT libraries: ${TensorRT_LIBRARY_DIR}")
  _guess_path(TensorRT_INCLUDE_DIR "NvInfer.h" ${_trt_include_candidates})
  message(STATUS "TensorRT includes: ${TensorRT_INCLUDE_DIR}")
endif()

foreach(lib IN LISTS _modules)
  find_library(
    TensorRT_${lib}_LIBRARY
    NAMES ${lib}
    HINTS ${TensorRT_LIBRARY_DIR})
  list(APPEND TensorRT_LIBRARIES ${TensorRT_${lib}_LIBRARY})
endforeach()

target_link_libraries(TensorRT INTERFACE ${TensorRT_LIBRARIES})

message(STATUS "Found TensorRT libs: ${TensorRT_LIBRARIES}")

set_target_properties(
  TensorRT
  PROPERTIES C_STANDARD 17
             CXX_STANDARD 17
             POSITION_INDEPENDENT_CODE ON
             SKIP_BUILD_RPATH TRUE
             BUILD_WITH_INSTALL_RPATH TRUE
             INSTALL_RPATH "$ORIGIN"
             INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIR}")

unset(TRT_MAJOR_VERSION)
unset(_modules)
unset(_trt_include_candidates)
unset(_trt_library_candidates)
unset(_trt_arch)


================================================
FILE: alexnet/README.md
================================================
# alexnet

## Introduction

AlexNet model architecture comes from this paper: [One weird trick for parallelizing convolutional neural networks](https://arxiv.org/abs/1404.5997). To generate `.wts` file, you can refer to [pytorchx/alexnet](https://github.com/wang-xinyu/pytorchx/tree/master/alexnet). To check the pytorch implementation of AlexNet, refer to [HERE](https://github.com/pytorch/vision/blob/main/torchvision/models/alexnet.py#L17)

AlexNet consists of 3 major parts: features, adaptive average pooling, and classifier:

- features: just several stacked `CRP`(conv-relu-pool) and `CR` layers
- adaptive average pooling: pytorch can decide its inner parameters, but we need to calculate it ourselves in TensorRT API
- classifier: just several `fc-relu` layers. All layers can be implemented by tensorrt api, including `addConvolution`, `addActivation`, `addPooling`, `addMatrixMultiply`, `addElementWise` etc.

## Use AlexNet from PyTorch

We can use torchvision to load the pretrained alexnet model:

```python
alexnet = torchvision.models.alexnet(pretrained=True)
```

The model structure is:

```bash
AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
    (2): ReLU(inplace=True)
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=4096, out_features=4096, bias=True)
    (5): ReLU(inplace=True)
    (6): Linear(in_features=4096, out_features=1000, bias=True)
  )
)
```

## Usage

1. use `gen_wts.py` to generate wts file.

```bash
python3 gen_wts.py
```

2. build C++ code

```bash
pushd tensorrtx/alexnet
cmake -S . -B build -G Ninja --fresh
cmake --build build
```

3. serialize wts model to engine file.

```bash
./build/alexnet -s
```

4. run inference

```bash
./build/alexnet -d
```

output looks like:

```txt
...
====
Execution time: 1ms
0.1234, -0.5678, ...
====
prediction result:
Top: 0 idx: 285, logits: 9.9, label: Egyptian cat
Top: 1 idx: 281, logits: 8.304, label: tabby, tabby cat
Top: 2 idx: 282, logits: 6.859, label: tiger cat
```

## FAQ

### How to align the output with Pytorch?

If your output is different from pytorch, you have to check which TensorRT API or your code cause this. A simple solution would be check the `.engine` output part by part, e.g., you can set the early layer of alexnet as output:

```c++
fc3_1->getOutput(0)->setName(OUTPUT_NAME);
network->markOutput(*pool3->getOutput(0)); // original is: "*fc3_1->getOutput(0)"
```

For this line of code, i use the output from "feature" part of alexnet, ignoring the rest of the model, then, don't forget to change the `OUTPUT_SIZE` macro on top of the file, lastly, build the `.engine` file to apply the changes.

You can sum up all output from C++ code, and compare it with Pytorch output, for Pytorch, you can do this by: `torch.sum(x)` at debug phase. The ideal value deviation between 2 values would be $[10^{-1}, 10^{-2}]$, for this example, since the output elements for "feature" is $256 * 6 * 6$ (bacth = 1), the final error would roughly be $10^{-4}$.

Note: This is a quick check, for more accurate check, you have to save the output tensor into a file to compare them value by value, but this situation is rare.


================================================
FILE: alexnet/alexnet.cc
================================================
#include <array>
#include <chrono>
#include <cmath>
#include <opencv2/opencv.hpp>
#include <vector>
#include "logging.h"
#include "utils.h"

// stuff we know about alexnet
constexpr const int32_t N = 1;
constexpr const int32_t INPUT_H = 224;
constexpr const int32_t INPUT_W = 224;
constexpr const std::array<int64_t, 3> SIZES = {3ll * INPUT_H * INPUT_W, 1000};

constexpr const std::array<const char*, 2> NAMES = {"data", "prob"};
constexpr const char* ENGINE_PATH = "../models/alexnet.engine";
constexpr const char* WTS_PATH = "../models/alexnet.wts";
constexpr const char* LABELS_PATH = "../assets/imagenet1000_clsidx_to_labels.txt";
static constexpr const bool TRT_PREPROCESS = TRT_VERSION >= 8510 ? true : false;
static constexpr const std::array<const float, 3> mean = {0.485f, 0.456f, 0.406f};
static constexpr const std::array<const float, 3> stdv = {0.229f, 0.224f, 0.225f};

using WeightMap = std::map<std::string, Weights>;
using M = nvinfer1::MatrixOperation;
using E = nvinfer1::ElementWiseOperation;
using NDCF = nvinfer1::NetworkDefinitionCreationFlag;

static Logger gLogger;

/**
 * @brief Create the engine using TensorRT API and without any parser.
 *
 * @param N max batch size
 * @param builder
 * @param config
 * @param dt
 * @return ICudaEngine*
 */
ICudaEngine* createEngine(int32_t N, IRuntime* runtime, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    WeightMap weightMap = loadWeights(WTS_PATH);

#if TRT_VERSION >= 11200
    auto flag = 1U << static_cast<int>(NDCF::kSTRONGLY_TYPED);
#elif TRT_VERSION >= 10000
    auto flag = 0U;
#else
    auto flag = 1U << static_cast<int>(NDCF::kEXPLICIT_BATCH);
#endif
    auto* network = builder->createNetworkV2(flag);

    // Create input tensor
    ITensor* input{nullptr};
    if constexpr (TRT_PREPROCESS) {
        dt = DataType::kUINT8;
        input = network->addInput(NAMES[0], dt, Dims4{N, INPUT_H, INPUT_W, 3});
        auto* trans = addTransformLayer(network, *input, true, mean, stdv);
        input = trans->getOutput(0);
    } else {
        input = network->addInput(NAMES[0], dt, Dims4{N, 3, INPUT_H, INPUT_W});
    }
    assert(input);

    // CRP (Conv-Relu-Pool)
    auto* conv1 = network->addConvolutionNd(*input, 64, DimsHW{11, 11}, weightMap["features.0.weight"],
                                            weightMap["features.0.bias"]);
    auto* relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);
    auto* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
    assert(conv1 && relu1 && pool1);
    conv1->setStrideNd(DimsHW{4, 4});
    conv1->setPaddingNd(DimsHW{2, 2});
    pool1->setStrideNd(DimsHW{2, 2});

    // CRP
    auto* conv2 = network->addConvolutionNd(*pool1->getOutput(0), 192, DimsHW{5, 5}, weightMap["features.3.weight"],
                                            weightMap["features.3.bias"]);
    auto* relu2 = network->addActivation(*conv2->getOutput(0), ActivationType::kRELU);
    auto* pool2 = network->addPoolingNd(*relu2->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
    assert(conv2 && pool2 && relu2);
    conv2->setPaddingNd(DimsHW{2, 2});
    pool2->setStrideNd(DimsHW{2, 2});

    // CR
    auto* conv3 = network->addConvolutionNd(*pool2->getOutput(0), 384, DimsHW{3, 3}, weightMap["features.6.weight"],
                                            weightMap["features.6.bias"]);
    auto* relu3 = network->addActivation(*conv3->getOutput(0), ActivationType::kRELU);
    assert(conv3 && relu3);
    conv3->setPaddingNd(DimsHW{1, 1});

    // CR
    auto* conv4 = network->addConvolutionNd(*relu3->getOutput(0), 256, DimsHW{3, 3}, weightMap["features.8.weight"],
                                            weightMap["features.8.bias"]);
    auto* relu4 = network->addActivation(*conv4->getOutput(0), ActivationType::kRELU);
    assert(conv4 && relu4);
    conv4->setPaddingNd(DimsHW{1, 1});

    // CRP
    auto* conv5 = network->addConvolutionNd(*relu4->getOutput(0), 256, DimsHW{3, 3}, weightMap["features.10.weight"],
                                            weightMap["features.10.bias"]);
    auto* relu5 = network->addActivation(*conv5->getOutput(0), ActivationType::kRELU);
    assert(conv5);
    auto* pool3 = network->addPoolingNd(*relu5->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
    assert(conv5 && relu5 && pool3);
    conv5->setPaddingNd(DimsHW{1, 1});
    pool3->setStrideNd(DimsHW{2, 2});

    // adaptive avgerage pooling
    auto* adaptive_pool = network->addPoolingNd(*pool3->getOutput(0), PoolingType::kAVERAGE, DimsHW{1, 1});
    assert(adaptive_pool);
    IShuffleLayer* shuffle = network->addShuffle(*adaptive_pool->getOutput(0));
    assert(shuffle);
    shuffle->setReshapeDimensions(Dims2{N, -1});  // "-1" means "256 * 6 * 6"

    // all classifier tensors
    int64_t in_feat = 256ll * 6 * 6;
    auto* fc1w = network->addConstant(DimsHW{4096, in_feat}, weightMap["classifier.1.weight"])->getOutput(0);
    auto* fc1b = network->addConstant(DimsHW{1, 4096}, weightMap["classifier.1.bias"])->getOutput(0);
    auto* fc2w = network->addConstant(DimsHW{4096, 4096}, weightMap["classifier.4.weight"])->getOutput(0);
    auto* fc2b = network->addConstant(DimsHW{1, 4096}, weightMap["classifier.4.bias"])->getOutput(0);
    auto* fc3w = network->addConstant(DimsHW{1000, 4096}, weightMap["classifier.6.weight"])->getOutput(0);
    auto* fc3b = network->addConstant(DimsHW{1, 1000}, weightMap["classifier.6.bias"])->getOutput(0);
    assert(fc1w && fc1b && fc2w && fc2b && fc3w && fc3b);

    // all layers in classifier
    auto* fc1_0 = network->addMatrixMultiply(*shuffle->getOutput(0), M::kNONE, *fc1w, M::kTRANSPOSE);
    auto* fc1_1 = network->addElementWise(*fc1_0->getOutput(0), *fc1b, E::kSUM);
    auto* relu6 = network->addActivation(*fc1_1->getOutput(0), ActivationType::kRELU);
    assert(fc1_0 && fc1_1 && relu6);
    fc1_0->setName("fc1_0");  // set name here, only for debug purpose
    auto* fc2_0 = network->addMatrixMultiply(*relu6->getOutput(0), M::kNONE, *fc2w, M::kTRANSPOSE);
    auto* fc2_1 = network->addElementWise(*fc2_0->getOutput(0), *fc2b, E::kSUM);
    auto* relu7 = network->addActivation(*fc2_1->getOutput(0), ActivationType::kRELU);
    assert(fc2_0 && fc2_1 && relu7);
    fc2_0->setName("fc2_0");
    auto* fc3_0 = network->addMatrixMultiply(*relu7->getOutput(0), M::kNONE, *fc3w, M::kTRANSPOSE);
    auto* fc3_1 = network->addElementWise(*fc3_0->getOutput(0), *fc3b, E::kSUM);
    assert(fc3_0 && fc3_1);
    fc3_0->setName("fc3_0");

    fc3_1->getOutput(0)->setName(NAMES[1]);
    network->markOutput(*fc3_1->getOutput(0));

    // Build engine
#if TRT_VERSION >= 8000
    config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, WORKSPACE_SIZE);
    auto* host_mem = builder->buildSerializedNetwork(*network, *config);
    auto* engine = runtime->deserializeCudaEngine(host_mem->data(), host_mem->size());
    delete network;
#else
    builder->setMaxBatchSize(N);
    config->setMaxWorkspaceSize(WORKSPACE_SIZE);
    auto* engine = builder->buildEngineWithConfig(*network, *config);
    network->destroy();
#endif

    std::cout << "build finished\n";
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    return engine;
}

void APIToModel(int32_t N, IRuntime* runtime, IHostMemory** modelStream) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(N, runtime, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
#if TRT_VERSION >= 8000
    delete engine;
    delete config;
    delete builder;
#else
    engine->destroy();
    config->destroy();
    builder->destroy();
#endif
}

std::vector<std::vector<float>> doInference(IExecutionContext& context, const std::string& img_path,
                                            std::size_t batchSize) {
    static std::vector<float> flat_img;
    auto img = cv::imread(img_path, cv::IMREAD_COLOR);
    void* input = nullptr;

    // use preprocess from gpu(TensorRT) or cpu(OpenCV)
    if constexpr (TRT_PREPROCESS) {
        // for simplicity, resize image on cpu side
        cv::resize(img, img, cv::Size(INPUT_W, INPUT_H), 0, 0, cv::INTER_LINEAR);
        input = static_cast<void*>(img.data);
    } else {
        flat_img = preprocess_img(img, true, mean, stdv, N, INPUT_H, INPUT_W);
        input = flat_img.data();
    }
    assert(input);

    const ICudaEngine& engine = context.getEngine();
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));
    std::vector<void*> buffers;

#if TRT_VERSION >= 8000
    const int32_t nIO = engine.getNbIOTensors();
#else
    const int32_t nIO = engine.getNbBindings();
#endif

    buffers.resize(nIO);
    for (auto i = 0; i < nIO; ++i) {
#if TRT_VERSION >= 8000
        auto* tensor_name = engine.getIOTensorName(i);
        auto s = getSize(engine.getTensorDataType(tensor_name));
        std::size_t size = s * batchSize * SIZES[i];
        CHECK(cudaMalloc(&buffers[i], size));
        context.setTensorAddress(tensor_name, buffers[i]);
#else
        const int32_t idx = engine.getBindingIndex(NAMES[i]);
        auto s = getSize(engine.getBindingDataType(idx));
        assert(idx == i);
        std::size_t size = s * batchSize * SIZES[i];
        CHECK(cudaMalloc(&buffers[i], size));
#endif
        if (i == 0) {
            CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream));
        }
    }

#if TRT_VERSION >= 8000
    assert(context.enqueueV3(stream));
#else
    assert(context.enqueueV2(buffers.data(), stream, nullptr));
#endif

    std::vector<std::vector<float>> prob;
    for (int i = 1; i < nIO; ++i) {
        std::vector<float> tmp(batchSize * SIZES[i], std::nanf(""));
        std::size_t size = batchSize * SIZES[i] * sizeof(float);
        CHECK(cudaMemcpyAsync(tmp.data(), buffers[i], size, cudaMemcpyDeviceToHost, stream));
        prob.emplace_back(tmp);
    }
    CHECK(cudaStreamSynchronize(stream));

    cudaStreamDestroy(stream);
    for (auto i = 0; i < nIO; ++i) {
        CHECK(cudaFree(buffers[i]));
    }
    return prob;
}

int main(int argc, char** argv) {
    checkTrtEnv();
    if (argc != 2) {
        std::cerr << "arguments not right!\n";
        std::cerr << "./alexnet -s   // serialize model to plan file\n";
        std::cerr << "./alexnet -d   // deserialize plan file and run inference\n";
        return -1;
    }

    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);

    // create a model using the API directly and serialize it to a stream
    char* trtModelStream{nullptr};
    std::streamsize size{0};

    if (std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(N, runtime, &modelStream);
        assert(modelStream != nullptr);

        std::ofstream p(ENGINE_PATH, std::ios::binary | std::ios::trunc);
        if (!p) {
            std::cerr << "could not open plan output file\n";
            return -1;
        }
        if (modelStream->size() > static_cast<std::size_t>(std::numeric_limits<std::streamsize>::max())) {
            std::cerr << "this model is too large to serialize\n";
            return -1;
        }
        const auto* data_ptr = reinterpret_cast<const char*>(modelStream->data());
        auto data_size = static_cast<std::streamsize>(modelStream->size());
        p.write(data_ptr, data_size);

#if TRT_VERSION >= 8000
        delete modelStream;
#else
        modelStream->destroy();
#endif
        return 0;
    } else if (std::string(argv[1]) == "-d") {
        std::ifstream file(ENGINE_PATH, std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        return -1;
    }

#if TRT_VERSION >= 8000
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
#else
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
#endif
    assert(engine != nullptr);

    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);

    const std::string img_path = "../assets/cats.jpg";
    for (int32_t i = 0; i < 100; ++i) {
        auto _start = std::chrono::system_clock::now();
        auto prob = doInference(*context, img_path, N);
        auto _end = std::chrono::system_clock::now();
        auto _time = std::chrono::duration_cast<std::chrono::milliseconds>(_end - _start).count();
        std::cout << "Execution time: " << _time << "ms\n";

        for (const auto& vector : prob) {
            int idx = 0;
            for (auto v : vector) {
                std::cout << std::setprecision(4) << v << ", " << std::flush;
                if (++idx > 20) {
                    std::cout << "\n====\n";
                    break;
                }
            }
        }

        if (i == 99) {
            std::cout << "prediction result:\n";
            auto labels = loadImagenetLabelMap(LABELS_PATH);
            int _top = 0;
            for (auto& [idx, logits] : topk(prob[0], 3)) {
                std::cout << "Top: " << _top++ << " idx: " << idx << ", logits: " << logits
                          << ", label: " << labels[idx] << "\n";
            }
        }
    }

#if TRT_VERSION >= 8000
    delete context;
    delete engine;
    delete runtime;
#else
    context->destroy();
    engine->destroy();
    runtime->destroy();
#endif
    return 0;
}


================================================
FILE: alexnet/alexnet.py
================================================
import os
import sys
import struct
import argparse

import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt

BATCH_SIZE = 1
INPUT_H = 224
INPUT_W = 224
OUTPUT_SIZE = 1000
INPUT_BLOB_NAME = "data"
OUTPUT_BLOB_NAME = "prob"

WEIGHT_PATH = "./alexnet.wts"
ENGINE_PATH = "./alexnet.engine"

TRT_LOGGER = trt.Logger(trt.Logger.INFO)


def load_weights(file):
    print(f"Loading weights: {file}")

    assert os.path.exists(file), 'Unable to load weight file.'

    weight_map = {}
    with open(file, "r") as f:
        lines = [line.strip() for line in f]
    count = int(lines[0])
    assert count == len(lines) - 1
    for i in range(1, count + 1):
        splits = lines[i].split(" ")
        name = splits[0]
        cur_count = int(splits[1])
        assert cur_count + 2 == len(splits)
        values = []
        for j in range(2, len(splits)):
            # hex string to bytes to float
            values.append(struct.unpack(">f", bytes.fromhex(splits[j])))
        weight_map[name] = np.array(values, dtype=np.float32)

    return weight_map


def create_engine(max_batch_size, builder, config, dt):
    weight_map = load_weights(WEIGHT_PATH)
    network = builder.create_network()

    data = network.add_input(INPUT_BLOB_NAME, dt, (3, INPUT_H, INPUT_W))
    assert data

    conv1 = network.add_convolution(input=data,
                                    num_output_maps=64,
                                    kernel_shape=(11, 11),
                                    kernel=weight_map["features.0.weight"],
                                    bias=weight_map["features.0.bias"])
    assert conv1
    conv1.stride = (4, 4)
    conv1.padding = (2, 2)

    relu1 = network.add_activation(conv1.get_output(0), type=trt.ActivationType.RELU)
    assert relu1

    pool1 = network.add_pooling(input=relu1.get_output(0),
                                type=trt.PoolingType.MAX,
                                window_size=trt.DimsHW(3, 3))
    assert pool1
    pool1.stride_nd = (2, 2)

    conv2 = network.add_convolution(input=pool1.get_output(0),
                                    num_output_maps=192,
                                    kernel_shape=(5, 5),
                                    kernel=weight_map["features.3.weight"],
                                    bias=weight_map["features.3.bias"])
    assert conv2
    conv2.padding = (2, 2)

    relu2 = network.add_activation(conv2.get_output(0), type=trt.ActivationType.RELU)
    assert relu2

    pool2 = network.add_pooling(input=relu2.get_output(0),
                                type=trt.PoolingType.MAX,
                                window_size=trt.DimsHW(3, 3))
    assert pool2
    pool2.stride_nd = (2, 2)

    conv3 = network.add_convolution(input=pool2.get_output(0),
                                    num_output_maps=384,
                                    kernel_shape=(3, 3),
                                    kernel=weight_map["features.6.weight"],
                                    bias=weight_map["features.6.bias"])
    assert conv3
    conv3.padding = (1, 1)

    relu3 = network.add_activation(conv3.get_output(0), type=trt.ActivationType.RELU)
    assert relu3

    conv4 = network.add_convolution(input=relu3.get_output(0),
                                    num_output_maps=256,
                                    kernel_shape=(3, 3),
                                    kernel=weight_map["features.8.weight"],
                                    bias=weight_map["features.8.bias"])
    assert conv4
    conv4.padding = (1, 1)

    relu4 = network.add_activation(conv4.get_output(0), type=trt.ActivationType.RELU)
    assert relu4

    conv5 = network.add_convolution(input=relu4.get_output(0),
                                    num_output_maps=256,
                                    kernel_shape=(3, 3),
                                    kernel=weight_map["features.10.weight"],
                                    bias=weight_map["features.10.bias"])
    assert conv5
    conv5.padding = (1, 1)

    relu5 = network.add_activation(conv5.get_output(0), type=trt.ActivationType.RELU)
    assert relu5

    pool3 = network.add_pooling(input=relu5.get_output(0),
                                type=trt.PoolingType.MAX,
                                window_size=trt.DimsHW(3, 3))
    assert pool3
    pool3.stride_nd = (2, 2)

    fc1 = network.add_fully_connected(input=pool3.get_output(0),
                                      num_outputs=4096,
                                      kernel=weight_map["classifier.1.weight"],
                                      bias=weight_map["classifier.1.bias"])
    assert fc1

    relu6 = network.add_activation(fc1.get_output(0), type=trt.ActivationType.RELU)
    assert relu6

    fc2 = network.add_fully_connected(input=relu6.get_output(0),
                                      num_outputs=4096,
                                      kernel=weight_map["classifier.4.weight"],
                                      bias=weight_map["classifier.4.bias"])
    assert fc2

    relu7 = network.add_activation(fc2.get_output(0), type=trt.ActivationType.RELU)
    assert relu7

    fc3 = network.add_fully_connected(input=relu7.get_output(0),
                                      num_outputs=1000,
                                      kernel=weight_map["classifier.6.weight"],
                                      bias=weight_map["classifier.6.bias"])
    assert fc3

    fc3.get_output(0).name = OUTPUT_BLOB_NAME
    network.mark_output(fc3.get_output(0))

    # Build Engine
    builder.max_batch_size = max_batch_size
    builder.max_workspace_size = 1 << 20
    engine = builder.build_engine(network, config)

    del network
    del weight_map

    return engine


def API_to_model(max_batch_size):
    builder = trt.Builder(TRT_LOGGER)
    config = builder.create_builder_config()
    engine = create_engine(max_batch_size, builder, config, trt.float32)
    assert engine
    with open(ENGINE_PATH, "wb") as f:
        f.write(engine.serialize())

    del engine
    del builder
    del config


class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()


def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream


def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-s", action='store_true')
    parser.add_argument("-d", action='store_true')
    args = parser.parse_args()

    if not (args.s ^ args.d):
        print(
            "arguments not right!\n"
            "python alexnet.py -s   # serialize model to plan file\n"
            "python alexnet.py -d   # deserialize plan file and run inference"
        )
        sys.exit()

    if args.s:
        API_to_model(BATCH_SIZE)
    else:
        runtime = trt.Runtime(TRT_LOGGER)
        assert runtime

        with open(ENGINE_PATH, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        assert engine

        context = engine.create_execution_context()
        assert context

        data = np.ones((BATCH_SIZE * 3 * INPUT_H * INPUT_W), dtype=np.float32)
        inputs, outputs, bindings, stream = allocate_buffers(engine)
        inputs[0].host = data

        trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)

        print(f'Output: \n{trt_outputs[0][:10]}\n{trt_outputs[0][-10:]}')


================================================
FILE: alexnet/gen_wts.py
================================================
import struct

import cv2
import numpy as np
import torch
from torchvision.models import alexnet


def read_imagenet_labels() -> dict[int, str]:
    """
    read ImageNet 1000 labels

    Returns:
        dict[int, str]: labels dict
    """
    clsid2label = {}
    with open("../assets/imagenet1000_clsidx_to_labels.txt", "r") as f:
        for i in f.readlines():
            k, v = i.split(": ")
            clsid2label.setdefault(int(k), v[1:-3])
    return clsid2label


def preprocess(img: np.array) -> torch.Tensor:
    """
    a preprocess method align with ImageNet dataset

    Args:
        img (np.array): input image

    Returns:
        torch.Tensor: preprocessed image in `NCHW` layout
    """
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
    img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_LINEAR)
    mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
    std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
    img = (img - mean) / std
    img = img.transpose(2, 0, 1)[None, ...]
    return torch.from_numpy(img)


if __name__ == "__main__":
    img = cv2.imread("../assets/cats.jpg", cv2.IMREAD_COLOR)
    img = preprocess(img)
    model = alexnet(pretrained=True)
    model.eval()
    output = model(img)
    labels = read_imagenet_labels()
    for batch in torch.topk(output, k=3).indices:
        for i, j in enumerate(batch, 1):
            print(f"top: {i:<2}, confidence: {float(output[0, j]):.4f}, label: {labels[int(j)]}")

    print("writing alexnet wts")
    with open("../models/alexnet.wts", "w") as f:
        f.write("{}\n".format(len(model.state_dict().keys())))
        for k, v in model.state_dict().items():
            print(f"key: {k}\tvalue: {v.shape}")
            vr = v.reshape(-1).cpu().numpy()
            f.write("{} {}".format(k, len(vr)))
            for vv in vr:
                f.write(" ")
                f.write(struct.pack(">f", float(vv)).hex())
            f.write("\n")


================================================
FILE: alexnet/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include <cassert>
#include <cstdint>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include <utility>
#include "NvInferRuntime.h"
#include "macros.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf {
   public:
    LogStreamConsumerBuffer(std::ostream& stream, std::string prefix, bool shouldLog)
        : mOutput(stream), mPrefix(std::move(prefix)), mShouldLog(shouldLog) {}

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) noexcept
        : mOutput(other.mOutput), mPrefix(std::move(other.mPrefix)), mShouldLog(other.mShouldLog) {}

    ~LogStreamConsumerBuffer() override {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr()) {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    int sync() override {
        putOutput();
        return 0;
    }

    void putOutput() {
        if (mShouldLog) {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }

   private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase {
   public:
    LogStreamConsumerBase(std::ostream& stream, std::string prefix, bool shouldLog)
        : mBuffer(stream, std::move(prefix), shouldLog) {}

   protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
   public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(severity <= reportableSeverity),
          mSeverity(severity) {}

    LogStreamConsumer(LogStreamConsumer&& other) noexcept
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(other.mShouldLog),
          mSeverity(other.mSeverity) {}

    void setReportableSeverity(Severity reportableSeverity) {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

   private:
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger {
   private:
    struct TestInfo;

   public:
    Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {}

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult : std::uint8_t {
        kRUNNING,  //!< The test is running
        kPASSED,   //!< The test passed
        kFAILED,   //!< The test failed
        kWAIVED    //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger() { return *this; }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << '\n';
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity) { mReportableSeverity = severity; }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom {
       public:
        TestAtom(TestAtom&&) = default;

       private:
        friend class Logger;

        TestAtom(bool started, TestInfo info)
            : mStarted(started), mName(std::move(info.name)), mCmdline(std::move(info.cmdline)) {}

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline) {
        return TestAtom{false, TestInfo{name, cmdline}};
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom) {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result) {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass) {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    [[nodiscard]] Severity getReportableSeverity() const { return mReportableSeverity; }

   private:
    struct TestInfo {
        std::string name;
        std::string cmdline;
    };
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result) {
        switch (result) {
            case TestResult::kRUNNING:
                return "RUNNING";
            case TestResult::kPASSED:
                return "PASSED";
            case TestResult::kFAILED:
                return "FAILED";
            case TestResult::kWAIVED:
                return "WAIVED";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result) {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << '\n';
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv) {
        std::stringstream ss;
        for (int i = 0; i < argc; i++) {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace {

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kVERBOSE};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINFO};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kWARNING};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kERROR};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINTERNAL_ERROR};
}

}  // anonymous namespace

#endif  // TENSORRT_LOGGING_H


================================================
FILE: alexnet/macros.h
================================================
#pragma once

#include <NvInfer.h>

#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else

#if defined(_MSC_VER)
#define API __declspec(dllimport)
#else
#define API
#endif
#endif  // API_EXPORTS

#define TRT_VERSION \
    ((NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + (NV_TENSORRT_PATCH * 10) + NV_TENSORRT_BUILD)

#if TRT_VERSION >= 8000
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif


================================================
FILE: alexnet/utils.h
================================================
#pragma once
#include <cuda_runtime_api.h>
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <fstream>
#include <iostream>
#include <map>
#include <memory>
#include <numeric>
#include <opencv2/opencv.hpp>
#include <string>
#include <vector>
#include "macros.h"

using namespace nvinfer1;

constexpr const std::size_t WORKSPACE_SIZE = 16 << 20;

#define CHECK(status)                                     \
    do {                                                  \
        auto ret = (status);                              \
        if (ret != cudaSuccess) {                         \
            std::cerr << "Cuda failure: " << ret << "\n"; \
            std::abort();                                 \
        }                                                 \
    } while (0)

static void checkTrtEnv(int device = 0) {
#if TRT_VERSION < 8000
    CHECK(cudaGetDevice(&device));
    cudaDeviceProp prop{};
    CHECK(cudaGetDeviceProperties(&prop, device));
    const int sm = prop.major * 10 + prop.minor;
    if (sm > 86) {
        std::cerr << "TensorRT < 8 does not support SM > 86 on this GPU.";
        std::abort();
    }
#endif
}

/**
 * @brief TensorRT weight files have a simple space delimited format:
 * [type] [size] <data x size in hex>
 * 
 * @param file input weight file path
 * @return std::map<std::string, nvinfer1::Weights> 
 */
static auto loadWeights(const std::string& file) {
    std::cout << "Loading weights: " << file << "\n";
    std::map<std::string, nvinfer1::Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--) {
        nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0};

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> wt.count;

        // Load blob
        auto* val = new uint32_t[wt.count];
        input >> std::hex;
        for (auto x = 0ll; x < wt.count; ++x) {
            input >> val[x];
        }
        wt.values = val;
        weightMap[name] = wt;
    }

    return weightMap;
}

/**
 * @brief a preprocess function aligning with ImageNet preprocess in torchvision, only support 3-channel image
 * 
 * @param img opencv image with BGR layout
 * @param bgr2rgb whether to convert BGR to RGB
 * @param mean subtract mean
 * @param std divide std
 * @param n batch size
 * @param h resize height
 * @param w resize width
 * @return std::vector<float> contiguous flatten image data in float32 type
 */
static std::vector<float> preprocess_img(cv::Mat& img, bool bgr2rgb, const std::array<const float, 3>& mean,
                                         const std::array<const float, 3>& std, int n, int h, int w) {
    const auto c = img.channels();
    const auto size = c * h * w;
    if (c != 3) {
        std::cerr << "this demo only supports 3 channel input image.\n";
        std::abort();
    }
    if (bgr2rgb) {
        cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
    }
    cv::resize(img, img, cv::Size(w, h), 0, 0, cv::INTER_LINEAR);
    img.convertTo(img, CV_32FC3, 1.f / 255);
    img = (img - cv::Scalar(mean[0], mean[1], mean[2])) / cv::Scalar(std[0], std[1], std[2]);
    std::vector<float> chw(static_cast<std::size_t>(n) * c * h * w, 0.f);

    // fill all batch with the same input image
    for (int i = 0; i < n; ++i) {
        for (int y = 0; y < h; ++y) {
            for (int x = 0; x < w; ++x) {
                const cv::Vec3f v = img.at<cv::Vec3f>(y, x);
                chw[i * size + 0 * h * w + y * w + x] = v[0];
                chw[i * size + 1 * h * w + y * w + x] = v[1];
                chw[i * size + 2 * h * w + y * w + x] = v[2];
            }
        }
    }
    return chw;
}

static auto topk(const std::vector<float>& v, int k) -> std::vector<std::pair<int, float>> {
    if (k <= 0)
        return {};
    auto stride = std::min<std::ptrdiff_t>(k, static_cast<int64_t>(v.size()));

    std::vector<int> idx(v.size());
    std::iota(idx.begin(), idx.end(), 0);

    std::partial_sort(idx.begin(), idx.begin() + k, idx.end(), [&](int a, int b) { return v[a] > v[b]; });

    std::vector<std::pair<int, float>> out;
    out.reserve(stride);
    for (auto i = 0; i < stride; ++i)
        out.emplace_back(idx[i], v[idx[i]]);
    return out;
}

static std::map<int, std::string> loadImagenetLabelMap(const std::string& path) {
    std::map<int, std::string> labels;
    std::ifstream in(path);
    if (!in.is_open()) {
        return labels;
    }
    std::string line;
    while (std::getline(in, line)) {
        auto colon = line.find(':');
        if (colon == std::string::npos) {
            continue;
        }
        auto first_quote = line.find('\'', colon);
        if (first_quote == std::string::npos) {
            continue;
        }
        auto second_quote = line.find('\'', first_quote + 1);
        if (second_quote == std::string::npos) {
            continue;
        }
        int idx = std::stoi(line.substr(0, colon));
        labels[idx] = line.substr(first_quote + 1, second_quote - first_quote - 1);
    }
    return labels;
}

static ILayer* addTransformLayer(INetworkDefinition* network, ITensor& input, bool bgr2rgb,
                                 const std::array<const float, 3>& mean, const std::array<const float, 3>& std) {
    struct ScaleParams {
        std::array<float, 3> shift;
        std::array<float, 3> scale;
    };
    static std::vector<std::unique_ptr<ScaleParams>> gScaleParams;
    auto params = std::make_unique<ScaleParams>();
    params->shift = {-mean[0] / std[0], -mean[1] / std[1], -mean[2] / std[2]};
    params->scale = {1.f / (std[0] * 255.f), 1.f / (std[1] * 255.f), 1.f / (std[2] * 255.f)};

    static const Weights empty{DataType::kFLOAT, nullptr, 0ll};
    const Weights shift{DataType::kFLOAT, params->shift.data(), 3ll};
    const Weights scale{DataType::kFLOAT, params->scale.data(), 3ll};

    gScaleParams.emplace_back(std::move(params));

    ITensor* in = &input;
    if (input.getType() != DataType::kFLOAT) {
#if TRT_VERSION >= 8000
        auto* cast = network->addCast(input, DataType::kFLOAT);
        assert(cast);
        cast->setName("Cast to FP32");
        in = cast->getOutput(0);
#else
        auto* identity = network->addIdentity(input);
        assert(identity);
        identity->setName("Convert to FP32");
        identity->setOutputType(0, DataType::kFLOAT);
        in = identity->getOutput(0);
#endif
    }
    // Convert from NHWC to NCHW
    auto* perm = network->addShuffle(*in);
    assert(perm);
    perm->setName("NHWC -> NCHW");
    perm->setFirstTranspose(Permutation{0, 3, 1, 2});

    // Convert from BGR to RGB (optional)
    ITensor* data{nullptr};
    if (bgr2rgb) {
        auto add_slice = [&](int c, const char* name) -> ITensor* {
            auto dims = perm->getOutput(0)->getDimensions();
            Dims4 start = {0, c, 0, 0}, stride = {1, 1, 1, 1};
            Dims4 size = {dims.d[0], 1, dims.d[2], dims.d[3]};
            auto* _slice = network->addSlice(*perm->getOutput(0), start, size, stride);
            _slice->setName(name);
            assert(_slice && _slice->getNbOutputs() == 1);
            return _slice->getOutput(0);
        };
        std::array<ITensor*, 3> channels = {add_slice(2, "R"), add_slice(1, "G"), add_slice(0, "B")};
        auto* cat = network->addConcatenation(channels.data(), 3);
        assert(cat);
        cat->setName("RGB");
        cat->setAxis(1);
        data = cat->getOutput(0);
    } else {
        data = perm->getOutput(0);
    }

    // Normalize
    auto* trans = network->addScale(*data, ScaleMode::kCHANNEL, shift, scale, empty);
    assert(trans);
    trans->setName("mean & std");
#if TRT_VERSION >= 8000
    trans->setChannelAxis(1);
#endif
    return trans;
}

static size_t getSize(DataType dt) {
    switch (dt) {
#if TRT_VERSION >= 8510
        case DataType::kUINT8:
#endif
        case DataType::kINT8:
            return sizeof(int8_t);
        case DataType::kFLOAT:
            return sizeof(float);
        case DataType::kHALF:
            return sizeof(int16_t);
        case DataType::kINT32:
            return sizeof(int32_t);
        default: {
            std::cerr << "Unsupported data type\n";
            std::abort();
        }
    }
}


================================================
FILE: arcface/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(arcface)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

find_package(CUDA REQUIRED)

include_directories(${PROJECT_SOURCE_DIR}/include)
if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
    message("embed_platform on")
    include_directories(/usr/local/cuda/targets/aarch64-linux/include)
    link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
else()
    message("embed_platform off")
    include_directories(/usr/local/cuda/include)
    link_directories(/usr/local/cuda/lib64)
endif()


set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")

cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/prelu.cu)

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

add_executable(arcface-r50 ${PROJECT_SOURCE_DIR}/arcface-r50.cpp)
target_link_libraries(arcface-r50 nvinfer)
target_link_libraries(arcface-r50 cudart)
target_link_libraries(arcface-r50 myplugins)
target_link_libraries(arcface-r50 ${OpenCV_LIBS})

add_executable(arcface-mobilefacenet ${PROJECT_SOURCE_DIR}/arcface-mobilefacenet.cpp)
target_link_libraries(arcface-mobilefacenet nvinfer)
target_link_libraries(arcface-mobilefacenet cudart)
target_link_libraries(arcface-mobilefacenet myplugins)
target_link_libraries(arcface-mobilefacenet ${OpenCV_LIBS})

add_executable(arcface-r100 ${PROJECT_SOURCE_DIR}/arcface-r100.cpp)
target_link_libraries(arcface-r100 nvinfer)
target_link_libraries(arcface-r100 cudart)
target_link_libraries(arcface-r100 myplugins)
target_link_libraries(arcface-r100 ${OpenCV_LIBS})

add_definitions(-O2 -pthread)


================================================
FILE: arcface/README.md
================================================
# arcface
### TensortRT 8

The mxnet implementation is from [deepinsight/insightface.](https://github.com/deepinsight/insightface)

**Updated Pretrained Weights:** ArcFace-R100 [Insight Face Google Drive](https://drive.google.com/file/d/1Hc5zUfBATaXUgcU2haUNa7dcaZSw95h2/view)

---

**Previous Pre-trained models:** The pretrained models are from [LResNet50E-IR,ArcFace@ms1m-refine-v1](https://github.com/deepinsight/insightface/wiki/Model-Zoo#32-lresnet50e-irarcfacems1m-refine-v1), [LResNet100E-IR,ArcFace@ms1m-refine-v2](https://github.com/deepinsight/insightface/wiki/Model-Zoo#31-lresnet100e-irarcfacems1m-refine-v2) and [MobileFaceNet,ArcFace@ms1m-refine-v1](https://github.com/deepinsight/insightface/wiki/Model-Zoo#34-mobilefacenetarcfacems1m-refine-v1)

---

The two input images used in this project are joey0.ppm and joey1.ppm, download them from [Google Drive.](https://drive.google.com/drive/folders/1ctqpkRCRKyBZRCNwo9Uq4eUoMRLtFq1e). The input image is 112x112, and generated from `get_input()` in `insightface/deploy/face_model.py`, which is cropped and aligned face image.

<p align="center">
<img src="https://user-images.githubusercontent.com/15235574/83122953-f45f8d80-a106-11ea-84b0-4f6ff91b5924.jpg">
</p>

## Config

- FP16/FP32 can be selected by the macro `USE_FP16` in arcface-r50/r100/mobilefacenet.cpp
- GPU id can be selected by the macro `DEVICE` in arcface-r50/r100/mobilefacenet.cpp

## Run

1.Generate .wts file from mxnet implementation of pretrained model. The following example described how to generate arcface-r100.wts from mxnet implementation of LResNet100E-IR,ArcFace@ms1m-refine-v1.
```
git clone https://github.com/deepinsight/insightface
cd insightface
git checkout 3866cd77a6896c934b51ed39e9651b791d78bb57
cd deploy
// copy tensorrtx/arcface/gen_wts.py to here(insightface/deploy)
// download model-r100-ii.zip and unzip here(insightface/deploy)
python gen_wts.py
// a file 'arcface-r100.wts' will be generated.
// the master branch of insightface should work, if not, you can checkout 94ad870abb3203d6f31b049b70dd080dc8f33fca
// arcface-r50.wts/arcface-mobilefacenet.wts can be generated in similar way from mxnet implementation of LResNet50E-IR,ArcFace@ms1m-refine-v1/MobileFaceNet,ArcFace@ms1m-refine-v1 pretrained model.

```
2.Put .wts file into tensorrtx/arcface, build and run

```
cd tensorrtx/arcface
// download joey0.ppm and joey1.ppm, and put here(tensorrtx/arcface)
mkdir build
cd build
cmake ..
make
sudo ./arcface-r100 -s    // serialize model to plan file i.e. 'arcface-r100.engine'
sudo ./arcface-r100 -d    // deserialize plan file and run inference

or

sudo ./arcface-r50 -s   // serialize model to plan file i.e. 'arcface-r50.engine'
sudo ./arcface-r50 -d   // deserialize plan file and run inference


or

sudo ./arcface-mobilefacenet -s   // serialize model to plan file i.e. 'arcface-mobilefacenet.engine'
sudo ./arcface-mobilefacenet -d   // deserialize plan file and run inference
```

3.Check the output log, latency and similarity score.

## More Information

See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)


================================================
FILE: arcface/arcface-mobilefacenet.cpp
================================================
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include <opencv2/opencv.hpp>
#include <dirent.h>
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

//#define USE_FP16  // comment out this if want to use FP32
#define DEVICE 0  // GPU id
#define BATCH_SIZE 1  // currently, only support BATCH=1

using namespace nvinfer1;

// stuff we know about the network and the input/output blobs
static const int INPUT_H = 112;
static const int INPUT_W = 112;
static const int OUTPUT_SIZE = 128;
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
static Logger gLogger;

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + "_gamma"].values;
    float *beta = (float*)weightMap[lname + "_beta"].values;
    float *mean = (float*)weightMap[lname + "_moving_mean"].values;
    float *var = (float*)weightMap[lname + "_moving_var"].values;
    int len = weightMap[lname + "_moving_var"].count;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};

    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

ILayer* addPRelu(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname) {
	float *gamma = (float*)weightMap[lname + "_gamma"].values;
	int len = weightMap[lname + "_gamma"].count;

	float *scval_1 = reinterpret_cast<float*>(malloc(sizeof(float) * len));
	float *scval_2 = reinterpret_cast<float*>(malloc(sizeof(float) * len));
	for (int i = 0; i < len; i++) {
		scval_1[i] = -1.0;
		scval_2[i] = -gamma[i];
	}
	Weights scale_1{ DataType::kFLOAT, scval_1, len };
	Weights scale_2{ DataType::kFLOAT, scval_2, len };

	float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
	for (int i = 0; i < len; i++) {
		shval[i] = 0.0;
	}
	Weights shift{ DataType::kFLOAT, shval, len };

	float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
	for (int i = 0; i < len; i++) {
		pval[i] = 1.0;
	}
	Weights power{ DataType::kFLOAT, pval, len };

	auto relu1 = network->addActivation(input, ActivationType::kRELU);
	assert(relu1);
	IScaleLayer* scale1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale_1, power);
	assert(scale1);
	auto relu2 = network->addActivation(*scale1->getOutput(0), ActivationType::kRELU);
	assert(relu2);
	IScaleLayer* scale2 = network->addScale(*relu2->getOutput(0), ScaleMode::kCHANNEL, shift, scale_2, power);
	assert(scale2);
	IElementWiseLayer* ew1 = network->addElementWise(*relu1->getOutput(0), *scale2->getOutput(0), ElementWiseOperation::kSUM);
	assert(ew1);
	return ew1;
}

ILayer* conv_bn_relu(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, int oup, int k = 3, int p = 1, int s = 2, int groups=1) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    IConvolutionLayer* conv1 = network->addConvolutionNd(input, oup, DimsHW{k, k}, weightMap[lname + "_conv2d_weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{p, p});
    conv1->setNbGroups(groups);
    auto bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "_batchnorm", 1e-3);
    assert(bn1);
    auto act1 = addPRelu(network, weightMap, *bn1->getOutput(0), lname + "_relu");
    assert(act1);
    return act1;
}

ILayer* conv_bn(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, int oup, int k = 3, int p = 1, int s = 1, int groups=1) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    IConvolutionLayer* conv1 = network->addConvolutionNd(input, oup, DimsHW{k, k}, weightMap[lname + "_conv2d_weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{p, p});
    conv1->setNbGroups(groups);
    auto bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "_batchnorm", 1e-3);
    assert(bn1);
    return bn1;
}

ILayer* DepthWise(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, int inp, int oup, int groups, int s) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    IConvolutionLayer* conv1 = network->addConvolutionNd(input, groups, DimsHW{1, 1}, weightMap[lname + "_conv_sep_conv2d_weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{1, 1});
    conv1->setPaddingNd(DimsHW{0, 0});
    conv1->setNbGroups(1);
    auto bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "_conv_sep_batchnorm", 1e-3);
    assert(bn1);
    auto act1 = addPRelu(network, weightMap, *bn1->getOutput(0), lname + "_conv_sep_relu");
    assert(act1);

    IConvolutionLayer* conv2 = network->addConvolutionNd(*act1->getOutput(0), groups, DimsHW{3, 3}, weightMap[lname + "_conv_dw_conv2d_weight"], emptywts);
    assert(conv2);
    conv2->setStrideNd(DimsHW{s, s});
    conv2->setPaddingNd(DimsHW{1, 1});
    conv2->setNbGroups(groups);
    auto bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "_conv_dw_batchnorm", 1e-3);
    assert(bn2);
    auto act2 = addPRelu(network, weightMap, *bn2->getOutput(0), lname + "_conv_dw_relu");
    assert(act2);

    IConvolutionLayer* conv3 = network->addConvolutionNd(*act2->getOutput(0), oup, DimsHW{1, 1}, weightMap[lname + "_conv_proj_conv2d_weight"], emptywts);
    assert(conv3);
    conv3->setStrideNd(DimsHW{1, 1});
    conv3->setPaddingNd(DimsHW{0, 0});
    conv3->setNbGroups(1);
    auto bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "_conv_proj_batchnorm", 1e-3);
    assert(bn3);
    return bn3;
}


ILayer* DWResidual(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, int inp, int oup, int groups, int s) {

    auto dw1 = DepthWise(network, weightMap, input, lname, inp, oup, groups, s);
    IElementWiseLayer* ew1;
    ew1 = network->addElementWise(input, *dw1->getOutput(0), ElementWiseOperation::kSUM);
    assert(ew1);
    return ew1;
}


// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../arcface-mobilefacenet.wts");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    auto conv_1 = conv_bn_relu(network, weightMap, *data, "conv_1", 64, 3, 1, 2);
    auto conv_2_dw = conv_bn_relu(network, weightMap, *conv_1->getOutput(0), "conv_2_dw", 64, 3, 1, 1, 64);
    auto conv_23 = DepthWise(network, weightMap, *conv_2_dw->getOutput(0), "dconv_23", 64, 64, 128, 2);
    auto res_3_block0 = DWResidual(network, weightMap, *conv_23->getOutput(0), "res_3_block0", 64, 64, 128, 1);
    auto res_3_block1 = DWResidual(network, weightMap, *res_3_block0->getOutput(0), "res_3_block1", 64, 64, 128, 1);
    auto res_3_block2 = DWResidual(network, weightMap, *res_3_block1->getOutput(0), "res_3_block2", 64, 64, 128, 1);
    auto res_3_block3 = DWResidual(network, weightMap, *res_3_block2->getOutput(0), "res_3_block3", 64, 64, 128, 1);
    auto conv_34 = DepthWise(network, weightMap, *res_3_block3->getOutput(0), "dconv_34", 64, 128, 256, 2);
    auto res_4_block0 = DWResidual(network, weightMap, *conv_34->getOutput(0), "res_4_block0", 128, 128, 256, 1);
    auto res_4_block1 = DWResidual(network, weightMap, *res_4_block0->getOutput(0), "res_4_block1", 128, 128, 256, 1);
    auto res_4_block2 = DWResidual(network, weightMap, *res_4_block1->getOutput(0), "res_4_block2", 128, 128, 256, 1);
    auto res_4_block3 = DWResidual(network, weightMap, *res_4_block2->getOutput(0), "res_4_block3", 128, 128, 256, 1);
    auto res_4_block4 = DWResidual(network, weightMap, *res_4_block3->getOutput(0), "res_4_block4", 128, 128, 256, 1);
    auto res_4_block5 = DWResidual(network, weightMap, *res_4_block4->getOutput(0), "res_4_block5", 128, 128, 256, 1);
    auto conv_45 = DepthWise(network, weightMap, *res_4_block5->getOutput(0), "dconv_45", 128, 128, 512, 2);
    auto res_5_block0 = DWResidual(network, weightMap, *conv_45->getOutput(0), "res_5_block0", 128, 128, 256, 1);
    auto res_5_block1 = DWResidual(network, weightMap, *res_5_block0->getOutput(0), "res_5_block1", 128, 128, 256, 1);
    auto conv_6_sep = conv_bn_relu(network, weightMap, *res_5_block1->getOutput(0), "conv_6sep", 512, 1, 0, 1);
    auto conv_6dw7_7 = conv_bn(network, weightMap, *conv_6_sep->getOutput(0), "conv_6dw7_7", 512, 7, 0, 1, 512);
    IFullyConnectedLayer* fc1 = network->addFullyConnected(*conv_6dw7_7->getOutput(0), 128, weightMap["fc1_weight"], weightMap["pre_fc1_bias"]);
    assert(fc1);
    auto bn1 = addBatchNorm2d(network, weightMap, *fc1->getOutput(0), "fc1", 2e-5);
    assert(bn1);
    bn1->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*bn1->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#ifdef USE_FP16
    config->setFlag(BuilderFlag::kFP16);
#endif
    std::cout << "Building engine, please wait for a while..." << std::endl;
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*) (mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
    DIR *p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 &&
                strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}

int main(int argc, char** argv) {
    cudaSetDevice(DEVICE);
    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{nullptr};
    size_t size{0};

    if (argc == 2 && std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(BATCH_SIZE, &modelStream);
        assert(modelStream != nullptr);
        std::ofstream p("arcface-mobilefacenet.engine", std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 0;
    } else if (argc == 2 && std::string(argv[1]) == "-d") {
        std::ifstream file("arcface-mobilefacenet.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./arcface-mobilefacenet -s  // serialize model to plan file" << std::endl;
        std::cerr << "./arcface-mobilefacenet -d  // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // prepare input data ---------------------------
    static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
    //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
    //    data[i] = 1.0;
    static float prob[BATCH_SIZE * OUTPUT_SIZE];
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    cv::Mat img = cv::imread("../joey0.ppm");
    for (int i = 0; i < INPUT_H * INPUT_W; i++) {
        data[i] = ((float)img.at<cv::Vec3b>(i)[2] - 127.5) * 0.0078125;
        data[i + INPUT_H * INPUT_W] = ((float)img.at<cv::Vec3b>(i)[1] - 127.5) * 0.0078125;
        data[i + 2 * INPUT_H * INPUT_W] = ((float)img.at<cv::Vec3b>(i)[0] - 127.5) * 0.0078125;
    }

    // Run inference
    auto start = std::chrono::system_clock::now();
    doInference(*context, data, prob, BATCH_SIZE);
    auto end = std::chrono::system_clock::now();
    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;

    cv::Mat out(128, 1, CV_32FC1, prob);
    cv::Mat out_norm;
    cv::normalize(out, out_norm);

    img = cv::imread("../joey1.ppm");
    for (int i = 0; i < INPUT_H * INPUT_W; i++) {
        data[i] = ((float)img.at<cv::Vec3b>(i)[2] - 127.5) * 0.0078125;
        data[i + INPUT_H * INPUT_W] = ((float)img.at<cv::Vec3b>(i)[1] - 127.5) * 0.0078125;
        data[i + 2 * INPUT_H * INPUT_W] = ((float)img.at<cv::Vec3b>(i)[0] - 127.5) * 0.0078125;
    }

    // Run inference
    start = std::chrono::system_clock::now();
    doInference(*context, data, prob, BATCH_SIZE);
    end = std::chrono::system_clock::now();
    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;

    cv::Mat out1(1, 128, CV_32FC1, prob);
    cv::Mat out_norm1;
    cv::normalize(out1, out_norm1);

    cv::Mat res = out_norm1 * out_norm;

    std::cout << "similarity score: " << *(float*)res.data << std::endl;

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    //Print histogram of the output distribution
    //std::cout << "\nOutput:\n\n";
    //for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
    //{
    //    std::cout << p_out_norm[i] << ", ";
    //    if (i % 10 == 0) std::cout << i / 10 << std::endl;
    //}
    //std::cout << std::endl;

    return 0;
}


================================================
FILE: arcface/arcface-r100.cpp
================================================
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include <opencv2/opencv.hpp>
#include <dirent.h>
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

//#define USE_FP16  // comment out this if want to use FP32
#define DEVICE 0  // GPU id
#define BATCH_SIZE 1  // currently, only support BATCH=1

using namespace nvinfer1;

// stuff we know about the network and the input/output blobs
static const int INPUT_H = 112;
static const int INPUT_W = 112;
static const int OUTPUT_SIZE = 512;
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
static Logger gLogger;

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + "_gamma"].values;
    float *beta = (float*)weightMap[lname + "_beta"].values;
    float *mean = (float*)weightMap[lname + "_moving_mean"].values;
    float *var = (float*)weightMap[lname + "_moving_var"].values;
    int len = weightMap[lname + "_moving_var"].count;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};

    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

ILayer* addPRelu(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname) {
	float *gamma = (float*)weightMap[lname + "_gamma"].values;
	int len = weightMap[lname + "_gamma"].count;

	float *scval_1 = reinterpret_cast<float*>(malloc(sizeof(float) * len));
	float *scval_2 = reinterpret_cast<float*>(malloc(sizeof(float) * len));
	for (int i = 0; i < len; i++) {
		scval_1[i] = -1.0;
		scval_2[i] = -gamma[i];
	}
	Weights scale_1{ DataType::kFLOAT, scval_1, len };
	Weights scale_2{ DataType::kFLOAT, scval_2, len };

	float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
	for (int i = 0; i < len; i++) {
		shval[i] = 0.0;
	}
	Weights shift{ DataType::kFLOAT, shval, len };

	float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
	for (int i = 0; i < len; i++) {
		pval[i] = 1.0;
	}
	Weights power{ DataType::kFLOAT, pval, len };

	auto relu1 = network->addActivation(input, ActivationType::kRELU);
	assert(relu1);
	IScaleLayer* scale1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale_1, power);
	assert(scale1);
	auto relu2 = network->addActivation(*scale1->getOutput(0), ActivationType::kRELU);
	assert(relu2);
	IScaleLayer* scale2 = network->addScale(*relu2->getOutput(0), ScaleMode::kCHANNEL, shift, scale_2, power);
	assert(scale2);
	IElementWiseLayer* ew1 = network->addElementWise(*relu1->getOutput(0), *scale2->getOutput(0), ElementWiseOperation::kSUM);
	assert(ew1);
	return ew1;
}

ILayer* resUnit(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int num_filters, int s, bool dim_match, std::string lname) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    auto bn1 = addBatchNorm2d(network, weightMap, input, lname + "_bn1", 2e-5);
    IConvolutionLayer* conv1 = network->addConvolutionNd(*bn1->getOutput(0), num_filters, DimsHW{3, 3}, weightMap[lname + "_conv1_weight"], emptywts);
    assert(conv1);
    conv1->setPaddingNd(DimsHW{1, 1});
    auto bn2 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "_bn2", 2e-5);
    auto act1 = addPRelu(network, weightMap, *bn2->getOutput(0), lname + "_relu1");
    IConvolutionLayer* conv2 = network->addConvolutionNd(*act1->getOutput(0), num_filters, DimsHW{3, 3}, weightMap[lname + "_conv2_weight"], emptywts);
    assert(conv2);
    conv2->setStrideNd(DimsHW{s, s});
    conv2->setPaddingNd(DimsHW{1, 1});
    auto bn3 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "_bn3", 2e-5);

    IElementWiseLayer* ew1;
    if (dim_match) {
        ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM);
    } else {
        IConvolutionLayer* conv1sc = network->addConvolutionNd(input, num_filters, DimsHW{1, 1}, weightMap[lname + "_conv1sc_weight"], emptywts);
        assert(conv1sc);
        conv1sc->setStrideNd(DimsHW{s, s});
        auto bn1sc = addBatchNorm2d(network, weightMap, *conv1sc->getOutput(0), lname + "_sc", 2e-5);
        ew1 = network->addElementWise(*bn1sc->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM);
    }
    assert(ew1);
    return ew1;
}

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../arcface-r100.wts");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    IConvolutionLayer* conv0 = network->addConvolutionNd(*data, 64, DimsHW{3, 3}, weightMap["conv0_weight"], emptywts);
    assert(conv0);
    conv0->setPaddingNd(DimsHW{1, 1});
    auto bn0 = addBatchNorm2d(network, weightMap, *conv0->getOutput(0), "bn0", 2e-5);
    auto relu0 = addPRelu(network, weightMap, *bn0->getOutput(0), "relu0");

    auto s1u1 = resUnit(network, weightMap, *relu0->getOutput(0), 64, 2, false, "stage1_unit1");
    auto s1u2 = resUnit(network, weightMap, *s1u1->getOutput(0), 64, 1, true, "stage1_unit2");
    auto s1u3 = resUnit(network, weightMap, *s1u2->getOutput(0), 64, 1, true, "stage1_unit3");

    auto s2u1 = resUnit(network, weightMap, *s1u3->getOutput(0), 128, 2, false, "stage2_unit1");
    auto s2u2 = resUnit(network, weightMap, *s2u1->getOutput(0), 128, 1, true, "stage2_unit2");
    auto s2u3 = resUnit(network, weightMap, *s2u2->getOutput(0), 128, 1, true, "stage2_unit3");
    auto s2u4 = resUnit(network, weightMap, *s2u3->getOutput(0), 128, 1, true, "stage2_unit4");


    auto s2u5 = resUnit(network, weightMap, *s2u4->getOutput(0), 128, 1, true, "stage2_unit5");
    auto s2u6 = resUnit(network, weightMap, *s2u5->getOutput(0), 128, 1, true, "stage2_unit6");
    auto s2u7 = resUnit(network, weightMap, *s2u6->getOutput(0), 128, 1, true, "stage2_unit7");
    auto s2u8 = resUnit(network, weightMap, *s2u7->getOutput(0), 128, 1, true, "stage2_unit8");

    auto s2u9 = resUnit(network, weightMap, *s2u8->getOutput(0), 128, 1, true, "stage2_unit9");
    auto s2u10 = resUnit(network, weightMap, *s2u9->getOutput(0), 128, 1, true, "stage2_unit10");
    auto s2u11 = resUnit(network, weightMap, *s2u10->getOutput(0), 128, 1, true, "stage2_unit11");
    auto s2u12 = resUnit(network, weightMap, *s2u11->getOutput(0), 128, 1, true, "stage2_unit12");
    auto s2u13 = resUnit(network, weightMap, *s2u12->getOutput(0), 128, 1, true, "stage2_unit13");

    auto s3u1 = resUnit(network, weightMap, *s2u13->getOutput(0), 256, 2, false, "stage3_unit1");
    auto s3u2 = resUnit(network, weightMap, *s3u1->getOutput(0), 256, 1, true, "stage3_unit2");
    auto s3u3 = resUnit(network, weightMap, *s3u2->getOutput(0), 256, 1, true, "stage3_unit3");
    auto s3u4 = resUnit(network, weightMap, *s3u3->getOutput(0), 256, 1, true, "stage3_unit4");
    auto s3u5 = resUnit(network, weightMap, *s3u4->getOutput(0), 256, 1, true, "stage3_unit5");
    auto s3u6 = resUnit(network, weightMap, *s3u5->getOutput(0), 256, 1, true, "stage3_unit6");
    auto s3u7 = resUnit(network, weightMap, *s3u6->getOutput(0), 256, 1, true, "stage3_unit7");
    auto s3u8 = resUnit(network, weightMap, *s3u7->getOutput(0), 256, 1, true, "stage3_unit8");
    auto s3u9 = resUnit(network, weightMap, *s3u8->getOutput(0), 256, 1, true, "stage3_unit9");
    auto s3u10 = resUnit(network, weightMap, *s3u9->getOutput(0), 256, 1, true, "stage3_unit10");
    auto s3u11 = resUnit(network, weightMap, *s3u10->getOutput(0), 256, 1, true, "stage3_unit11");
    auto s3u12 = resUnit(network, weightMap, *s3u11->getOutput(0), 256, 1, true, "stage3_unit12");
    auto s3u13 = resUnit(network, weightMap, *s3u12->getOutput(0), 256, 1, true, "stage3_unit13");
    auto s3u14 = resUnit(network, weightMap, *s3u13->getOutput(0), 256, 1, true, "stage3_unit14");

    auto s3u15 = resUnit(network, weightMap, *s3u14->getOutput(0), 256, 1, true, "stage3_unit15");
    auto s3u16 = resUnit(network, weightMap, *s3u15->getOutput(0), 256, 1, true, "stage3_unit16");
    auto s3u17 = resUnit(network, weightMap, *s3u16->getOutput(0), 256, 1, true, "stage3_unit17");
    auto s3u18 = resUnit(network, weightMap, *s3u17->getOutput(0), 256, 1, true, "stage3_unit18");
    auto s3u19 = resUnit(network, weightMap, *s3u18->getOutput(0), 256, 1, true, "stage3_unit19");
    auto s3u20 = resUnit(network, weightMap, *s3u19->getOutput(0), 256, 1, true, "stage3_unit20");
    auto s3u21 = resUnit(network, weightMap, *s3u20->getOutput(0), 256, 1, true, "stage3_unit21");
    auto s3u22 = resUnit(network, weightMap, *s3u21->getOutput(0), 256, 1, true, "stage3_unit22");
    auto s3u23 = resUnit(network, weightMap, *s3u22->getOutput(0), 256, 1, true, "stage3_unit23");
    auto s3u24 = resUnit(network, weightMap, *s3u23->getOutput(0), 256, 1, true, "stage3_unit24");
    auto s3u25 = resUnit(network, weightMap, *s3u24->getOutput(0), 256, 1, true, "stage3_unit25");
    auto s3u26 = resUnit(network, weightMap, *s3u25->getOutput(0), 256, 1, true, "stage3_unit26");
    auto s3u27 = resUnit(network, weightMap, *s3u26->getOutput(0), 256, 1, true, "stage3_unit27");
    auto s3u28 = resUnit(network, weightMap, *s3u27->getOutput(0), 256, 1, true, "stage3_unit28");
    auto s3u29 = resUnit(network, weightMap, *s3u28->getOutput(0), 256, 1, true, "stage3_unit29");
    auto s3u30 = resUnit(network, weightMap, *s3u29->getOutput(0), 256, 1, true, "stage3_unit30");

    auto s4u1 = resUnit(network, weightMap, *s3u30->getOutput(0), 512, 2, false, "stage4_unit1");
    auto s4u2 = resUnit(network, weightMap, *s4u1->getOutput(0), 512, 1, true, "stage4_unit2");
    auto s4u3 = resUnit(network, weightMap, *s4u2->getOutput(0), 512, 1, true, "stage4_unit3");

    auto bn1 = addBatchNorm2d(network, weightMap, *s4u3->getOutput(0), "bn1", 2e-5);
    IFullyConnectedLayer* fc1 = network->addFullyConnected(*bn1->getOutput(0), 512, weightMap["pre_fc1_weight"], weightMap["pre_fc1_bias"]);
    assert(fc1);
    auto bn2 = addBatchNorm2d(network, weightMap, *fc1->getOutput(0), "fc1", 2e-5);

    bn2->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*bn2->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#ifdef USE_FP16
    config->setFlag(BuilderFlag::kFP16);
#endif
    std::cout << "Building engine, please wait for a while..." << std::endl;
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*) (mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
    DIR *p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 &&
                strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}

int main(int argc, char** argv) {
    cudaSetDevice(DEVICE);
    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{nullptr};
    size_t size{0};

    if (argc == 2 && std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(256, &modelStream);
        assert(modelStream != nullptr);
        std::ofstream p("arcface-r100.engine", std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 0;
    } else if (argc == 2 && std::string(argv[1]) == "-d") {
        std::ifstream file("arcface-r100.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./arcface-r100 -s  // serialize model to plan file" << std::endl;
        std::cerr << "./arcface-r100 -d  // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // prepare input data ---------------------------
    static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
    //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
    //    data[i] = 1.0;
    static float prob[BATCH_SIZE * OUTPUT_SIZE];
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    cv::Mat img = cv::imread("../joey0.ppm");
    for (int i = 0; i < INPUT_H * INPUT_W; i++) {
        data[i] = ((float)img.at<cv::Vec3b>(i)[2] - 127.5) * 0.0078125;
        data[i + INPUT_H * INPUT_W] = ((float)img.at<cv::Vec3b>(i)[1] - 127.5) * 0.0078125;
        data[i + 2 * INPUT_H * INPUT_W] = ((float)img.at<cv::Vec3b>(i)[0] - 127.5) * 0.0078125;
    }

    // Run inference
    auto start = std::chrono::system_clock::now();
    doInference(*context, data, prob, BATCH_SIZE);
    auto end = std::chrono::system_clock::now();
    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;

    cv::Mat out(512, 1, CV_32FC1, prob);
    cv::Mat out_norm;
    cv::normalize(out, out_norm);

    img = cv::imread("../joey1.ppm");
    for (int i = 0; i < INPUT_H * INPUT_W; i++) {
        data[i] = ((float)img.at<cv::Vec3b>(i)[2] - 127.5) * 0.0078125;
        data[i + INPUT_H * INPUT_W] = ((float)img.at<cv::Vec3b>(i)[1] - 127.5) * 0.0078125;
        data[i + 2 * INPUT_H * INPUT_W] = ((float)img.at<cv::Vec3b>(i)[0] - 127.5) * 0.0078125;
    }

    // Run inference
    start = std::chrono::system_clock::now();
    doInference(*context, data, prob, BATCH_SIZE);
    end = std::chrono::system_clock::now();
    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;

    cv::Mat out1(1, 512, CV_32FC1, prob);
    cv::Mat out_norm1;
    cv::normalize(out1, out_norm1);

    cv::Mat res = out_norm1 * out_norm;

    std::cout << "similarity score: " << *(float*)res.data << std::endl;

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    //Print histogram of the output distribution
    //std::cout << "\nOutput:\n\n";
    //for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
    //{
    //    std::cout << p_out_norm[i] << ", ";
    //    if (i % 10 == 0) std::cout << i / 10 << std::endl;
    //}
    //std::cout << std::endl;

    return 0;
}

================================================
FILE: arcface/arcface-r50.cpp
================================================
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include <opencv2/opencv.hpp>
#include <dirent.h>
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

//#define USE_FP16  // comment out this if want to use FP32
#define DEVICE 0  // GPU id
#define BATCH_SIZE 1  // currently, only support BATCH=1

using namespace nvinfer1;

// stuff we know about the network and the input/output blobs
static const int INPUT_H = 112;
static const int INPUT_W = 112;
static const int OUTPUT_SIZE = 512;
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
static Logger gLogger;

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        
        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + "_gamma"].values;
    float *beta = (float*)weightMap[lname + "_beta"].values;
    float *mean = (float*)weightMap[lname + "_moving_mean"].values;
    float *var = (float*)weightMap[lname + "_moving_var"].values;
    int len = weightMap[lname + "_moving_var"].count;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};
    
    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

ILayer* addPRelu(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname) {
	float *gamma = (float*)weightMap[lname + "_gamma"].values;
	int len = weightMap[lname + "_gamma"].count;

	float *scval_1 = reinterpret_cast<float*>(malloc(sizeof(float) * len));
	float *scval_2 = reinterpret_cast<float*>(malloc(sizeof(float) * len));
	for (int i = 0; i < len; i++) {
		scval_1[i] = -1.0;
		scval_2[i] = -gamma[i];
	}
	Weights scale_1{ DataType::kFLOAT, scval_1, len };
	Weights scale_2{ DataType::kFLOAT, scval_2, len };

	float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
	for (int i = 0; i < len; i++) {
		shval[i] = 0.0;
	}
	Weights shift{ DataType::kFLOAT, shval, len };

	float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
	for (int i = 0; i < len; i++) {
		pval[i] = 1.0;
	}
	Weights power{ DataType::kFLOAT, pval, len };

	auto relu1 = network->addActivation(input, ActivationType::kRELU);
	assert(relu1);
	IScaleLayer* scale1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale_1, power);
	assert(scale1);
	auto relu2 = network->addActivation(*scale1->getOutput(0), ActivationType::kRELU);
	assert(relu2);
	IScaleLayer* scale2 = network->addScale(*relu2->getOutput(0), ScaleMode::kCHANNEL, shift, scale_2, power);
	assert(scale2);
	IElementWiseLayer* ew1 = network->addElementWise(*relu1->getOutput(0), *scale2->getOutput(0), ElementWiseOperation::kSUM);
	assert(ew1);
	return ew1;
}

ILayer* resUnit(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int num_filters, int s, bool dim_match, std::string lname) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    auto bn1 = addBatchNorm2d(network, weightMap, input, lname + "_bn1", 2e-5);
    IConvolutionLayer* conv1 = network->addConvolutionNd(*bn1->getOutput(0), num_filters, DimsHW{3, 3}, weightMap[lname + "_conv1_weight"], emptywts);
    assert(conv1);
    conv1->setPaddingNd(DimsHW{1, 1});
    auto bn2 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "_bn2", 2e-5);
    auto act1 = addPRelu(network, weightMap, *bn2->getOutput(0), lname + "_relu1");
    IConvolutionLayer* conv2 = network->addConvolutionNd(*act1->getOutput(0), num_filters, DimsHW{3, 3}, weightMap[lname + "_conv2_weight"], emptywts);
    assert(conv2);
    conv2->setStrideNd(DimsHW{s, s});
    conv2->setPaddingNd(DimsHW{1, 1});
    auto bn3 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "_bn3", 2e-5);

    IElementWiseLayer* ew1;
    if (dim_match) {
        ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM);
    } else {
        IConvolutionLayer* conv1sc = network->addConvolutionNd(input, num_filters, DimsHW{1, 1}, weightMap[lname + "_conv1sc_weight"], emptywts);
        assert(conv1sc);
        conv1sc->setStrideNd(DimsHW{s, s});
        auto bn1sc = addBatchNorm2d(network, weightMap, *conv1sc->getOutput(0), lname + "_sc", 2e-5);
        ew1 = network->addElementWise(*bn1sc->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM);
    }
    assert(ew1);
    return ew1;
}

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../arcface-r50.wts");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    IConvolutionLayer* conv0 = network->addConvolutionNd(*data, 64, DimsHW{3, 3}, weightMap["conv0_weight"], emptywts);
    assert(conv0);
    conv0->setPaddingNd(DimsHW{1, 1});
    auto bn0 = addBatchNorm2d(network, weightMap, *conv0->getOutput(0), "bn0", 2e-5);
    auto relu0 = addPRelu(network, weightMap, *bn0->getOutput(0), "relu0");

    auto s1u1 = resUnit(network, weightMap, *relu0->getOutput(0), 64, 2, false, "stage1_unit1");
    auto s1u2 = resUnit(network, weightMap, *s1u1->getOutput(0), 64, 1, true, "stage1_unit2");
    auto s1u3 = resUnit(network, weightMap, *s1u2->getOutput(0), 64, 1, true, "stage1_unit3");

    auto s2u1 = resUnit(network, weightMap, *s1u3->getOutput(0), 128, 2, false, "stage2_unit1");
    auto s2u2 = resUnit(network, weightMap, *s2u1->getOutput(0), 128, 1, true, "stage2_unit2");
    auto s2u3 = resUnit(network, weightMap, *s2u2->getOutput(0), 128, 1, true, "stage2_unit3");
    auto s2u4 = resUnit(network, weightMap, *s2u3->getOutput(0), 128, 1, true, "stage2_unit4");

    auto s3u1 = resUnit(network, weightMap, *s2u4->getOutput(0), 256, 2, false, "stage3_unit1");
    auto s3u2 = resUnit(network, weightMap, *s3u1->getOutput(0), 256, 1, true, "stage3_unit2");
    auto s3u3 = resUnit(network, weightMap, *s3u2->getOutput(0), 256, 1, true, "stage3_unit3");
    auto s3u4 = resUnit(network, weightMap, *s3u3->getOutput(0), 256, 1, true, "stage3_unit4");
    auto s3u5 = resUnit(network, weightMap, *s3u4->getOutput(0), 256, 1, true, "stage3_unit5");
    auto s3u6 = resUnit(network, weightMap, *s3u5->getOutput(0), 256, 1, true, "stage3_unit6");
    auto s3u7 = resUnit(network, weightMap, *s3u6->getOutput(0), 256, 1, true, "stage3_unit7");
    auto s3u8 = resUnit(network, weightMap, *s3u7->getOutput(0), 256, 1, true, "stage3_unit8");
    auto s3u9 = resUnit(network, weightMap, *s3u8->getOutput(0), 256, 1, true, "stage3_unit9");
    auto s3u10 = resUnit(network, weightMap, *s3u9->getOutput(0), 256, 1, true, "stage3_unit10");
    auto s3u11 = resUnit(network, weightMap, *s3u10->getOutput(0), 256, 1, true, "stage3_unit11");
    auto s3u12 = resUnit(network, weightMap, *s3u11->getOutput(0), 256, 1, true, "stage3_unit12");
    auto s3u13 = resUnit(network, weightMap, *s3u12->getOutput(0), 256, 1, true, "stage3_unit13");
    auto s3u14 = resUnit(network, weightMap, *s3u13->getOutput(0), 256, 1, true, "stage3_unit14");

    auto s4u1 = resUnit(network, weightMap, *s3u14->getOutput(0), 512, 2, false, "stage4_unit1");
    auto s4u2 = resUnit(network, weightMap, *s4u1->getOutput(0), 512, 1, true, "stage4_unit2");
    auto s4u3 = resUnit(network, weightMap, *s4u2->getOutput(0), 512, 1, true, "stage4_unit3");

    auto bn1 = addBatchNorm2d(network, weightMap, *s4u3->getOutput(0), "bn1", 2e-5);
    IFullyConnectedLayer* fc1 = network->addFullyConnected(*bn1->getOutput(0), 512, weightMap["pre_fc1_weight"], weightMap["pre_fc1_bias"]);
    assert(fc1);
    auto bn2 = addBatchNorm2d(network, weightMap, *fc1->getOutput(0), "fc1", 2e-5);

    bn2->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*bn2->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#ifdef USE_FP16
    config->setFlag(BuilderFlag::kFP16);
#endif
    std::cout << "Building engine, please wait for a while..." << std::endl;
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*) (mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
    DIR *p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 &&
                strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}

int main(int argc, char** argv) {
    cudaSetDevice(DEVICE);
    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{nullptr};
    size_t size{0};

    if (argc == 2 && std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(BATCH_SIZE, &modelStream);
        assert(modelStream != nullptr);
        std::ofstream p("arcface-r50.engine", std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 0;
    } else if (argc == 2 && std::string(argv[1]) == "-d") {
        std::ifstream file("arcface-r50.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./arcface-r50 -s  // serialize model to plan file" << std::endl;
        std::cerr << "./arcface-r50 -d  // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // prepare input data ---------------------------
    static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
    //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
    //    data[i] = 1.0;
    static float prob[BATCH_SIZE * OUTPUT_SIZE];
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    cv::Mat img = cv::imread("../joey0.ppm");
    for (int i = 0; i < INPUT_H * INPUT_W; i++) {
        data[i] = ((float)img.at<cv::Vec3b>(i)[2] - 127.5) * 0.0078125;
        data[i + INPUT_H * INPUT_W] = ((float)img.at<cv::Vec3b>(i)[1] - 127.5) * 0.0078125;
        data[i + 2 * INPUT_H * INPUT_W] = ((float)img.at<cv::Vec3b>(i)[0] - 127.5) * 0.0078125;
    }

    // Run inference
    auto start = std::chrono::system_clock::now();
    doInference(*context, data, prob, BATCH_SIZE);
    auto end = std::chrono::system_clock::now();
    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;

    cv::Mat out(512, 1, CV_32FC1, prob);
    cv::Mat out_norm;
    cv::normalize(out, out_norm);

    img = cv::imread("../joey1.ppm");
    for (int i = 0; i < INPUT_H * INPUT_W; i++) {
        data[i] = ((float)img.at<cv::Vec3b>(i)[2] - 127.5) * 0.0078125;
        data[i + INPUT_H * INPUT_W] = ((float)img.at<cv::Vec3b>(i)[1] - 127.5) * 0.0078125;
        data[i + 2 * INPUT_H * INPUT_W] = ((float)img.at<cv::Vec3b>(i)[0] - 127.5) * 0.0078125;
    }

    // Run inference
    start = std::chrono::system_clock::now();
    doInference(*context, data, prob, BATCH_SIZE);
    end = std::chrono::system_clock::now();
    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;

    cv::Mat out1(1, 512, CV_32FC1, prob);
    cv::Mat out_norm1;
    cv::normalize(out1, out_norm1);

    cv::Mat res = out_norm1 * out_norm;

    std::cout << "similarity score: " << *(float*)res.data << std::endl;

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    //Print histogram of the output distribution
    //std::cout << "\nOutput:\n\n";
    //for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
    //{
    //    std::cout << p_out_norm[i] << ", ";
    //    if (i % 10 == 0) std::cout << i / 10 << std::endl;
    //}
    //std::cout << std::endl;

    return 0;
}


================================================
FILE: arcface/gen_wts.py
================================================
import struct
import sys
import argparse
import face_model
import cv2
import numpy as np

parser = argparse.ArgumentParser(description='face model test')
# general
parser.add_argument('--image-size', default='112,112', help='')
parser.add_argument('--model', default='model-r100-ii/model,0', help='path to load model.')
parser.add_argument('--ga-model', default='', help='path to load model.')
parser.add_argument('--gpu', default=0, type=int, help='gpu id')
parser.add_argument('--det', default=0, type=int, help='mtcnn option, 1 means using R+O, 0 means detect from begining')
parser.add_argument('--flip', default=0, type=int, help='whether do lr flip aug')
parser.add_argument('--threshold', default=1.24, type=float, help='ver dist threshold')
args = parser.parse_args()

model = face_model.FaceModel(args)

f = open('arcface-r100.wts', 'w')
f.write('{}\n'.format(len(model.model.get_params()[0].keys()) + len(model.model.get_params()[1].keys())))
for k, v in model.model.get_params()[0].items():
    vr = v.reshape(-1).asnumpy()
    f.write('{} {} '.format(k, len(vr)))
    for vv in vr:
        f.write(' ')
        f.write(struct.pack('>f',float(vv)).hex())
    f.write('\n')
for k, v in model.model.get_params()[1].items():
    vr = v.reshape(-1).asnumpy()
    f.write('{} {} '.format(k, len(vr)))
    for vv in vr:
        f.write(' ')
        f.write(struct.pack('>f',float(vv)).hex())
    f.write('\n')


================================================
FILE: arcface/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "macros.h"

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#else
#define TRT_NOEXCEPT
#endif

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: arcface/macros.h
================================================
#ifndef __MACROS_H
#define __MACROS_H

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif

#endif  // __MACROS_H

================================================
FILE: arcface/prelu.cu
================================================
#include <cmath>
#include <stdio.h>
#include <cassert>
#include <iostream>
#include "prelu.h"

namespace nvinfer1
{
    PReluPlugin::PReluPlugin(const std::vector<float>& gamma) : gamma_(gamma)
    {
    }

    PReluPlugin::~PReluPlugin()
    {
    }

    // create the plugin at runtime from a byte stream
    PReluPlugin::PReluPlugin(const void* data, size_t length)
    {
        char *p = (char*)data;
        input_size_ = reinterpret_cast<const int*>(p)[0];
        p += sizeof(int);
        gamma_.assign((float*)p, (float*)p + (length - sizeof(int)) / sizeof(float));
    }

    void PReluPlugin::serialize(void* buffer) const TRT_NOEXCEPT 
    {
        *reinterpret_cast<int*>(buffer) = input_size_;
        char *p = reinterpret_cast<char*>(buffer);
        p += sizeof(int);
        memcpy(p, gamma_.data(), gamma_.size() * sizeof(float));
    }

    size_t PReluPlugin::getSerializationSize() const TRT_NOEXCEPT
    {  
        return sizeof(input_size_) + gamma_.size() * sizeof(float);
    }

    int PReluPlugin::initialize() TRT_NOEXCEPT
    { 
        return 0;
    }

    Dims PReluPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT
    {
        assert(nbInputDims == 1);
        assert(index == 0);
        input_size_ = inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2];
        // Output dimensions
        return Dims3(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]);
    }

    // Set plugin namespace
    void PReluPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT
    {
        mPluginNamespace = pluginNamespace;
    }

    const char* PReluPlugin::getPluginNamespace() const TRT_NOEXCEPT
    {
        return mPluginNamespace;
    }

    // Return the DataType of the plugin output at the requested index
    DataType PReluPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT
    {
        return DataType::kFLOAT;
    }

    // Return true if output tensor is broadcast across a batch.
    bool PReluPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT
    {
        return false;
    }

    // Return true if plugin can use input that is broadcast across batch without replication.
    bool PReluPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT
    {
        return false;
    }

    void PReluPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT
    {
    }

    // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
    void PReluPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT
    {
    }

    // Detach the plugin object from its execution context.
    void PReluPlugin::detachFromContext() TRT_NOEXCEPT {}

    const char* PReluPlugin::getPluginType() const TRT_NOEXCEPT
    {
        return "PRelu_TRT";
    }

    const char* PReluPlugin::getPluginVersion() const TRT_NOEXCEPT
    {
        return "1";
    }

    void PReluPlugin::destroy() TRT_NOEXCEPT
    {
        delete this;
    }

    // Clone the plugin
    IPluginV2IOExt* PReluPlugin::clone() const TRT_NOEXCEPT
    {
        PReluPlugin *p = new PReluPlugin(gamma_);
        p->input_size_ = input_size_;
        p->setPluginNamespace(mPluginNamespace);
        return p;
    }

    __global__ void prelu_kernel(const float *input, float *output, int num_elem, int input_size, int fm_size, const float* gamma) {

        int idx = threadIdx.x + blockDim.x * blockIdx.x;
        if (idx >= num_elem) return;

        if (input[idx] >= 0.0f) {
            output[idx] = input[idx];
            return;
        }
        int c = (idx % input_size) / fm_size;
        output[idx] = input[idx] * gamma[c];
    }

    void PReluPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) {
        int block_size = thread_count_;
        int grid_size = (input_size_ * batchSize + block_size - 1) / block_size;
        void *dev_gamma;
        assert(cudaMalloc(&dev_gamma, sizeof(float) * gamma_.size()) == cudaSuccess);
        assert(cudaMemcpy(dev_gamma, gamma_.data(), sizeof(float) * gamma_.size(), cudaMemcpyHostToDevice)  == cudaSuccess);
        prelu_kernel<<<grid_size, block_size>>>(inputs[0], output, input_size_ * batchSize, input_size_, input_size_ / gamma_.size(), (const float*)dev_gamma);
        assert(cudaFree(dev_gamma) == cudaSuccess);
    }

    int PReluPlugin::enqueue(int batchSize, const void*const * inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT
    {
        //assert(batchSize == 1);
        //GPU
        //CUDA_CHECK(cudaStreamSynchronize(stream));
        forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);
        return 0;
    }

    PluginFieldCollection PReluPluginCreator::mFC{};
    std::vector<PluginField> PReluPluginCreator::mPluginAttributes;

    PReluPluginCreator::PReluPluginCreator()
    {
        mPluginAttributes.emplace_back(PluginField("gamma", nullptr, PluginFieldType::kFLOAT32, 1));

        mFC.nbFields = mPluginAttributes.size();
        mFC.fields = mPluginAttributes.data();
    }

    const char* PReluPluginCreator::getPluginName() const TRT_NOEXCEPT
    {
            return "PRelu_TRT";
    }

    const char* PReluPluginCreator::getPluginVersion() const TRT_NOEXCEPT
    {
            return "1";
    }

    const PluginFieldCollection* PReluPluginCreator::getFieldNames() TRT_NOEXCEPT
    {
            return &mFC;
    }

    IPluginV2IOExt* PReluPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT
    {
        std::vector<float> gamma;
        const PluginField* fields = fc->fields;
        for (int i = 0; i < fc->nbFields; ++i) {
            const char* attrName = fields[i].name;
            if (!strcmp(attrName, "gamma")) {
                assert(fields[i].type == PluginFieldType::kFLOAT32);
                int size = fields[i].length;
                gamma.reserve(size);
                const auto* w = static_cast<const float*>(fields[i].data);
                for (int j = 0; j < size; j++)
                {
                    gamma.push_back(*w);
                    w++;
                }
            }
        }

        PReluPlugin* obj = new PReluPlugin(gamma);
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

    IPluginV2IOExt* PReluPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT
    {
        // This object will be deleted when the network is destroyed, which will
        // call PReluPlugin::destroy()
        PReluPlugin* obj = new PReluPlugin(serialData, serialLength);
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

}


================================================
FILE: arcface/prelu.h
================================================
#ifndef _PRELU_PLUGIN_H
#define _PRELU_PLUGIN_H

#include <string>
#include <vector>
#include "NvInfer.h"
#include "macros.h"

namespace nvinfer1
{
    class PReluPlugin: public IPluginV2IOExt
    {
        public:
            PReluPlugin(const std::vector<float>& gamma);
            PReluPlugin(const void* data, size_t length);

            ~PReluPlugin();

            int getNbOutputs() const TRT_NOEXCEPT override
            {
                return 1;
            }

            Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;

            int initialize() TRT_NOEXCEPT override;

            virtual void terminate() TRT_NOEXCEPT override {};

            virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0;}

            virtual int enqueue(int batchSize, const void*const * inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;

            virtual size_t getSerializationSize() const TRT_NOEXCEPT override;

            virtual void serialize(void* buffer) const TRT_NOEXCEPT override;

            bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override {
                return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
            }

            const char* getPluginType() const TRT_NOEXCEPT override;

            const char* getPluginVersion() const TRT_NOEXCEPT override;

            void destroy() TRT_NOEXCEPT override;

            IPluginV2IOExt* clone() const TRT_NOEXCEPT override;

            void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;

            const char* getPluginNamespace() const TRT_NOEXCEPT override;

            DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;

            bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override;

            bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;

            void attachToContext(
                    cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;

            void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT override;

            void detachFromContext() TRT_NOEXCEPT override;

            int input_size_;
        private:
            void forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize = 1);
            int thread_count_ = 256;
            std::vector<float> gamma_;
            const char* mPluginNamespace;
    };

    class PReluPluginCreator : public IPluginCreator
    {
        public:
            PReluPluginCreator();

            ~PReluPluginCreator() override = default;

            const char* getPluginName() const TRT_NOEXCEPT override;

            const char* getPluginVersion() const TRT_NOEXCEPT override;

            const PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;

            IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT override;

            IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;

            void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override
            {
                mNamespace = libNamespace;
            }

            const char* getPluginNamespace() const TRT_NOEXCEPT override
            {
                return mNamespace.c_str();
            }

        private:
            std::string mNamespace;
            static PluginFieldCollection mFC;
            static std::vector<PluginField> mPluginAttributes;
    };
};
#endif 


================================================
FILE: centernet/README.md
================================================
# CenterNet

This is the trt implementation of detection model [ctdet_coco_dla_2x](https://drive.google.com/open?id=1pl_-ael8wERdUREEnaIfqOV_VF2bEVRT) from [xingyizhou/CenterNet](https://github.com/xingyizhou/CenterNet) official work. 

## How to Run

1. Follow [NVIDIA/TensorRT](https://github.com/NVIDIA/TensorRT) tutorial to build TensorRT7

2. Copy folder `dcnv2Plugin` to `TensorRT/plugin` and edit `InferPlugin.cpp` and `CMakeLists.txt`

3. Rebuild to install custom plugin

4. Use `tensorrt-7.2.3.4-cp36-none-linux_x86_64.whl` in TensorRT OSS to update your python-tensorrt

5. Run `python centernet.py -m ${PTH_PATH} -s` to create trt engine 

## Sample

```
// Download ctdet_coco_dla_2x.pth and transfer it into trt engine first
// Download the test img from https://raw.githubusercontent.com/tensorflow/models/master/research/deeplab/g3doc/img/image2.jpg or choose your own one
cd sample
python test.py ${ENGINE_PATH} ${IMG_PATH}
```
![trt_out](https://user-images.githubusercontent.com/47047345/119128637-7a878900-ba68-11eb-91ff-5dcc10f01b77.jpg)

## TODO

Integrate the post process with trt engine to make it more easier to use.

================================================
FILE: centernet/centernet.py
================================================
import numpy as np

import tensorrt as trt
import torch

from sample import common
import argparse
import time

# You can set the logger severity higher to suppress messages (or lower to display more messages).
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

trt.init_libnvinfer_plugins(TRT_LOGGER, '')
PLUGIN_CREATORS = trt.get_plugin_registry().plugin_creator_list

for plugin_creator in PLUGIN_CREATORS:
    if plugin_creator.name == 'DCNv2_TRT':
        dcnCreator = plugin_creator


class ModelData(object):
    INPUT_NAME = "data"
    INPUT_SHAPE = (3, 512, 512)
    OUTPUT_NAME = "prob"
    DTYPE = trt.float16


class Centernet_dla34(object):
    def __init__(self, weights) -> None:
        super().__init__()
        self.weights = weights
        self.levels = [1, 1, 1, 2, 2, 1]
        self.channels = [16, 32, 64, 128, 256, 512]
        self.down_ratio = 4
        self.last_level = 5
        self.engine = self.build_engine()

    def add_batchnorm_2d(self, input_tensor, parent):
        gamma = self.weights[parent + '.weight'].numpy()
        beta = self.weights[parent + '.bias'].numpy()
        mean = self.weights[parent + '.running_mean'].numpy()
        var = self.weights[parent + '.running_var'].numpy()
        eps = 1e-5

        scale = gamma / np.sqrt(var + eps)
        shift = beta - mean * gamma / np.sqrt(var + eps)
        power = np.ones_like(scale)

        return self.network.add_scale(input=input_tensor.get_output(0), mode=trt.ScaleMode.CHANNEL, shift=shift, scale=scale, power=power)

    def add_basic_block(self, input_tensor, out_channels, residual=None, stride=1, dilation=1, parent=''):
        conv1_w = self.weights[parent + '.conv1.weight'].numpy()
        conv1 = self.network.add_convolution(input=input_tensor.get_output(
            0), num_output_maps=out_channels, kernel_shape=(3, 3), kernel=conv1_w)
        conv1.stride = (stride, stride)
        conv1.padding = (dilation, dilation)
        conv1.dilation = (dilation, dilation)

        bn1 = self.add_batchnorm_2d(conv1, parent + '.bn1')
        ac1 = self.network.add_activation(
            input=bn1.get_output(0), type=trt.ActivationType.RELU)

        conv2_w = self.weights[parent + '.conv2.weight'].numpy()
        conv2 = self.network.add_convolution(input=ac1.get_output(
            0), num_output_maps=out_channels, kernel_shape=(3, 3), kernel=conv2_w)
        conv2.padding = (dilation, dilation)
        conv2.dilation = (dilation, dilation)

        out = self.add_batchnorm_2d(conv2, parent + '.bn2')

        if residual is None:
            out = self.network.add_elementwise(input_tensor.get_output(
                0), out.get_output(0), trt.ElementWiseOperation.SUM)
        else:
            out = self.network.add_elementwise(residual.get_output(
                0), out.get_output(0), trt.ElementWiseOperation.SUM)
        return self.network.add_activation(input=out.get_output(0), type=trt.ActivationType.RELU)

    def add_level(self, input_tensor, out_channels, stride=1, dilation=1, parent=''):
        conv1_w = self.weights[parent + '.0.weight'].numpy()
        conv1 = self.network.add_convolution(input=input_tensor.get_output(
            0), num_output_maps=out_channels, kernel_shape=(3, 3), kernel=conv1_w)
        conv1.stride = (stride, stride)
        conv1.padding = (dilation, dilation)
        conv1.dilation = (dilation, dilation)

        bn1 = self.add_batchnorm_2d(conv1, parent + '.1')
        ac1 = self.network.add_activation(
            input=bn1.get_output(0), type=trt.ActivationType.RELU)
        return ac1

    def add_root(self, input_tensors: list, out_channels, kernel_size=1, residual=False, parent=''):
        ct = self.network.add_concatenation(
            [x.get_output(0) for x in input_tensors])

        conv_w = self.weights[parent + '.conv.weight'].numpy()
        conv = self.network.add_convolution(input=ct.get_output(
            0), num_output_maps=out_channels, kernel_shape=(1, 1), kernel=conv_w)
        conv.padding = ((kernel_size - 1) // 2, (kernel_size - 1) // 2)

        bn1 = self.add_batchnorm_2d(conv, parent + '.bn')
        out = self.network.add_activation(
            input=bn1.get_output(0), type=trt.ActivationType.RELU)

        if residual:
            out = self.network.add_elementwise(input_tensors[0].get_output(
                0), out.get_output(0), trt.ElementWiseOperation.SUM)

        return self.network.add_activation(input=out.get_output(0), type=trt.ActivationType.RELU)

    def add_tree(self, input_tensor, level, out_channels, residual=None, children=None, stride=1, level_root=False, parent=''):
        children = [] if children is None else children
        if stride > 1:
            bottom = self.network.add_pooling(input_tensor.get_output(
                0), trt.PoolingType.MAX, (stride, stride))
            bottom.stride = (stride, stride)
        else:
            bottom = input_tensor

        if input_tensor.get_output(0).shape[0] != out_channels:
            project_conv1_w = self.weights[parent +
                                           '.project.0.weight'].numpy()
            project_conv1 = self.network.add_convolution(input=bottom.get_output(
                0), num_output_maps=out_channels, kernel_shape=(1, 1), kernel=project_conv1_w)
            residual = self.add_batchnorm_2d(
                project_conv1, parent + '.project.1')
        else:
            residual = bottom

        if level_root:
            children.append(bottom)

        if level == 1:
            tree1 = self.add_basic_block(
                input_tensor, out_channels, residual, stride, parent=parent+'.tree1')
            tree2 = self.add_basic_block(
                tree1, out_channels, parent=parent+'.tree2')
            return self.add_root([tree2, tree1]+children, out_channels, parent=parent+'.root')
        else:
            tree1 = self.add_tree(input_tensor, level-1, out_channels,
                                  residual, stride=stride, parent=parent+'.tree1')
            children.append(tree1)
            return self.add_tree(tree1, level-1, out_channels, children=children, parent=parent+'.tree2')

    def add_base(self, input_tensor, parent):
        base_conv1_w = self.weights[parent+'.base_layer.0.weight'].numpy()
        base_conv1 = self.network.add_convolution(
            input=input_tensor, num_output_maps=self.channels[0], kernel_shape=(7, 7), kernel=base_conv1_w)
        base_conv1.padding = (3, 3)

        base_bn1 = self.add_batchnorm_2d(base_conv1, parent+'.base_layer.1')
        base_ac1 = self.network.add_activation(
            input=base_bn1.get_output(0), type=trt.ActivationType.RELU)

        level0 = self.add_level(
            base_ac1, self.channels[0],    parent=parent+'.level0')
        level1 = self.add_level(
            level0,   self.channels[1], 2, parent=parent+'.level1')

        level2 = self.add_tree(
            level1, self.levels[2], self.channels[2], stride=2, level_root=False, parent=parent+'.level2')
        level3 = self.add_tree(
            level2, self.levels[3], self.channels[3], stride=2, level_root=True, parent=parent+'.level3')
        level4 = self.add_tree(
            level3, self.levels[4], self.channels[4], stride=2, level_root=True, parent=parent+'.level4')
        level5 = self.add_tree(
            level4, self.levels[5], self.channels[5], stride=2, level_root=True, parent=parent+'.level5')

        return [level0, level1, level2, level3, level4, level5]

    def add_deform_conv(self, input_tensor, out_channels, kernel=3, stride=1, padding=1, dilation=1, deformable_group=1, parent=''):
        conv_offset_mask_w = self.weights[parent +
                                          '.conv.conv_offset_mask.weight'].numpy()
        conv_offset_mask_b = self.weights[parent +
                                          '.conv.conv_offset_mask.bias'].numpy()
        conv_offset_mask = self.network.add_convolution(input=input_tensor.get_output(0),
                                                        num_output_maps=deformable_group*3*kernel*kernel,
                                                        kernel_shape=(
                                                            kernel, kernel),
                                                        kernel=conv_offset_mask_w,
                                                        bias=conv_offset_mask_b)
        conv_offset_mask.stride = (stride, stride)
        conv_offset_mask.padding = (padding, padding)

        out_channels = trt.PluginField("out_channels", np.array(
            [out_channels], dtype=np.int32), trt.PluginFieldType.INT32)
        kernel = trt.PluginField("kernel", np.array(
            [kernel], dtype=np.int32), trt.PluginFieldType.INT32)
        deformable_group = trt.PluginField("deformable_group", np.array(
            [deformable_group], dtype=np.int32), trt.PluginFieldType.INT32)
        dilation = trt.PluginField("dilation", np.array(
            [dilation], dtype=np.int32), trt.PluginFieldType.INT32)
        padding = trt.PluginField("padding", np.array(
            [padding], dtype=np.int32), trt.PluginFieldType.INT32)
        stride = trt.PluginField("stride", np.array(
            [stride], dtype=np.int32), trt.PluginFieldType.INT32)
        weight = trt.PluginField(
            "weight", self.weights[parent + '.conv.weight'].numpy(), trt.PluginFieldType.FLOAT32)
        bias = trt.PluginField(
            "bias", self.weights[parent + '.conv.bias'].numpy(), trt.PluginFieldType.FLOAT32)
        field_collection = trt.PluginFieldCollection(
            [out_channels, kernel, deformable_group, dilation, padding, stride, weight, bias])
        DCN = dcnCreator.create_plugin(
            name='DCNv2_TRT', field_collection=field_collection)

        sigmoid_conv_offset_mask = self.network.add_activation(
            input=conv_offset_mask.get_output(0), type=trt.ActivationType.SIGMOID)

        dcn = self.network.add_plugin_v2(inputs=[input_tensor.get_output(
            0), conv_offset_mask.get_output(0), sigmoid_conv_offset_mask.get_output(0)], plugin=DCN)
        bn = self.add_batchnorm_2d(dcn, parent+'.actf.0')
        return self.network.add_activation(input=bn.get_output(0), type=trt.ActivationType.RELU)

    def add_ida_up(self, input_tensors, out_channels, up_f, startp, parent):
        for i in range(startp + 1, len(input_tensors)):
            proj = self.add_deform_conv(
                input_tensors[i], out_channels, parent=parent+'.proj_%d' % (i-startp))
            f = up_f[i-startp]
            up_w = self.weights[parent + '.up_%d.weight' % (i-startp)].numpy()
            up = self.network.add_deconvolution(
                proj.get_output(0), out_channels, (f*2, f*2), up_w)
            up.stride = (f, f)
            up.padding = (f//2, f//2)
            up.num_groups = out_channels
            node = self.network.add_elementwise(
                input_tensors[i-1].get_output(0), up.get_output(0), trt.ElementWiseOperation.SUM)
            input_tensors[i] = self.add_deform_conv(
                node, out_channels, parent=parent+'.node_%d' % (i-startp))
        return input_tensors

    def add_dla_up(self, input_tensors, first_level, parent):
        channels = self.channels[first_level:]
        scales = [2 ** i for i in range(len(self.channels[first_level:]))]
        scales = np.array(scales, dtype=int)
        out = [input_tensors[-1]]
        for i in range(len(channels) - 1):
            j = -i - 2
            input_tensors = self.add_ida_up(
                input_tensors, channels[j], scales[j:] // scales[j], len(input_tensors) - i - 2, parent+'.ida_%d' % i)
            out.insert(0, input_tensors[-1])
            scales[j + 1:] = scales[j]
            channels[j + 1:] = [channels[j] for _ in channels[j + 1:]]
        return out

    def add_head(self, input_tensor, out_channels, head, head_conv=256, final_kernal=1):
        conv1_w = self.weights[head+'.0.weight'].numpy()
        conv1_b = self.weights[head+'.0.bias'].numpy()
        conv1 = self.network.add_convolution(
            input_tensor.get_output(0), head_conv, (3, 3), conv1_w, conv1_b)
        conv1.padding = (1, 1)
        ac1 = self.network.add_activation(
            input=conv1.get_output(0), type=trt.ActivationType.RELU)
        conv2_w = self.weights[head + '.2.weight'].numpy()
        conv2_b = self.weights[head+'.2.bias'].numpy()
        conv2 = self.network.add_convolution(ac1.get_output(
            0), out_channels, (final_kernal, final_kernal), conv2_w, conv2_b)
        return conv2

    def populate_network(self):
        # Configure the network layers based on the self.weights provided.
        input_tensor = self.network.add_input(
            name=ModelData.INPUT_NAME, dtype=ModelData.DTYPE, shape=ModelData.INPUT_SHAPE)

        y = self.add_base(input_tensor, 'module.base')

        first_level = int(np.log2(self.down_ratio))
        last_level = self.last_level
        dla_up = self.add_dla_up(y, first_level, 'module.dla_up')
        ida_up = self.add_ida_up(dla_up[:last_level-first_level], self.channels[first_level], [
                                 2 ** i for i in range(last_level - first_level)], 0, 'module.ida_up')

        hm = self.add_head(ida_up[-1], 80, 'module.hm')
        wh = self.add_head(ida_up[-1], 2, 'module.wh')
        reg = self.add_head(ida_up[-1], 2, 'module.reg')

        hm.get_output(0).name = 'hm'
        wh.get_output(0).name = 'wh'
        reg.get_output(0).name = 'reg'
        self.network.mark_output(tensor=hm.get_output(0))
        self.network.mark_output(tensor=wh.get_output(0))
        self.network.mark_output(tensor=reg.get_output(0))

    def build_engine(self):
        # For more information on TRT basics, refer to the introductory samples.
        with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network:
            self.network = network
            builder.max_workspace_size = common.GiB(1)
            builder.max_batch_size = 1
            # Populate the network using self.weights from the PyTorch model.
            self.populate_network()
            # Build and return an engine.
            return builder.build_cuda_engine(self.network)


def load_random_test_case(pagelocked_buffer):
    # Select an image at random to be the test case.
    img = np.random.randn(1, 3, 512, 512).astype(np.float32)
    # Copy to the pagelocked input buffer
    np.copyto(pagelocked_buffer, img.ravel())
    return img


def main(args):
    # Get the PyTorch weights
    weights = torch.load(args.model, map_location={
                         'cuda:0': 'cpu'})['state_dict']
    # Do inference with TensorRT.
    with Centernet_dla34(weights).engine as engine:
        if args.save_engine:
            with open('centernet.engine', "wb") as f:
                f.write(engine.serialize())
        inputs, outputs, bindings, stream = common.allocate_buffers(engine)
        with engine.create_execution_context() as context:
            img = load_random_test_case(pagelocked_buffer=inputs[0].host)
            # For more information on performing inference, refer to the introductory samples.
            # The common.do_inference function will return a list of outputs - we only have one in this case.
            t = time.time()
            [hm, wh, reg] = common.do_inference(
                context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=1)
            t = time.time() - t
            print('output:   hm:%f, wh:%f, reg:%f' %
                  (hm.mean(), wh.mean(), reg.mean()))
            print(t)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='CenterNet dla34 ctdet')
    parser.add_argument('--model',  '-m', type=str,
                        default='./ctdet_coco_dla_2x.pth', help='path of pytorch .pth')
    parser.add_argument('--save_engine', '-s',
                        action='store_true', help='if save trt engine')
    args = parser.parse_args()
    main(args)


================================================
FILE: centernet/dcnv2Plugin/CMakeLists.txt
================================================
#
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
file(GLOB SRCS *.cpp)
set(PLUGIN_SOURCES ${PLUGIN_SOURCES} ${SRCS})
set(PLUGIN_SOURCES ${PLUGIN_SOURCES} PARENT_SCOPE)
file(GLOB CU_SRCS *.cu)
set(PLUGIN_CU_SOURCES ${PLUGIN_CU_SOURCES} ${CU_SRCS})
set(PLUGIN_CU_SOURCES ${PLUGIN_CU_SOURCES} PARENT_SCOPE)

================================================
FILE: centernet/dcnv2Plugin/dcn_v2_im2col_cuda.cu
================================================
#include "dcn_v2_im2col_cuda.h"
#include <cstdio>
#include <algorithm>
#include <cstring>

#define CUDA_KERNEL_LOOP(i, n)                          \
  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
      i < (n);                                          \
      i += blockDim.x * gridDim.x)

const int CUDA_NUM_THREADS = 512;
//inline int GET_BLOCKS(const int N)
//{
//  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
//}
dim3 GET_BLOCKS(uint n)
{
    uint k = (n - 1) /CUDA_NUM_THREADS + 1;
    uint x = k ;
    uint y = 1 ;
    if (x > 65535 )
    {
        x = ceil(sqrt(x));
        y = (n - 1 )/(x*CUDA_NUM_THREADS) + 1;
    }
    dim3 d = {x,y,1} ;
    return d;
}

__device__ float dmcn_im2col_bilinear(const float *bottom_data, const int data_width,
                                      const int height, const int width, float h, float w)
{
  int h_low = floor(h);
  int w_low = floor(w);
  int h_high = h_low + 1;
  int w_high = w_low + 1;

  float lh = h - h_low;
  float lw = w - w_low;
  float hh = 1 - lh, hw = 1 - lw;

  float v1 = 0;
  if (h_low >= 0 && w_low >= 0)
    v1 = bottom_data[h_low * data_width + w_low];
  float v2 = 0;
  if (h_low >= 0 && w_high <= width - 1)
    v2 = bottom_data[h_low * data_width + w_high];
  float v3 = 0;
  if (h_high <= height - 1 && w_low >= 0)
    v3 = bottom_data[h_high * data_width + w_low];
  float v4 = 0;
  if (h_high <= height - 1 && w_high <= width - 1)
    v4 = bottom_data[h_high * data_width + w_high];

  float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;

  float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
  return val;
}

__device__ float dmcn_get_gradient_weight(float argmax_h, float argmax_w,
                                          const int h, const int w, const int height, const int width)
{
  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
  {
    //empty
    return 0;
  }

  int argmax_h_low = floor(argmax_h);
  int argmax_w_low = floor(argmax_w);
  int argmax_h_high = argmax_h_low + 1;
  int argmax_w_high = argmax_w_low + 1;

  float weight = 0;
  if (h == argmax_h_low && w == argmax_w_low)
    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
  if (h == argmax_h_low && w == argmax_w_high)
    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
  if (h == argmax_h_high && w == argmax_w_low)
    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
  if (h == argmax_h_high && w == argmax_w_high)
    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
  return weight;
}

__device__ float dmcn_get_coordinate_weight(float argmax_h, float argmax_w,
                                            const int height, const int width, const float *im_data,
                                            const int data_width, const int bp_dir)
{
  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
  {
    //empty
    return 0;
  }

  int argmax_h_low = floor(argmax_h);
  int argmax_w_low = floor(argmax_w);
  int argmax_h_high = argmax_h_low + 1;
  int argmax_w_high = argmax_w_low + 1;

  float weight = 0;

  if (bp_dir == 0)
  {
    if (argmax_h_low >= 0 && argmax_w_low >= 0)
      weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
      weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
      weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
      weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
  }
  else if (bp_dir == 1)
  {
    if (argmax_h_low >= 0 && argmax_w_low >= 0)
      weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
      weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
      weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
      weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
  }

  return weight;
}

__global__ void modulated_deformable_im2col_gpu_kernel(const int n,
                                                       const float *data_im, const float *data_offset, const float *data_mask,
                                                       const int height, const int width, const int kernel_h, const int kernel_w,
                                                       const int pad_h, const int pad_w,
                                                       const int stride_h, const int stride_w,
                                                       const int dilation_h, const int dilation_w,
                                                       const int channel_per_deformable_group,
                                                       const int batch_size, const int num_channels, const int deformable_group,
                                                       const int height_col, const int width_col,
                                                       float *data_col)
{
  CUDA_KERNEL_LOOP(index, n)
  {
    // index index of output matrix
    const int w_col = index % width_col;
    const int h_col = (index / width_col) % height_col;
    const int b_col = (index / width_col / height_col) % batch_size;
    const int c_im = (index / width_col / height_col) / batch_size;
    const int c_col = c_im * kernel_h * kernel_w;

    // compute deformable group index
    const int deformable_group_index = c_im / channel_per_deformable_group;

    const int h_in = h_col * stride_h - pad_h;
    const int w_in = w_col * stride_w - pad_w;

    float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
    //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
    const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
    const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;

    const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;

    for (int i = 0; i < kernel_h; ++i)
    {
      for (int j = 0; j < kernel_w; ++j)
      {
        const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
        const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
        const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
        const float offset_h = data_offset_ptr[data_offset_h_ptr];
        const float offset_w = data_offset_ptr[data_offset_w_ptr];
        const float mask = data_mask_ptr[data_mask_hw_ptr];
        float val = static_cast<float>(0);
        const float h_im = h_in + i * dilation_h + offset_h;
        const float w_im = w_in + j * dilation_w + offset_w;
        //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
        {
          //const float map_h = i * dilation_h + offset_h;
          //const float map_w = j * dilation_w + offset_w;
          //const int cur_height = height - h_in;
          //const int cur_width = width - w_in;
          //val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
          val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
        }
        *data_col_ptr = val * mask;
        data_col_ptr += batch_size * height_col * width_col;
        //data_col_ptr += height_col * width_col;
      }
    }
  }
}

__global__ void modulated_deformable_col2im_gpu_kernel(const int n,
                                                       const float *data_col, const float *data_offset, const float *data_mask,
                                                       const int channels, const int height, const int width,
                                                       const int kernel_h, const int kernel_w,
                                                       const int pad_h, const int pad_w,
                                                       const int stride_h, const int stride_w,
                                                       const int dilation_h, const int dilation_w,
                                                       const int channel_per_deformable_group,
                                                       const int batch_size, const int deformable_group,
                                                       const int height_col, const int width_col,
                                                       float *grad_im)
{
  CUDA_KERNEL_LOOP(index, n)
  {
    const int j = (index / width_col / height_col / batch_size) % kernel_w;
    const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
    const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
    // compute the start and end of the output

    const int deformable_group_index = c / channel_per_deformable_group;

    int w_out = index % width_col;
    int h_out = (index / width_col) % height_col;
    int b = (index / width_col / height_col) % batch_size;
    int w_in = w_out * stride_w - pad_w;
    int h_in = h_out * stride_h - pad_h;

    const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
    const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
    const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
    const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
    const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
    const float offset_h = data_offset_ptr[data_offset_h_ptr];
    const float offset_w = data_offset_ptr[data_offset_w_ptr];
    const float mask = data_mask_ptr[data_mask_hw_ptr];
    const float cur_inv_h_data = h_in + i * dilation_h + offset_h;
    const float cur_inv_w_data = w_in + j * dilation_w + offset_w;

    const float cur_top_grad = data_col[index] * mask;
    const int cur_h = (int)cur_inv_h_data;
    const int cur_w = (int)cur_inv_w_data;
    for (int dy = -2; dy <= 2; dy++)
    {
      for (int dx = -2; dx <= 2; dx++)
      {
        if (cur_h + dy >= 0 && cur_h + dy < height &&
            cur_w + dx >= 0 && cur_w + dx < width &&
            abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
            abs(cur_inv_w_data - (cur_w + dx)) < 1)
        {
          int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
          float weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
        }
      }
    }
  }
}

__global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n,
                                                             const float *data_col, const float *data_im,
                                                             const float *data_offset, const float *data_mask,
                                                             const int channels, const int height, const int width,
                                                             const int kernel_h, const int kernel_w,
                                                             const int pad_h, const int pad_w,
                                                             const int stride_h, const int stride_w,
                                                             const int dilation_h, const int dilation_w,
                                                             const int channel_per_deformable_group,
                                                             const int batch_size, const int offset_channels, const int deformable_group,
                                                             const int height_col, const int width_col,
                                                             float *grad_offset, float *grad_mask)
{
  CUDA_KERNEL_LOOP(index, n)
  {
    float val = 0, mval = 0;
    int w = index % width_col;
    int h = (index / width_col) % height_col;
    int c = (index / width_col / height_col) % offset_channels;
    int b = (index / width_col / height_col) / offset_channels;
    // compute the start and end of the output

    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
    const int col_step = kernel_h * kernel_w;
    int cnt = 0;
    const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col;
    const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width;
    const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
    const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;

    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;

    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
    {
      const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
      const int bp_dir = offset_c % 2;

      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
      int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
      int w_out = col_pos % width_col;
      int h_out = (col_pos / width_col) % height_col;
      int w_in = w_out * stride_w - pad_w;
      int h_in = h_out * stride_h - pad_h;
      const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
      const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
      const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
      const float offset_h = data_offset_ptr[data_offset_h_ptr];
      const float offset_w = data_offset_ptr[data_offset_w_ptr];
      const float mask = data_mask_ptr[data_mask_hw_ptr];
      float inv_h = h_in + i * dilation_h + offset_h;
      float inv_w = w_in + j * dilation_w + offset_w;
      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
      {
        inv_h = inv_w = -2;
      }
      else
      {
        mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w);
      }
      const float weight = dmcn_get_coordinate_weight(
          inv_h, inv_w,
          height, width, data_im_ptr + cnt * height * width, width, bp_dir);
      val += weight * data_col_ptr[col_pos] * mask;
      cnt += 1;
    }
    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
    grad_offset[index] = val;
    if (offset_c % 2 == 0)
      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval);
      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval;
  }
}

void modulated_deformable_im2col_cuda(cudaStream_t stream,
  const float* data_im, const float* data_offset, const float* data_mask,
  const int batch_size, const int channels, const int height_im, const int width_im, 
  const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
  const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
  const int dilation_h, const int dilation_w,
  const int deformable_group, float* data_col) {
  // num_axes should be smaller than block size
  const int channel_per_deformable_group = channels / deformable_group;
  const int num_kernels = channels * batch_size * height_col * width_col;
  modulated_deformable_im2col_gpu_kernel
      <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
          0, stream>>>(
      num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kenerl_w,
      pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
      batch_size, channels, deformable_group, height_col, width_col, data_col);
  
  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess)
  {
    printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
  }

}

void modulated_deformable_col2im_cuda(cudaStream_t stream,
  const float* data_col, const float* data_offset, const float* data_mask,
  const int batch_size, const int channels, const int height_im, const int width_im, 
  const int height_col, const int width_col, const int kernel_h, const int kernel_w,
  const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
  const int dilation_h, const int dilation_w, 
  const int deformable_group, float* grad_im){

  const int channel_per_deformable_group = channels / deformable_group;
  const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
  modulated_deformable_col2im_gpu_kernel
      <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
          0, stream>>>(
        num_kernels, data_col, data_offset, data_mask, channels, height_im, width_im,
        kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w,
        dilation_h, dilation_w, channel_per_deformable_group,
        batch_size, deformable_group, height_col, width_col, grad_im);
  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess)
  {
    printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
  }

}

void modulated_deformable_col2im_coord_cuda(cudaStream_t stream,
  const float* data_col, const float* data_im, const float* data_offset, const float* data_mask,
  const int batch_size, const int channels, const int height_im, const int width_im, 
  const int height_col, const int width_col, const int kernel_h, const int kernel_w,
  const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
  const int dilation_h, const int dilation_w, 
  const int deformable_group,
  float* grad_offset, float* grad_mask) {
  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group;
  const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
  modulated_deformable_col2im_coord_gpu_kernel
      <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
        0, stream>>>(
        num_kernels, data_col, data_im, data_offset, data_mask, channels, height_im, width_im,
        kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
        dilation_h, dilation_w, channel_per_deformable_group,
        batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, 
        grad_offset, grad_mask);
  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess)
  {
    printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err));
  }
}

================================================
FILE: centernet/dcnv2Plugin/dcn_v2_im2col_cuda.h
================================================
/*!
 ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
 *
 * COPYRIGHT
 *
 * All contributions by the University of California:
 * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
 * All rights reserved.
 *
 * All other contributions:
 * Copyright (c) 2014-2017, the respective contributors
 * All rights reserved.
 *
 * Caffe uses a shared copyright model: each contributor holds copyright over
 * their contributions to Caffe. The project versioning records all such
 * contribution and copyright details. If a contributor wants to further mark
 * their specific copyright on a particular contribution, they should indicate
 * their copyright solely in the commit message of the change when it is
 * committed.
 *
 * LICENSE
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 * list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * CONTRIBUTION AGREEMENT
 *
 * By contributing to the BVLC/caffe repository through pull-request, comment,
 * or otherwise, the contributor releases their content to the
 * license and copyright terms herein.
 *
 ***************** END Caffe Copyright Notice and Disclaimer ********************
 *
 * Copyright (c) 2018 Microsoft
 * Licensed under The MIT License [see LICENSE for details]
 * \file modulated_deformable_im2col.h
 * \brief Function definitions of converting an image to
 * column matrix based on kernel, padding, dilation, and offset.
 * These functions are mainly used in deformable convolution operators.
 * \ref: https://arxiv.org/abs/1811.11168
 * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu
 */

/***************** Adapted by Charles Shang *********************/

#ifndef DCN_V2_IM2COL_CUDA
#define DCN_V2_IM2COL_CUDA

// #ifdef __cplusplus
// extern "C"
// {
// #endif

  void modulated_deformable_im2col_cuda(cudaStream_t stream,
                                        const float *data_im, const float *data_offset, const float *data_mask,
                                        const int batch_size, const int channels, const int height_im, const int width_im,
                                        const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
                                        const int pad_h, const int pad_w, const int stride_h, const int stride_w,
                                        const int dilation_h, const int dilation_w,
                                        const int deformable_group, float *data_col);

  void modulated_deformable_col2im_cuda(cudaStream_t stream,
                                        const float *data_col, const float *data_offset, const float *data_mask,
                                        const int batch_size, const int channels, const int height_im, const int width_im,
                                        const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
                                        const int pad_h, const int pad_w, const int stride_h, const int stride_w,
                                        const int dilation_h, const int dilation_w,
                                        const int deformable_group, float *grad_im);

  void modulated_deformable_col2im_coord_cuda(cudaStream_t stream,
                                         const float *data_col, const float *data_im, const float *data_offset, const float *data_mask,
                                         const int batch_size, const int channels, const int height_im, const int width_im,
                                         const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
                                         const int pad_h, const int pad_w, const int stride_h, const int stride_w,
                                         const int dilation_h, const int dilation_w,
                                         const int deformable_group,
                                         float *grad_offset, float *grad_mask);

// #ifdef __cplusplus
// }
// #endif

#endif

================================================
FILE: centernet/dcnv2Plugin/dcnv2Plugin.cpp
================================================
#include "dcnv2Plugin.h"
#include <iostream>

using namespace nvinfer1;
using nvinfer1::plugin::DeformableConvolutionalLayer;
using nvinfer1::plugin::DCNv2PluginCreator;

namespace
{
const char* DCNv2_PLUGIN_VERSION{"1"};
const char* DCNv2_PLUGIN_NAME{"DCNv2_TRT"};
} // namespace

#define CHECK_CUDA(call)                                                                                               \
    do                                                                                                                 \
    {                                                                                                                  \
        cudaError_t status = call;                                                                                     \
        if (status != cudaSuccess)                                                                                     \
        {                                                                                                              \
            return status;                                                                                             \
        }                                                                                                              \
    } while (0)

PluginFieldCollection DCNv2PluginCreator::mFC{};
std::vector<PluginField> DCNv2PluginCreator::mPluginAttributes;

// Parameterized constructor
DeformableConvolutionalLayer::DeformableConvolutionalLayer(
                         int out_channels,
                         int kernel,
                         int deformable_group,
                         int dilation,
                         int padding,
                         int stride,
                         const Weights* weight, const Weights* bias):
                         out_channels(out_channels),kernel_size(kernel),deformable_group(deformable_group),
                         dilation(dilation),padding(padding),stride(stride){
        mWeight = copyToDevice(weight[0].values, weight[0].count);
        mBias = copyToDevice(bias[0].values, bias[0].count);
}

DeformableConvolutionalLayer::DeformableConvolutionalLayer(const void* buffer, size_t length)
{
    const char* d = static_cast<const char*>(buffer);
    const char* a = d;
    in_channels = read<int>(d);
    height = read<int>(d);
    width = read<int>(d);
    height_out = read<int>(d);
    width_out = read<int>(d);

    out_channels = read<int>(d);
    kernel_size = read<int>(d);
    deformable_group = read<int>(d);
    dilation = read<int>(d);
    padding = read<int>(d);
    stride = read<int>(d);

    int count = read<int>(d);
    mWeight = deserializeToDevice(d, count);
    count = read<int>(d);
    mBias = deserializeToDevice(d, count);

    ASSERT(d == a + length);
}

int DeformableConvolutionalLayer::getNbOutputs() const
{
    // Plugin layer has 2 outputs
    return 1;
}

int DeformableConvolutionalLayer::initialize()
{
    size_t oneSize = height_out * width_out * sizeof(float);
    std::vector<float> one_((int)oneSize, 1.0f);
    CHECK_CUDA(cudaMalloc((void**)&mOne, oneSize));
    CHECK_CUDA(cudaMalloc((void**)&mColumn, in_channels * kernel_size * kernel_size * oneSize));
    CHECK_CUDA(cudaMemcpy(mOne, one_.data(), oneSize, cudaMemcpyHostToDevice));
    return STATUS_SUCCESS; 
}

Dims DeformableConvolutionalLayer::getOutputDimensions(int index, const Dims* inputs, int nbInputs)
{
    ASSERT(index == 0);
    ASSERT(nbInputs == 3);

    in_channels = inputs[0].d[0];
    height = inputs[0].d[1];
    width = inputs[0].d[2];
    height_out = (inputs[0].d[1] + 2 * padding - (dilation * (kernel_size - 1) + 1)) / stride + 1;
    width_out = (inputs[0].d[2] + 2 * padding - (dilation * (kernel_size - 1) + 1)) / stride + 1;

    return Dims3(out_channels, height_out, width_out);
}

size_t DeformableConvolutionalLayer::getWorkspaceSize(int maxBatchSize) const
{
    return 0;
}

int DeformableConvolutionalLayer::enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream)
{
    const float* input = static_cast<const float *>(inputs[0]);
    const float* offset = static_cast<const float *>(inputs[1]);
    const float* offset_mask = static_cast<const float *>(inputs[2]);
    const float* mask = offset_mask + deformable_group * 2 * kernel_size * kernel_size * height * width;
    float * output = static_cast<float *>(outputs[0]);

    float alpha{1}, beta{0};

    // Do Bias first:
    // M,N,K are dims of matrix A and B
    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
    // (N x 1) (1 x M)
    int m_ = out_channels;
    int n_ = height_out * width_out;
    int k_ = 1;
    cublasSgemm(mCublas, CUBLAS_OP_T, CUBLAS_OP_N, n_, m_, k_, &alpha,
                mOne, k_,
                static_cast<const float *>(mBias.values), k_, &beta,
                output, n_);

    modulated_deformable_im2col_cuda(stream, input, offset, mask,
                                    1, in_channels, height, width,
                                    height_out, width_out, kernel_size, kernel_size,
                                    padding, padding, stride, stride, dilation, dilation,
                                    deformable_group, mColumn); 

    //(k * m)  x  (m * n)
    // Y = WC
    int m = out_channels;
    int n = height_out * width_out;
    int k = in_channels * kernel_size * kernel_size;
    cublasSgemm(mCublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha,
                mColumn, n,
                static_cast<const float *>(mWeight.values), k, &alpha,
                output, n);
    
    return 0;
}

size_t DeformableConvolutionalLayer::getSerializationSize() const
{
    return sizeof(int) * 13 + (mWeight.count + mBias.count) * sizeof(float);
}

void DeformableConvolutionalLayer::serialize(void* buffer) const
{
    char *d = reinterpret_cast<char*>(buffer), *a = d;
    write(d, in_channels);
    write(d, height);
    write(d, width);
    write(d, height_out);
    write(d, width_out);

    write(d, out_channels);    
    write(d, kernel_size);
    write(d, deformable_group);
    write(d, dilation);
    write(d, padding);
    write(d, stride);

    write(d, (int) mWeight.count);
    serializeFromDevice(d, mWeight);
    write(d, (int) mBias.count);
    serializeFromDevice(d, mBias);

    ASSERT(d == a + getSerializationSize());
}

bool DeformableConvolutionalLayer::supportsFormat(DataType type, PluginFormat format) const
{
    return (type == DataType::kFLOAT && format == PluginFormat::kNCHW);
}

Weights DeformableConvolutionalLayer::copyToDevice(const void* hostData, size_t count)
{
    void* deviceData;
    CUASSERT(cudaMalloc(&deviceData, count * sizeof(float)));
    CUASSERT(cudaMemcpy(deviceData, hostData, count * sizeof(float), cudaMemcpyHostToDevice));
    return Weights{DataType::kFLOAT, deviceData, int64_t(count)};
}

void DeformableConvolutionalLayer::serializeFromDevice(char*& hostBuffer, Weights deviceWeights) const
{
    CUASSERT(cudaMemcpy(hostBuffer, deviceWeights.values, deviceWeights.count * sizeof(float), cudaMemcpyDeviceToHost));
    hostBuffer += deviceWeights.count * sizeof(float);
}

Weights DeformableConvolutionalLayer::deserializeToDevice(const char*& hostBuffer, size_t count)
{
    Weights w = copyToDevice(hostBuffer, count);
    hostBuffer += count * sizeof(float);
    return w;
}

const char* DeformableConvolutionalLayer::getPluginType() const
{
    return DCNv2_PLUGIN_NAME;
}

const char* DeformableConvolutionalLayer::getPluginVersion() const
{
    return DCNv2_PLUGIN_VERSION;
}

void DeformableConvolutionalLayer::terminate() {
        if (mOne)
        {
            cudaFree(mOne);
            mOne = nullptr;
        }
        if (mColumn)
        {
            cudaFree(mColumn);
            mColumn = nullptr;
        }
}

void DeformableConvolutionalLayer::destroy()
{
    delete this;
}

IPluginV2Ext* DeformableConvolutionalLayer::clone() const
{
    IPluginV2Ext* plugin = new DeformableConvolutionalLayer(*this);
    plugin->setPluginNamespace(mPluginNamespace.c_str());
    return plugin;
}

// Set plugin namespace
void DeformableConvolutionalLayer::setPluginNamespace(const char* pluginNamespace)
{
    mPluginNamespace = pluginNamespace;
}

const char* DeformableConvolutionalLayer::getPluginNamespace() const
{
    return mPluginNamespace.c_str();
}

// Return the DataType of the plugin output at the requested index.
DataType DeformableConvolutionalLayer::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
{
    // Only DataType::kFLOAT is acceptable by the plugin layer
    return DataType::kFLOAT;
}
// Return true if output tensor is broadcast across a batch.
bool DeformableConvolutionalLayer::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
{
    return false;
}

// Return true if plugin can use input that is broadcast across batch without replication.
bool DeformableConvolutionalLayer::canBroadcastInputAcrossBatch(int inputIndex) const
{
    return false;
}

// Configure the layer with input and output data types.
// inutDims: input Dimensions for the plugin layer
// nInputs : Number of inputs to the plugin layer
// outputDims: output Dimensions from the plugin layer
// nOutputs: number of outputs from the plugin layer
// type: DataType configuration for the plugin layer
// format: format NCHW, NHWC etc
// maxbatchSize: maximum batch size for the plugin layer
void DeformableConvolutionalLayer::configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs,
    const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast,
    const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize)
{
    ASSERT(*inputTypes == DataType::kFLOAT && floatFormat == PluginFormat::kNCHW);
}

// Attach the plugin object to an execution context and grant the plugin the access to some context resource.
void DeformableConvolutionalLayer::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
{
    mCublas = cublasContext;
}

// Detach the plugin object from its execution context.
void DeformableConvolutionalLayer::detachFromContext() {}

DCNv2PluginCreator::DCNv2PluginCreator()
{
    mPluginAttributes.emplace_back(PluginField("out_channels", nullptr, PluginFieldType::kINT32, 1));
    mPluginAttributes.emplace_back(PluginField("kernel", nullptr, PluginFieldType::kINT32, 1));
    mPluginAttributes.emplace_back(PluginField("deformable_group", nullptr, PluginFieldType::kINT32, 1));
    mPluginAttributes.emplace_back(PluginField("dilation", nullptr, PluginFieldType::kINT32, 1));
    mPluginAttributes.emplace_back(PluginField("padding", nullptr, PluginFieldType::kINT32, 1));
    mPluginAttributes.emplace_back(PluginField("stride", nullptr, PluginFieldType::kINT32, 1));
    mPluginAttributes.emplace_back(PluginField("weight", nullptr, PluginFieldType::kFLOAT32, 1));
    mPluginAttributes.emplace_back(PluginField("bias", nullptr, PluginFieldType::kFLOAT32, 1));

    mFC.nbFields = mPluginAttributes.size();
    mFC.fields = mPluginAttributes.data();
}

const char* DCNv2PluginCreator::getPluginName() const
{
    return DCNv2_PLUGIN_NAME;
}

const char* DCNv2PluginCreator::getPluginVersion() const
{
    return DCNv2_PLUGIN_VERSION;
}

const PluginFieldCollection* DCNv2PluginCreator::getFieldNames()
{
    return &mFC;
}

IPluginV2Ext* DCNv2PluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
{
    std::vector<float> weight;
    std::vector<float> bias;
    int out_channels, kernel, deformable_group, padding, stride, dilation;
    const PluginField* fields = fc->fields;
    for (int i = 0; i < fc->nbFields; ++i)
    {
        const char* attrName = fields[i].name;
        if (!strcmp(attrName, "out_channels"))
        {
            ASSERT(fields[i].type == PluginFieldType::kINT32);
            out_channels = *(static_cast<const int*>(fields[i].data));
        }
        else if (!strcmp(attrName, "kernel"))
        {
            ASSERT(fields[i].type == PluginFieldType::kINT32);
            kernel = *(static_cast<const int*>(fields[i].data));
        }
        else if (!strcmp(attrName, "deformable_group"))
        {
            ASSERT(fields[i].type == PluginFieldType::kINT32);
            deformable_group = *(static_cast<const int*>(fields[i].data));
        }
        else if (!strcmp(attrName, "dilation"))
        {
            ASSERT(fields[i].type == PluginFieldType::kINT32);
            dilation = *(static_cast<const int*>(fields[i].data));
        }
        else if (!strcmp(attrName, "stride"))
        {
            ASSERT(fields[i].type == PluginFieldType::kINT32);
            stride = *(static_cast<const int*>(fields[i].data));
        }
        else if (!strcmp(attrName, "padding"))
        {
            ASSERT(fields[i].type == PluginFieldType::kINT32);
            padding = *(static_cast<const int*>(fields[i].data));
        }
        else if (!strcmp(attrName, "weight"))
        {
            ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
            int size = fields[i].length;
            weight.reserve(size);
            const auto* w = static_cast<const float*>(fields[i].data);
            for (int j = 0; j < size; j++)
            {
                weight.push_back(*w);
                w++;
            }
        }
        else if (!strcmp(attrName, "bias"))
        {
            ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
            int size = fields[i].length;
            bias.reserve(size);
            const auto* w = static_cast<const float*>(fields[i].data);
            for (int j = 0; j < size; j++)
            {
                bias.push_back(*w);
                w++;
            }
        }
    }

    Weights mWeight{DataType::kFLOAT, weight.data(), (int64_t) weight.size()};
    Weights mBias{DataType::kFLOAT, bias.data(), (int64_t) bias.size()};

    DeformableConvolutionalLayer* obj = new DeformableConvolutionalLayer(out_channels,
                         kernel,
                         deformable_group,
                         dilation,
                         padding,
                         stride,
                         &mWeight, &mBias);
    obj->setPluginNamespace(mNamespace.c_str());
    return obj;
}

IPluginV2Ext* DCNv2PluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
{
    // This object will be deleted when the network is destroyed, which will
    // call Normalize::destroy()
    DeformableConvolutionalLayer* obj = new DeformableConvolutionalLayer(serialData, serialLength);
    obj->setPluginNamespace(mNamespace.c_str());
    return obj;
}

================================================
FILE: centernet/dcnv2Plugin/dcnv2Plugin.h
================================================
/*
 * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#ifndef TRT_DCNV2_PLUGIN_H
#define TRT_DCNV2_PLUGIN_H
#include "kernel.h"
#include "plugin.h"
#include "dcn_v2_im2col_cuda.h"

#include "serialize.hpp"
#include <cudnn.h>
#include <vector>
#include <cublas_v2.h>
#include <cuda.h>
#include <string>
#include <vector>

using namespace nvinfer1::plugin;
namespace nvinfer1
{
namespace plugin
{

class DeformableConvolutionalLayer : public IPluginV2Ext
{
public:
    DeformableConvolutionalLayer(int out_channels,
                         int kernel,
                         int deformable_group,
                         int dilation,
                         int padding,
                         int stride,
                         const Weights* weight, const Weights* bias);

    DeformableConvolutionalLayer(const void* buffer, size_t length);

    ~DeformableConvolutionalLayer() override = default;

    int getNbOutputs() const override;

    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputs) override;

    int initialize() override;

    void terminate() override;

    size_t getWorkspaceSize(int maxBatchSize) const override;

    int enqueue(
        int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override;

    size_t getSerializationSize() const override;

    void serialize(void* buffer) const override;

    bool supportsFormat(DataType type, PluginFormat format) const override;

    const char* getPluginType() const override;

    const char* getPluginVersion() const override;

    void destroy() override;

    IPluginV2Ext* clone() const override;

    void setPluginNamespace(const char* pluginNamespace) override;

    const char* getPluginNamespace() const override;

    DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;

    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override;

    bool canBroadcastInputAcrossBatch(int inputIndex) const override;

    void attachToContext(
        cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override;

    void configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs,
        const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast,
        const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize) override;

    void detachFromContext() override;

private:
    Weights copyToDevice(const void* hostData, size_t count);
    void serializeFromDevice(char*& hostBuffer, Weights deviceWeights) const;
    Weights deserializeToDevice(const char*& hostBuffer, size_t count);

    std::string mPluginNamespace;

    int in_channels{};
    int height_out{};
    int width_out{};
    int height{};
    int width{};

    int out_channels{};
    int kernel_size{};
    int deformable_group{};
    int dilation{};
    int padding{};
    int stride{};

    Weights mWeight{};
    Weights mBias{};

    float* mOne;
    float* mColumn;

    cublasHandle_t mCublas;
};

class DCNv2PluginCreator : public BaseCreator
{
public:
    DCNv2PluginCreator();

    ~DCNv2PluginCreator() override = default;

    const char* getPluginName() const override;

    const char* getPluginVersion() const override;

    const PluginFieldCollection* getFieldNames() override;

    IPluginV2Ext* createPlugin(const char* name, const PluginFieldCollection* fc) override;

    IPluginV2Ext* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;

private:
    static PluginFieldCollection mFC;

    // Parameters for DeformableConvolutionalLayer
    static std::vector<PluginField> mPluginAttributes;
};
} // namespace plugin
} // namespace nvinfer1

#endif // TRT_DCNv2_PLUGIN_H


================================================
FILE: centernet/sample/common.py
================================================
#
# Copyright 1993-2020 NVIDIA Corporation.  All rights reserved.
#
# NOTICE TO LICENSEE:
#
# This source code and/or documentation ("Licensed Deliverables") are
# subject to NVIDIA intellectual property rights under U.S. and
# international Copyright laws.
#
# These Licensed Deliverables contained herein is PROPRIETARY and
# CONFIDENTIAL to NVIDIA and is being provided under the terms and
# conditions of a form of NVIDIA software license agreement by and
# between NVIDIA and Licensee ("License Agreement") or electronically
# accepted by Licensee.  Notwithstanding any terms or conditions to
# the contrary in the License Agreement, reproduction or disclosure
# of the Licensed Deliverables to any third party without the express
# written consent of NVIDIA is prohibited.
#
# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
# LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
# SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
# PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
# NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
# DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
# NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
# LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
# SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
# OF THESE LICENSED DELIVERABLES.
#
# U.S. Government End Users.  These Licensed Deliverables are a
# "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
# 1995), consisting of "commercial computer software" and "commercial
# computer software documentation" as such terms are used in 48
# C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
# only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
# 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
# U.S. Government End Users acquire the Licensed Deliverables with
# only those rights set forth herein.
#
# Any use of the Licensed Deliverables in individual and commercial
# software must include, in the user documentation and internal
# comments to the code, the above Disclaimer and U.S. Government End
# Users Notice.
#

from itertools import chain
import argparse
import os

import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np

import tensorrt as trt

try:
    # Sometimes python2 does not understand FileNotFoundError
    FileNotFoundError
except NameError:
    FileNotFoundError = IOError

EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

def GiB(val):
    return val * 1 << 30


def add_help(description):
    parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    args, _ = parser.parse_known_args()


def find_sample_data(description="Runs a TensorRT Python sample", subfolder="", find_files=[], err_msg=""):
    '''
    Parses sample arguments.

    Args:
        description (str): Description of the sample.
        subfolder (str): The subfolder containing data relevant to this sample
        find_files (str): A list of filenames to find. Each filename will be replaced with an absolute path.

    Returns:
        str: Path of data directory.
    '''

    # Standard command-line arguments for all samples.
    kDEFAULT_DATA_ROOT = os.path.join(os.sep, "usr", "src", "tensorrt", "data")
    parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-d", "--datadir", help="Location of the TensorRT sample data directory, and any additional data directories.", action="append", default=[kDEFAULT_DATA_ROOT])
    args, _ = parser.parse_known_args()

    def get_data_path(data_dir):
        # If the subfolder exists, append it to the path, otherwise use the provided path as-is.
        data_path = os.path.join(data_dir, subfolder)
        if not os.path.exists(data_path):
            if data_dir != kDEFAULT_DATA_ROOT:
                print("WARNING: " + data_path + " does not exist. Trying " + data_dir + " instead.")
            data_path = data_dir
        # Make sure data directory exists.
        if not (os.path.exists(data_path)) and data_dir != kDEFAULT_DATA_ROOT:
            print("WARNING: {:} does not exist. Please provide the correct data path with the -d option.".format(data_path))
        return data_path

    data_paths = [get_data_path(data_dir) for data_dir in args.datadir]
    return data_paths, locate_files(data_paths, find_files, err_msg)

def locate_files(data_paths, filenames, err_msg=""):
    """
    Locates the specified files in the specified data directories.
    If a file exists in multiple data directories, the first directory is used.

    Args:
        data_paths (List[str]): The data directories.
        filename (List[str]): The names of the files to find.

    Returns:
        List[str]: The absolute paths of the files.

    Raises:
        FileNotFoundError if a file could not be located.
    """
    found_files = [None] * len(filenames)
    for data_path in data_paths:
        # Find all requested files.
        for index, (found, filename) in enumerate(zip(found_files, filenames)):
            if not found:
                file_path = os.path.abspath(os.path.join(data_path, filename))
                if os.path.exists(file_path):
                    found_files[index] = file_path

    # Check that all files were found
    for f, filename in zip(found_files, filenames):
        if not f or not os.path.exists(f):
            raise FileNotFoundError("Could not find {:}. Searched in data paths: {:}\n{:}".format(filename, data_paths, err_msg))
    return found_files

# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream

# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]

# This function is generalized for multiple inputs/outputs for full dimension networks.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference_v2(context, bindings, inputs, outputs, stream):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]


# `retry_call` and `retry` are used to wrap the function we want to try multiple times
def retry_call(func, args=[], kwargs={}, n_retries=3):
    """Wrap a function to retry it several times.

    Args:
        func: function to call
        args (List): args parsed to func
        kwargs (Dict): kwargs parsed to func
        n_retries (int): maximum times of tries
    """
    for i_try in range(n_retries):
        try:
            func(*args, **kwargs)
            break
        except:
            if i_try == n_retries - 1:
                raise
            print("retry...")

# Usage: @retry(n_retries)
def retry(n_retries=3):
    """Wrap a function to retry it several times. Decorator version of `retry_call`.

    Args:
        n_retries (int): maximum times of tries

    Usage:
        @retry(n_retries)
        def func(...):
            pass
    """
    def wrapper(func):
        def _wrapper(*args, **kwargs):
            retry_call(func, args, kwargs, n_retries)
        return _wrapper
    return wrapper


================================================
FILE: centernet/sample/test.py
================================================
import cv2 as cv
import numpy as np

import tensorrt as trt
import common

import torch
import time
from sys import argv

# You can set the logger severity higher to suppress messages (or lower to display more messages).
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt.init_libnvinfer_plugins(TRT_LOGGER, '')


def _gather_feat(feat, ind, mask=None):
    dim = feat.size(2)
    ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim)
    feat = feat.gather(1, ind)
    if mask is not None:
        mask = mask.unsqueeze(2).expand_as(feat)
        feat = feat[mask]
        feat = feat.view(-1, dim)
    return feat


def _transpose_and_gather_feat(feat, ind):
    feat = feat.permute(0, 2, 3, 1).contiguous()
    feat = feat.view(feat.size(0), -1, feat.size(3))
    feat = _gather_feat(feat, ind)
    return feat


def pre_process(image):
    long_size = max(image.shape)
    img = np.zeros((long_size, long_size, 3))
    img[:image.shape[0], :img.shape[1], :] = image[:]
    img = cv.resize(img, (512,512))
    inp_image = ((img / 255. - 0.5) / 0.5).astype(np.float32)
    images = inp_image.transpose(2, 0, 1)
    return images, long_size/512


def _nms(heat, kernel=3):
    pad = (kernel - 1) // 2

    hmax = torch.nn.functional.max_pool2d(
        heat, (kernel, kernel), stride=1, padding=pad)
    keep = (hmax == heat).float()
    return heat * keep


def _topk(scores, K=40):
    batch, cat, height, width = scores.size()

    topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K)

    topk_inds = topk_inds % (height * width)
    topk_ys = (topk_inds.true_divide(width)).int().float()
    topk_xs = (topk_inds % width).int().float()

    topk_score, topk_ind = torch.topk(topk_scores.view(batch, -1), K)
    topk_clses = (topk_ind.true_divide(K)).int()
    topk_inds = _gather_feat(
        topk_inds.view(batch, -1, 1), topk_ind).view(batch, K)
    topk_ys = _gather_feat(topk_ys.view(batch, -1, 1), topk_ind).view(batch, K)
    topk_xs = _gather_feat(topk_xs.view(batch, -1, 1), topk_ind).view(batch, K)

    return topk_score, topk_inds, topk_clses, topk_ys, topk_xs


def ctdet_decode(heat, wh, reg=None, cat_spec_wh=False, K=100):
    batch, cat, height, width = heat.size()

    heat = torch.sigmoid(heat)
    # perform nms on heatmaps
    heat = _nms(heat)

    scores, inds, clses, ys, xs = _topk(heat, K=K)
    if reg is not None:
        reg = _transpose_and_gather_feat(reg, inds)
        reg = reg.view(batch, K, 2)
        xs = xs.view(batch, K, 1) + reg[:, :, 0:1]
        ys = ys.view(batch, K, 1) + reg[:, :, 1:2]
    else:
        xs = xs.view(batch, K, 1) + 0.5
        ys = ys.view(batch, K, 1) + 0.5
    wh = _transpose_and_gather_feat(wh, inds)
    if cat_spec_wh:
        wh = wh.view(batch, K, cat, 2)
        clses_ind = clses.view(batch, K, 1, 1).expand(batch, K, 1, 2).long()
        wh = wh.gather(2, clses_ind).view(batch, K, 2)
    else:
        wh = wh.view(batch, K, 2)
    clses = clses.view(batch, K, 1).float()
    scores = scores.view(batch, K, 1)
    bboxes = torch.cat([xs - wh[..., 0:1] / 2,
                        ys - wh[..., 1:2] / 2,
                        xs + wh[..., 0:1] / 2,
                        ys + wh[..., 1:2] / 2], dim=2)
    detections = torch.cat([bboxes, scores, clses], dim=2)
    return detections


if __name__ == '__main__':
    try:
        engine_path = argv[1]
        img_path = argv[2]
    except:
        print('engine path and image path are needed!')
        exit()
    with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime, runtime.deserialize_cuda_engine(f.read()) as engine:
        inputs, outputs, bindings, stream = common.allocate_buffers(engine)
        with engine.create_execution_context() as context:
            img = cv.imread('test.jpg')
            dis = img.copy()
            img, s = pre_process(img)
            # Copy to the pagelocked input buffer
            np.copyto(inputs[0].host, img.ravel())
            [hm, wh, reg] = common.do_inference(
                context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=1)

            [dets] = ctdet_decode(torch.from_numpy(hm.reshape(1, 80, 128, 128)), torch.from_numpy(
                wh.reshape(1, 2, 128, 128)), torch.from_numpy(reg.reshape(1, 2, 128, 128)))

            for i in dets:
                if i[-2] > 0.5:
                    i[:4] *= 4*s
                    cv.rectangle(dis, (int(i[0]), int(
                        i[1])), (int(i[2]), int(i[3])), 255, 1)
                    cv.putText(dis, '%d' %
                               int(i[-1]), (int(i[0]), int(i[1])), 1, 1, 255)

            cv.imwrite('trt_out.jpg', dis)


================================================
FILE: contributing.md
================================================
# How to Contribute

1. Fork this repo to your github account

2. Clone your fork

3. Create a feature branch

4. Make changes, including but not limited to create new model, bug fix, documentation, tutorials, etc.

5. Pre-commit check and push, we use clang-format to do coding style checking, and the coding style is following google c++ coding style with 4-space.

```bash
pip install pre-commit clang-format

cd tensorrtx
pre-commit install
git add [files-to-commit]
pre-commit run

# fix pre-commit errors, then git add files-to-commit again
git add [files-to-commit]

git commit -m "describe your commit"

git push origin [feature-branch]
```

6. Submit a pull-request on github web UI to master branch of wang-xinyu/tensorrtx.


================================================
FILE: convnextv2/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.10)
project(convnextv2)

find_package(CUDA REQUIRED)
find_package(OpenCV REQUIRED)

include_directories(${CUDA_INCLUDE_DIRS} /usr/local/cuda/include /usr/local/TensorRT-8.6.1.6/include)
link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64 /usr/local/cuda/lib64 /usr/local/TensorRT-8.6.1.6/targets/x86_64-linux-gnu/lib)

# TRT
find_library(NVINFER nvinfer PATHS /usr/local/TensorRT-8.6.1.6/targets/x86_64-linux-gnu/lib NO_DEFAULT_PATH)
find_library(NVINFER_PLUGIN nvinfer_plugin PATHS /usr/local/TensorRT-8.6.1.6/targets/x86_64-linux-gnu/lib NO_DEFAULT_PATH)
find_library(NVPARSERS nvparsers PATHS /usr/local/TensorRT-8.6.1.6/targets/x86_64-linux-gnu/lib NO_DEFAULT_PATH)

set(CMAKE_CXX_STANDARD 14)

cuda_add_executable(convnextv2 src/convnextv2.cpp src/LayerNormPlugin.cu)
target_link_libraries(convnextv2 ${NVINFER} ${NVINFER_PLUGIN} ${CUDA_LIBRARIES} ${OpenCV_LIBS})

cuda_add_library(layernorm_plugin SHARED src/LayerNormPlugin.cu)
target_link_libraries(layernorm_plugin ${NVINFER} ${NVINFER_PLUGIN} ${CUDA_LIBRARIES})

# Inference executable
cuda_add_executable(inference_cpp src/inference_cpp.cpp src/LayerNormPlugin.cu)
target_link_libraries(inference_cpp ${NVINFER} ${NVINFER_PLUGIN} ${CUDA_LIBRARIES} ${OpenCV_LIBS})


================================================
FILE: convnextv2/README.md
================================================
# ConvNeXtV2 TensorRT

## Environment

- ubuntu20.04
-  cuda11.8
-  cudnn8.9.7
-  TensorRT8.6.1.6
-  OpenCV4.13

## Support

[ConvNext-V2](https://github.com/facebookresearch/ConvNeXt-V2.git)provides official pre-trained models such as ImageNet-1K fine-tuned models, ImageNet-22K fine-tuned models, and custom dataset classification models trained using these pre-trained weights.

## Build and Run

``````
# Downloda dependencies
pip install torch tensorrt pycuda numpy opencv-python

# Generate .wts
cd path-to-tensorrtx/convnextv2
python path-to-gen_wts.py path-to-pt path-to-wts

# Build convnextv2
cmake -B build
make -C build

# Update config.yaml to match your selected model

# Generate .engine
./build/convnextv2 path-to-wts path-to-engine

# Inference(python)
python path-to-inference.py path-to-engine path-to-your-image path-to-your-labels.txt

# Inference(cpp)
./build/inference_cpp path-to-engine path-to-your-image path-to-your-labels.txt
``````

## More Information

An interesting fact is that the suffix of the engine file can be arbitrarily specified; it does not need to be “engine”, and you can even use your own name as the suffix.


================================================
FILE: convnextv2/config.yaml
================================================
# ConvNeXtV2 Configuration

# Model variants reference:
# Atto:  depths: [2, 2, 6, 2], dims: [40, 80, 160, 320]
# Femto: depths: [2, 2, 6, 2], dims: [48, 96, 192, 384]
# Pico:  depths: [2, 2, 6, 2], dims: [64, 128, 256, 512]
# Nano:  depths: [2, 2, 8, 2], dims: [80, 160, 320, 640]
# Tiny:  depths: [3, 3, 9, 3], dims: [96, 192, 384, 768]
# Base:  depths: [3, 3, 27, 3], dims: [128, 256, 512, 1024]
# Large: depths: [3, 3, 27, 3], dims: [192, 384, 768, 1536]
# Huge:  depths: [3, 3, 27, 3], dims: [352, 704, 1408, 2816]

depths: [2, 2, 8, 2]
dims: [80, 160, 320, 640]
input_h: 224
input_w: 224


================================================
FILE: convnextv2/gen_wts.py
================================================
import torch
import struct


def gen_wts(model_path, wts_path):
    print(f"Loading {model_path}...")
    try:
        data = torch.load(model_path, map_location='cpu')
    except FileNotFoundError:
        print(f"Error: {model_path} not found.")
        return

    if isinstance(data, dict) and 'model' in data:
        state_dict = data['model']
    else:
        state_dict = data

    print(f"Exporting to {wts_path}...")

    # Infer architecture
    dims = []
    depths = [0, 0, 0, 0]

    # Check dimensions from downsample layers
    # downsample_layers.0.0 is stem: conv set output to dim[0]
    # downsample_layers.1.0 is conv: dim[0] -> dim[1]
    # ...

    if 'downsample_layers.0.0.weight' in state_dict:
        dims.append(state_dict['downsample_layers.0.0.weight'].shape[0])
    if 'downsample_layers.1.0.weight' in state_dict:
        dims.append(state_dict['downsample_layers.1.0.weight'].shape[0])
    if 'downsample_layers.2.0.weight' in state_dict:
        dims.append(state_dict['downsample_layers.2.0.weight'].shape[0])
    if 'downsample_layers.3.0.weight' in state_dict:
        dims.append(state_dict['downsample_layers.3.0.weight'].shape[0])

    # Count blocks per stage
    for k in state_dict.keys():
        if k.startswith('stages.'):
            parts = k.split('.')
            if len(parts) >= 3:
                stage_idx = int(parts[1])
                block_idx = int(parts[2])
                if stage_idx < 4:
                    depths[stage_idx] = max(depths[stage_idx], block_idx + 1)

    print("Inferred Architecture:")
    print(f"  Dims: {dims}")
    print(f"  Depths: {depths}")

    with open(wts_path, 'w') as f:
        f.write(f"{len(state_dict)}\n")
        for k, v in state_dict.items():
            vr = v.reshape(-1).cpu().numpy()
            f.write(f"{k} {len(vr)}")
            for val in vr:
                f.write(" ")
                f.write(struct.pack('>f', float(val)).hex())
            f.write("\n")

    print("Done.")


if __name__ == "__main__":
    import sys
    if len(sys.argv) != 3:
        print(f"Usage: python {sys.argv[0]} <pt_path> <wts_path>")
        print(f"Example: python {sys.argv[0]} models/test.pt convnextv2.wts")
        sys.exit(1)

    pt_path = sys.argv[1]
    wts_path = sys.argv[2]
    gen_wts(pt_path, wts_path)


================================================
FILE: convnextv2/inference.py
================================================
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit  # noqa: F401
import numpy as np
import cv2
import ctypes
import os
import sys


def load_imagenet_labels(label_file="imagenet_classes.txt"):
    """Load ImageNet class labels"""
    if not os.path.exists(label_file):
        return None
    with open(label_file, 'r') as f:
        labels = [line.strip() for line in f.readlines()]
    return labels


def main(engine_path, img_path, label_file="imagenet_classes.txt"):
    # Load plugin library
    so_file = os.path.abspath("./build/liblayernorm_plugin.so")
    if not os.path.exists(so_file):
        print(f"Plugin library not found: {so_file}")
        return

    ctypes.CDLL(so_file)

    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    runtime = trt.Runtime(TRT_LOGGER)

    if not os.path.exists(engine_path):
        print(f"Engine file not found: {engine_path}")
        return

    with open(engine_path, "rb") as f:
        serialized_engine = f.read()

    engine = runtime.deserialize_cuda_engine(serialized_engine)
    if not engine:
        print("Failed to deserialize engine.")
        return

    context = engine.create_execution_context()
    # Get Input Shape from Engine
    input_shape = (224, 224)  # Default
    for i in range(engine.num_bindings):
        if engine.binding_is_input(i):
            shape = engine.get_binding_shape(i)
            # shape is usually (N, C, H, W) or (C, H, W)
            if len(shape) == 4:
                input_shape = (shape[2], shape[3])
            elif len(shape) == 3:
                input_shape = (shape[1], shape[2])
            break

    # Prepare input
    img = cv2.imread(img_path)
    if img is None:
        print(f"Failed to load image: {img_path}")
        return
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (input_shape[1], input_shape[0]))  # cv2.resize takes (W, H)
    img = img.astype(np.float32) / 255.0

    # ImageNet Mean/Std
    mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
    std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
    img = (img - mean) / std

    img = img.transpose(2, 0, 1)  # HWC -> CHW
    img = np.expand_dims(img, axis=0)  # CHW -> NCHW
    img = np.ascontiguousarray(img)

    inputs, outputs, bindings, stream = [], [], [], cuda.Stream()

    for i in range(engine.num_bindings):
        dtype = trt.nptype(engine.get_binding_dtype(i))
        shape = engine.get_binding_shape(i)

        # Handle dynamic shape or fixed
        # Check if input or output
        is_input = engine.binding_is_input(i)

        # Since we use explicit batch, shape[0] might be -1 or 1
        # If -1, we set context binding shape
        if shape[0] == -1:
            shape = (1,) + shape[1:]
            context.set_binding_shape(i, shape)

        size = trt.volume(shape) * np.dtype(dtype).itemsize

        # Host memory
        host_mem = cuda.pagelocked_empty(trt.volume(shape), dtype)
        # Device memory
        device_mem = cuda.mem_alloc(size)

        bindings.append(int(device_mem))

        if is_input:
            inputs.append({'host': host_mem, 'device': device_mem, 'shape': shape})
            # Copy input data to host buffer
            np.copyto(host_mem, img.ravel())
        else:
            outputs.append({'host': host_mem, 'device': device_mem, 'shape': shape})

    # Inference
    # Transfer input data to the GPU.
    for inp in inputs:
        cuda.memcpy_htod_async(inp['device'], inp['host'], stream)

    # Run inference.
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)

    # Transfer predictions back from the GPU.
    for out in outputs:
        cuda.memcpy_dtoh_async(out['host'], out['device'], stream)

    # Synchronize the stream
    stream.synchronize()

    # Process output
    labels = load_imagenet_labels(label_file)
    for out in outputs:
        output_data = out['host']
        max_idx = np.argmax(output_data)
        max_val = output_data[max_idx]
        if labels:
            print(f"Predicted Class: {max_idx} - {labels[max_idx]} (Score: {max_val})")
        else:
            print(f"Predicted Class: {max_idx} (Score: {max_val})")


if __name__ == "__main__":
    if len(sys.argv) < 3 or len(sys.argv) > 4:
        print(f"Usage: python {sys.argv[0]} <engine_path> <image_path> [label_file]")
        print(f"Example: python {sys.argv[0]} convnextv2.engine images/test.jpg")
        print(f"         python {sys.argv[0]} convnextv2.engine images/test.jpg custom_labels.txt")
        sys.exit(1)

    engine_path = sys.argv[1]
    img_path = sys.argv[2]
    label_file = sys.argv[3] if len(sys.argv) == 4 else "imagenet_classes.txt"
    main(engine_path, img_path, label_file)


================================================
FILE: convnextv2/src/LayerNormPlugin.cu
================================================
#include <cuda_fp16.h>
#include <cassert>
#include <cstring>
#include <cub/cub.cuh>
#include <iostream>
#include "LayerNormPlugin.h"

using namespace nvinfer1;

static const char* PLUGIN_NAME = "LayerNorm";
static const char* PLUGIN_VERSION = "1";

PluginFieldCollection LayerNormPluginCreator::mFC{};
std::vector<PluginField> LayerNormPluginCreator::mPluginAttributes;

// Helper to check CUDA errors
#define CHECK(status)                                                                     \
    do {                                                                                  \
        auto ret = (status);                                                              \
        if (ret != 0) {                                                                   \
            std::cerr << "Cuda failure: " << ret << " at line " << __LINE__ << std::endl; \
            abort();                                                                      \
        }                                                                                 \
    } while (0)

template <typename T>
__device__ inline T epsilon();

template <>
__device__ inline float epsilon<float>() {
    return 1e-6f;
}

template <>
__device__ inline half epsilon<half>() {
    return (half)1e-6f;
}

// --- Kernel ---
// Supports hidden_size up to 1024 with TPB=256, VPT=4
template <typename T, int VPT>
__global__ void layerNormKernel(const T* __restrict__ input, const T* __restrict__ gamma, const T* __restrict__ beta,
                                T* __restrict__ output, int hidden_size, float eps) {
    // blockIdx.x corresponds to one instance (one row of hidden_size elements)

    int row_offset = blockIdx.x * hidden_size;

    // Load data
    float vals[VPT];
#pragma unroll
    for (int i = 0; i < VPT; ++i) {
        int col = threadIdx.x * VPT + i;
        if (col < hidden_size) {
            vals[i] = (float)input[row_offset + col];
        } else {
            vals[i] = 0.0f;
        }
    }

    // Compute mean
    float thread_sum = 0.0f;
#pragma unroll
    for (int i = 0; i < VPT; ++i) {
        if (threadIdx.x * VPT + i < hidden_size)
            thread_sum += vals[i];
    }

    using BlockReduce = cub::BlockReduce<float, 256>;
    __shared__ typename BlockReduce::TempStorage temp_storage;
    float sum = BlockReduce(temp_storage).Sum(thread_sum);
    __shared__ float mean;
    if (threadIdx.x == 0)
        mean = sum / hidden_size;
    __syncthreads();

    // Compute variance
    float thread_sq_diff = 0.0f;
#pragma unroll
    for (int i = 0; i < VPT; ++i) {
        if (threadIdx.x * VPT + i < hidden_size) {
            float diff = vals[i] - mean;
            thread_sq_diff += diff * diff;
        }
    }
    float sq_diff_sum = BlockReduce(temp_storage).Sum(thread_sq_diff);
    __shared__ float inv_std;
    if (threadIdx.x == 0) {
        inv_std = rsqrtf((sq_diff_sum / hidden_size) + eps);
    }
    __syncthreads();

// Normalize and scale
#pragma unroll
    for (int i = 0; i < VPT; ++i) {
        int col = threadIdx.x * VPT + i;
        if (col < hidden_size) {
            float val = (vals[i] - mean) * inv_std;
            float g = (float)gamma[col];
            float b = (float)beta[col];
            output[row_offset + col] = (T)(val * g + b);
        }
    }
}

// --- Plugin Implementation ---

LayerNormPlugin::LayerNormPlugin(const std::string& name, float epsilon, int hidden_size)
    : mName(name), mEpsilon(epsilon), mHiddenSize(hidden_size) {}

LayerNormPlugin::LayerNormPlugin(const std::string& name, const void* data, size_t length) : mName(name) {
    const char* d = static_cast<const char*>(data);
    const char* a = d;
    mEpsilon = *reinterpret_cast<const float*>(d);
    d += sizeof(float);
    mHiddenSize = *reinterpret_cast<const int*>(d);
    d += sizeof(int);
    assert(d == a + length);
}

LayerNormPlugin::~LayerNormPlugin() {}

IPluginV2DynamicExt* LayerNormPlugin::clone() const noexcept {
    auto p = new LayerNormPlugin(mName, mEpsilon, mHiddenSize);
    p->setPluginNamespace(mNamespace.c_str());
    return p;
}

int32_t LayerNormPlugin::getNbOutputs() const noexcept {
    return 1;
}

DataType LayerNormPlugin::getOutputDataType(int32_t index, const DataType* inputTypes,
                                            int32_t nbInputs) const noexcept {
    return inputTypes[0];
}

DimsExprs LayerNormPlugin::getOutputDimensions(int32_t outputIndex, const DimsExprs* inputs, int32_t nbInputs,
                                               IExprBuilder& exprBuilder) noexcept {
    return inputs[0];
}

bool LayerNormPlugin::supportsFormatCombination(int32_t pos, const PluginTensorDesc* inOut, int32_t nbInputs,
                                                int32_t nbOutputs) noexcept {
    if (pos == 0) {  // Input
        return (inOut[0].type == DataType::kFLOAT || inOut[0].type == DataType::kHALF) &&
               inOut[0].format == TensorFormat::kLINEAR;
    }
    if (pos == 1 || pos == 2) {  // Gamma, Beta
        return inOut[pos].type == inOut[0].type && inOut[pos].format == TensorFormat::kLINEAR;
    }
    if (pos == 3) {  // Output
        return inOut[pos].type == inOut[0].type && inOut[pos].format == TensorFormat::kLINEAR;
    }
    return false;
}

void LayerNormPlugin::configurePlugin(const DynamicPluginTensorDesc* in, int32_t nbInputs,
                                      const DynamicPluginTensorDesc* out, int32_t nbOutputs) noexcept {
    // Validate inputs
    mHiddenSize = in[0].desc.dims.d[in[0].desc.dims.nbDims - 1];
}

size_t LayerNormPlugin::getWorkspaceSize(const PluginTensorDesc* inputs, int32_t nbInputs,
                                         const PluginTensorDesc* outputs, int32_t nbOutputs) const noexcept {
    return 0;
}

int32_t LayerNormPlugin::enqueue(const PluginTensorDesc* inputDesc, const PluginTensorDesc* outputDesc,
                                 const void* const* inputs, void* const* outputs, void* workspace,
                                 cudaStream_t stream) noexcept {

    int total = 1;
    for (int i = 0; i < inputDesc[0].dims.nbDims; ++i)
        total *= inputDesc[0].dims.d[i];
    int rows = total / mHiddenSize;

    if (inputDesc[0].type == DataType::kFLOAT) {
        layerNormKernel<float, 4><<<rows, 256, 0, stream>>>((const float*)inputs[0], (const float*)inputs[1],
                                                            (const float*)inputs[2], (float*)outputs[0], mHiddenSize,
                                                            mEpsilon);
    } else {
        layerNormKernel<half, 4><<<rows, 256, 0, stream>>>((const half*)inputs[0], (const half*)inputs[1],
                                                           (const half*)inputs[2], (half*)outputs[0], mHiddenSize,
                                                           mEpsilon);
    }
    return 0;
}

const char* LayerNormPlugin::getPluginType() const noexcept {
    return PLUGIN_NAME;
}
const char* LayerNormPlugin::getPluginVersion() const noexcept {
    return PLUGIN_VERSION;
}

void LayerNormPlugin::destroy() noexcept {
    delete this;
}

int32_t LayerNormPlugin::initialize() noexcept {
    return 0;
}
void LayerNormPlugin::terminate() noexcept {}

size_t LayerNormPlugin::getSerializationSize() const noexcept {
    return sizeof(float) + sizeof(int);
}

void LayerNormPlugin::serialize(void* buffer) const noexcept {
    char* d = static_cast<char*>(buffer);
    *reinterpret_cast<float*>(d) = mEpsilon;
    d += sizeof(float);
    *reinterpret_cast<int*>(d) = mHiddenSize;
    d += sizeof(int);
}

void LayerNormPlugin::setPluginNamespace(const char* libNamespace) noexcept {
    mNamespace = libNamespace;
}
const char* LayerNormPlugin::getPluginNamespace() const noexcept {
    return mNamespace.c_str();
}

// --- Creator Implementation ---

LayerNormPluginCreator::LayerNormPluginCreator() {
    mPluginAttributes.emplace_back(PluginField("epsilon", nullptr, PluginFieldType::kFLOAT32, 1));
    mFC.nbFields = mPluginAttributes.size();
    mFC.fields = mPluginAttributes.data();
}

LayerNormPluginCreator::~LayerNormPluginCreator() {}

const char* LayerNormPluginCreator::getPluginName() const noexcept {
    return PLUGIN_NAME;
}
const char* LayerNormPluginCreator::getPluginVersion() const noexcept {
    return PLUGIN_VERSION;
}

const PluginFieldCollection* LayerNormPluginCreator::getFieldNames() noexcept {
    return &mFC;
}

IPluginV2* LayerNormPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) noexcept {
    float epsilon = 1e-6f;
    for (int i = 0; i < fc->nbFields; ++i) {
        if (strcmp(fc->fields[i].name, "epsilon") == 0) {
            epsilon = *static_cast<const float*>(fc->fields[i].data);
        }
    }
    return new LayerNormPlugin(name, epsilon, 0);  // hidden_size will be set in configure
}

IPluginV2* LayerNormPluginCreator::deserializePlugin(const char* name, const void* serialData,
                                                     size_t serialLength) noexcept {
    return new LayerNormPlugin(name, serialData, serialLength);
}

void LayerNormPluginCreator::setPluginNamespace(const char* libNamespace) noexcept {
    mNamespace = libNamespace;
}
const char* LayerNormPluginCreator::getPluginNamespace() const noexcept {
    return mNamespace.c_str();
}

REGISTER_TENSORRT_PLUGIN(LayerNormPluginCreator);


================================================
FILE: convnextv2/src/LayerNormPlugin.h
================================================
#ifndef LAYER_NORM_PLUGIN_H
#define LAYER_NORM_PLUGIN_H

#include <NvInfer.h>
#include <string>
#include <vector>

using namespace nvinfer1;

class LayerNormPlugin : public IPluginV2DynamicExt {
   public:
    LayerNormPlugin(const std::string& name, float epsilon, int hidden_size);
    LayerNormPlugin(const std::string& name, const void* data, size_t length);
    LayerNormPlugin() = delete;
    ~LayerNormPlugin() override;

    // IPluginV2DynamicExt Methods
    IPluginV2DynamicExt* clone() const noexcept override;
    int32_t getNbOutputs() const noexcept override;
    DataType getOutputDataType(int32_t index, const DataType* inputTypes, int32_t nbInputs) const noexcept override;
    DimsExprs getOutputDimensions(int32_t outputIndex, const DimsExprs* inputs, int32_t nbInputs,
                                  IExprBuilder& exprBuilder) noexcept override;
    bool supportsFormatCombination(int32_t pos, const PluginTensorDesc* inOut, int32_t nbInputs,
                                   int32_t nbOutputs) noexcept override;
    void configurePlugin(const DynamicPluginTensorDesc* in, int32_t nbInputs, const DynamicPluginTensorDesc* out,
                         int32_t nbOutputs) noexcept override;
    size_t getWorkspaceSize(const PluginTensorDesc* inputs, int32_t nbInputs, const PluginTensorDesc* outputs,
                            int32_t nbOutputs) const noexcept override;
    int32_t enqueue(const PluginTensorDesc* inputDesc, const PluginTensorDesc* outputDesc, const void* const* inputs,
                    void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;

    // IPluginV2 Methods
    const char* getPluginType() const noexcept override;
    const char* getPluginVersion() const noexcept override;
    void destroy() noexcept override;
    int32_t initialize() noexcept override;
    void terminate() noexcept override;
    size_t getSerializationSize() const noexcept override;
    void serialize(void* buffer) const noexcept override;
    void setPluginNamespace(const char* pluginNamespace) noexcept override;
    const char* getPluginNamespace() const noexcept override;

   private:
    std::string mName;
    std::string mNamespace;
    float mEpsilon;
    int mHiddenSize;  // Number of channels
};

class LayerNormPluginCreator : public IPluginCreator {
   public:
    LayerNormPluginCreator();
    ~LayerNormPluginCreator() override;

    const char* getPluginName() const noexcept override;
    const char* getPluginVersion() const noexcept override;
    const PluginFieldCollection* getFieldNames() noexcept override;
    IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) noexcept override;
    IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) noexcept override;
    void setPluginNamespace(const char* pluginNamespace) noexcept override;
    const char* getPluginNamespace() const noexcept override;

   private:
    static PluginFieldCollection mFC;
    static std::vector<PluginField> mPluginAttributes;
    std::string mNamespace;
};

#endif  // LAYER_NORM_PLUGIN_H


================================================
FILE: convnextv2/src/convnextv2.cpp
================================================
#include <cuda_runtime_api.h>
#include <algorithm>
#include <cassert>
#include <cmath>
#include <fstream>
#include <iostream>
#include <map>
#include <opencv2/opencv.hpp>
#include <vector>
#include "LayerNormPlugin.h"
#include "NvInfer.h"
#include "logging.h"

static const char* INPUT_BLOB_NAME = "data";
static const char* OUTPUT_BLOB_NAME = "output";

struct ConvNextConfig {
    int depths[4];
    int dims[4];
    int input_h;
    int input_w;
};

// Simple parser for YAML-like config (key: [v1, v2..] or key: value)
ConvNextConfig loadConfig(const std::string& configPath) {
    ConvNextConfig cfg;
    // Default to Nano
    cfg.depths[0] = 2;
    cfg.depths[1] = 2;
    cfg.depths[2] = 8;
    cfg.depths[3] = 2;
    cfg.dims[0] = 80;
    cfg.dims[1] = 160;
    cfg.dims[2] = 320;
    cfg.dims[3] = 640;
    cfg.input_h = 224;
    cfg.input_w = 224;

    std::ifstream file(configPath);
    if (!file.is_open()) {
        std::cerr << "Warning: Could not open config file " << configPath << ". Using default Nano config."
                  << std::endl;
        return cfg;
    }

    std::string line;
    while (std::getline(file, line)) {
        if (line.empty() || line[0] == '#')
            continue;
        std::stringstream ss(line);
        std::string key;
        std::getline(ss, key, ':');

        // Trim key
        key.erase(0, key.find_first_not_of(" \t"));
        key.erase(key.find_last_not_of(" \t") + 1);

        if (key == "depths" || key == "dims") {
            // format: [v1, v2, v3, v4]
            std::string valStr;
            std::getline(ss, valStr);
            // Simple parse: remove [ ] and split by ,
            size_t start = valStr.find('[');
            size_t end = valStr.find(']');
            if (start != std::string::npos && end != std::string::npos) {
                std::string nums = valStr.substr(start + 1, end - start - 1);
                std::stringstream ssNums(nums);
                std::string segment;
                int idx = 0;
                while (std::getline(ssNums, segment, ',') && idx < 4) {
                    if (key == "depths")
                        cfg.depths[idx++] = std::stoi(segment);
                    else
                        cfg.dims[idx++] = std::stoi(segment);
                }
            }
        } else if (key == "input_h") {
            int val;
            ss >> val;
            cfg.input_h = val;
        } else if (key == "input_w") {
            int val;
            ss >> val;
            cfg.input_w = val;
        }
    }
    std::cout << "Loaded Config - Depths: [" << cfg.depths[0] << "," << cfg.depths[1] << "," << cfg.depths[2] << ","
              << cfg.depths[3] << "]"
              << " Dims: [" << cfg.dims[0] << "," << cfg.dims[1] << "," << cfg.dims[2] << "," << cfg.dims[3] << "]"
              << " Input: " << cfg.input_h << "x" << cfg.input_w << std::endl;
    return cfg;
}

// Global config
static ConvNextConfig g_config;
// Macros/Consts replaced by g_config members
#define DEPTHS g_config.depths
#define DIMS g_config.dims
#define INPUT_H g_config.input_h
#define INPUT_W g_config.input_w

using namespace nvinfer1;

static Logger gLogger;

// Global variables for paths
std::string g_wts_path = "convnextv2.wts";
std::string g_engine_path = "convnextv2.engine";

// Weights utils
std::map<std::string, Weights> loadWeights(const std::string& file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");
    while (count--) {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;
        uint32_t* val = new uint32_t[size];
        for (uint32_t x = 0, y = size; x < y; ++x) {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        wt.count = size;
        weightMap[name] = wt;
    }
    return weightMap;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition* network, ITensor& input, std::map<std::string, Weights>& weightMap,
                            std::string name, float eps) {
    float* gamma = (float*)weightMap[name + ".weight"].values;
    float* beta = (float*)weightMap[name + ".bias"].values;
    float* mean = (float*)weightMap[name + ".running_mean"].values;
    float* var = (float*)weightMap[name + ".running_var"].values;
    int len = weightMap[name + ".running_var"].count;

    float* scval = new float[len];
    float* shval = new float[len];
    float* pval = new float[len];

    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
        pval[i] = 1.0;
    }
    Weights wsc{DataType::kFLOAT, scval, len};
    Weights wsh{DataType::kFLOAT, shval, len};
    Weights wpower{DataType::kFLOAT, pval, len};

    IScaleLayer* scale = network->addScale(input, ScaleMode::kCHANNEL, wsh, wsc, wpower);
    assert(scale);
    return scale;
}

ITensor* convNextBlock(INetworkDefinition* network, ITensor* input, int dim, std::string name,
                       std::map<std::string, Weights>& weightMap) {
    // Input is NCHW

    // 1. DWConv 7x7
    Weights empty{DataType::kFLOAT, nullptr, 0};
    IConvolutionLayer* dwconv = network->addConvolutionNd(*input, dim, DimsHW{7, 7}, weightMap[name + ".dwconv.weight"],
                                                          weightMap[name + ".dwconv.bias"]);
    assert(dwconv);
    dwconv->setStrideNd(DimsHW{1, 1});
    dwconv->setPaddingNd(DimsHW{3, 3});
    dwconv->setNbGroups(dim);
    ITensor* x = dwconv->getOutput(0);

    // 2. Permute NCHW -> NHWC for LayerNorm
    IShuffleLayer* p1 = network->addShuffle(*x);
    p1->setSecondTranspose({0, 2, 3, 1});
    x = p1->getOutput(0);

    // 3. LayerNorm (Plugin)
    auto creator = getPluginRegistry()->getPluginCreator("LayerNorm", "1");
    PluginFieldCollection pfc;
    float eps = 1e-6f;
    PluginField pf("epsilon", &eps, PluginFieldType::kFLOAT32, 1);
    pfc.nbFields = 1;
    pfc.fields = &pf;
    IPluginV2* plugin = creator->createPlugin(name.c_str(), &pfc);

    // Pass gamma/beta (1D of size C) as plugin inputs along with x (N,H,W,C)
    auto w_ln_w = weightMap[name + ".norm.weight"];
    auto w_ln_b = weightMap[name + ".norm.bias"];
    IConstantLayer* c_gamma = network->addConstant(Dims{1, {w_ln_w.count}}, w_ln_w);
    IConstantLayer* c_beta = network->addConstant(Dims{1, {w_ln_b.count}}, w_ln_b);

    ITensor* inputs[] = {x, c_gamma->getOutput(0), c_beta->getOutput(0)};
    IPluginV2Layer* ln = network->addPluginV2(inputs, 3, *plugin);
    x = ln->getOutput(0);

    // 4. Permute NHWC -> NCHW
    IShuffleLayer* p2 = network->addShuffle(*x);
    p2->setSecondTranspose({0, 3, 1, 2});
    x = p2->getOutput(0);

    // 5. PWConv1 (1x1)
    IConvolutionLayer* pw1 = network->addConvolutionNd(*x, 4 * dim, DimsHW{1, 1}, weightMap[name + ".pwconv1.weight"],
                                                       weightMap[name + ".pwconv1.bias"]);
    x = pw1->getOutput(0);

    // 6. GELU
    // Manual GELU implementation: 0.5 * x * (1 + erf(x / sqrt(2)))
    float* sqrt2_inv = new float[1];
    *sqrt2_inv = 1.0f / std::sqrt(2.0f);
    Weights w_sqrt2{DataType::kFLOAT, sqrt2_inv, 1};
    IConstantLayer* c_sqrt2 = network->addConstant(Dims4{1, 1, 1, 1}, w_sqrt2);  // Broadcast

    IElementWiseLayer* div = network->addElementWise(*x, *c_sqrt2->getOutput(0), ElementWiseOperation::kPROD);
    IUnaryLayer* erf = network->addUnary(*div->getOutput(0), UnaryOperation::kERF);

    float* one = new float[1];
    *one = 1.0f;
    Weights w_one{DataType::kFLOAT, one, 1};
    IConstantLayer* c_one = network->addConstant(Dims4{1, 1, 1, 1}, w_one);

    IElementWiseLayer* add_erf =
            network->addElementWise(*erf->getOutput(0), *c_one->getOutput(0), ElementWiseOperation::kSUM);

    float* half = new float[1];
    *half = 0.5f;
    Weights w_half{DataType::kFLOAT, half, 1};
    IConstantLayer* c_half = network->addConstant(Dims4{1, 1, 1, 1}, w_half);

    IElementWiseLayer* mul_half = network->addElementWise(*x, *c_half->getOutput(0), ElementWiseOperation::kPROD);

    IElementWiseLayer* gelu =
            network->addElementWise(*mul_half->getOutput(0), *add_erf->getOutput(0), ElementWiseOperation::kPROD);
    x = gelu->getOutput(0);

    // 7. GRN (implemented in NCHW). X shape: [N, 4*dim, H, W], gx -> [N, C, 1, 1]

    // x*x
    IElementWiseLayer* sq = network->addElementWise(*x, *x, ElementWiseOperation::kPROD);
    ITensor* x_sq = sq->getOutput(0);

    // Sum over H,W (axes 2, 3 = 4 | 8 = 12)
    IReduceLayer* red_sum = network->addReduce(*x_sq, ReduceOperation::kSUM, 12, true);
    ITensor* sum_x = red_sum->getOutput(0);

    // Sqrt
    IUnaryLayer* sqrt_layer = network->addUnary(*sum_x, UnaryOperation::kSQRT);
    ITensor* gx = sqrt_layer->getOutput(0);  // [N, C, 1, 1]

    // Normalize GRN: nx = gx / (mean(gx, dim=1) + eps)
    // Mean over C (axis 1)
    IReduceLayer* red_mean = network->addReduce(*gx, ReduceOperation::kAVG, 2, true);  // bit 1 set -> axis 1
    ITensor* mean_gx = red_mean->getOutput(0);                                         // [N, 1, 1, 1]

    // Add eps
    float eps_val = 1e-6f;
    Weights w_eps{DataType::kFLOAT, &eps_val, 1};

    // Creating scalar constant [1,1,1,1]
    float* eps_ptr = new float[1];
    eps_ptr[0] = 1e-6f;
    Weights eps_w{DataType::kFLOAT, eps_ptr, 1};
    IConstantLayer* c_eps = network->addConstant(Dims4{1, 1, 1, 1}, eps_w);

    IElementWiseLayer* add_eps = network->addElementWise(*mean_gx, *c_eps->getOutput(0), ElementWiseOperation::kSUM);
    ITensor* denom = add_eps->getOutput(0);

    // Div
    IElementWiseLayer* div_grn = network->addElementWise(*gx, *denom, ElementWiseOperation::kDIV);
    ITensor* nx = div_grn->getOutput(0);  // [N, C, 1, 1]

    // Scale X by nx
    IElementWiseLayer* scale_x = network->addElementWise(*x, *nx, ElementWiseOperation::kPROD);
    ITensor* x_norm = scale_x->getOutput(0);

    // Apply Gamma/Beta for GRN (channel-wise scale) then add residual from GELU input
    Weights w_grn_g = weightMap[name + ".grn.gamma"];
    Weights w_grn_b = weightMap[name + ".grn.beta"];
    Weights w_power{DataType::kFLOAT, nullptr, 0};
    IScaleLayer* grn_scale = network->addScale(*x_norm, ScaleMode::kCHANNEL, w_grn_b, w_grn_g, w_power);
    x = grn_scale->getOutput(0);

    // Residual: x = grn_scaled + gelu_output
    ITensor* x_in = gelu->getOutput(0);
    IElementWiseLayer* add_grn = network->addElementWise(*x, *x_in, ElementWiseOperation::kSUM);
    x = add_grn->getOutput(0);

    // 8. PWConv2 (1x1)
    IConvolutionLayer* pw2 = network->addConvolutionNd(*x, dim, DimsHW{1, 1}, weightMap[name + ".pwconv2.weight"],
                                                       weightMap[name + ".pwconv2.bias"]);
    x = pw2->getOutput(0);

    // 9. DropPath (Ignored in inference)

    // 10. Residual
    IElementWiseLayer* res = network->addElementWise(*input, *x, ElementWiseOperation::kSUM);
    return res->getOutput(0);
}

ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
    INetworkDefinition* network = builder->createNetworkV2(explicitBatch);

    // Create input
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims4{maxBatchSize, 3, INPUT_H, INPUT_W});
    assert(data);

    // Load weights from the path provided via command line (g_wts_path)
    std::map<std::string, Weights> weightMap = loadWeights(g_wts_path);

    // Initialize Stem
    // downsample_layers.0: Conv 4x4, s=4 -> LN
    // Conv
    IConvolutionLayer* conv0 =
            network->addConvolutionNd(*data, DIMS[0], DimsHW{4, 4}, weightMap["downsample_layers.0.0.weight"],
                                      weightMap["downsample_layers.0.0.bias"]);
    assert(conv0);
    conv0->setStrideNd(DimsHW{4, 4});

    ITensor* x = conv0->getOutput(0);

    // LN
    // Transpose to NHWC
    IShuffleLayer* p0 = network->addShuffle(*x);
    p0->setSecondTranspose({0, 2, 3, 1});
    x = p0->getOutput(0);

    // Plugin
    auto creator = getPluginRegistry()->getPluginCreator("LayerNorm", "1");
    PluginFieldCollection pfc;
    float eps = 1e-6f;
    PluginField pf("epsilon", &eps, PluginFieldType::kFLOAT32, 1);
    pfc.nbFields = 1;
    pfc.fields = &pf;
    IPluginV2* plugin = creator->createPlugin("stem_ln", &pfc);

    auto w_ln0_w = weightMap["downsample_layers.0.1.weight"];
    auto w_ln0_b = weightMap["downsample_layers.0.1.bias"];
    IConstantLayer* c_g0 = network->addConstant(Dims{1, {w_ln0_w.count}}, w_ln0_w);
    IConstantLayer* c_b0 = network->addConstant(Dims{1, {w_ln0_b.count}}, w_ln0_b);
    ITensor* in0[] = {x, c_g0->getOutput(0), c_b0->getOutput(0)};
    IPluginV2Layer* ln0 = network->addPluginV2(in0, 3, *plugin);
    x = ln0->getOutput(0);

    // Transpose back
    IShuffleLayer* p0_back = network->addShuffle(*x);
    p0_back->setSecondTranspose({0, 3, 1, 2});
    x = p0_back->getOutput(0);

    // Stages
    for (int i = 0; i < 4; i++) {
        // Downsample layer (except first stage which is stem)
        if (i > 0) {
            std::string ds_name = "downsample_layers." + std::to_string(i);
            // LN -> Conv 2x2 s=2
            // LN (NHWC)
            IShuffleLayer* p_ds = network->addShuffle(*x);
            p_ds->setSecondTranspose({0, 2, 3, 1});
            x = p_ds->getOutput(0);

            auto creator = getPluginRegistry()->getPluginCreator("LayerNorm", "1");
            PluginFieldCollection pfc_ds;
            float eps_ds = 1e-6f;
            PluginField pf_ds("epsilon", &eps_ds, PluginFieldType::kFLOAT32, 1);
            pfc_ds.nbFields = 1;
            pfc_ds.fields = &pf_ds;
            IPluginV2* plugin_ds = creator->createPlugin((ds_name + "_ln").c_str(), &pfc_ds);

            auto w_ds_w = weightMap[ds_name + ".0.weight"];
            auto w_ds_b = weightMap[ds_name + ".0.bias"];
            IConstantLayer* c_ds_g = network->addConstant(Dims{1, {w_ds_w.count}}, w_ds_w);
            IConstantLayer* c_ds_b = network->addConstant(Dims{1, {w_ds_b.count}}, w_ds_b);
            ITensor* in_ds[] = {x, c_ds_g->getOutput(0), c_ds_b->getOutput(0)};
            IPluginV2Layer* ln_ds = network->addPluginV2(in_ds, 3, *plugin_ds);
            x = ln_ds->getOutput(0);

            IShuffleLayer* p_ds_back = network->addShuffle(*x);
            p_ds_back->setSecondTranspose({0, 3, 1, 2});
            x = p_ds_back->getOutput(0);

            // Conv 2x2, s=2
            IConvolutionLayer* conv_ds = network->addConvolutionNd(
                    *x, DIMS[i], DimsHW{2, 2}, weightMap[ds_name + ".1.weight"], weightMap[ds_name + ".1.bias"]);
            conv_ds->setStrideNd(DimsHW{2, 2});
            x = conv_ds->getOutput(0);
        }

        // Blocks
        for (int j = 0; j < DEPTHS[i]; j++) {
            std::string block_name = "stages." + std::to_string(i) + "." + std::to_string(j);
            x = convNextBlock(network, x, DIMS[i], block_name, weightMap);
        }
    }

    // Final Norm (Global Avg Pooling -> LayerNorm -> Head)

    // Global Avg Pooling
    IReduceLayer* gap = network->addReduce(*x, ReduceOperation::kAVG, 12, true);  // sum H,W (indices 2,3)
    x = gap->getOutput(0);                                                        // [N, C, 1, 1]

    // Reshape to [N,1,1,C] so LayerNorm plugin sees channels as last dimension
    IShuffleLayer* p_fin = network->addShuffle(*x);
    p_fin->setReshapeDimensions(Dims4{maxBatchSize, 1, 1, DIMS[3]});
    x = p_fin->getOutput(0);

    auto creator_fin = getPluginRegistry()->getPluginCreator("LayerNorm", "1");
    PluginFieldCollection pfc_fin;
    float eps_fin = 1e-6f;
    PluginField pf_fin("epsilon", &eps_fin, PluginFieldType::kFLOAT32, 1);
    pfc_fin.nbFields = 1;
    pfc_fin.fields = &pf_fin;
    IPluginV2* plugin_fin = creator_fin->createPlugin("final_norm", &pfc_fin);

    // norm.weight / norm.bias
    auto w_fn_w = weightMap["norm.weight"];
    auto w_fn_b = weightMap["norm.bias"];
    IConstantLayer* c_fn_g = network->addConstant(Dims{1, {w_fn_w.count}}, w_fn_w);
    IConstantLayer* c_fn_b = network->addConstant(Dims{1, {w_fn_b.count}}, w_fn_b);
    ITensor* in_fn[] = {x, c_fn_g->getOutput(0), c_fn_b->getOutput(0)};
    IPluginV2Layer* ln_fn = network->addPluginV2(in_fn, 3, *plugin_fin);
    x = ln_fn->getOutput(0);

    // Reshape back to [N, C, 1, 1] for 1x1 conv.
    IShuffleLayer* p_fin_b = network->addShuffle(*x);
    p_fin_b->setReshapeDimensions(Dims4{maxBatchSize, DIMS[3], 1, 1});
    x = p_fin_b->getOutput(0);

    Weights head_w = weightMap["head.weight"];
    Weights head_b = weightMap["head.bias"];
    // Check num classes
    int num_classes = head_w.count / DIMS[3];

    IConvolutionLayer* head = network->addConvolutionNd(*x, num_classes, DimsHW{1, 1}, head_w, head_b);
    x = head->getOutput(0);

    x->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*x);

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
// Workspace size configured below depending on TRT version
#if (NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR) >= 86
    // setMemoryPoolLimit
    config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, 1U << 30);  // 1GB
#else
    config->setMaxWorkspaceSize(1 << 30);  // 1GB
#endif

    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);

    delete network;
    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);
    (*modelStream) = engine->serialize();
    engine->destroy();
    config->destroy();
    builder->destroy();
}

void inference(const std::string& engine_file, const std::string& image_file) {
    std::cout << "Running inference..." << std::endl;
    std::ifstream file(engine_file, std::ios::binary);
    if (!file.good()) {
        std::cerr << "Error: Engine file not found" << std::endl;
        return;
    }
    file.seekg(0, file.end);
    size_t size = file.tellg();
    file.seekg(0, file.beg);
    char* trtModelStream = new char[size];
    assert(trtModelStream);
    file.read(trtModelStream, size);
    file.close();

    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    // Load image
    cv::Mat img = cv::imread(image_file);
    if (img.empty()) {
        std::cerr << "Error: Image not found" << std::endl;
        return;
    }
    cv::resize(img, img, cv::Size(INPUT_W, INPUT_H));
    img.convertTo(img, CV_32F);

    // Normalize (Mean [0.485, 0.456, 0.406], Std [0.229, 0.224, 0.225])
    // OpenCV is BGR. Pytorch expects RGB.
    cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
    img /= 255.0;

    float mean[] = {0.485, 0.456, 0.406};
    float std[] = {0.229, 0.224, 0.225};

    // HWC -> NCHW and Normalize
    float* hostData = new float[3 * INPUT_H * INPUT_W];
    for (int h = 0; h < INPUT_H; ++h) {
        for (int w = 0; w < INPUT_W; ++w) {
            for (int c = 0; c < 3; ++c) {
                float val = img.at<cv::Vec3f>(h, w)[c];
                hostData[c * INPUT_H * INPUT_W + h * INPUT_W + w] = (val - mean[c]) / std[c];
            }
        }
    }

    void* deviceData;
    cudaMalloc(&deviceData, 3 * INPUT_H * INPUT_W * sizeof(float));
    cudaMemcpy(deviceData, hostData, 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice);

    // Output buffer
    // Determine output size.
    int outputSize = 1000;  // Default ImageNet
    // Check binding dimensions
    int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);
    Dims outDims = engine->getBindingDimensions(outputIndex);
    // outputSize = outDims.d[1];

    float* hostOutput = new float[outputSize];
    void* deviceOutput;
    cudaMalloc(&deviceOutput, outputSize * sizeof(float));

    void* bindings[] = {deviceData, deviceOutput};

    // Execute
    context->executeV2(bindings);

    // Copy back
    cudaMemcpy(hostOutput, deviceOutput, outputSize * sizeof(float), cudaMemcpyDeviceToHost);

    // Softmax and Argmax
    float maxVal = -1e9;
    int maxIdx = -1;
    for (int i = 0; i < outputSize; ++i) {
        if (hostOutput[i] > maxVal) {
            maxVal = hostOutput[i];
            maxIdx = i;
        }
    }
    std::cout << "Predicted Class: " << maxIdx << " (Score: " << maxVal << ")" << std::endl;

    cudaFree(deviceData);
    cudaFree(deviceOutput);
    delete[] hostData;
    delete[] hostOutput;
    delete context;
    delete engine;
    delete runtime;
}

int main(int argc, char** argv) {
    if (argc < 3) {
        std::cerr << "Usage: " << argv[0] << " <wts_path> <engine_path> [config_path]" << std::endl;
        std::cerr << "Example: " << argv[0] << " convnextv2.wts convnextv2.engine config.yaml" << std::endl;
        return -1;
    }

    g_wts_path = argv[1];
    g_engine_path = argv[2];
    std::string config_path = (argc >= 4) ? argv[3] : "config.yaml";
    g_config = loadConfig(config_path);

    // Register Plugin manually if needed
    auto* lnCreator = new LayerNormPluginCreator();
    getPluginRegistry()->registerCreator(*lnCreator, "");

    // Generate engine
    IHostMemory* modelStream{nullptr};
    APIToModel(1, &modelStream);
    assert(modelStream != nullptr);
    std::ofstream p(g_engine_path, std::ios::binary);
    if (!p) {
        std::cerr << "Could not open plan output file" << std::endl;
        return -1;
    }
    p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
    modelStream->destroy();
    std::cout << "Engine generated successfully: " << g_engine_path << std::endl;

    return 0;
}


================================================
FILE: convnextv2/src/inference_cpp.cpp
================================================
#include <cuda_runtime_api.h>
#include <fstream>
#include <iostream>
#include <opencv2/opencv.hpp>
#include <vector>
#include "LayerNormPlugin.h"
#include "NvInfer.h"
#include "logging.h"

using namespace nvinfer1;

static Logger gLogger;

std::vector<std::string> load_imagenet_labels(const std::string& label_file = "imagenet_classes.txt") {
    std::vector<std::string> labels;
    std::ifstream file(label_file);
    if (!file.is_open()) {
        return labels;
    }
    std::string line;
    while (std::getline(file, line)) {
        labels.push_back(line);
    }
    return labels;
}

static const char* INPUT_BLOB_NAME = "data";
static const char* OUTPUT_BLOB_NAME = "prob";

void inference(const std::string& engine_file, const std::string& image_file,
               const std::string& label_file = "imagenet_classes.txt") {
    std::cout << "Running inference..." << std::endl;

    // Register LayerNorm plugin
    static LayerNormPluginCreator pluginCreator;
    getPluginRegistry()->registerCreator(pluginCreator, "");

    std::ifstream file(engine_file, std::ios::binary);
    if (!file.good()) {
        std::cerr << "Error: Engine file not found: " << engine_file << std::endl;
        return;
    }
    file.seekg(0, file.end);
    size_t size = file.tellg();
    file.seekg(0, file.beg);
    char* trtModelStream = new char[size];
    assert(trtModelStream);
    file.read(trtModelStream, size);
    file.close();

    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    // Determine dimensions from engine
    int inputIndex = -1;
    int outputIndex = -1;
    for (int i = 0; i < engine->getNbBindings(); ++i) {
        if (engine->bindingIsInput(i)) {
            inputIndex = i;
        } else {
            outputIndex = i;
        }
    }

    if (inputIndex == -1 || outputIndex == -1) {
        std::cerr << "Error: Could not find input or output bindings in engine." << std::endl;
        return;
    }

    Dims inputDims = engine->getBindingDimensions(inputIndex);
    Dims outputDims = engine->getBindingDimensions(outputIndex);

    // Assuming NCHW format for input
    int input_h = inputDims.d[2];
    int input_w = inputDims.d[3];
    int input_c = inputDims.d[1];  // Usually 3

    // Assuming N x NumClasses or just NumClasses
    int outputSize = 1;
    for (int i = 0; i < outputDims.nbDims; ++i) {
        // Skip batch dimension if it is dynamic (-1) or 1
        if (i == 0 && (outputDims.d[i] == -1 || outputDims.d[i] == 1))
            continue;
        outputSize *= outputDims.d[i];
    }

    std::cout << "Input Dimensions: " << input_c << "x" << input_h << "x" << input_w << std::endl;
    std::cout << "Output Size: " << outputSize << std::endl;

    // Load image
    cv::Mat img = cv::imread(image_file);
    if (img.empty()) {
        std::cerr << "Error: Image not found: " << image_file << std::endl;
        return;
    }
    cv::resize(img, img, cv::Size(input_w, input_h));
    img.convertTo(img, CV_32F);

    // Normalize (Mean [0.485, 0.456, 0.406], Std [0.229, 0.224, 0.225])
    // OpenCV is BGR. Pytorch expects RGB.
    cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
    img /= 255.0;

    float mean[] = {0.485, 0.456, 0.406};
    float std[] = {0.229, 0.224, 0.225};

    // HWC -> NCHW and Normalize
    float* hostData = new float[input_c * input_h * input_w];
    for (int h = 0; h < input_h; ++h) {
        for (int w = 0; w < input_w; ++w) {
            for (int c = 0; c < input_c; ++c) {
                float val = img.at<cv::Vec3f>(h, w)[c];
                hostData[c * input_h * input_w + h * input_w + w] = (val - mean[c]) / std[c];
            }
        }
    }

    void* deviceData;
    cudaMalloc(&deviceData, input_c * input_h * input_w * sizeof(float));
    cudaMemcpy(deviceData, hostData, input_c * input_h * input_w * sizeof(float), cudaMemcpyHostToDevice);

    // Output buffer
    float* hostOutput = new float[outputSize];
    void* deviceOutput;
    cudaMalloc(&deviceOutput, outputSize * sizeof(float));

    void* bindings[] = {deviceData, deviceOutput};
    if (engine->getBindingIndex(INPUT_BLOB_NAME) != 0) {
        bindings[inputIndex] = deviceData;
        bindings[outputIndex] = deviceOutput;
    }

    // Execute
    context->executeV2(bindings);

    // Copy back
    cudaMemcpy(hostOutput, deviceOutput, outputSize * sizeof(float), cudaMemcpyDeviceToHost);

    // Argmax
    float maxVal = -1e9;
    int maxIdx = -1;
    for (int i = 0; i < outputSize; ++i) {
        if (hostOutput[i] > maxVal) {
            maxVal = hostOutput[i];
            maxIdx = i;
        }
    }

    auto labels = load_imagenet_labels(label_file);
    if (!labels.empty() && maxIdx < static_cast<int>(labels.size())) {
        std::cout << "Predicted Class: " << maxIdx << " - " << labels[maxIdx] << " (Score: " << maxVal << ")"
                  << std::endl;
    } else {
        std::cout << "Predicted Class: " << maxIdx << " (Score: " << maxVal << ")" << std::endl;
    }

    cudaFree(deviceData);
    cudaFree(deviceOutput);
    delete[] hostData;
    delete[] hostOutput;
    delete context;
    delete engine;
    delete runtime;
}

int main(int argc, char** argv) {
    if (argc < 3 || argc > 4) {
        std::cerr << "Usage: " << argv[0] << " <engine_path> <image_path> [label_file]" << std::endl;
        std::cerr << "Example: " << argv[0] << " convnextv2.engine images/test.jpg" << std::endl;
        std::cerr << "         " << argv[0] << " convnextv2.engine images/test.jpg custom_labels.txt" << std::endl;
        return -1;
    }

    std::string engine_path = argv[1];
    std::string image_path = argv[2];
    std::string label_file = (argc == 4) ? argv[3] : "imagenet_classes.txt";

    inference(engine_path, image_path, label_file);

    return 0;
}


================================================
FILE: convnextv2/src/logging.h
================================================
#ifndef LOGGING_H
#define LOGGING_H

#include <NvInfer.h>
#include <iostream>

using namespace nvinfer1;

class Logger : public ILogger {
   public:
    Logger(Severity severity = Severity::kINFO) : reportableSeverity(severity) {}

    void log(Severity severity, const char* msg) noexcept override {
        if (severity > reportableSeverity)
            return;
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                std::cerr << "INTERNAL_ERROR: ";
                break;
            case Severity::kERROR:
                std::cerr << "ERROR: ";
                break;
            case Severity::kWARNING:
                std::cerr << "WARNING: ";
                break;
            case Severity::kINFO:
                std::cout << "INFO: ";
                break;
            default:
                std::cout << "VERBOSE: ";
                break;
        }
        std::cout << msg << std::endl;
    }

    Severity reportableSeverity;
};

#endif


================================================
FILE: crnn/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(crnn)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

find_package(CUDA REQUIRED)

include_directories(${PROJECT_SOURCE_DIR}/include)
if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
    message("embed_platform on")
    include_directories(/usr/local/cuda/targets/aarch64-linux/include)
    link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
else()
    message("embed_platform off")
    include_directories(/usr/local/cuda/include)
    link_directories(/usr/local/cuda/lib64)
endif()

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

add_executable(crnn ${PROJECT_SOURCE_DIR}/crnn.cpp)
target_link_libraries(crnn nvinfer)
target_link_libraries(crnn cudart)
target_link_libraries(crnn ${OpenCV_LIBS})

add_definitions(-O2 -pthread)


================================================
FILE: crnn/README.md
================================================
# crnn

The Pytorch implementation is [meijieru/crnn.pytorch](https://github.com/meijieru/crnn.pytorch).

## How to Run

```
1. generate crnn.wts from pytorch

git clone https://github.com/wang-xinyu/tensorrtx.git
git clone https://github.com/meijieru/crnn.pytorch.git
// download its weights 'crnn.pth'
// copy tensorrtx/crnn/genwts.py into crnn.pytorch/
// go to crnn.pytorch/
python genwts.py
// a file 'crnn.wts' will be generated.

2. build tensorrtx/crnn and run

// put crnn.wts into tensorrtx/crnn
// go to tensorrtx/crnn
mkdir build
cd build
cmake ..
make
sudo ./crnn -s  // serialize model to plan file i.e. 'crnn.engine'
// copy crnn.pytorch/data/demo.png here
sudo ./crnn -d  // deserialize plan file and run inference

3. check the output as follows:

raw: a-----v--a-i-l-a-bb-l-e---
sim: available

```

## More Information

See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)

## Acknowledgment

Thanks for the donation for this crnn tensorrt implementation from @雍.


================================================
FILE: crnn/crnn.cpp
================================================
#include <iostream>
#include <chrono>
#include <map>
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

#define USE_FP16  // comment out this if want to use FP32
#define DEVICE 0  // GPU id
#define BATCH_SIZE 1

// stuff we know about the network and the input/output blobs
static const int INPUT_H = 32;
static const int INPUT_W = 100;
static const int OUTPUT_SIZE = 26 * 37;
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
static Logger gLogger;

const int ks[] = {3, 3, 3, 3, 3, 3, 2};
const int ps[] = {1, 1, 1, 1, 1, 1, 0};
const int ss[] = {1, 1, 1, 1, 1, 1, 1};
const int nm[] = {64, 128, 256, 256, 512, 512, 512};
const std::string alphabet = "-0123456789abcdefghijklmnopqrstuvwxyz";

using namespace nvinfer1;

std::string strDecode(std::vector<int>& preds, bool raw) {
    std::string str;
    if (raw) {
        for (auto v: preds) {
            str.push_back(alphabet[v]);
        }
    } else {
        for (size_t i = 0; i < preds.size(); i++) {
            if (preds[i] == 0 || (i > 0 && preds[i - 1] == preds[i])) continue;
            str.push_back(alphabet[preds[i]]);
        }
    }
    return str;
}

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + ".weight"].values;
    float *beta = (float*)weightMap[lname + ".bias"].values;
    float *mean = (float*)weightMap[lname + ".running_mean"].values;
    float *var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};

    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

ILayer* convRelu(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int i, bool use_bn = false) {
    int nOut = nm[i];
    IConvolutionLayer* conv = network->addConvolutionNd(input, nOut, DimsHW{ks[i], ks[i]}, weightMap["cnn.conv" + std::to_string(i) + ".weight"], weightMap["cnn.conv" + std::to_string(i) + ".bias"]);
    assert(conv);
    conv->setStrideNd(DimsHW{ss[i], ss[i]});
    conv->setPaddingNd(DimsHW{ps[i], ps[i]});
    ILayer *tmp = conv;
    if (use_bn) {
        tmp = addBatchNorm2d(network, weightMap, *conv->getOutput(0), "cnn.batchnorm" + std::to_string(i), 1e-5);
    }
    auto relu = network->addActivation(*tmp->getOutput(0), ActivationType::kRELU);
    assert(relu);
    return relu;
}

void splitLstmWeights(std::map<std::string, Weights>& weightMap, std::string lname) {
    int weight_size = weightMap[lname].count;
    for (int i = 0; i < 4; i++) {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        wt.count = weight_size / 4;
        float *val = reinterpret_cast<float*>(malloc(sizeof(float) * wt.count));
        memcpy(val, (float*)weightMap[lname].values + wt.count * i, sizeof(float) * wt.count);
        wt.values = val;
        weightMap[lname + std::to_string(i)] = wt;
    }
}

ILayer* addLSTM(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int nHidden, std::string lname) {
    splitLstmWeights(weightMap, lname + ".weight_ih_l0");
    splitLstmWeights(weightMap, lname + ".weight_hh_l0");
    splitLstmWeights(weightMap, lname + ".bias_ih_l0");
    splitLstmWeights(weightMap, lname + ".bias_hh_l0");
    splitLstmWeights(weightMap, lname + ".weight_ih_l0_reverse");
    splitLstmWeights(weightMap, lname + ".weight_hh_l0_reverse");
    splitLstmWeights(weightMap, lname + ".bias_ih_l0_reverse");
    splitLstmWeights(weightMap, lname + ".bias_hh_l0_reverse");
    Dims dims = input.getDimensions();
    std::cout << "lstm input shape: " << dims.nbDims << " [" << dims.d[0] << " " << dims.d[1] << " " << dims.d[2] << "]"<< std::endl;
    auto lstm = network->addRNNv2(input, 1, nHidden, dims.d[1], RNNOperation::kLSTM);
    lstm->setDirection(RNNDirection::kBIDIRECTION);
    lstm->setWeightsForGate(0, RNNGateType::kINPUT, true, weightMap[lname + ".weight_ih_l00"]);
    lstm->setWeightsForGate(0, RNNGateType::kFORGET, true, weightMap[lname + ".weight_ih_l01"]);
    lstm->setWeightsForGate(0, RNNGateType::kCELL, true, weightMap[lname + ".weight_ih_l02"]);
    lstm->setWeightsForGate(0, RNNGateType::kOUTPUT, true, weightMap[lname + ".weight_ih_l03"]);

    lstm->setWeightsForGate(0, RNNGateType::kINPUT, false, weightMap[lname + ".weight_hh_l00"]);
    lstm->setWeightsForGate(0, RNNGateType::kFORGET, false, weightMap[lname + ".weight_hh_l01"]);
    lstm->setWeightsForGate(0, RNNGateType::kCELL, false, weightMap[lname + ".weight_hh_l02"]);
    lstm->setWeightsForGate(0, RNNGateType::kOUTPUT, false, weightMap[lname + ".weight_hh_l03"]);

    lstm->setBiasForGate(0, RNNGateType::kINPUT, true, weightMap[lname + ".bias_ih_l00"]);
    lstm->setBiasForGate(0, RNNGateType::kFORGET, true, weightMap[lname + ".bias_ih_l01"]);
    lstm->setBiasForGate(0, RNNGateType::kCELL, true, weightMap[lname + ".bias_ih_l02"]);
    lstm->setBiasForGate(0, RNNGateType::kOUTPUT, true, weightMap[lname + ".bias_ih_l03"]);

    lstm->setBiasForGate(0, RNNGateType::kINPUT, false, weightMap[lname + ".bias_hh_l00"]);
    lstm->setBiasForGate(0, RNNGateType::kFORGET, false, weightMap[lname + ".bias_hh_l01"]);
    lstm->setBiasForGate(0, RNNGateType::kCELL, false, weightMap[lname + ".bias_hh_l02"]);
    lstm->setBiasForGate(0, RNNGateType::kOUTPUT, false, weightMap[lname + ".bias_hh_l03"]);

    lstm->setWeightsForGate(1, RNNGateType::kINPUT, true, weightMap[lname + ".weight_ih_l0_reverse0"]);
    lstm->setWeightsForGate(1, RNNGateType::kFORGET, true, weightMap[lname + ".weight_ih_l0_reverse1"]);
    lstm->setWeightsForGate(1, RNNGateType::kCELL, true, weightMap[lname + ".weight_ih_l0_reverse2"]);
    lstm->setWeightsForGate(1, RNNGateType::kOUTPUT, true, weightMap[lname + ".weight_ih_l0_reverse3"]);

    lstm->setWeightsForGate(1, RNNGateType::kINPUT, false, weightMap[lname + ".weight_hh_l0_reverse0"]);
    lstm->setWeightsForGate(1, RNNGateType::kFORGET, false, weightMap[lname + ".weight_hh_l0_reverse1"]);
    lstm->setWeightsForGate(1, RNNGateType::kCELL, false, weightMap[lname + ".weight_hh_l0_reverse2"]);
    lstm->setWeightsForGate(1, RNNGateType::kOUTPUT, false, weightMap[lname + ".weight_hh_l0_reverse3"]);

    lstm->setBiasForGate(1, RNNGateType::kINPUT, true, weightMap[lname + ".bias_ih_l0_reverse0"]);
    lstm->setBiasForGate(1, RNNGateType::kFORGET, true, weightMap[lname + ".bias_ih_l0_reverse1"]);
    lstm->setBiasForGate(1, RNNGateType::kCELL, true, weightMap[lname + ".bias_ih_l0_reverse2"]);
    lstm->setBiasForGate(1, RNNGateType::kOUTPUT, true, weightMap[lname + ".bias_ih_l0_reverse3"]);

    lstm->setBiasForGate(1, RNNGateType::kINPUT, false, weightMap[lname + ".bias_hh_l0_reverse0"]);
    lstm->setBiasForGate(1, RNNGateType::kFORGET, false, weightMap[lname + ".bias_hh_l0_reverse1"]);
    lstm->setBiasForGate(1, RNNGateType::kCELL, false, weightMap[lname + ".bias_hh_l0_reverse2"]);
    lstm->setBiasForGate(1, RNNGateType::kOUTPUT, false, weightMap[lname + ".bias_hh_l0_reverse3"]);
    return lstm;
}

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // Create input tensor of shape {C, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{1, INPUT_H, INPUT_W});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../crnn.wts");

    // cnn
    auto x = convRelu(network, weightMap, *data, 0);
    auto p = network->addPoolingNd(*x->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    p->setStrideNd(DimsHW{2, 2});
    x = convRelu(network, weightMap, *p->getOutput(0), 1);
    p = network->addPoolingNd(*x->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    p->setStrideNd(DimsHW{2, 2});
    x = convRelu(network, weightMap, *p->getOutput(0), 2, true);
    x = convRelu(network, weightMap, *x->getOutput(0), 3);
    p = network->addPoolingNd(*x->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    p->setStrideNd(DimsHW{2, 1});
    p->setPaddingNd(DimsHW{0, 1});
    x = convRelu(network, weightMap, *p->getOutput(0), 4, true);
    x = convRelu(network, weightMap, *x->getOutput(0), 5);
    p = network->addPoolingNd(*x->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    p->setStrideNd(DimsHW{2, 1});
    p->setPaddingNd(DimsHW{0, 1});
    x = convRelu(network, weightMap, *p->getOutput(0), 6, true);

    auto sfl = network->addShuffle(*x->getOutput(0));
    sfl->setFirstTranspose(Permutation{1, 2, 0});

    // rnn
    auto lstm0 = addLSTM(network, weightMap, *sfl->getOutput(0), 256, "rnn.0.rnn");
    auto sfl0 = network->addShuffle(*lstm0->getOutput(0));
    sfl0->setReshapeDimensions(Dims4{26, 1, 1, 512});
    auto fc0 = network->addFullyConnected(*sfl0->getOutput(0), 256, weightMap["rnn.0.embedding.weight"], weightMap["rnn.0.embedding.bias"]);

    sfl = network->addShuffle(*fc0->getOutput(0));
    sfl->setFirstTranspose(Permutation{2, 3, 0, 1});
    sfl->setReshapeDimensions(Dims3{1, 26, 256});

    auto lstm1 = addLSTM(network, weightMap, *sfl->getOutput(0), 256, "rnn.1.rnn");
    auto sfl1 = network->addShuffle(*lstm1->getOutput(0));
    sfl1->setReshapeDimensions(Dims4{26, 1, 1, 512});
    auto fc1 = network->addFullyConnected(*sfl1->getOutput(0), 37, weightMap["rnn.1.embedding.weight"], weightMap["rnn.1.embedding.bias"]);
    Dims dims = fc1->getOutput(0)->getDimensions();
    std::cout << "fc1 shape " << dims.d[0] << " " << dims.d[1] << " " << dims.d[2] << std::endl;

    fc1->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*fc1->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#ifdef USE_FP16
    config->setFlag(BuilderFlag::kFP16);
#endif
    std::cout << "Building engine, please wait for a while..." << std::endl;
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*) (mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
}

void doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* output, int batchSize) {
    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 1 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);
}

int main(int argc, char** argv) {
    cudaSetDevice(DEVICE);
    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{nullptr};
    size_t size{0};
    if (argc == 2 && std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(BATCH_SIZE, &modelStream);
        assert(modelStream != nullptr);
        std::ofstream p("crnn.engine", std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 0;
    } else if (argc == 2 && std::string(argv[1]) == "-d") {
        std::ifstream file("crnn.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./crnn -s  // serialize model to plan file" << std::endl;
        std::cerr << "./crnn -d ../samples  // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // prepare input data ---------------------------
    static float data[BATCH_SIZE * 1 * INPUT_H * INPUT_W];
    //for (int i = 0; i < 1 * INPUT_H * INPUT_W; i++)
    //    data[i] = 1.0;
    static float prob[BATCH_SIZE * OUTPUT_SIZE];
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;
    assert(engine->getNbBindings() == 2);
    void* buffers[2];
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);
    assert(inputIndex == 0);
    assert(outputIndex == 1);
    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], BATCH_SIZE * 1 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(float)));
    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    cv::Mat img = cv::imread("demo.png");
    if (img.empty()) {
        std::cerr << "demo.png not found !!!" << std::endl;
        return -1;
    }
    cv::cvtColor(img, img, CV_BGR2GRAY);
    cv::resize(img, img, cv::Size(INPUT_W, INPUT_H));
    for (int i = 0; i < INPUT_H * INPUT_W; i++) {
        data[i] = ((float)img.at<uchar>(i) / 255.0 - 0.5) * 2.0;
    }

    // Run inference
    auto start = std::chrono::system_clock::now();
    doInference(*context, stream, buffers, data, prob, BATCH_SIZE);
    auto end = std::chrono::system_clock::now();
    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;

    std::vector<int> preds;
    for (int i = 0; i < 26; i++) {
        int maxj = 0;
        for (int j = 1; j < 37; j++) {
            if (prob[37 * i + j] > prob[37 * i + maxj]) maxj = j;
        }
        preds.push_back(maxj);
    }
    std::cout << "raw: " << strDecode(preds, true) << std::endl;
    std::cout << "sim: " << strDecode(preds, false) << std::endl;

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    // Print histogram of the output distribution
    //std::cout << "\nOutput:\n\n";
    //for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
    //{
    //    std::cout << prob[i] << ", ";
    //    if (i % 10 == 0) std::cout << std::endl;
    //}
    //std::cout << std::endl;

    return 0;
}


================================================
FILE: crnn/genwts.py
================================================
import torch
from torch.autograd import Variable
import utils
import models.crnn as crnn
import struct

model_path = './data/crnn.pth'

model = crnn.CRNN(32, 1, 37, 256)
if torch.cuda.is_available():
    model = model.cuda()
print('loading pretrained model from %s' % model_path)
model.load_state_dict(torch.load(model_path))

image = torch.ones(1, 1, 32, 100)
if torch.cuda.is_available():
    image = image.cuda()

model.eval()
print(model)
print('image shape ', image.shape)
preds = model(image)

f = open("crnn.wts", 'w')
f.write("{}\n".format(len(model.state_dict().keys())))
for k,v in model.state_dict().items():
    print('key: ', k)
    print('value: ', v.shape)
    vr = v.reshape(-1).cpu().numpy()
    f.write("{} {}".format(k, len(vr)))
    for vv in vr:
        f.write(" ")
        f.write(struct.pack(">f", float(vv)).hex())
    f.write("\n")


================================================
FILE: crnn/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) override
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: csrnet/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.10)

project(csrnet)

add_definitions(-std=c++11)
add_definitions(-DAPI_EXPORTS)
option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

# cuda
include_directories(/usr/local/cuda/targets/x86_64-linux/include )
link_directories(/usr/local/cuda/targets/x86_64-linux/lib)

# tensorrt
include_directories(/usr/include/x86_64-linux-gnu/)
link_directories(/usr/lib/x86_64-linux-gnu/)

# opencv
find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

include_directories(${PROJECT_SOURCE_DIR}/)

add_executable(csrnet csrnet.cpp)
target_link_libraries(csrnet nvinfer cudart ${OpenCV_LIBS})

================================================
FILE: csrnet/README.md
================================================
# csrnet

The Pytorch implementation is [leeyeehoo/CSRNet-pytorch](https://github.com/leeyeehoo/CSRNet-pytorch).

This repo is a TensorRT implementation of CSRNet.

paper : [CSRNet: Dilated Convolutional Neural Networks for Understanding the Highly Congested Scenes](https://arxiv.org/abs/1802.10062)

Dev environment:
- Ubuntu 22.04
- TensorRT 8.6
- OpenCV 4.5.4
- CMake 3.24
- GPU Driver 535.113.01
- CUDA 12.2
- RTX3080


# how to run

```bash
1. generate csrnet engine
git clone https://github.com/leeyeehoo/CSRNet-pytorch.git
git clone https://github.com/wang-xinyu/tensorrtx.git
// copy gen_wts.py to CSRNet-pytorch
// generate wts file
python gen_wts.py
// csrnet wts will be generated in CSRNet-pytorch

2. build csrnet.engine
// mv CSRNet-pytorch/csrnet.engine to tensorrtx/csrnet
mv CSRNet-pytorch/csrnet.wts tensorrtx/csrnet
// build
mkdir build
cmake ..
make
sudo ./csrnet -s  ./csrnet.wts

Loading weights: ./csrnet.wts
build engine successfully : ./csrnet.engine

// download images https://github.com/wang-xinyu/tensorrtx/assets/46584679/46bc4def-e573-44ae-996d-5d68927c78ff and copy to images
sudo ./csrnet -d  ./images

// output e.g
// enqueueV2 time: 0.0323869s
// detect time:44ms
// people num :22.9101 write_path: ../images/data.jpg
```


# result 

inference people num: 22.9101

<p align="center">
<img src= https://raw.githubusercontent.com/wang-xinyu/tensorrtx/dbf857d25f77bf64113fc99a745ccf4973bdd44e/Density_Plot.jpg>
</p>


================================================
FILE: csrnet/config.h
================================================
#pragma once

const static char *kInputTensorName = "data";
const static char *kOutputTensorName = "prob";
const static char *kEngineFile = "./csrnet.engine";

const static int kBatchSize = 1;

const static int MAX_INPUT_SIZE = 1440; // 32x
const static int MIN_INPUT_SIZE = 608;
const static int OPT_INPUT_W = 1152;
const static int OPT_INPUT_H = 640;

constexpr static int kMaxInputImageSize = MAX_INPUT_SIZE * MAX_INPUT_SIZE * 3;
constexpr static int kMaxOutputProbSize =
    (MAX_INPUT_SIZE * MAX_INPUT_SIZE) >> 6;

================================================
FILE: csrnet/csrnet.cpp
================================================
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include <chrono>
#include <config.h>
#include <cstring>
#include <dirent.h>
#include <fstream>
#include <iostream>
#include <logging.h>
#include <map>
#include <numeric>
#include <opencv2/opencv.hpp>
#include <vector>
using namespace nvinfer1;

#define CHECK(status)                                                          \
  do {                                                                         \
    auto ret = (status);                                                       \
    if (ret != 0) {                                                            \
      std::cerr << "Cuda failure: " << ret << std::endl;                       \
      abort();                                                                 \
    }                                                                          \
  } while (0)

static Logger gLogger;
static char *kWTSFile = "";
std::map<std::string, Weights> loadWeights(const std::string file) {
  std::cout << "Loading weights: " << file << std::endl;
  std::map<std::string, Weights> weightMap;

  // Open weights file
  std::ifstream input(file);
  assert(input.is_open() && "Unable to load weight file.");

  // Read number of weight blobs
  int32_t count;
  input >> count;
  assert(count > 0 && "Invalid weight map file.");

  while (count--) {
    Weights wt{DataType::kFLOAT, nullptr, 0};
    uint32_t size;

    // Read name and type of blob
    std::string name;
    input >> name >> std::dec >> size;
    wt.type = DataType::kFLOAT;

    // Load blob
    uint32_t *val = reinterpret_cast<uint32_t *>(malloc(sizeof(val) * size));
    for (uint32_t x = 0, y = size; x < y; ++x) {
      input >> std::hex >> val[x];
    }
    wt.values = val;

    wt.count = size;
    weightMap[name] = wt;
  }

  return weightMap;
}
// clang-format off
/*
CSRNet(
 (frontend): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (17): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (18): ReLU(inplace=True)
    (19): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (20): ReLU(inplace=True)
    (21): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (22): ReLU(inplace=True)
  )
  (backend): Sequential(
    (0): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2),
    dilation=(2, 2)) (1): ReLU(inplace=True) (2): Conv2d(512, 512,
    kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2)) (3):
    ReLU(inplace=True) (4): Conv2d(512, 512, kernel_size=(3, 3), stride=(1,
    1), padding=(2, 2), dilation=(2, 2)) (5): ReLU(inplace=True) (6):
    Conv2d(512, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2),
    dilation=(2, 2)) (7): ReLU(inplace=True) (8): Conv2d(256, 128,
    kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2)) (9):
    ReLU(inplace=True) (10): Conv2d(128, 64, kernel_size=(3, 3), stride=(1,
    1), padding=(2, 2), dilation=(2, 2)) (11): ReLU(inplace=True)
  )
  (output_layer): Conv2d(64, 1, kernel_size=(1, 1), stride=(1, 1))
)
*/
// clang-format on
void doInference(IExecutionContext &context, float *input, float *output,
                 int input_h, int input_w) {
  const ICudaEngine &engine = context.getEngine();

  uint64_t input_size = 3 * input_h * input_w * sizeof(float);
  uint64_t output_size = ((input_h * input_w) >> 6) * sizeof(float);

  // Pointers to input and output device buffers to pass to engine.
  // Engine requires exactly IEngine::getNbBindings() number of buffers.
  assert(engine.getNbBindings() == 2);
  void *buffers[2];

  // In order to bind the buffers, we need to know the names of the input and
  // output tensors. Note that indices are guaranteed to be less than
  // IEngine::getNbBindings()
  const int inputIndex = engine.getBindingIndex(kInputTensorName);
  const int outputIndex = engine.getBindingIndex(kOutputTensorName);
  context.setBindingDimensions(inputIndex, Dims4(1, 3, input_h, input_w));

  // Create GPU buffers on device
  CHECK(cudaMalloc(&buffers[inputIndex], input_size));
  CHECK(cudaMalloc(&buffers[outputIndex], output_size));

  // Create stream
  cudaStream_t stream;
  CHECK(cudaStreamCreate(&stream));

  // DMA input batch data to device, infer on the batch asynchronously, and DMA
  // output back to host
  CHECK(cudaMemcpyAsync(buffers[inputIndex], input, input_size,
                        cudaMemcpyHostToDevice, stream));
  auto t1 = std::chrono::high_resolution_clock::now();
  context.enqueueV2(buffers, stream, nullptr);
  std::cout << "enqueueV2 time: "
            << std::chrono::duration<float>(
                   std::chrono::high_resolution_clock::now() - t1)
                   .count()
            << "s" << std::endl;
  CHECK(cudaMemcpyAsync(output, buffers[outputIndex], output_size,
                        cudaMemcpyDeviceToHost, stream));
  cudaStreamSynchronize(stream);

  // Release stream and buffers
  cudaStreamDestroy(stream);
  CHECK(cudaFree(buffers[inputIndex]));
  CHECK(cudaFree(buffers[outputIndex]));
}
ICudaEngine *createEngine(unsigned int maxBatchSize, IBuilder *builder,
                          IBuilderConfig *config, DataType dt) {

  //   INetworkDefinition *network = builder->createNetworkV2(0U);
  const auto explicitBatch =
      1U << static_cast<uint32_t>(
          NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
  INetworkDefinition *network = builder->createNetworkV2(explicitBatch);
  ITensor *data = network->addInput(kInputTensorName, dt, Dims4{1, 3, -1, -1});
  assert(data);
  std::map<std::string, Weights> weightMap = loadWeights(kWTSFile);

  IConvolutionLayer *conv1 = network->addConvolutionNd(
      *data, 64, DimsHW{3, 3}, weightMap["frontend.0.weight"],
      weightMap["frontend.0.bias"]);
  assert(conv1);
  conv1->setStrideNd(DimsHW{1, 1});
  conv1->setPaddingNd(DimsHW{1, 1});

  IActivationLayer *relu1 =
      network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);

  assert(relu1);

  auto conv2 = network->addConvolutionNd(*relu1->getOutput(0), 64, DimsHW{3, 3},
                                         weightMap["frontend.2.weight"],
                                         weightMap["frontend.2.bias"]);
  assert(conv2);
  conv2->setStrideNd(DimsHW{1, 1});
  conv2->setPaddingNd(DimsHW{1, 1});
  auto relu2 =
      network->addActivation(*conv2->getOutput(0), ActivationType::kRELU);
  assert(relu2);
  auto pool1 = network->addPoolingNd(*relu2->getOutput(0), PoolingType::kMAX,
                                     DimsHW{2, 2});
  assert(pool1);
  pool1->setStrideNd(DimsHW{2, 2});
  auto conv3 = network->addConvolutionNd(
      *pool1->getOutput(0), 128, DimsHW{3, 3}, weightMap["frontend.5.weight"],
      weightMap["frontend.5.bias"]);
  assert(conv3);
  conv3->setStrideNd(DimsHW{1, 1});

  conv3->setPaddingNd(DimsHW{1, 1});
  auto relu3 =
      network->addActivation(*conv3->getOutput(0), ActivationType::kRELU);
  assert(relu3);

  auto conv4 = network->addConvolutionNd(
      *relu3->getOutput(0), 128, DimsHW{3, 3}, weightMap["frontend.7.weight"],
      weightMap["frontend.7.bias"]);
  assert(conv4);
  conv4->setStrideNd(DimsHW{1, 1});
  conv4->setPaddingNd(DimsHW{1, 1});
  auto relu4 =
      network->addActivation(*conv4->getOutput(0), ActivationType::kRELU);
  assert(relu4);

  auto pool2 = network->addPoolingNd(*relu4->getOutput(0), PoolingType::kMAX,
                                     DimsHW{2, 2});
  assert(pool2);
  pool2->setStrideNd(DimsHW{2, 2});

  auto conv5 = network->addConvolutionNd(
      *pool2->getOutput(0), 256, DimsHW{3, 3}, weightMap["frontend.10.weight"],
      weightMap["frontend.10.bias"]);
  assert(conv5);
  conv5->setStrideNd(DimsHW{1, 1});
  conv5->setPaddingNd(DimsHW{1, 1});
  auto relu5 =
      network->addActivation(*conv5->getOutput(0), ActivationType::kRELU);
  assert(relu5);

  auto conv6 = network->addConvolutionNd(
      *relu5->getOutput(0), 256, DimsHW{3, 3}, weightMap["frontend.12.weight"],
      weightMap["frontend.12.bias"]);
  assert(conv6);
  conv6->setStrideNd(DimsHW{1, 1});
  conv6->setPaddingNd(DimsHW{1, 1});
  auto relu6 =
      network->addActivation(*conv6->getOutput(0), ActivationType::kRELU);
  assert(relu6);
  auto conv7 = network->addConvolutionNd(
      *relu6->getOutput(0), 256, DimsHW{3, 3}, weightMap["frontend.14.weight"],
      weightMap["frontend.14.bias"]);
  assert(conv7);
  conv7->setStrideNd(DimsHW{1, 1});
  conv7->setPaddingNd(DimsHW{1, 1});
  auto relu7 =
      network->addActivation(*conv7->getOutput(0), ActivationType::kRELU);
  assert(relu7);
  auto pool3 = network->addPoolingNd(*relu7->getOutput(0), PoolingType::kMAX,
                                     DimsHW{2, 2});
  assert(pool3);
  pool3->setStrideNd(DimsHW{2, 2});
  auto conv8 = network->addConvolutionNd(
      *pool3->getOutput(0), 512, DimsHW{3, 3}, weightMap["frontend.17.weight"],
      weightMap["frontend.17.bias"]);
  assert(conv8);
  conv8->setStrideNd(DimsHW{1, 1});
  conv8->setPaddingNd(DimsHW{1, 1});
  auto relu8 =
      network->addActivation(*conv8->getOutput(0), ActivationType::kRELU);
  assert(relu8);
  auto conv9 = network->addConvolutionNd(
      *relu8->getOutput(0), 512, DimsHW{3, 3}, weightMap["frontend.19.weight"],
      weightMap["frontend.19.bias"]);
  assert(conv9);
  conv9->setStrideNd(DimsHW{1, 1});
  conv9->setPaddingNd(DimsHW{1, 1});
  auto relu9 =
      network->addActivation(*conv9->getOutput(0), ActivationType::kRELU);
  assert(relu9);
  auto conv10 = network->addConvolutionNd(
      *relu9->getOutput(0), 512, DimsHW{3, 3}, weightMap["frontend.21.weight"],
      weightMap["frontend.21.bias"]);
  assert(conv10);
  conv10->setStrideNd(DimsHW{1, 1});
  conv10->setPaddingNd(DimsHW{1, 1});
  auto relu10 =
      network->addActivation(*conv10->getOutput(0), ActivationType::kRELU);
  assert(relu10);
  // backend
  auto conv11 = network->addConvolutionNd(
      *relu10->getOutput(0), 512, DimsHW{3, 3}, weightMap["backend.0.weight"],
      weightMap["backend.0.bias"]);
  assert(conv11);
  conv11->setPaddingNd(DimsHW{2, 2});
  conv11->setStrideNd(DimsHW{1, 1});
  conv11->setDilationNd(DimsHW{2, 2});
  auto relu11 =
      network->addActivation(*conv11->getOutput(0), ActivationType::kRELU);

  assert(relu11);
  auto conv12 = network->addConvolutionNd(
      *relu11->getOutput(0), 512, DimsHW{3, 3}, weightMap["backend.2.weight"],
      weightMap["backend.2.bias"]);
  assert(conv12);
  conv12->setPaddingNd(DimsHW{2, 2});
  conv12->setStrideNd(DimsHW{1, 1});
  conv12->setDilationNd(DimsHW{2, 2});
  auto relu12 =
      network->addActivation(*conv12->getOutput(0), ActivationType::kRELU);
  assert(relu12);

  auto conv13 = network->addConvolutionNd(
      *relu12->getOutput(0), 512, DimsHW{3, 3}, weightMap["backend.4.weight"],
      weightMap["backend.4.bias"]);
  assert(conv13);
  conv13->setPaddingNd(DimsHW{2, 2});
  conv13->setStrideNd(DimsHW{1, 1});
  conv13->setDilationNd(DimsHW{2, 2});
  auto relu13 =
      network->addActivation(*conv13->getOutput(0), ActivationType::kRELU);
  assert(relu13);

  auto conv14 = network->addConvolutionNd(
      *relu13->getOutput(0), 256, DimsHW{3, 3}, weightMap["backend.6.weight"],
      weightMap["backend.6.bias"]);
  assert(conv14);
  conv14->setPaddingNd(DimsHW{2, 2});
  conv14->setStrideNd(DimsHW{1, 1});
  conv14->setDilationNd(DimsHW{2, 2});
  auto relu14 =
      network->addActivation(*conv14->getOutput(0), ActivationType::kRELU);
  assert(relu14);
  auto conv15 = network->addConvolutionNd(
      *relu14->getOutput(0), 128, DimsHW{3, 3}, weightMap["backend.8.weight"],
      weightMap["backend.8.bias"]);
  assert(conv15);
  conv15->setPaddingNd(DimsHW{2, 2});
  conv15->setStrideNd(DimsHW{1, 1});
  conv15->setDilationNd(DimsHW{2, 2});
  auto relu15 =
      network->addActivation(*conv15->getOutput(0), ActivationType::kRELU);
  assert(relu15);
  auto conv16 = network->addConvolutionNd(
      *relu15->getOutput(0), 64, DimsHW{3, 3}, weightMap["backend.10.weight"],
      weightMap["backend.10.bias"]);
  assert(conv16);
  conv16->setPaddingNd(DimsHW{2, 2});
  conv16->setStrideNd(DimsHW{1, 1});
  conv16->setDilationNd(DimsHW{2, 2});
  auto relu16 =
      network->addActivation(*conv16->getOutput(0), ActivationType::kRELU);

  assert(relu16);

  auto conv17 = network->addConvolutionNd(
      *relu16->getOutput(0), 1, DimsHW{1, 1}, weightMap["output_layer.weight"],
      weightMap["output_layer.bias"]);
  assert(conv17);

  conv17->setStrideNd(DimsHW{1, 1});
  conv17->getOutput(0)->setName(kOutputTensorName);
  network->markOutput(*conv17->getOutput(0));

  IOptimizationProfile *profile = builder->createOptimizationProfile();
  profile->setDimensions(kInputTensorName, OptProfileSelector::kMIN,
                         Dims4(1, 3, MIN_INPUT_SIZE, MIN_INPUT_SIZE));
  profile->setDimensions(kInputTensorName, OptProfileSelector::kOPT,
                         Dims4(1, 3, OPT_INPUT_H, OPT_INPUT_W));
  profile->setDimensions(kInputTensorName, OptProfileSelector::kMAX,
                         Dims4(1, 3, MAX_INPUT_SIZE, MAX_INPUT_SIZE));
  config->addOptimizationProfile(profile);

  builder->setMaxBatchSize(kBatchSize);
  config->setMaxWorkspaceSize(16 << 20);
#ifdef USE_FP16
  config->setFlag(BuilderFlag::kFP16);
#endif
  ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config);

  printf("build engine successfully : %s\n", kEngineFile);
  // Don't need the network any more
  network->destroy();

  // Release host memory
  for (auto &mem : weightMap) {
    free((void *)(mem.second.values));
  }

  return engine;
}
void APIToModel(unsigned int maxBatchSize, IHostMemory **modelStream) {
  // Create builder
  IBuilder *builder = createInferBuilder(gLogger);
  IBuilderConfig *config = builder->createBuilderConfig();

  // Create model to populate the network, then set the outputs and create an
  // engine
  ICudaEngine *engine =
      createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
  assert(engine != nullptr);

  // Serialize the engine
  (*modelStream) = engine->serialize();

  // Close everything down
  engine->destroy();
  config->destroy();
  builder->destroy();
}

int read_files_in_dir(const char *p_dir_name,
                      std::vector<std::string> &file_names) {
  DIR *p_dir = opendir(p_dir_name);
  if (p_dir == nullptr) {
    return -1;
  }

  struct dirent *p_file = nullptr;
  while ((p_file = readdir(p_dir)) != nullptr) {
    if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) {
      std::string cur_file_name(p_file->d_name);
      file_names.push_back(cur_file_name);
    }
  }
  closedir(p_dir);
  return 0;
}

int main(int argc, char **argv) {

  if (argc != 3) {
    std::cerr << "arguments not right!" << std::endl;
    std::cerr << "./csrnet -s  ./csrnet.wts // serialize model to plan file"
              << std::endl;
    std::cerr
        << "./csrnet -d  ../images  // deserialize plan file and run inference"
        << std::endl;
    return -1;
  }
  char *trtModelStream{nullptr};
  size_t size{0};

  if (std::string(argv[1]) == "-s") {
    IHostMemory *modelStream{nullptr};
    kWTSFile = argv[2];
    APIToModel(kBatchSize, &modelStream);
    assert(modelStream != nullptr);

    std::ofstream p(kEngineFile, std::ios::binary);
    if (!p) {
      std::cerr << "could not open plan output file" << std::endl;
      return -1;
    }
    p.write(reinterpret_cast<const char *>(modelStream->data()),
            modelStream->size());
    modelStream->destroy();
    return 1;
  } else if (std::string(argv[1]) == "-d") {
    std::ifstream file(kEngineFile, std::ios::binary);
    if (file.good()) {
      file.seekg(0, file.end);
      size = file.tellg();
      file.seekg(0, file.beg);
      trtModelStream = new char[size];
      assert(trtModelStream);
      file.read(trtModelStream, size);
      file.close();
    }
  } else {
    return -1;
  }
  IRuntime *runtime = createInferRuntime(gLogger);
  assert(runtime != nullptr);
  ICudaEngine *engine = runtime->deserializeCudaEngine(trtModelStream, size);
  assert(engine != nullptr);
  IExecutionContext *context = engine->createExecutionContext();
  assert(context != nullptr);
  delete[] trtModelStream;

  std::vector<std::string> file_names;
  if (read_files_in_dir(argv[2], file_names) < 0) {
    std::cout << "read_files_in_dir failed." << std::endl;
    return -1;
  }

  std::vector<float> mean_value{0.406, 0.456, 0.485}; // BGR
  std::vector<float> std_value{0.225, 0.224, 0.229};

  int fcount = 0;

  float *data = new float[kMaxInputImageSize];
  float *prob = new float[kMaxOutputProbSize];

  for (auto f : file_names) {
    fcount++;
    cv::Mat src_img = cv::imread(std::string(argv[2]) + "/" + f);
    if (src_img.empty())
      continue;

    int i = 0;
    for (int row = 0; row < src_img.rows; ++row) {
      uchar *uc_pixel = src_img.data + row * src_img.step;
      for (int col = 0; col < src_img.cols; ++col) {
        data[i] = (uc_pixel[2] / 255.0 - mean_value[2]) / std_value[2];
        data[i + src_img.rows * src_img.cols] =
            (uc_pixel[1] / 255.0 - mean_value[1]) / std_value[1];
        data[i + 2 * src_img.rows * src_img.cols] =
            (uc_pixel[0] / 255.0 - mean_value[0]) / std_value[0];
        uc_pixel += 3;
        ++i;
      }
    }
    // Run inference
    auto start = std::chrono::system_clock::now();
    doInference(*context, data, prob, src_img.rows, src_img.cols);
    auto end = std::chrono::system_clock::now();
    std::cout << "detect time:"
              << std::chrono::duration_cast<std::chrono::milliseconds>(end -
                                                                       start)
                     .count()
              << "ms" << std::endl;
    float num = std::accumulate(
        prob, prob + ((src_img.rows * src_img.cols) >> 6), 0.0f);

    cv::Mat densityMap(src_img.rows >> 3, src_img.cols >> 3, CV_32FC1,
                       (void *)prob);

    cv::Mat densityMapScaled;
    cv::normalize(densityMap, densityMapScaled, 0, 255, cv::NORM_MINMAX,
                  CV_8UC1);
    cv::Mat densityColorMap;
    cv::applyColorMap(densityMapScaled, densityColorMap, cv::COLORMAP_VIRIDIS);

    cv::resize(densityColorMap, densityColorMap, src_img.size());
    cv::addWeighted(densityColorMap, 0.5, src_img, 0.5, 0, src_img);

    // write to jpg
    cv::putText(src_img, std::string("people num: ") + std::to_string(num),
                cv::Point(10, 50), cv::FONT_HERSHEY_SIMPLEX, 0.5,
                cv::Scalar(255, 255, 255), 1);
    std::string write_path = std::string(argv[2]) + "result_" + f;
    std::cout << "people num :" << num << " write_path: " << write_path
              << std::endl;
    cv::imwrite(write_path, src_img);
  }
  delete[] data;
  delete[] prob;

  return 0;
}

================================================
FILE: csrnet/gen_wts.py
================================================
from torch.nn.modules import module
from model import CSRNet
import torch
import os
import struct


save_path = os.path.join(os.path.dirname(
    __file__), "output", os.path.basename(__file__).split('.')[0])
os.makedirs(save_path, exist_ok=True)
wts_file = os.path.join(save_path, "csrnet.wts")


# load model
model_path = "partBmodel_best.pth.tar"
model = CSRNet()
checkpoint = torch.load(model_path)
model.load_state_dict(checkpoint['state_dict'])


# save to wts
print(f'Writing into {wts_file}')
with open(wts_file, 'w') as f:
    f.write('{}\n'.format(len(model.state_dict().keys())))
    for k, v in model.state_dict().items():
        vr = v.reshape(-1).cpu().numpy()
        f.write('{} {} '.format(k, len(vr)))
        for vv in vr:
            f.write(' ')
            f.write(struct.pack('>f', float(vv)).hex())
        f.write('\n')

================================================
FILE: csrnet/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include "macros.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf {
public:
  LogStreamConsumerBuffer(std::ostream &stream, const std::string &prefix,
                          bool shouldLog)
      : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {}

  LogStreamConsumerBuffer(LogStreamConsumerBuffer &&other)
      : mOutput(other.mOutput) {}

  ~LogStreamConsumerBuffer() {
    // std::streambuf::pbase() gives a pointer to the beginning of the buffered
    // part of the output sequence std::streambuf::pptr() gives a pointer to the
    // current position of the output sequence if the pointer to the beginning
    // is not equal to the pointer to the current position, call putOutput() to
    // log the output to the stream
    if (pbase() != pptr()) {
      putOutput();
    }
  }

  // synchronizes the stream buffer and returns 0 on success
  // synchronizing the stream buffer consists of inserting the buffer contents
  // into the stream, resetting the buffer and flushing the stream
  virtual int sync() {
    putOutput();
    return 0;
  }

  void putOutput() {
    if (mShouldLog) {
      // prepend timestamp
      std::time_t timestamp = std::time(nullptr);
      tm *tm_local = std::localtime(&timestamp);
      std::cout << "[";
      std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon
                << "/";
      std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday
                << "/";
      std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year
                << "-";
      std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour
                << ":";
      std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
      std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec
                << "] ";
      // std::stringbuf::str() gets the string contents of the buffer
      // insert the buffer contents pre-appended by the appropriate prefix into
      // the stream
      mOutput << mPrefix << str();
      // set the buffer to empty
      str("");
      // flush the stream
      mOutput.flush();
    }
  }

  void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }

private:
  std::ostream &mOutput;
  std::string mPrefix;
  bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before
//! std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase {
public:
  LogStreamConsumerBase(std::ostream &stream, const std::string &prefix,
                        bool shouldLog)
      : mBuffer(stream, prefix, shouldLog) {}

protected:
  LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when
//! logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the
//!  LogStreamConsumerBuffer member field in LogStreamConsumer and then the
//!  address of the buffer is passed to std::ostream. This is necessary to
//!  prevent the address of an uninitialized buffer from being passed to
//!  std::ostream. Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
public:
  //! \brief Creates a LogStreamConsumer which logs messages with level
  //! severity.
  //!  Reportable severity determines if the messages are severe enough to be
  //!  logged.
  LogStreamConsumer(Severity reportableSeverity, Severity severity)
      : LogStreamConsumerBase(severityOstream(severity),
                              severityPrefix(severity),
                              severity <= reportableSeverity),
        std::ostream(&mBuffer) // links the stream buffer with the stream
        ,
        mShouldLog(severity <= reportableSeverity), mSeverity(severity) {}

  LogStreamConsumer(LogStreamConsumer &&other)
      : LogStreamConsumerBase(severityOstream(other.mSeverity),
                              severityPrefix(other.mSeverity),
                              other.mShouldLog),
        std::ostream(&mBuffer) // links the stream buffer with the stream
        ,
        mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {}

  void setReportableSeverity(Severity reportableSeverity) {
    mShouldLog = mSeverity <= reportableSeverity;
    mBuffer.setShouldLog(mShouldLog);
  }

private:
  static std::ostream &severityOstream(Severity severity) {
    return severity >= Severity::kINFO ? std::cout : std::cerr;
  }

  static std::string severityPrefix(Severity severity) {
    switch (severity) {
    case Severity::kINTERNAL_ERROR:
      return "[F] ";
    case Severity::kERROR:
      return "[E] ";
    case Severity::kWARNING:
      return "[W] ";
    case Severity::kINFO:
      return "[I] ";
    case Severity::kVERBOSE:
      return "[V] ";
    default:
      assert(0);
      return "";
    }
  }

  bool mShouldLog;
  Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and
//! samples to log information to the console, and supports logging two types of
//! messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or
//! internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to
//! emitting directly to stdout/stderr is that the logic for controlling the
//! verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results
//! to a file in some standard format (for example, JUnit XML), and providing
//! additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits
//! directly from the nvinfer1::ILogger interface, which is problematic since
//! there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to
//! access the ILogger) we can refactor the class to eliminate the inheritance
//! and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger {
public:
  Logger(Severity severity = Severity::kWARNING)
      : mReportableSeverity(severity) {}

  //!
  //! \enum TestResult
  //! \brief Represents the state of a given test
  //!
  enum class TestResult {
    kRUNNING, //!< The test is running
    kPASSED,  //!< The test passed
    kFAILED,  //!< The test failed
    kWAIVED   //!< The test was waived
  };

  //!
  //! \brief Forward-compatible method for retrieving the nvinfer::ILogger
  //! associated with this Logger \return The nvinfer1::ILogger associated with
  //! this Logger
  //!
  //! TODO Once all samples are updated to use this method to register the
  //! logger with TensorRT, we can eliminate the inheritance of Logger from
  //! ILogger
  //!
  nvinfer1::ILogger &getTRTLogger() { return *this; }

  //!
  //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
  //!
  //! Note samples should not be calling this function directly; it will
  //! eventually go away once we eliminate the inheritance from
  //! nvinfer1::ILogger
  //!
  void log(Severity severity, const char *msg) TRT_NOEXCEPT override {
    LogStreamConsumer(mReportableSeverity, severity)
        << "[TRT] " << std::string(msg) << std::endl;
  }

  //!
  //! \brief Method for controlling the verbosity of logging output
  //!
  //! \param severity The logger will only emit messages that have severity of
  //! this level or higher.
  //!
  void setReportableSeverity(Severity severity) {
    mReportableSeverity = severity;
  }

  //!
  //! \brief Opaque handle that holds logging information for a particular test
  //!
  //! This object is an opaque handle to information used by the Logger to print
  //! test results. The sample must call Logger::defineTest() in order to obtain
  //! a TestAtom that can be used with Logger::reportTest{Start,End}().
  //!
  class TestAtom {
  public:
    TestAtom(TestAtom &&) = default;

  private:
    friend class Logger;

    TestAtom(bool started, const std::string &name, const std::string &cmdline)
        : mStarted(started), mName(name), mCmdline(cmdline) {}

    bool mStarted;
    std::string mName;
    std::string mCmdline;
  };

  //!
  //! \brief Define a test for logging
  //!
  //! \param[in] name The name of the test.  This should be a string starting
  //! with
  //!                  "TensorRT" and containing dot-separated strings
  //!                  containing the characters [A-Za-z0-9_]. For example,
  //!                  "TensorRT.sample_googlenet"
  //! \param[in] cmdline The command line used to reproduce the test
  //
  //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
  //!
  static TestAtom defineTest(const std::string &name,
                             const std::string &cmdline) {
    return TestAtom(false, name, cmdline);
  }

  //!
  //! \brief A convenience overloaded version of defineTest() that accepts an
  //! array of command-line arguments
  //!        as input
  //!
  //! \param[in] name The name of the test
  //! \param[in] argc The number of command-line arguments
  //! \param[in] argv The array of command-line arguments (given as C strings)
  //!
  //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
  static TestAtom defineTest(const std::string &name, int argc,
                             char const *const *argv) {
    auto cmdline = genCmdlineString(argc, argv);
    return defineTest(name, cmdline);
  }

  //!
  //! \brief Report that a test has started.
  //!
  //! \pre reportTestStart() has not been called yet for the given testAtom
  //!
  //! \param[in] testAtom The handle to the test that has started
  //!
  static void reportTestStart(TestAtom &testAtom) {
    reportTestResult(testAtom, TestResult::kRUNNING);
    assert(!testAtom.mStarted);
    testAtom.mStarted = true;
  }

  //!
  //! \brief Report that a test has ended.
  //!
  //! \pre reportTestStart() has been called for the given testAtom
  //!
  //! \param[in] testAtom The handle to the test that has ended
  //! \param[in] result The result of the test. Should be one of
  //! TestResult::kPASSED,
  //!                   TestResult::kFAILED, TestResult::kWAIVED
  //!
  static void reportTestEnd(const TestAtom &testAtom, TestResult result) {
    assert(result != TestResult::kRUNNING);
    assert(testAtom.mStarted);
    reportTestResult(testAtom, result);
  }

  static int reportPass(const TestAtom &testAtom) {
    reportTestEnd(testAtom, TestResult::kPASSED);
    return EXIT_SUCCESS;
  }

  static int reportFail(const TestAtom &testAtom) {
    reportTestEnd(testAtom, TestResult::kFAILED);
    return EXIT_FAILURE;
  }

  static int reportWaive(const TestAtom &testAtom) {
    reportTestEnd(testAtom, TestResult::kWAIVED);
    return EXIT_SUCCESS;
  }

  static int reportTest(const TestAtom &testAtom, bool pass) {
    return pass ? reportPass(testAtom) : reportFail(testAtom);
  }

  Severity getReportableSeverity() const { return mReportableSeverity; }

private:
  //!
  //! \brief returns an appropriate string for prefixing a log message with the
  //! given severity
  //!
  static const char *severityPrefix(Severity severity) {
    switch (severity) {
    case Severity::kINTERNAL_ERROR:
      return "[F] ";
    case Severity::kERROR:
      return "[E] ";
    case Severity::kWARNING:
      return "[W] ";
    case Severity::kINFO:
      return "[I] ";
    case Severity::kVERBOSE:
      return "[V] ";
    default:
      assert(0);
      return "";
    }
  }

  //!
  //! \brief returns an appropriate string for prefixing a test result message
  //! with the given result
  //!
  static const char *testResultString(TestResult result) {
    switch (result) {
    case TestResult::kRUNNING:
      return "RUNNING";
    case TestResult::kPASSED:
      return "PASSED";
    case TestResult::kFAILED:
      return "FAILED";
    case TestResult::kWAIVED:
      return "WAIVED";
    default:
      assert(0);
      return "";
    }
  }

  //!
  //! \brief returns an appropriate output stream (cout or cerr) to use with the
  //! given severity
  //!
  static std::ostream &severityOstream(Severity severity) {
    return severity >= Severity::kINFO ? std::cout : std::cerr;
  }

  //!
  //! \brief method that implements logging test results
  //!
  static void reportTestResult(const TestAtom &testAtom, TestResult result) {
    severityOstream(Severity::kINFO)
        << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
        << testAtom.mCmdline << std::endl;
  }

  //!
  //! \brief generate a command line string from the given (argc, argv) values
  //!
  static std::string genCmdlineString(int argc, char const *const *argv) {
    std::stringstream ss;
    for (int i = 0; i < argc; i++) {
      if (i > 0)
        ss << " ";
      ss << argv[i];
    }
    return ss.str();
  }

  Severity mReportableSeverity;
};

namespace {

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages
//! of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger &logger) {
  return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages
//! of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger &logger) {
  return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages
//! of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger &logger) {
  return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages
//! of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger &logger) {
  return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages
//! of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger &logger) {
  return LogStreamConsumer(logger.getReportableSeverity(),
                           Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: csrnet/macros.h
================================================
#ifndef __MACROS_H
#define __MACROS_H

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif

#endif  // __MACROS_H


================================================
FILE: dbnet/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(dbnet)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

find_package(CUDA REQUIRED)

include_directories(${PROJECT_SOURCE_DIR}/include)

# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(/usr/include/x86_64-linux-gnu/)
link_directories(/usr/lib/x86_64-linux-gnu/)

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

aux_source_directory(. DIRSRCS)

# clipper
include_directories(./ ./clipper)
add_subdirectory(clipper)

add_executable(dbnet ${DIRSRCS})
target_link_libraries(dbnet clipper)
target_link_libraries(dbnet nvinfer)
target_link_libraries(dbnet cudart)
target_link_libraries(dbnet ${OpenCV_LIBS})

add_definitions(-O2 -pthread)


================================================
FILE: dbnet/README.md
================================================
# DBNet

The Pytorch implementation is [DBNet](https://github.com/BaofengZan/DBNet.pytorch).

<p align="center">
<img src="https://user-images.githubusercontent.com/25873202/113959270-1eb8c600-9855-11eb-9c4d-1e6dc8e38a17.jpg">
</p>


## How to Run

* 1. generate `.wts`

  Download code and model from [DBNet](https://github.com/BaofengZan/DBNet.pytorch) and config your environments.

  Go to file`tools/predict.py`, set `--save_wts` as `True`, then run, the `DBNet.wts` will be generated.

  Onnx can also be exported, just need to set `--onnx` as `True`.

* 2. cmake and make

  ```
  mkdir build
  cd build
  cmake ..
  make
  cp /your_wts_path/DBNet.wts .
  sudo ./dbnet -s             // serialize model to plan file i.e. 'DBNet.engine'
  sudo ./dbnet -d  ./test_imgs // deserialize plan file and run inference, all images in test_imgs folder will be processed.
  ```


## For windows

https://github.com/BaofengZan/DBNet-TensorRT


## Todo

- [x] 1. In `common.hpp`, the following two functions can be merged.

     ```c++
     ILayer* convBnLeaky(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname, bool bias = true) 
     ```

     ```c++
     ILayer* convBnLeaky2(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname, bool bias = true)
     ```

- [x] 2. The postprocess method here should be optimized, which is a little different from pytorch side.

- [x] 3. The input image here is resized to `640 x 640` directly, while the pytorch side is using `letterbox` method.


================================================
FILE: dbnet/clipper/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

aux_source_directory(. DIR_CLIPPER_SRCS)
add_library(clipper ${DIR_CLIPPER_SRCS})

================================================
FILE: dbnet/clipper/clipper.cpp
================================================
/*******************************************************************************
*                                                                              *
* Author    :  Angus Johnson                                                   *
* Version   :  6.4.2                                                           *
* Date      :  27 February 2017                                                *
* Website   :  http://www.angusj.com                                           *
* Copyright :  Angus Johnson 2010-2017                                         *
*                                                                              *
* License:                                                                     *
* Use, modification & distribution is subject to Boost Software License Ver 1. *
* http://www.boost.org/LICENSE_1_0.txt                                         *
*                                                                              *
* Attributions:                                                                *
* The code in this library is an extension of Bala Vatti's clipping algorithm: *
* "A generic solution to polygon clipping"                                     *
* Communications of the ACM, Vol 35, Issue 7 (July 1992) pp 56-63.             *
* http://portal.acm.org/citation.cfm?id=129906                                 *
*                                                                              *
* Computer graphics and geometric modeling: implementation and algorithms      *
* By Max K. Agoston                                                            *
* Springer; 1 edition (January 4, 2005)                                        *
* http://books.google.com/books?q=vatti+clipping+agoston                       *
*                                                                              *
* See also:                                                                    *
* "Polygon Offsetting by Computing Winding Numbers"                            *
* Paper no. DETC2005-85513 pp. 565-575                                         *
* ASME 2005 International Design Engineering Technical Conferences             *
* and Computers and Information in Engineering Conference (IDETC/CIE2005)      *
* September 24-28, 2005 , Long Beach, California, USA                          *
* http://www.me.berkeley.edu/~mcmains/pubs/DAC05OffsetPolygon.pdf              *
*                                                                              *
*******************************************************************************/

/*******************************************************************************
*                                                                              *
* This is a translation of the Delphi Clipper library and the naming style     *
* used has retained a Delphi flavour.                                          *
*                                                                              *
*******************************************************************************/

#include "clipper.hpp"
#include <cmath>
#include <vector>
#include <algorithm>
#include <stdexcept>
#include <cstring>
#include <cstdlib>
#include <ostream>
#include <functional>

namespace ClipperLib {

static double const pi = 3.141592653589793238;
static double const two_pi = pi *2;
static double const def_arc_tolerance = 0.25;

enum Direction { dRightToLeft, dLeftToRight };

static int const Unassigned = -1;  //edge not currently 'owning' a solution
static int const Skip = -2;        //edge that would otherwise close a path

#define HORIZONTAL (-1.0E+40)
#define TOLERANCE (1.0e-20)
#define NEAR_ZERO(val) (((val) > -TOLERANCE) && ((val) < TOLERANCE))

struct TEdge {
  IntPoint Bot;
  IntPoint Curr; //current (updated for every new scanbeam)
  IntPoint Top;
  double Dx;
  PolyType PolyTyp;
  EdgeSide Side; //side only refers to current side of solution poly
  int WindDelta; //1 or -1 depending on winding direction
  int WindCnt;
  int WindCnt2; //winding count of the opposite polytype
  int OutIdx;
  TEdge *Next;
  TEdge *Prev;
  TEdge *NextInLML;
  TEdge *NextInAEL;
  TEdge *PrevInAEL;
  TEdge *NextInSEL;
  TEdge *PrevInSEL;
};

struct IntersectNode {
  TEdge          *Edge1;
  TEdge          *Edge2;
  IntPoint        Pt;
};

struct LocalMinimum {
  cInt          Y;
  TEdge        *LeftBound;
  TEdge        *RightBound;
};

struct OutPt;

//OutRec: contains a path in the clipping solution. Edges in the AEL will
//carry a pointer to an OutRec when they are part of the clipping solution.
struct OutRec {
  int       Idx;
  bool      IsHole;
  bool      IsOpen;
  OutRec   *FirstLeft;  //see comments in clipper.pas
  PolyNode *PolyNd;
  OutPt    *Pts;
  OutPt    *BottomPt;
};

struct OutPt {
  int       Idx;
  IntPoint  Pt;
  OutPt    *Next;
  OutPt    *Prev;
};

struct Join {
  OutPt    *OutPt1;
  OutPt    *OutPt2;
  IntPoint  OffPt;
};

struct LocMinSorter
{
  inline bool operator()(const LocalMinimum& locMin1, const LocalMinimum& locMin2)
  {
    return locMin2.Y < locMin1.Y;
  }
};

//------------------------------------------------------------------------------
//------------------------------------------------------------------------------

inline cInt Round(double val)
{
  if ((val < 0)) return static_cast<cInt>(val - 0.5); 
  else return static_cast<cInt>(val + 0.5);
}
//------------------------------------------------------------------------------

inline cInt Abs(cInt val)
{
  return val < 0 ? -val : val;
}

//------------------------------------------------------------------------------
// PolyTree methods ...
//------------------------------------------------------------------------------

void PolyTree::Clear()
{
    for (PolyNodes::size_type i = 0; i < AllNodes.size(); ++i)
      delete AllNodes[i];
    AllNodes.resize(0); 
    Childs.resize(0);
}
//------------------------------------------------------------------------------

PolyNode* PolyTree::GetFirst() const
{
  if (!Childs.empty())
      return Childs[0];
  else
      return 0;
}
//------------------------------------------------------------------------------

int PolyTree::Total() const
{
  int result = (int)AllNodes.size();
  //with negative offsets, ignore the hidden outer polygon ...
  if (result > 0 && Childs[0] != AllNodes[0]) result--;
  return result;
}

//------------------------------------------------------------------------------
// PolyNode methods ...
//------------------------------------------------------------------------------

PolyNode::PolyNode(): Parent(0), Index(0), m_IsOpen(false)
{
}
//------------------------------------------------------------------------------

int PolyNode::ChildCount() const
{
  return (int)Childs.size();
}
//------------------------------------------------------------------------------

void PolyNode::AddChild(PolyNode& child)
{
  unsigned cnt = (unsigned)Childs.size();
  Childs.push_back(&child);
  child.Parent = this;
  child.Index = cnt;
}
//------------------------------------------------------------------------------

PolyNode* PolyNode::GetNext() const
{ 
  if (!Childs.empty()) 
      return Childs[0]; 
  else
      return GetNextSiblingUp();    
}  
//------------------------------------------------------------------------------

PolyNode* PolyNode::GetNextSiblingUp() const
{ 
  if (!Parent) //protects against PolyTree.GetNextSiblingUp()
      return 0;
  else if (Index == Parent->Childs.size() - 1)
      return Parent->GetNextSiblingUp();
  else
      return Parent->Childs[Index + 1];
}  
//------------------------------------------------------------------------------

bool PolyNode::IsHole() const
{ 
  bool result = true;
  PolyNode* node = Parent;
  while (node)
  {
      result = !result;
      node = node->Parent;
  }
  return result;
}  
//------------------------------------------------------------------------------

bool PolyNode::IsOpen() const
{ 
  return m_IsOpen;
}  
//------------------------------------------------------------------------------

#ifndef use_int32

//------------------------------------------------------------------------------
// Int128 class (enables safe math on signed 64bit integers)
// eg Int128 val1((long64)9223372036854775807); //ie 2^63 -1
//    Int128 val2((long64)9223372036854775807);
//    Int128 val3 = val1 * val2;
//    val3.AsString => "85070591730234615847396907784232501249" (8.5e+37)
//------------------------------------------------------------------------------

class Int128
{
  public:
    ulong64 lo;
    long64 hi;

    Int128(long64 _lo = 0)
    {
      lo = (ulong64)_lo;   
      if (_lo < 0)  hi = -1; else hi = 0; 
    }


    Int128(const Int128 &val): lo(val.lo), hi(val.hi){}

    Int128(const long64& _hi, const ulong64& _lo): lo(_lo), hi(_hi){}
    
    Int128& operator = (const long64 &val)
    {
      lo = (ulong64)val;
      if (val < 0) hi = -1; else hi = 0;
      return *this;
    }

    bool operator == (const Int128 &val) const
      {return (hi == val.hi && lo == val.lo);}

    bool operator != (const Int128 &val) const
      { return !(*this == val);}

    bool operator > (const Int128 &val) const
    {
      if (hi != val.hi)
        return hi > val.hi;
      else
        return lo > val.lo;
    }

    bool operator < (const Int128 &val) const
    {
      if (hi != val.hi)
        return hi < val.hi;
      else
        return lo < val.lo;
    }

    bool operator >= (const Int128 &val) const
      { return !(*this < val);}

    bool operator <= (const Int128 &val) const
      { return !(*this > val);}

    Int128& operator += (const Int128 &rhs)
    {
      hi += rhs.hi;
      lo += rhs.lo;
      if (lo < rhs.lo) hi++;
      return *this;
    }

    Int128 operator + (const Int128 &rhs) const
    {
      Int128 result(*this);
      result+= rhs;
      return result;
    }

    Int128& operator -= (const Int128 &rhs)
    {
      *this += -rhs;
      return *this;
    }

    Int128 operator - (const Int128 &rhs) const
    {
      Int128 result(*this);
      result -= rhs;
      return result;
    }

    Int128 operator-() const //unary negation
    {
      if (lo == 0)
        return Int128(-hi, 0);
      else
        return Int128(~hi, ~lo + 1);
    }

    operator double() const
    {
      const double shift64 = 18446744073709551616.0; //2^64
      if (hi < 0)
      {
        if (lo == 0) return (double)hi * shift64;
        else return -(double)(~lo + ~hi * shift64);
      }
      else
        return (double)(lo + hi * shift64);
    }

};
//------------------------------------------------------------------------------

Int128 Int128Mul (long64 lhs, long64 rhs)
{
  bool negate = (lhs < 0) != (rhs < 0);

  if (lhs < 0) lhs = -lhs;
  ulong64 int1Hi = ulong64(lhs) >> 32;
  ulong64 int1Lo = ulong64(lhs & 0xFFFFFFFF);

  if (rhs < 0) rhs = -rhs;
  ulong64 int2Hi = ulong64(rhs) >> 32;
  ulong64 int2Lo = ulong64(rhs & 0xFFFFFFFF);

  //nb: see comments in clipper.pas
  ulong64 a = int1Hi * int2Hi;
  ulong64 b = int1Lo * int2Lo;
  ulong64 c = int1Hi * int2Lo + int1Lo * int2Hi;

  Int128 tmp;
  tmp.hi = long64(a + (c >> 32));
  tmp.lo = long64(c << 32);
  tmp.lo += long64(b);
  if (tmp.lo < b) tmp.hi++;
  if (negate) tmp = -tmp;
  return tmp;
};
#endif

//------------------------------------------------------------------------------
// Miscellaneous global functions
//------------------------------------------------------------------------------

bool Orientation(const Path &poly)
{
    return Area(poly) >= 0;
}
//------------------------------------------------------------------------------

double Area(const Path &poly)
{
  int size = (int)poly.size();
  if (size < 3) return 0;

  double a = 0;
  for (int i = 0, j = size -1; i < size; ++i)
  {
    a += ((double)poly[j].X + poly[i].X) * ((double)poly[j].Y - poly[i].Y);
    j = i;
  }
  return -a * 0.5;
}
//------------------------------------------------------------------------------

double Area(const OutPt *op)
{
  const OutPt *startOp = op;
  if (!op) return 0;
  double a = 0;
  do {
    a +=  (double)(op->Prev->Pt.X + op->Pt.X) * (double)(op->Prev->Pt.Y - op->Pt.Y);
    op = op->Next;
  } while (op != startOp);
  return a * 0.5;
}
//------------------------------------------------------------------------------

double Area(const OutRec &outRec)
{
  return Area(outRec.Pts);
}
//------------------------------------------------------------------------------

bool PointIsVertex(const IntPoint &Pt, OutPt *pp)
{
  OutPt *pp2 = pp;
  do
  {
    if (pp2->Pt == Pt) return true;
    pp2 = pp2->Next;
  }
  while (pp2 != pp);
  return false;
}
//------------------------------------------------------------------------------

//See "The Point in Polygon Problem for Arbitrary Polygons" by Hormann & Agathos
//http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.88.5498&rep=rep1&type=pdf
int PointInPolygon(const IntPoint &pt, const Path &path)
{
  //returns 0 if false, +1 if true, -1 if pt ON polygon boundary
  int result = 0;
  size_t cnt = path.size();
  if (cnt < 3) return 0;
  IntPoint ip = path[0];
  for(size_t i = 1; i <= cnt; ++i)
  {
    IntPoint ipNext = (i == cnt ? path[0] : path[i]);
    if (ipNext.Y == pt.Y)
    {
        if ((ipNext.X == pt.X) || (ip.Y == pt.Y && 
          ((ipNext.X > pt.X) == (ip.X < pt.X)))) return -1;
    }
    if ((ip.Y < pt.Y) != (ipNext.Y < pt.Y))
    {
      if (ip.X >= pt.X)
      {
        if (ipNext.X > pt.X) result = 1 - result;
        else
        {
          double d = (double)(ip.X - pt.X) * (ipNext.Y - pt.Y) - 
            (double)(ipNext.X - pt.X) * (ip.Y - pt.Y);
          if (!d) return -1;
          if ((d > 0) == (ipNext.Y > ip.Y)) result = 1 - result;
        }
      } else
      {
        if (ipNext.X > pt.X)
        {
          double d = (double)(ip.X - pt.X) * (ipNext.Y - pt.Y) - 
            (double)(ipNext.X - pt.X) * (ip.Y - pt.Y);
          if (!d) return -1;
          if ((d > 0) == (ipNext.Y > ip.Y)) result = 1 - result;
        }
      }
    }
    ip = ipNext;
  } 
  return result;
}
//------------------------------------------------------------------------------

int PointInPolygon (const IntPoint &pt, OutPt *op)
{
  //returns 0 if false, +1 if true, -1 if pt ON polygon boundary
  int result = 0;
  OutPt* startOp = op;
  for(;;)
  {
    if (op->Next->Pt.Y == pt.Y)
    {
        if ((op->Next->Pt.X == pt.X) || (op->Pt.Y == pt.Y && 
          ((op->Next->Pt.X > pt.X) == (op->Pt.X < pt.X)))) return -1;
    }
    if ((op->Pt.Y < pt.Y) != (op->Next->Pt.Y < pt.Y))
    {
      if (op->Pt.X >= pt.X)
      {
        if (op->Next->Pt.X > pt.X) result = 1 - result;
        else
        {
          double d = (double)(op->Pt.X - pt.X) * (op->Next->Pt.Y - pt.Y) - 
            (double)(op->Next->Pt.X - pt.X) * (op->Pt.Y - pt.Y);
          if (!d) return -1;
          if ((d > 0) == (op->Next->Pt.Y > op->Pt.Y)) result = 1 - result;
        }
      } else
      {
        if (op->Next->Pt.X > pt.X)
        {
          double d = (double)(op->Pt.X - pt.X) * (op->Next->Pt.Y - pt.Y) - 
            (double)(op->Next->Pt.X - pt.X) * (op->Pt.Y - pt.Y);
          if (!d) return -1;
          if ((d > 0) == (op->Next->Pt.Y > op->Pt.Y)) result = 1 - result;
        }
      }
    } 
    op = op->Next;
    if (startOp == op) break;
  } 
  return result;
}
//------------------------------------------------------------------------------

bool Poly2ContainsPoly1(OutPt *OutPt1, OutPt *OutPt2)
{
  OutPt* op = OutPt1;
  do
  {
    //nb: PointInPolygon returns 0 if false, +1 if true, -1 if pt on polygon
    int res = PointInPolygon(op->Pt, OutPt2);
    if (res >= 0) return res > 0;
    op = op->Next; 
  }
  while (op != OutPt1);
  return true; 
}
//----------------------------------------------------------------------

bool SlopesEqual(const TEdge &e1, const TEdge &e2, bool UseFullInt64Range)
{
#ifndef use_int32
  if (UseFullInt64Range)
    return Int128Mul(e1.Top.Y - e1.Bot.Y, e2.Top.X - e2.Bot.X) == 
    Int128Mul(e1.Top.X - e1.Bot.X, e2.Top.Y - e2.Bot.Y);
  else 
#endif
    return (e1.Top.Y - e1.Bot.Y) * (e2.Top.X - e2.Bot.X) == 
    (e1.Top.X - e1.Bot.X) * (e2.Top.Y - e2.Bot.Y);
}
//------------------------------------------------------------------------------

bool SlopesEqual(const IntPoint pt1, const IntPoint pt2,
  const IntPoint pt3, bool UseFullInt64Range)
{
#ifndef use_int32
  if (UseFullInt64Range)
    return Int128Mul(pt1.Y-pt2.Y, pt2.X-pt3.X) == Int128Mul(pt1.X-pt2.X, pt2.Y-pt3.Y);
  else 
#endif
    return (pt1.Y-pt2.Y)*(pt2.X-pt3.X) == (pt1.X-pt2.X)*(pt2.Y-pt3.Y);
}
//------------------------------------------------------------------------------

bool SlopesEqual(const IntPoint pt1, const IntPoint pt2,
  const IntPoint pt3, const IntPoint pt4, bool UseFullInt64Range)
{
#ifndef use_int32
  if (UseFullInt64Range)
    return Int128Mul(pt1.Y-pt2.Y, pt3.X-pt4.X) == Int128Mul(pt1.X-pt2.X, pt3.Y-pt4.Y);
  else 
#endif
    return (pt1.Y-pt2.Y)*(pt3.X-pt4.X) == (pt1.X-pt2.X)*(pt3.Y-pt4.Y);
}
//------------------------------------------------------------------------------

inline bool IsHorizontal(TEdge &e)
{
  return e.Dx == HORIZONTAL;
}
//------------------------------------------------------------------------------

inline double GetDx(const IntPoint pt1, const IntPoint pt2)
{
  return (pt1.Y == pt2.Y) ?
    HORIZONTAL : (double)(pt2.X - pt1.X) / (pt2.Y - pt1.Y);
}
//---------------------------------------------------------------------------

inline void SetDx(TEdge &e)
{
  cInt dy  = (e.Top.Y - e.Bot.Y);
  if (dy == 0) e.Dx = HORIZONTAL;
  else e.Dx = (double)(e.Top.X - e.Bot.X) / dy;
}
//---------------------------------------------------------------------------

inline void SwapSides(TEdge &Edge1, TEdge &Edge2)
{
  EdgeSide Side =  Edge1.Side;
  Edge1.Side = Edge2.Side;
  Edge2.Side = Side;
}
//------------------------------------------------------------------------------

inline void SwapPolyIndexes(TEdge &Edge1, TEdge &Edge2)
{
  int OutIdx =  Edge1.OutIdx;
  Edge1.OutIdx = Edge2.OutIdx;
  Edge2.OutIdx = OutIdx;
}
//------------------------------------------------------------------------------

inline cInt TopX(TEdge &edge, const cInt currentY)
{
  return ( currentY == edge.Top.Y ) ?
    edge.Top.X : edge.Bot.X + Round(edge.Dx *(currentY - edge.Bot.Y));
}
//------------------------------------------------------------------------------

void IntersectPoint(TEdge &Edge1, TEdge &Edge2, IntPoint &ip)
{
#ifdef use_xyz  
  ip.Z = 0;
#endif

  double b1, b2;
  if (Edge1.Dx == Edge2.Dx)
  {
    ip.Y = Edge1.Curr.Y;
    ip.X = TopX(Edge1, ip.Y);
    return;
  }
  else if (Edge1.Dx == 0)
  {
    ip.X = Edge1.Bot.X;
    if (IsHorizontal(Edge2))
      ip.Y = Edge2.Bot.Y;
    else
    {
      b2 = Edge2.Bot.Y - (Edge2.Bot.X / Edge2.Dx);
      ip.Y = Round(ip.X / Edge2.Dx + b2);
    }
  }
  else if (Edge2.Dx == 0)
  {
    ip.X = Edge2.Bot.X;
    if (IsHorizontal(Edge1))
      ip.Y = Edge1.Bot.Y;
    else
    {
      b1 = Edge1.Bot.Y - (Edge1.Bot.X / Edge1.Dx);
      ip.Y = Round(ip.X / Edge1.Dx + b1);
    }
  } 
  else 
  {
    b1 = Edge1.Bot.X - Edge1.Bot.Y * Edge1.Dx;
    b2 = Edge2.Bot.X - Edge2.Bot.Y * Edge2.Dx;
    double q = (b2-b1) / (Edge1.Dx - Edge2.Dx);
    ip.Y = Round(q);
    if (std::fabs(Edge1.Dx) < std::fabs(Edge2.Dx))
      ip.X = Round(Edge1.Dx * q + b1);
    else 
      ip.X = Round(Edge2.Dx * q + b2);
  }

  if (ip.Y < Edge1.Top.Y || ip.Y < Edge2.Top.Y) 
  {
    if (Edge1.Top.Y > Edge2.Top.Y)
      ip.Y = Edge1.Top.Y;
    else
      ip.Y = Edge2.Top.Y;
    if (std::fabs(Edge1.Dx) < std::fabs(Edge2.Dx))
      ip.X = TopX(Edge1, ip.Y);
    else
      ip.X = TopX(Edge2, ip.Y);
  } 
  //finally, don't allow 'ip' to be BELOW curr.Y (ie bottom of scanbeam) ...
  if (ip.Y > Edge1.Curr.Y)
  {
    ip.Y = Edge1.Curr.Y;
    //use the more vertical edge to derive X ...
    if (std::fabs(Edge1.Dx) > std::fabs(Edge2.Dx))
      ip.X = TopX(Edge2, ip.Y); else
      ip.X = TopX(Edge1, ip.Y);
  }
}
//------------------------------------------------------------------------------

void ReversePolyPtLinks(OutPt *pp)
{
  if (!pp) return;
  OutPt *pp1, *pp2;
  pp1 = pp;
  do {
  pp2 = pp1->Next;
  pp1->Next = pp1->Prev;
  pp1->Prev = pp2;
  pp1 = pp2;
  } while( pp1 != pp );
}
//------------------------------------------------------------------------------

void DisposeOutPts(OutPt*& pp)
{
  if (pp == 0) return;
    pp->Prev->Next = 0;
  while( pp )
  {
    OutPt *tmpPp = pp;
    pp = pp->Next;
    delete tmpPp;
  }
}
//------------------------------------------------------------------------------

inline void InitEdge(TEdge* e, TEdge* eNext, TEdge* ePrev, const IntPoint& Pt)
{
  std::memset(e, 0, sizeof(TEdge));
  e->Next = eNext;
  e->Prev = ePrev;
  e->Curr = Pt;
  e->OutIdx = Unassigned;
}
//------------------------------------------------------------------------------

void InitEdge2(TEdge& e, PolyType Pt)
{
  if (e.Curr.Y >= e.Next->Curr.Y)
  {
    e.Bot = e.Curr;
    e.Top = e.Next->Curr;
  } else
  {
    e.Top = e.Curr;
    e.Bot = e.Next->Curr;
  }
  SetDx(e);
  e.PolyTyp = Pt;
}
//------------------------------------------------------------------------------

TEdge* RemoveEdge(TEdge* e)
{
  //removes e from double_linked_list (but without removing from memory)
  e->Prev->Next = e->Next;
  e->Next->Prev = e->Prev;
  TEdge* result = e->Next;
  e->Prev = 0; //flag as removed (see ClipperBase.Clear)
  return result;
}
//------------------------------------------------------------------------------

inline void ReverseHorizontal(TEdge &e)
{
  //swap horizontal edges' Top and Bottom x's so they follow the natural
  //progression of the bounds - ie so their xbots will align with the
  //adjoining lower edge. [Helpful in the ProcessHorizontal() method.]
  std::swap(e.Top.X, e.Bot.X);
#ifdef use_xyz  
  std::swap(e.Top.Z, e.Bot.Z);
#endif
}
//------------------------------------------------------------------------------

void SwapPoints(IntPoint &pt1, IntPoint &pt2)
{
  IntPoint tmp = pt1;
  pt1 = pt2;
  pt2 = tmp;
}
//------------------------------------------------------------------------------

bool GetOverlapSegment(IntPoint pt1a, IntPoint pt1b, IntPoint pt2a,
  IntPoint pt2b, IntPoint &pt1, IntPoint &pt2)
{
  //precondition: segments are Collinear.
  if (Abs(pt1a.X - pt1b.X) > Abs(pt1a.Y - pt1b.Y))
  {
    if (pt1a.X > pt1b.X) SwapPoints(pt1a, pt1b);
    if (pt2a.X > pt2b.X) SwapPoints(pt2a, pt2b);
    if (pt1a.X > pt2a.X) pt1 = pt1a; else pt1 = pt2a;
    if (pt1b.X < pt2b.X) pt2 = pt1b; else pt2 = pt2b;
    return pt1.X < pt2.X;
  } else
  {
    if (pt1a.Y < pt1b.Y) SwapPoints(pt1a, pt1b);
    if (pt2a.Y < pt2b.Y) SwapPoints(pt2a, pt2b);
    if (pt1a.Y < pt2a.Y) pt1 = pt1a; else pt1 = pt2a;
    if (pt1b.Y > pt2b.Y) pt2 = pt1b; else pt2 = pt2b;
    return pt1.Y > pt2.Y;
  }
}
//------------------------------------------------------------------------------

bool FirstIsBottomPt(const OutPt* btmPt1, const OutPt* btmPt2)
{
  OutPt *p = btmPt1->Prev;
  while ((p->Pt == btmPt1->Pt) && (p != btmPt1)) p = p->Prev;
  double dx1p = std::fabs(GetDx(btmPt1->Pt, p->Pt));
  p = btmPt1->Next;
  while ((p->Pt == btmPt1->Pt) && (p != btmPt1)) p = p->Next;
  double dx1n = std::fabs(GetDx(btmPt1->Pt, p->Pt));

  p = btmPt2->Prev;
  while ((p->Pt == btmPt2->Pt) && (p != btmPt2)) p = p->Prev;
  double dx2p = std::fabs(GetDx(btmPt2->Pt, p->Pt));
  p = btmPt2->Next;
  while ((p->Pt == btmPt2->Pt) && (p != btmPt2)) p = p->Next;
  double dx2n = std::fabs(GetDx(btmPt2->Pt, p->Pt));

  if (std::max(dx1p, dx1n) == std::max(dx2p, dx2n) &&
    std::min(dx1p, dx1n) == std::min(dx2p, dx2n))
      return Area(btmPt1) > 0; //if otherwise identical use orientation
  else
    return (dx1p >= dx2p && dx1p >= dx2n) || (dx1n >= dx2p && dx1n >= dx2n);
}
//------------------------------------------------------------------------------

OutPt* GetBottomPt(OutPt *pp)
{
  OutPt* dups = 0;
  OutPt* p = pp->Next;
  while (p != pp)
  {
    if (p->Pt.Y > pp->Pt.Y)
    {
      pp = p;
      dups = 0;
    }
    else if (p->Pt.Y == pp->Pt.Y && p->Pt.X <= pp->Pt.X)
    {
      if (p->Pt.X < pp->Pt.X)
      {
        dups = 0;
        pp = p;
      } else
      {
        if (p->Next != pp && p->Prev != pp) dups = p;
      }
    }
    p = p->Next;
  }
  if (dups)
  {
    //there appears to be at least 2 vertices at BottomPt so ...
    while (dups != p)
    {
      if (!FirstIsBottomPt(p, dups)) pp = dups;
      dups = dups->Next;
      while (dups->Pt != pp->Pt) dups = dups->Next;
    }
  }
  return pp;
}
//------------------------------------------------------------------------------

bool Pt2IsBetweenPt1AndPt3(const IntPoint pt1,
  const IntPoint pt2, const IntPoint pt3)
{
  if ((pt1 == pt3) || (pt1 == pt2) || (pt3 == pt2))
    return false;
  else if (pt1.X != pt3.X)
    return (pt2.X > pt1.X) == (pt2.X < pt3.X);
  else
    return (pt2.Y > pt1.Y) == (pt2.Y < pt3.Y);
}
//------------------------------------------------------------------------------

bool HorzSegmentsOverlap(cInt seg1a, cInt seg1b, cInt seg2a, cInt seg2b)
{
  if (seg1a > seg1b) std::swap(seg1a, seg1b);
  if (seg2a > seg2b) std::swap(seg2a, seg2b);
  return (seg1a < seg2b) && (seg2a < seg1b);
}

//------------------------------------------------------------------------------
// ClipperBase class methods ...
//------------------------------------------------------------------------------

ClipperBase::ClipperBase() //constructor
{
  m_CurrentLM = m_MinimaList.begin(); //begin() == end() here
  m_UseFullRange = false;
}
//------------------------------------------------------------------------------

ClipperBase::~ClipperBase() //destructor
{
  Clear();
}
//------------------------------------------------------------------------------

void RangeTest(const IntPoint& Pt, bool& useFullRange)
{
  if (useFullRange)
  {
    if (Pt.X > hiRange || Pt.Y > hiRange || -Pt.X > hiRange || -Pt.Y > hiRange) 
      throw clipperException("Coordinate outside allowed range");
  }
  else if (Pt.X > loRange|| Pt.Y > loRange || -Pt.X > loRange || -Pt.Y > loRange) 
  {
    useFullRange = true;
    RangeTest(Pt, useFullRange);
  }
}
//------------------------------------------------------------------------------

TEdge* FindNextLocMin(TEdge* E)
{
  for (;;)
  {
    while (E->Bot != E->Prev->Bot || E->Curr == E->Top) E = E->Next;
    if (!IsHorizontal(*E) && !IsHorizontal(*E->Prev)) break;
    while (IsHorizontal(*E->Prev)) E = E->Prev;
    TEdge* E2 = E;
    while (IsHorizontal(*E)) E = E->Next;
    if (E->Top.Y == E->Prev->Bot.Y) continue; //ie just an intermediate horz.
    if (E2->Prev->Bot.X < E->Bot.X) E = E2;
    break;
  }
  return E;
}
//------------------------------------------------------------------------------

TEdge* ClipperBase::ProcessBound(TEdge* E, bool NextIsForward)
{
  TEdge *Result = E;
  TEdge *Horz = 0;

  if (E->OutIdx == Skip)
  {
    //if edges still remain in the current bound beyond the skip edge then
    //create another LocMin and call ProcessBound once more
    if (NextIsForward)
    {
      while (E->Top.Y == E->Next->Bot.Y) E = E->Next;
      //don't include top horizontals when parsing a bound a second time,
      //they will be contained in the opposite bound ...
      while (E != Result && IsHorizontal(*E)) E = E->Prev;
    }
    else
    {
      while (E->Top.Y == E->Prev->Bot.Y) E = E->Prev;
      while (E != Result && IsHorizontal(*E)) E = E->Next;
    }

    if (E == Result)
    {
      if (NextIsForward) Result = E->Next;
      else Result = E->Prev;
    }
    else
    {
      //there are more edges in the bound beyond result starting with E
      if (NextIsForward)
        E = Result->Next;
      else
        E = Result->Prev;
      MinimaList::value_type locMin;
      locMin.Y = E->Bot.Y;
      locMin.LeftBound = 0;
      locMin.RightBound = E;
      E->WindDelta = 0;
      Result = ProcessBound(E, NextIsForward);
      m_MinimaList.push_back(locMin);
    }
    return Result;
  }

  TEdge *EStart;

  if (IsHorizontal(*E))
  {
    //We need to be careful with open paths because this may not be a
    //true local minima (ie E may be following a skip edge).
    //Also, consecutive horz. edges may start heading left before going right.
    if (NextIsForward) 
      EStart = E->Prev;
    else 
      EStart = E->Next;
    if (IsHorizontal(*EStart)) //ie an adjoining horizontal skip edge
      {
        if (EStart->Bot.X != E->Bot.X && EStart->Top.X != E->Bot.X)
          ReverseHorizontal(*E);
      }
      else if (EStart->Bot.X != E->Bot.X)
        ReverseHorizontal(*E);
  }
  
  EStart = E;
  if (NextIsForward)
  {
    while (Result->Top.Y == Result->Next->Bot.Y && Result->Next->OutIdx != Skip)
      Result = Result->Next;
    if (IsHorizontal(*Result) && Result->Next->OutIdx != Skip)
    {
      //nb: at the top of a bound, horizontals are added to the bound
      //only when the preceding edge attaches to the horizontal's left vertex
      //unless a Skip edge is encountered when that becomes the top divide
      Horz = Result;
      while (IsHorizontal(*Horz->Prev)) Horz = Horz->Prev;
      if (Horz->Prev->Top.X > Result->Next->Top.X) Result = Horz->Prev;
    }
    while (E != Result) 
    {
      E->NextInLML = E->Next;
      if (IsHorizontal(*E) && E != EStart &&
        E->Bot.X != E->Prev->Top.X) ReverseHorizontal(*E);
      E = E->Next;
    }
    if (IsHorizontal(*E) && E != EStart && E->Bot.X != E->Prev->Top.X) 
      ReverseHorizontal(*E);
    Result = Result->Next; //move to the edge just beyond current bound
  } else
  {
    while (Result->Top.Y == Result->Prev->Bot.Y && Result->Prev->OutIdx != Skip) 
      Result = Result->Prev;
    if (IsHorizontal(*Result) && Result->Prev->OutIdx != Skip)
    {
      Horz = Result;
      while (IsHorizontal(*Horz->Next)) Horz = Horz->Next;
      if (Horz->Next->Top.X == Result->Prev->Top.X ||
          Horz->Next->Top.X > Result->Prev->Top.X) Result = Horz->Next;
    }

    while (E != Result)
    {
      E->NextInLML = E->Prev;
      if (IsHorizontal(*E) && E != EStart && E->Bot.X != E->Next->Top.X) 
        ReverseHorizontal(*E);
      E = E->Prev;
    }
    if (IsHorizontal(*E) && E != EStart && E->Bot.X != E->Next->Top.X) 
      ReverseHorizontal(*E);
    Result = Result->Prev; //move to the edge just beyond current bound
  }

  return Result;
}
//------------------------------------------------------------------------------

bool ClipperBase::AddPath(const Path &pg, PolyType PolyTyp, bool Closed)
{
#ifdef use_lines
  if (!Closed && PolyTyp == ptClip)
    throw clipperException("AddPath: Open paths must be subject.");
#else
  if (!Closed)
    throw clipperException("AddPath: Open paths have been disabled.");
#endif

  int highI = (int)pg.size() -1;
  if (Closed) while (highI > 0 && (pg[highI] == pg[0])) --highI;
  while (highI > 0 && (pg[highI] == pg[highI -1])) --highI;
  if ((Closed && highI < 2) || (!Closed && highI < 1)) return false;

  //create a new edge array ...
  TEdge *edges = new TEdge [highI +1];

  bool IsFlat = true;
  //1. Basic (first) edge initialization ...
  try
  {
    edges[1].Curr = pg[1];
    RangeTest(pg[0], m_UseFullRange);
    RangeTest(pg[highI], m_UseFullRange);
    InitEdge(&edges[0], &edges[1], &edges[highI], pg[0]);
    InitEdge(&edges[highI], &edges[0], &edges[highI-1], pg[highI]);
    for (int i = highI - 1; i >= 1; --i)
    {
      RangeTest(pg[i], m_UseFullRange);
      InitEdge(&edges[i], &edges[i+1], &edges[i-1], pg[i]);
    }
  }
  catch(...)
  {
    delete [] edges;
    throw; //range test fails
  }
  TEdge *eStart = &edges[0];

  //2. Remove duplicate vertices, and (when closed) collinear edges ...
  TEdge *E = eStart, *eLoopStop = eStart;
  for (;;)
  {
    //nb: allows matching start and end points when not Closed ...
    if (E->Curr == E->Next->Curr && (Closed || E->Next != eStart))
    {
      if (E == E->Next) break;
      if (E == eStart) eStart = E->Next;
      E = RemoveEdge(E);
      eLoopStop = E;
      continue;
    }
    if (E->Prev == E->Next) 
      break; //only two vertices
    else if (Closed &&
      SlopesEqual(E->Prev->Curr, E->Curr, E->Next->Curr, m_UseFullRange) && 
      (!m_PreserveCollinear ||
      !Pt2IsBetweenPt1AndPt3(E->Prev->Curr, E->Curr, E->Next->Curr)))
    {
      //Collinear edges are allowed for open paths but in closed paths
      //the default is to merge adjacent collinear edges into a single edge.
      //However, if the PreserveCollinear property is enabled, only overlapping
      //collinear edges (ie spikes) will be removed from closed paths.
      if (E == eStart) eStart = E->Next;
      E = RemoveEdge(E);
      E = E->Prev;
      eLoopStop = E;
      continue;
    }
    E = E->Next;
    if ((E == eLoopStop) || (!Closed && E->Next == eStart)) break;
  }

  if ((!Closed && (E == E->Next)) || (Closed && (E->Prev == E->Next)))
  {
    delete [] edges;
    return false;
  }

  if (!Closed)
  { 
    m_HasOpenPaths = true;
    eStart->Prev->OutIdx = Skip;
  }

  //3. Do second stage of edge initialization ...
  E = eStart;
  do
  {
    InitEdge2(*E, PolyTyp);
    E = E->Next;
    if (IsFlat && E->Curr.Y != eStart->Curr.Y) IsFlat = false;
  }
  while (E != eStart);

  //4. Finally, add edge bounds to LocalMinima list ...

  //Totally flat paths must be handled differently when adding them
  //to LocalMinima list to avoid endless loops etc ...
  if (IsFlat) 
  {
    if (Closed) 
    {
      delete [] edges;
      return false;
    }
    E->Prev->OutIdx = Skip;
    MinimaList::value_type locMin;
    locMin.Y = E->Bot.Y;
    locMin.LeftBound = 0;
    locMin.RightBound = E;
    locMin.RightBound->Side = esRight;
    locMin.RightBound->WindDelta = 0;
    for (;;)
    {
      if (E->Bot.X != E->Prev->Top.X) ReverseHorizontal(*E);
      if (E->Next->OutIdx == Skip) break;
      E->NextInLML = E->Next;
      E = E->Next;
    }
    m_MinimaList.push_back(locMin);
    m_edges.push_back(edges);
	  return true;
  }

  m_edges.push_back(edges);
  bool leftBoundIsForward;
  TEdge* EMin = 0;

  //workaround to avoid an endless loop in the while loop below when
  //open paths have matching start and end points ...
  if (E->Prev->Bot == E->Prev->Top) E = E->Next;

  for (;;)
  {
    E = FindNextLocMin(E);
    if (E == EMin) break;
    else if (!EMin) EMin = E;

    //E and E.Prev now share a local minima (left aligned if horizontal).
    //Compare their slopes to find which starts which bound ...
    MinimaList::value_type locMin;
    locMin.Y = E->Bot.Y;
    if (E->Dx < E->Prev->Dx) 
    {
      locMin.LeftBound = E->Prev;
      locMin.RightBound = E;
      leftBoundIsForward = false; //Q.nextInLML = Q.prev
    } else
    {
      locMin.LeftBound = E;
      locMin.RightBound = E->Prev;
      leftBoundIsForward = true; //Q.nextInLML = Q.next
    }

    if (!Closed) locMin.LeftBound->WindDelta = 0;
    else if (locMin.LeftBound->Next == locMin.RightBound)
      locMin.LeftBound->WindDelta = -1;
    else locMin.LeftBound->WindDelta = 1;
    locMin.RightBound->WindDelta = -locMin.LeftBound->WindDelta;

    E = ProcessBound(locMin.LeftBound, leftBoundIsForward);
    if (E->OutIdx == Skip) E = ProcessBound(E, leftBoundIsForward);

    TEdge* E2 = ProcessBound(locMin.RightBound, !leftBoundIsForward);
    if (E2->OutIdx == Skip) E2 = ProcessBound(E2, !leftBoundIsForward);

    if (locMin.LeftBound->OutIdx == Skip)
      locMin.LeftBound = 0;
    else if (locMin.RightBound->OutIdx == Skip)
      locMin.RightBound = 0;
    m_MinimaList.push_back(locMin);
    if (!leftBoundIsForward) E = E2;
  }
  return true;
}
//------------------------------------------------------------------------------

bool ClipperBase::AddPaths(const Paths &ppg, PolyType PolyTyp, bool Closed)
{
  bool result = false;
  for (Paths::size_type i = 0; i < ppg.size(); ++i)
    if (AddPath(ppg[i], PolyTyp, Closed)) result = true;
  return result;
}
//------------------------------------------------------------------------------

void ClipperBase::Clear()
{
  DisposeLocalMinimaList();
  for (EdgeList::size_type i = 0; i < m_edges.size(); ++i)
  {
    TEdge* edges = m_edges[i];
    delete [] edges;
  }
  m_edges.clear();
  m_UseFullRange = false;
  m_HasOpenPaths = false;
}
//------------------------------------------------------------------------------

void ClipperBase::Reset()
{
  m_CurrentLM = m_MinimaList.begin();
  if (m_CurrentLM == m_MinimaList.end()) return; //ie nothing to process
  std::sort(m_MinimaList.begin(), m_MinimaList.end(), LocMinSorter());

  m_Scanbeam = ScanbeamList(); //clears/resets priority_queue
  //reset all edges ...
  for (MinimaList::iterator lm = m_MinimaList.begin(); lm != m_MinimaList.end(); ++lm)
  {
    InsertScanbeam(lm->Y);
    TEdge* e = lm->LeftBound;
    if (e)
    {
      e->Curr = e->Bot;
      e->Side = esLeft;
      e->OutIdx = Unassigned;
    }

    e = lm->RightBound;
    if (e)
    {
      e->Curr = e->Bot;
      e->Side = esRight;
      e->OutIdx = Unassigned;
    }
  }
  m_ActiveEdges = 0;
  m_CurrentLM = m_MinimaList.begin();
}
//------------------------------------------------------------------------------

void ClipperBase::DisposeLocalMinimaList()
{
  m_MinimaList.clear();
  m_CurrentLM = m_MinimaList.begin();
}
//------------------------------------------------------------------------------

bool ClipperBase::PopLocalMinima(cInt Y, const LocalMinimum *&locMin)
{
  if (m_CurrentLM == m_MinimaList.end() || (*m_CurrentLM).Y != Y) return false;
  locMin = &(*m_CurrentLM);
  ++m_CurrentLM;
  return true;
}
//------------------------------------------------------------------------------

IntRect ClipperBase::GetBounds()
{
  IntRect result;
  MinimaList::iterator lm = m_MinimaList.begin();
  if (lm == m_MinimaList.end())
  {
    result.left = result.top = result.right = result.bottom = 0;
    return result;
  }
  result.left = lm->LeftBound->Bot.X;
  result.top = lm->LeftBound->Bot.Y;
  result.right = lm->LeftBound->Bot.X;
  result.bottom = lm->LeftBound->Bot.Y;
  while (lm != m_MinimaList.end())
  {
    //todo - needs fixing for open paths
    result.bottom = std::max(result.bottom, lm->LeftBound->Bot.Y);
    TEdge* e = lm->LeftBound;
    for (;;) {
      TEdge* bottomE = e;
      while (e->NextInLML)
      {
        if (e->Bot.X < result.left) result.left = e->Bot.X;
        if (e->Bot.X > result.right) result.right = e->Bot.X;
        e = e->NextInLML;
      }
      result.left = std::min(result.left, e->Bot.X);
      result.right = std::max(result.right, e->Bot.X);
      result.left = std::min(result.left, e->Top.X);
      result.right = std::max(result.right, e->Top.X);
      result.top = std::min(result.top, e->Top.Y);
      if (bottomE == lm->LeftBound) e = lm->RightBound;
      else break;
    }
    ++lm;
  }
  return result;
}
//------------------------------------------------------------------------------

void ClipperBase::InsertScanbeam(const cInt Y)
{
  m_Scanbeam.push(Y);
}
//------------------------------------------------------------------------------

bool ClipperBase::PopScanbeam(cInt &Y)
{
  if (m_Scanbeam.empty()) return false;
  Y = m_Scanbeam.top();
  m_Scanbeam.pop();
  while (!m_Scanbeam.empty() && Y == m_Scanbeam.top()) { m_Scanbeam.pop(); } // Pop duplicates.
  return true;
}
//------------------------------------------------------------------------------

void ClipperBase::DisposeAllOutRecs(){
  for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
    DisposeOutRec(i);
  m_PolyOuts.clear();
}
//------------------------------------------------------------------------------

void ClipperBase::DisposeOutRec(PolyOutList::size_type index)
{
  OutRec *outRec = m_PolyOuts[index];
  if (outRec->Pts) DisposeOutPts(outRec->Pts);
  delete outRec;
  m_PolyOuts[index] = 0;
}
//------------------------------------------------------------------------------

void ClipperBase::DeleteFromAEL(TEdge *e)
{
  TEdge* AelPrev = e->PrevInAEL;
  TEdge* AelNext = e->NextInAEL;
  if (!AelPrev &&  !AelNext && (e != m_ActiveEdges)) return; //already deleted
  if (AelPrev) AelPrev->NextInAEL = AelNext;
  else m_ActiveEdges = AelNext;
  if (AelNext) AelNext->PrevInAEL = AelPrev;
  e->NextInAEL = 0;
  e->PrevInAEL = 0;
}
//------------------------------------------------------------------------------

OutRec* ClipperBase::CreateOutRec()
{
  OutRec* result = new OutRec;
  result->IsHole = false;
  result->IsOpen = false;
  result->FirstLeft = 0;
  result->Pts = 0;
  result->BottomPt = 0;
  result->PolyNd = 0;
  m_PolyOuts.push_back(result);
  result->Idx = (int)m_PolyOuts.size() - 1;
  return result;
}
//------------------------------------------------------------------------------

void ClipperBase::SwapPositionsInAEL(TEdge *Edge1, TEdge *Edge2)
{
  //check that one or other edge hasn't already been removed from AEL ...
  if (Edge1->NextInAEL == Edge1->PrevInAEL ||
    Edge2->NextInAEL == Edge2->PrevInAEL) return;

  if (Edge1->NextInAEL == Edge2)
  {
    TEdge* Next = Edge2->NextInAEL;
    if (Next) Next->PrevInAEL = Edge1;
    TEdge* Prev = Edge1->PrevInAEL;
    if (Prev) Prev->NextInAEL = Edge2;
    Edge2->PrevInAEL = Prev;
    Edge2->NextInAEL = Edge1;
    Edge1->PrevInAEL = Edge2;
    Edge1->NextInAEL = Next;
  }
  else if (Edge2->NextInAEL == Edge1)
  {
    TEdge* Next = Edge1->NextInAEL;
    if (Next) Next->PrevInAEL = Edge2;
    TEdge* Prev = Edge2->PrevInAEL;
    if (Prev) Prev->NextInAEL = Edge1;
    Edge1->PrevInAEL = Prev;
    Edge1->NextInAEL = Edge2;
    Edge2->PrevInAEL = Edge1;
    Edge2->NextInAEL = Next;
  }
  else
  {
    TEdge* Next = Edge1->NextInAEL;
    TEdge* Prev = Edge1->PrevInAEL;
    Edge1->NextInAEL = Edge2->NextInAEL;
    if (Edge1->NextInAEL) Edge1->NextInAEL->PrevInAEL = Edge1;
    Edge1->PrevInAEL = Edge2->PrevInAEL;
    if (Edge1->PrevInAEL) Edge1->PrevInAEL->NextInAEL = Edge1;
    Edge2->NextInAEL = Next;
    if (Edge2->NextInAEL) Edge2->NextInAEL->PrevInAEL = Edge2;
    Edge2->PrevInAEL = Prev;
    if (Edge2->PrevInAEL) Edge2->PrevInAEL->NextInAEL = Edge2;
  }

  if (!Edge1->PrevInAEL) m_ActiveEdges = Edge1;
  else if (!Edge2->PrevInAEL) m_ActiveEdges = Edge2;
}
//------------------------------------------------------------------------------

void ClipperBase::UpdateEdgeIntoAEL(TEdge *&e)
{
  if (!e->NextInLML) 
    throw clipperException("UpdateEdgeIntoAEL: invalid call");

  e->NextInLML->OutIdx = e->OutIdx;
  TEdge* AelPrev = e->PrevInAEL;
  TEdge* AelNext = e->NextInAEL;
  if (AelPrev) AelPrev->NextInAEL = e->NextInLML;
  else m_ActiveEdges = e->NextInLML;
  if (AelNext) AelNext->PrevInAEL = e->NextInLML;
  e->NextInLML->Side = e->Side;
  e->NextInLML->WindDelta = e->WindDelta;
  e->NextInLML->WindCnt = e->WindCnt;
  e->NextInLML->WindCnt2 = e->WindCnt2;
  e = e->NextInLML;
  e->Curr = e->Bot;
  e->PrevInAEL = AelPrev;
  e->NextInAEL = AelNext;
  if (!IsHorizontal(*e)) InsertScanbeam(e->Top.Y);
}
//------------------------------------------------------------------------------

bool ClipperBase::LocalMinimaPending()
{
  return (m_CurrentLM != m_MinimaList.end());
}

//------------------------------------------------------------------------------
// TClipper methods ...
//------------------------------------------------------------------------------

Clipper::Clipper(int initOptions) : ClipperBase() //constructor
{
  m_ExecuteLocked = false;
  m_UseFullRange = false;
  m_ReverseOutput = ((initOptions & ioReverseSolution) != 0);
  m_StrictSimple = ((initOptions & ioStrictlySimple) != 0);
  m_PreserveCollinear = ((initOptions & ioPreserveCollinear) != 0);
  m_HasOpenPaths = false;
#ifdef use_xyz  
  m_ZFill = 0;
#endif
}
//------------------------------------------------------------------------------

#ifdef use_xyz  
void Clipper::ZFillFunction(ZFillCallback zFillFunc)
{  
  m_ZFill = zFillFunc;
}
//------------------------------------------------------------------------------
#endif

bool Clipper::Execute(ClipType clipType, Paths &solution, PolyFillType fillType)
{
    return Execute(clipType, solution, fillType, fillType);
}
//------------------------------------------------------------------------------

bool Clipper::Execute(ClipType clipType, PolyTree &polytree, PolyFillType fillType)
{
    return Execute(clipType, polytree, fillType, fillType);
}
//------------------------------------------------------------------------------

bool Clipper::Execute(ClipType clipType, Paths &solution,
    PolyFillType subjFillType, PolyFillType clipFillType)
{
  if( m_ExecuteLocked ) return false;
  if (m_HasOpenPaths)
    throw clipperException("Error: PolyTree struct is needed for open path clipping.");
  m_ExecuteLocked = true;
  solution.resize(0);
  m_SubjFillType = subjFillType;
  m_ClipFillType = clipFillType;
  m_ClipType = clipType;
  m_UsingPolyTree = false;
  bool succeeded = ExecuteInternal();
  if (succeeded) BuildResult(solution);
  DisposeAllOutRecs();
  m_ExecuteLocked = false;
  return succeeded;
}
//------------------------------------------------------------------------------

bool Clipper::Execute(ClipType clipType, PolyTree& polytree,
    PolyFillType subjFillType, PolyFillType clipFillType)
{
  if( m_ExecuteLocked ) return false;
  m_ExecuteLocked = true;
  m_SubjFillType = subjFillType;
  m_ClipFillType = clipFillType;
  m_ClipType = clipType;
  m_UsingPolyTree = true;
  bool succeeded = ExecuteInternal();
  if (succeeded) BuildResult2(polytree);
  DisposeAllOutRecs();
  m_ExecuteLocked = false;
  return succeeded;
}
//------------------------------------------------------------------------------

void Clipper::FixHoleLinkage(OutRec &outrec)
{
  //skip OutRecs that (a) contain outermost polygons or
  //(b) already have the correct owner/child linkage ...
  if (!outrec.FirstLeft ||                
      (outrec.IsHole != outrec.FirstLeft->IsHole &&
      outrec.FirstLeft->Pts)) return;

  OutRec* orfl = outrec.FirstLeft;
  while (orfl && ((orfl->IsHole == outrec.IsHole) || !orfl->Pts))
      orfl = orfl->FirstLeft;
  outrec.FirstLeft = orfl;
}
//------------------------------------------------------------------------------

bool Clipper::ExecuteInternal()
{
  bool succeeded = true;
  try {
    Reset();
    m_Maxima = MaximaList();
    m_SortedEdges = 0;

    succeeded = true;
    cInt botY, topY;
    if (!PopScanbeam(botY)) return false;
    InsertLocalMinimaIntoAEL(botY);
    while (PopScanbeam(topY) || LocalMinimaPending())
    {
      ProcessHorizontals();
	    ClearGhostJoins();
      if (!ProcessIntersections(topY))
      {
        succeeded = false;
        break;
      }
      ProcessEdgesAtTopOfScanbeam(topY);
      botY = topY;
      InsertLocalMinimaIntoAEL(botY);
    }
  }
  catch(...) 
  {
    succeeded = false;
  }

  if (succeeded)
  {
    //fix orientations ...
    for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
    {
      OutRec *outRec = m_PolyOuts[i];
      if (!outRec->Pts || outRec->IsOpen) continue;
      if ((outRec->IsHole ^ m_ReverseOutput) == (Area(*outRec) > 0))
        ReversePolyPtLinks(outRec->Pts);
    }

    if (!m_Joins.empty()) JoinCommonEdges();

    //unfortunately FixupOutPolygon() must be done after JoinCommonEdges()
    for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
    {
      OutRec *outRec = m_PolyOuts[i];
      if (!outRec->Pts) continue;
      if (outRec->IsOpen)
        FixupOutPolyline(*outRec);
      else
        FixupOutPolygon(*outRec);
    }

    if (m_StrictSimple) DoSimplePolygons();
  }

  ClearJoins();
  ClearGhostJoins();
  return succeeded;
}
//------------------------------------------------------------------------------

void Clipper::SetWindingCount(TEdge &edge)
{
  TEdge *e = edge.PrevInAEL;
  //find the edge of the same polytype that immediately preceeds 'edge' in AEL
  while (e  && ((e->PolyTyp != edge.PolyTyp) || (e->WindDelta == 0))) e = e->PrevInAEL;
  if (!e)
  {
    if (edge.WindDelta == 0)
    {
      PolyFillType pft = (edge.PolyTyp == ptSubject ? m_SubjFillType : m_ClipFillType);
      edge.WindCnt = (pft == pftNegative ? -1 : 1);
    }
    else
      edge.WindCnt = edge.WindDelta;
    edge.WindCnt2 = 0;
    e = m_ActiveEdges; //ie get ready to calc WindCnt2
  }   
  else if (edge.WindDelta == 0 && m_ClipType != ctUnion)
  {
    edge.WindCnt = 1;
    edge.WindCnt2 = e->WindCnt2;
    e = e->NextInAEL; //ie get ready to calc WindCnt2
  }
  else if (IsEvenOddFillType(edge))
  {
    //EvenOdd filling ...
    if (edge.WindDelta == 0)
    {
      //are we inside a subj polygon ...
      bool Inside = true;
      TEdge *e2 = e->PrevInAEL;
      while (e2)
      {
        if (e2->PolyTyp == e->PolyTyp && e2->WindDelta != 0) 
          Inside = !Inside;
        e2 = e2->PrevInAEL;
      }
      edge.WindCnt = (Inside ? 0 : 1);
    }
    else
    {
      edge.WindCnt = edge.WindDelta;
    }
    edge.WindCnt2 = e->WindCnt2;
    e = e->NextInAEL; //ie get ready to calc WindCnt2
  } 
  else
  {
    //nonZero, Positive or Negative filling ...
    if (e->WindCnt * e->WindDelta < 0)
    {
      //prev edge is 'decreasing' WindCount (WC) toward zero
      //so we're outside the previous polygon ...
      if (Abs(e->WindCnt) > 1)
      {
        //outside prev poly but still inside another.
        //when reversing direction of prev poly use the same WC 
        if (e->WindDelta * edge.WindDelta < 0) edge.WindCnt = e->WindCnt;
        //otherwise continue to 'decrease' WC ...
        else edge.WindCnt = e->WindCnt + edge.WindDelta;
      } 
      else
        //now outside all polys of same polytype so set own WC ...
        edge.WindCnt = (edge.WindDelta == 0 ? 1 : edge.WindDelta);
    } else
    {
      //prev edge is 'increasing' WindCount (WC) away from zero
      //so we're inside the previous polygon ...
      if (edge.WindDelta == 0) 
        edge.WindCnt = (e->WindCnt < 0 ? e->WindCnt - 1 : e->WindCnt + 1);
      //if wind direction is reversing prev then use same WC
      else if (e->WindDelta * edge.WindDelta < 0) edge.WindCnt = e->WindCnt;
      //otherwise add to WC ...
      else edge.WindCnt = e->WindCnt + edge.WindDelta;
    }
    edge.WindCnt2 = e->WindCnt2;
    e = e->NextInAEL; //ie get ready to calc WindCnt2
  }

  //update WindCnt2 ...
  if (IsEvenOddAltFillType(edge))
  {
    //EvenOdd filling ...
    while (e != &edge)
    {
      if (e->WindDelta != 0)
        edge.WindCnt2 = (edge.WindCnt2 == 0 ? 1 : 0);
      e = e->NextInAEL;
    }
  } else
  {
    //nonZero, Positive or Negative filling ...
    while ( e != &edge )
    {
      edge.WindCnt2 += e->WindDelta;
      e = e->NextInAEL;
    }
  }
}
//------------------------------------------------------------------------------

bool Clipper::IsEvenOddFillType(const TEdge& edge) const
{
  if (edge.PolyTyp == ptSubject)
    return m_SubjFillType == pftEvenOdd; else
    return m_ClipFillType == pftEvenOdd;
}
//------------------------------------------------------------------------------

bool Clipper::IsEvenOddAltFillType(const TEdge& edge) const
{
  if (edge.PolyTyp == ptSubject)
    return m_ClipFillType == pftEvenOdd; else
    return m_SubjFillType == pftEvenOdd;
}
//------------------------------------------------------------------------------

bool Clipper::IsContributing(const TEdge& edge) const
{
  PolyFillType pft, pft2;
  if (edge.PolyTyp == ptSubject)
  {
    pft = m_SubjFillType;
    pft2 = m_ClipFillType;
  } else
  {
    pft = m_ClipFillType;
    pft2 = m_SubjFillType;
  }

  switch(pft)
  {
    case pftEvenOdd: 
      //return false if a subj line has been flagged as inside a subj polygon
      if (edge.WindDelta == 0 && edge.WindCnt != 1) return false;
      break;
    case pftNonZero:
      if (Abs(edge.WindCnt) != 1) return false;
      break;
    case pftPositive: 
      if (edge.WindCnt != 1) return false;
      break;
    default: //pftNegative
      if (edge.WindCnt != -1) return false;
  }

  switch(m_ClipType)
  {
    case ctIntersection:
      switch(pft2)
      {
        case pftEvenOdd: 
        case pftNonZero: 
          return (edge.WindCnt2 != 0);
        case pftPositive: 
          return (edge.WindCnt2 > 0);
        default: 
          return (edge.WindCnt2 < 0);
      }
      break;
    case ctUnion:
      switch(pft2)
      {
        case pftEvenOdd: 
        case pftNonZero: 
          return (edge.WindCnt2 == 0);
        case pftPositive: 
          return (edge.WindCnt2 <= 0);
        default: 
          return (edge.WindCnt2 >= 0);
      }
      break;
    case ctDifference:
      if (edge.PolyTyp == ptSubject)
        switch(pft2)
        {
          case pftEvenOdd: 
          case pftNonZero: 
            return (edge.WindCnt2 == 0);
          case pftPositive: 
            return (edge.WindCnt2 <= 0);
          default: 
            return (edge.WindCnt2 >= 0);
        }
      else
        switch(pft2)
        {
          case pftEvenOdd: 
          case pftNonZero: 
            return (edge.WindCnt2 != 0);
          case pftPositive: 
            return (edge.WindCnt2 > 0);
          default: 
            return (edge.WindCnt2 < 0);
        }
      break;
    case ctXor:
      if (edge.WindDelta == 0) //XOr always contributing unless open
        switch(pft2)
        {
          case pftEvenOdd: 
          case pftNonZero: 
            return (edge.WindCnt2 == 0);
          case pftPositive: 
            return (edge.WindCnt2 <= 0);
          default: 
            return (edge.WindCnt2 >= 0);
        }
      else 
        return true;
      break;
    default:
      return true;
  }
}
//------------------------------------------------------------------------------

OutPt* Clipper::AddLocalMinPoly(TEdge *e1, TEdge *e2, const IntPoint &Pt)
{
  OutPt* result;
  TEdge *e, *prevE;
  if (IsHorizontal(*e2) || ( e1->Dx > e2->Dx ))
  {
    result = AddOutPt(e1, Pt);
    e2->OutIdx = e1->OutIdx;
    e1->Side = esLeft;
    e2->Side = esRight;
    e = e1;
    if (e->PrevInAEL == e2)
      prevE = e2->PrevInAEL; 
    else
      prevE = e->PrevInAEL;
  } else
  {
    result = AddOutPt(e2, Pt);
    e1->OutIdx = e2->OutIdx;
    e1->Side = esRight;
    e2->Side = esLeft;
    e = e2;
    if (e->PrevInAEL == e1)
        prevE = e1->PrevInAEL;
    else
        prevE = e->PrevInAEL;
  }

  if (prevE && prevE->OutIdx >= 0 && prevE->Top.Y < Pt.Y && e->Top.Y < Pt.Y) 
  {
    cInt xPrev = TopX(*prevE, Pt.Y);
    cInt xE = TopX(*e, Pt.Y);
    if (xPrev == xE && (e->WindDelta != 0) && (prevE->WindDelta != 0) &&
      SlopesEqual(IntPoint(xPrev, Pt.Y), prevE->Top, IntPoint(xE, Pt.Y), e->Top, m_UseFullRange))
    {
      OutPt* outPt = AddOutPt(prevE, Pt);
      AddJoin(result, outPt, e->Top);
    }
  }
  return result;
}
//------------------------------------------------------------------------------

void Clipper::AddLocalMaxPoly(TEdge *e1, TEdge *e2, const IntPoint &Pt)
{
  AddOutPt( e1, Pt );
  if (e2->WindDelta == 0) AddOutPt(e2, Pt);
  if( e1->OutIdx == e2->OutIdx )
  {
    e1->OutIdx = Unassigned;
    e2->OutIdx = Unassigned;
  }
  else if (e1->OutIdx < e2->OutIdx) 
    AppendPolygon(e1, e2); 
  else 
    AppendPolygon(e2, e1);
}
//------------------------------------------------------------------------------

void Clipper::AddEdgeToSEL(TEdge *edge)
{
  //SEL pointers in PEdge are reused to build a list of horizontal edges.
  //However, we don't need to worry about order with horizontal edge processing.
  if( !m_SortedEdges )
  {
    m_SortedEdges = edge;
    edge->PrevInSEL = 0;
    edge->NextInSEL = 0;
  }
  else
  {
    edge->NextInSEL = m_SortedEdges;
    edge->PrevInSEL = 0;
    m_SortedEdges->PrevInSEL = edge;
    m_SortedEdges = edge;
  }
}
//------------------------------------------------------------------------------

bool Clipper::PopEdgeFromSEL(TEdge *&edge)
{
  if (!m_SortedEdges) return false;
  edge = m_SortedEdges;
  DeleteFromSEL(m_SortedEdges);
  return true;
}
//------------------------------------------------------------------------------

void Clipper::CopyAELToSEL()
{
  TEdge* e = m_ActiveEdges;
  m_SortedEdges = e;
  while ( e )
  {
    e->PrevInSEL = e->PrevInAEL;
    e->NextInSEL = e->NextInAEL;
    e = e->NextInAEL;
  }
}
//------------------------------------------------------------------------------

void Clipper::AddJoin(OutPt *op1, OutPt *op2, const IntPoint OffPt)
{
  Join* j = new Join;
  j->OutPt1 = op1;
  j->OutPt2 = op2;
  j->OffPt = OffPt;
  m_Joins.push_back(j);
}
//------------------------------------------------------------------------------

void Clipper::ClearJoins()
{
  for (JoinList::size_type i = 0; i < m_Joins.size(); i++)
    delete m_Joins[i];
  m_Joins.resize(0);
}
//------------------------------------------------------------------------------

void Clipper::ClearGhostJoins()
{
  for (JoinList::size_type i = 0; i < m_GhostJoins.size(); i++)
    delete m_GhostJoins[i];
  m_GhostJoins.resize(0);
}
//------------------------------------------------------------------------------

void Clipper::AddGhostJoin(OutPt *op, const IntPoint OffPt)
{
  Join* j = new Join;
  j->OutPt1 = op;
  j->OutPt2 = 0;
  j->OffPt = OffPt;
  m_GhostJoins.push_back(j);
}
//------------------------------------------------------------------------------

void Clipper::InsertLocalMinimaIntoAEL(const cInt botY)
{
  const LocalMinimum *lm;
  while (PopLocalMinima(botY, lm))
  {
    TEdge* lb = lm->LeftBound;
    TEdge* rb = lm->RightBound;
    
    OutPt *Op1 = 0;
    if (!lb)
    {
      //nb: don't insert LB into either AEL or SEL
      InsertEdgeIntoAEL(rb, 0);
      SetWindingCount(*rb);
      if (IsContributing(*rb))
        Op1 = AddOutPt(rb, rb->Bot); 
    } 
    else if (!rb)
    {
      InsertEdgeIntoAEL(lb, 0);
      SetWindingCount(*lb);
      if (IsContributing(*lb))
        Op1 = AddOutPt(lb, lb->Bot);
      InsertScanbeam(lb->Top.Y);
    }
    else
    {
      InsertEdgeIntoAEL(lb, 0);
      InsertEdgeIntoAEL(rb, lb);
      SetWindingCount( *lb );
      rb->WindCnt = lb->WindCnt;
      rb->WindCnt2 = lb->WindCnt2;
      if (IsContributing(*lb))
        Op1 = AddLocalMinPoly(lb, rb, lb->Bot);      
      InsertScanbeam(lb->Top.Y);
    }

     if (rb)
     {
		 if (IsHorizontal(*rb))
		 {
			 AddEdgeToSEL(rb);
			 if (rb->NextInLML) 
				 InsertScanbeam(rb->NextInLML->Top.Y);
		 }
		 else InsertScanbeam( rb->Top.Y );
     }

    if (!lb || !rb) continue;

    //if any output polygons share an edge, they'll need joining later ...
    if (Op1 && IsHorizontal(*rb) && 
      m_GhostJoins.size() > 0 && (rb->WindDelta != 0))
    {
      for (JoinList::size_type i = 0; i < m_GhostJoins.size(); ++i)
      {
        Join* jr = m_GhostJoins[i];
        //if the horizontal Rb and a 'ghost' horizontal overlap, then convert
        //the 'ghost' join to a real join ready for later ...
        if (HorzSegmentsOverlap(jr->OutPt1->Pt.X, jr->OffPt.X, rb->Bot.X, rb->Top.X))
          AddJoin(jr->OutPt1, Op1, jr->OffPt);
      }
    }

    if (lb->OutIdx >= 0 && lb->PrevInAEL && 
      lb->PrevInAEL->Curr.X == lb->Bot.X &&
      lb->PrevInAEL->OutIdx >= 0 &&
      SlopesEqual(lb->PrevInAEL->Bot, lb->PrevInAEL->Top, lb->Curr, lb->Top, m_UseFullRange) &&
      (lb->WindDelta != 0) && (lb->PrevInAEL->WindDelta != 0))
    {
        OutPt *Op2 = AddOutPt(lb->PrevInAEL, lb->Bot);
        AddJoin(Op1, Op2, lb->Top);
    }

    if(lb->NextInAEL != rb)
    {

      if (rb->OutIdx >= 0 && rb->PrevInAEL->OutIdx >= 0 &&
        SlopesEqual(rb->PrevInAEL->Curr, rb->PrevInAEL->Top, rb->Curr, rb->Top, m_UseFullRange) &&
        (rb->WindDelta != 0) && (rb->PrevInAEL->WindDelta != 0))
      {
          OutPt *Op2 = AddOutPt(rb->PrevInAEL, rb->Bot);
          AddJoin(Op1, Op2, rb->Top);
      }

      TEdge* e = lb->NextInAEL;
      if (e)
      {
        while( e != rb )
        {
          //nb: For calculating winding counts etc, IntersectEdges() assumes
          //that param1 will be to the Right of param2 ABOVE the intersection ...
          IntersectEdges(rb , e , lb->Curr); //order important here
          e = e->NextInAEL;
        }
      }
    }
    
  }
}
//------------------------------------------------------------------------------

void Clipper::DeleteFromSEL(TEdge *e)
{
  TEdge* SelPrev = e->PrevInSEL;
  TEdge* SelNext = e->NextInSEL;
  if( !SelPrev &&  !SelNext && (e != m_SortedEdges) ) return; //already deleted
  if( SelPrev ) SelPrev->NextInSEL = SelNext;
  else m_SortedEdges = SelNext;
  if( SelNext ) SelNext->PrevInSEL = SelPrev;
  e->NextInSEL = 0;
  e->PrevInSEL = 0;
}
//------------------------------------------------------------------------------

#ifdef use_xyz
void Clipper::SetZ(IntPoint& pt, TEdge& e1, TEdge& e2)
{
  if (pt.Z != 0 || !m_ZFill) return;
  else if (pt == e1.Bot) pt.Z = e1.Bot.Z;
  else if (pt == e1.Top) pt.Z = e1.Top.Z;
  else if (pt == e2.Bot) pt.Z = e2.Bot.Z;
  else if (pt == e2.Top) pt.Z = e2.Top.Z;
  else (*m_ZFill)(e1.Bot, e1.Top, e2.Bot, e2.Top, pt); 
}
//------------------------------------------------------------------------------
#endif

void Clipper::IntersectEdges(TEdge *e1, TEdge *e2, IntPoint &Pt)
{
  bool e1Contributing = ( e1->OutIdx >= 0 );
  bool e2Contributing = ( e2->OutIdx >= 0 );

#ifdef use_xyz
        SetZ(Pt, *e1, *e2);
#endif

#ifdef use_lines
  //if either edge is on an OPEN path ...
  if (e1->WindDelta == 0 || e2->WindDelta == 0)
  {
    //ignore subject-subject open path intersections UNLESS they
    //are both open paths, AND they are both 'contributing maximas' ...
	if (e1->WindDelta == 0 && e2->WindDelta == 0) return;

    //if intersecting a subj line with a subj poly ...
    else if (e1->PolyTyp == e2->PolyTyp && 
      e1->WindDelta != e2->WindDelta && m_ClipType == ctUnion)
    {
      if (e1->WindDelta == 0)
      {
        if (e2Contributing)
        {
          AddOutPt(e1, Pt);
          if (e1Contributing) e1->OutIdx = Unassigned;
        }
      }
      else
      {
        if (e1Contributing)
        {
          AddOutPt(e2, Pt);
          if (e2Contributing) e2->OutIdx = Unassigned;
        }
      }
    }
    else if (e1->PolyTyp != e2->PolyTyp)
    {
      //toggle subj open path OutIdx on/off when Abs(clip.WndCnt) == 1 ...
      if ((e1->WindDelta == 0) && abs(e2->WindCnt) == 1 && 
        (m_ClipType != ctUnion || e2->WindCnt2 == 0))
      {
        AddOutPt(e1, Pt);
        if (e1Contributing) e1->OutIdx = Unassigned;
      }
      else if ((e2->WindDelta == 0) && (abs(e1->WindCnt) == 1) && 
        (m_ClipType != ctUnion || e1->WindCnt2 == 0))
      {
        AddOutPt(e2, Pt);
        if (e2Contributing) e2->OutIdx = Unassigned;
      }
    }
    return;
  }
#endif

  //update winding counts...
  //assumes that e1 will be to the Right of e2 ABOVE the intersection
  if ( e1->PolyTyp == e2->PolyTyp )
  {
    if ( IsEvenOddFillType( *e1) )
    {
      int oldE1WindCnt = e1->WindCnt;
      e1->WindCnt = e2->WindCnt;
      e2->WindCnt = oldE1WindCnt;
    } else
    {
      if (e1->WindCnt + e2->WindDelta == 0 ) e1->WindCnt = -e1->WindCnt;
      else e1->WindCnt += e2->WindDelta;
      if ( e2->WindCnt - e1->WindDelta == 0 ) e2->WindCnt = -e2->WindCnt;
      else e2->WindCnt -= e1->WindDelta;
    }
  } else
  {
    if (!IsEvenOddFillType(*e2)) e1->WindCnt2 += e2->WindDelta;
    else e1->WindCnt2 = ( e1->WindCnt2 == 0 ) ? 1 : 0;
    if (!IsEvenOddFillType(*e1)) e2->WindCnt2 -= e1->WindDelta;
    else e2->WindCnt2 = ( e2->WindCnt2 == 0 ) ? 1 : 0;
  }

  PolyFillType e1FillType, e2FillType, e1FillType2, e2FillType2;
  if (e1->PolyTyp == ptSubject)
  {
    e1FillType = m_SubjFillType;
    e1FillType2 = m_ClipFillType;
  } else
  {
    e1FillType = m_ClipFillType;
    e1FillType2 = m_SubjFillType;
  }
  if (e2->PolyTyp == ptSubject)
  {
    e2FillType = m_SubjFillType;
    e2FillType2 = m_ClipFillType;
  } else
  {
    e2FillType = m_ClipFillType;
    e2FillType2 = m_SubjFillType;
  }

  cInt e1Wc, e2Wc;
  switch (e1FillType)
  {
    case pftPositive: e1Wc = e1->WindCnt; break;
    case pftNegative: e1Wc = -e1->WindCnt; break;
    default: e1Wc = Abs(e1->WindCnt);
  }
  switch(e2FillType)
  {
    case pftPositive: e2Wc = e2->WindCnt; break;
    case pftNegative: e2Wc = -e2->WindCnt; break;
    default: e2Wc = Abs(e2->WindCnt);
  }

  if ( e1Contributing && e2Contributing )
  {
    if ((e1Wc != 0 && e1Wc != 1) || (e2Wc != 0 && e2Wc != 1) ||
      (e1->PolyTyp != e2->PolyTyp && m_ClipType != ctXor) )
    {
      AddLocalMaxPoly(e1, e2, Pt); 
    }
    else
    {
      AddOutPt(e1, Pt);
      AddOutPt(e2, Pt);
      SwapSides( *e1 , *e2 );
      SwapPolyIndexes( *e1 , *e2 );
    }
  }
  else if ( e1Contributing )
  {
    if (e2Wc == 0 || e2Wc == 1) 
    {
      AddOutPt(e1, Pt);
      SwapSides(*e1, *e2);
      SwapPolyIndexes(*e1, *e2);
    }
  }
  else if ( e2Contributing )
  {
    if (e1Wc == 0 || e1Wc == 1) 
    {
      AddOutPt(e2, Pt);
      SwapSides(*e1, *e2);
      SwapPolyIndexes(*e1, *e2);
    }
  } 
  else if ( (e1Wc == 0 || e1Wc == 1) && (e2Wc == 0 || e2Wc == 1))
  {
    //neither edge is currently contributing ...

    cInt e1Wc2, e2Wc2;
    switch (e1FillType2)
    {
      case pftPositive: e1Wc2 = e1->WindCnt2; break;
      case pftNegative : e1Wc2 = -e1->WindCnt2; break;
      default: e1Wc2 = Abs(e1->WindCnt2);
    }
    switch (e2FillType2)
    {
      case pftPositive: e2Wc2 = e2->WindCnt2; break;
      case pftNegative: e2Wc2 = -e2->WindCnt2; break;
      default: e2Wc2 = Abs(e2->WindCnt2);
    }

    if (e1->PolyTyp != e2->PolyTyp)
    {
      AddLocalMinPoly(e1, e2, Pt);
    }
    else if (e1Wc == 1 && e2Wc == 1)
      switch( m_ClipType ) {
        case ctIntersection:
          if (e1Wc2 > 0 && e2Wc2 > 0)
            AddLocalMinPoly(e1, e2, Pt);
          break;
        case ctUnion:
          if ( e1Wc2 <= 0 && e2Wc2 <= 0 )
            AddLocalMinPoly(e1, e2, Pt);
          break;
        case ctDifference:
          if (((e1->PolyTyp == ptClip) && (e1Wc2 > 0) && (e2Wc2 > 0)) ||
              ((e1->PolyTyp == ptSubject) && (e1Wc2 <= 0) && (e2Wc2 <= 0)))
                AddLocalMinPoly(e1, e2, Pt);
          break;
        case ctXor:
          AddLocalMinPoly(e1, e2, Pt);
      }
    else
      SwapSides( *e1, *e2 );
  }
}
//------------------------------------------------------------------------------

void Clipper::SetHoleState(TEdge *e, OutRec *outrec)
{
  TEdge *e2 = e->PrevInAEL;
  TEdge *eTmp = 0;
  while (e2)
  {
    if (e2->OutIdx >= 0 && e2->WindDelta != 0)
    {
      if (!eTmp) eTmp = e2;
      else if (eTmp->OutIdx == e2->OutIdx) eTmp = 0;        
    }
    e2 = e2->PrevInAEL;
  }
  if (!eTmp)
  {
    outrec->FirstLeft = 0;
    outrec->IsHole = false;
  }
  else
  {
    outrec->FirstLeft = m_PolyOuts[eTmp->OutIdx];
    outrec->IsHole = !outrec->FirstLeft->IsHole;
  }
}
//------------------------------------------------------------------------------

OutRec* GetLowermostRec(OutRec *outRec1, OutRec *outRec2)
{
  //work out which polygon fragment has the correct hole state ...
  if (!outRec1->BottomPt) 
    outRec1->BottomPt = GetBottomPt(outRec1->Pts);
  if (!outRec2->BottomPt) 
    outRec2->BottomPt = GetBottomPt(outRec2->Pts);
  OutPt *OutPt1 = outRec1->BottomPt;
  OutPt *OutPt2 = outRec2->BottomPt;
  if (OutPt1->Pt.Y > OutPt2->Pt.Y) return outRec1;
  else if (OutPt1->Pt.Y < OutPt2->Pt.Y) return outRec2;
  else if (OutPt1->Pt.X < OutPt2->Pt.X) return outRec1;
  else if (OutPt1->Pt.X > OutPt2->Pt.X) return outRec2;
  else if (OutPt1->Next == OutPt1) return outRec2;
  else if (OutPt2->Next == OutPt2) return outRec1;
  else if (FirstIsBottomPt(OutPt1, OutPt2)) return outRec1;
  else return outRec2;
}
//------------------------------------------------------------------------------

bool OutRec1RightOfOutRec2(OutRec* outRec1, OutRec* outRec2)
{
  do
  {
    outRec1 = outRec1->FirstLeft;
    if (outRec1 == outRec2) return true;
  } while (outRec1);
  return false;
}
//------------------------------------------------------------------------------

OutRec* Clipper::GetOutRec(int Idx)
{
  OutRec* outrec = m_PolyOuts[Idx];
  while (outrec != m_PolyOuts[outrec->Idx])
    outrec = m_PolyOuts[outrec->Idx];
  return outrec;
}
//------------------------------------------------------------------------------

void Clipper::AppendPolygon(TEdge *e1, TEdge *e2)
{
  //get the start and ends of both output polygons ...
  OutRec *outRec1 = m_PolyOuts[e1->OutIdx];
  OutRec *outRec2 = m_PolyOuts[e2->OutIdx];

  OutRec *holeStateRec;
  if (OutRec1RightOfOutRec2(outRec1, outRec2))
    holeStateRec = outRec2;
  else if (OutRec1RightOfOutRec2(outRec2, outRec1))
    holeStateRec = outRec1;
  else 
    holeStateRec = GetLowermostRec(outRec1, outRec2);

  //get the start and ends of both output polygons and
  //join e2 poly onto e1 poly and delete pointers to e2 ...

  OutPt* p1_lft = outRec1->Pts;
  OutPt* p1_rt = p1_lft->Prev;
  OutPt* p2_lft = outRec2->Pts;
  OutPt* p2_rt = p2_lft->Prev;

  //join e2 poly onto e1 poly and delete pointers to e2 ...
  if(  e1->Side == esLeft )
  {
    if(  e2->Side == esLeft )
    {
      //z y x a b c
      ReversePolyPtLinks(p2_lft);
      p2_lft->Next = p1_lft;
      p1_lft->Prev = p2_lft;
      p1_rt->Next = p2_rt;
      p2_rt->Prev = p1_rt;
      outRec1->Pts = p2_rt;
    } else
    {
      //x y z a b c
      p2_rt->Next = p1_lft;
      p1_lft->Prev = p2_rt;
      p2_lft->Prev = p1_rt;
      p1_rt->Next = p2_lft;
      outRec1->Pts = p2_lft;
    }
  } else
  {
    if(  e2->Side == esRight )
    {
      //a b c z y x
      ReversePolyPtLinks(p2_lft);
      p1_rt->Next = p2_rt;
      p2_rt->Prev = p1_rt;
      p2_lft->Next = p1_lft;
      p1_lft->Prev = p2_lft;
    } else
    {
      //a b c x y z
      p1_rt->Next = p2_lft;
      p2_lft->Prev = p1_rt;
      p1_lft->Prev = p2_rt;
      p2_rt->Next = p1_lft;
    }
  }

  outRec1->BottomPt = 0;
  if (holeStateRec == outRec2)
  {
    if (outRec2->FirstLeft != outRec1)
      outRec1->FirstLeft = outRec2->FirstLeft;
    outRec1->IsHole = outRec2->IsHole;
  }
  outRec2->Pts = 0;
  outRec2->BottomPt = 0;
  outRec2->FirstLeft = outRec1;

  int OKIdx = e1->OutIdx;
  int ObsoleteIdx = e2->OutIdx;

  e1->OutIdx = Unassigned; //nb: safe because we only get here via AddLocalMaxPoly
  e2->OutIdx = Unassigned;

  TEdge* e = m_ActiveEdges;
  while( e )
  {
    if( e->OutIdx == ObsoleteIdx )
    {
      e->OutIdx = OKIdx;
      e->Side = e1->Side;
      break;
    }
    e = e->NextInAEL;
  }

  outRec2->Idx = outRec1->Idx;
}
//------------------------------------------------------------------------------

OutPt* Clipper::AddOutPt(TEdge *e, const IntPoint &pt)
{
  if(  e->OutIdx < 0 )
  {
    OutRec *outRec = CreateOutRec();
    outRec->IsOpen = (e->WindDelta == 0);
    OutPt* newOp = new OutPt;
    outRec->Pts = newOp;
    newOp->Idx = outRec->Idx;
    newOp->Pt = pt;
    newOp->Next = newOp;
    newOp->Prev = newOp;
    if (!outRec->IsOpen)
      SetHoleState(e, outRec);
    e->OutIdx = outRec->Idx;
    return newOp;
  } else
  {
    OutRec *outRec = m_PolyOuts[e->OutIdx];
    //OutRec.Pts is the 'Left-most' point & OutRec.Pts.Prev is the 'Right-most'
    OutPt* op = outRec->Pts;

	bool ToFront = (e->Side == esLeft);
	if (ToFront && (pt == op->Pt)) return op;
    else if (!ToFront && (pt == op->Prev->Pt)) return op->Prev;

    OutPt* newOp = new OutPt;
    newOp->Idx = outRec->Idx;
    newOp->Pt = pt;
    newOp->Next = op;
    newOp->Prev = op->Prev;
    newOp->Prev->Next = newOp;
    op->Prev = newOp;
    if (ToFront) outRec->Pts = newOp;
    return newOp;
  }
}
//------------------------------------------------------------------------------

OutPt* Clipper::GetLastOutPt(TEdge *e)
{
	OutRec *outRec = m_PolyOuts[e->OutIdx];
	if (e->Side == esLeft)
		return outRec->Pts;
	else
		return outRec->Pts->Prev;
}
//------------------------------------------------------------------------------

void Clipper::ProcessHorizontals()
{
  TEdge* horzEdge;
  while (PopEdgeFromSEL(horzEdge))
    ProcessHorizontal(horzEdge);
}
//------------------------------------------------------------------------------

inline bool IsMinima(TEdge *e)
{
  return e  && (e->Prev->NextInLML != e) && (e->Next->NextInLML != e);
}
//------------------------------------------------------------------------------

inline bool IsMaxima(TEdge *e, const cInt Y)
{
  return e && e->Top.Y == Y && !e->NextInLML;
}
//------------------------------------------------------------------------------

inline bool IsIntermediate(TEdge *e, const cInt Y)
{
  return e->Top.Y == Y && e->NextInLML;
}
//------------------------------------------------------------------------------

TEdge *GetMaximaPair(TEdge *e)
{
  if ((e->Next->Top == e->Top) && !e->Next->NextInLML)
    return e->Next;
  else if ((e->Prev->Top == e->Top) && !e->Prev->NextInLML)
    return e->Prev;
  else return 0;
}
//------------------------------------------------------------------------------

TEdge *GetMaximaPairEx(TEdge *e)
{
  //as GetMaximaPair() but returns 0 if MaxPair isn't in AEL (unless it's horizontal)
  TEdge* result = GetMaximaPair(e);
  if (result && (result->OutIdx == Skip ||
    (result->NextInAEL == result->PrevInAEL && !IsHorizontal(*result)))) return 0;
  return result;
}
//------------------------------------------------------------------------------

void Clipper::SwapPositionsInSEL(TEdge *Edge1, TEdge *Edge2)
{
  if(  !( Edge1->NextInSEL ) &&  !( Edge1->PrevInSEL ) ) return;
  if(  !( Edge2->NextInSEL ) &&  !( Edge2->PrevInSEL ) ) return;

  if(  Edge1->NextInSEL == Edge2 )
  {
    TEdge* Next = Edge2->NextInSEL;
    if( Next ) Next->PrevInSEL = Edge1;
    TEdge* Prev = Edge1->PrevInSEL;
    if( Prev ) Prev->NextInSEL = Edge2;
    Edge2->PrevInSEL = Prev;
    Edge2->NextInSEL = Edge1;
    Edge1->PrevInSEL = Edge2;
    Edge1->NextInSEL = Next;
  }
  else if(  Edge2->NextInSEL == Edge1 )
  {
    TEdge* Next = Edge1->NextInSEL;
    if( Next ) Next->PrevInSEL = Edge2;
    TEdge* Prev = Edge2->PrevInSEL;
    if( Prev ) Prev->NextInSEL = Edge1;
    Edge1->PrevInSEL = Prev;
    Edge1->NextInSEL = Edge2;
    Edge2->PrevInSEL = Edge1;
    Edge2->NextInSEL = Next;
  }
  else
  {
    TEdge* Next = Edge1->NextInSEL;
    TEdge* Prev = Edge1->PrevInSEL;
    Edge1->NextInSEL = Edge2->NextInSEL;
    if( Edge1->NextInSEL ) Edge1->NextInSEL->PrevInSEL = Edge1;
    Edge1->PrevInSEL = Edge2->PrevInSEL;
    if( Edge1->PrevInSEL ) Edge1->PrevInSEL->NextInSEL = Edge1;
    Edge2->NextInSEL = Next;
    if( Edge2->NextInSEL ) Edge2->NextInSEL->PrevInSEL = Edge2;
    Edge2->PrevInSEL = Prev;
    if( Edge2->PrevInSEL ) Edge2->PrevInSEL->NextInSEL = Edge2;
  }

  if( !Edge1->PrevInSEL ) m_SortedEdges = Edge1;
  else if( !Edge2->PrevInSEL ) m_SortedEdges = Edge2;
}
//------------------------------------------------------------------------------

TEdge* GetNextInAEL(TEdge *e, Direction dir)
{
  return dir == dLeftToRight ? e->NextInAEL : e->PrevInAEL;
}
//------------------------------------------------------------------------------

void GetHorzDirection(TEdge& HorzEdge, Direction& Dir, cInt& Left, cInt& Right)
{
  if (HorzEdge.Bot.X < HorzEdge.Top.X)
  {
    Left = HorzEdge.Bot.X;
    Right = HorzEdge.Top.X;
    Dir = dLeftToRight;
  } else
  {
    Left = HorzEdge.Top.X;
    Right = HorzEdge.Bot.X;
    Dir = dRightToLeft;
  }
}
//------------------------------------------------------------------------

/*******************************************************************************
* Notes: Horizontal edges (HEs) at scanline intersections (ie at the Top or    *
* Bottom of a scanbeam) are processed as if layered. The order in which HEs    *
* are processed doesn't matter. HEs intersect with other HE Bot.Xs only [#]    *
* (or they could intersect with Top.Xs only, ie EITHER Bot.Xs OR Top.Xs),      *
* and with other non-horizontal edges [*]. Once these intersections are        *
* processed, intermediate HEs then 'promote' the Edge above (NextInLML) into   *
* the AEL. These 'promoted' edges may in turn intersect [%] with other HEs.    *
*******************************************************************************/

void Clipper::ProcessHorizontal(TEdge *horzEdge)
{
  Direction dir;
  cInt horzLeft, horzRight;
  bool IsOpen = (horzEdge->WindDelta == 0);

  GetHorzDirection(*horzEdge, dir, horzLeft, horzRight);

  TEdge* eLastHorz = horzEdge, *eMaxPair = 0;
  while (eLastHorz->NextInLML && IsHorizontal(*eLastHorz->NextInLML)) 
    eLastHorz = eLastHorz->NextInLML;
  if (!eLastHorz->NextInLML)
    eMaxPair = GetMaximaPair(eLastHorz);

  MaximaList::const_iterator maxIt;
  MaximaList::const_reverse_iterator maxRit;
  if (m_Maxima.size() > 0)
  {
      //get the first maxima in range (X) ...
      if (dir == dLeftToRight)
      {
          maxIt = m_Maxima.begin();
          while (maxIt != m_Maxima.end() && *maxIt <= horzEdge->Bot.X) maxIt++;
          if (maxIt != m_Maxima.end() && *maxIt >= eLastHorz->Top.X)
              maxIt = m_Maxima.end();
      }
      else
      {
          maxRit = m_Maxima.rbegin();
          while (maxRit != m_Maxima.rend() && *maxRit > horzEdge->Bot.X) maxRit++;
          if (maxRit != m_Maxima.rend() && *maxRit <= eLastHorz->Top.X)
              maxRit = m_Maxima.rend();
      }
  }

  OutPt* op1 = 0;

  for (;;) //loop through consec. horizontal edges
  {
		  
    bool IsLastHorz = (horzEdge == eLastHorz);
    TEdge* e = GetNextInAEL(horzEdge, dir);
    while(e)
    {

        //this code block inserts extra coords into horizontal edges (in output
        //polygons) whereever maxima touch these horizontal edges. This helps
        //'simplifying' polygons (ie if the Simplify property is set).
        if (m_Maxima.size() > 0)
        {
            if (dir == dLeftToRight)
            {
                while (maxIt != m_Maxima.end() && *maxIt < e->Curr.X) 
                {
                  if (horzEdge->OutIdx >= 0 && !IsOpen)
                    AddOutPt(horzEdge, IntPoint(*maxIt, horzEdge->Bot.Y));
                  maxIt++;
                }
            }
            else
            {
                while (maxRit != m_Maxima.rend() && *maxRit > e->Curr.X)
                {
                  if (horzEdge->OutIdx >= 0 && !IsOpen)
                    AddOutPt(horzEdge, IntPoint(*maxRit, horzEdge->Bot.Y));
                  maxRit++;
                }
            }
        };

        if ((dir == dLeftToRight && e->Curr.X > horzRight) ||
			(dir == dRightToLeft && e->Curr.X < horzLeft)) break;

		//Also break if we've got to the end of an intermediate horizontal edge ...
		//nb: Smaller Dx's are to the right of larger Dx's ABOVE the horizontal.
		if (e->Curr.X == horzEdge->Top.X && horzEdge->NextInLML && 
			e->Dx < horzEdge->NextInLML->Dx) break;

    if (horzEdge->OutIdx >= 0 && !IsOpen)  //note: may be done multiple times
		{
#ifdef use_xyz
			if (dir == dLeftToRight) SetZ(e->Curr, *horzEdge, *e);
			else SetZ(e->Curr, *e, *horzEdge);
#endif      
			op1 = AddOutPt(horzEdge, e->Curr);
			TEdge* eNextHorz = m_SortedEdges;
			while (eNextHorz)
			{
				if (eNextHorz->OutIdx >= 0 &&
					HorzSegmentsOverlap(horzEdge->Bot.X,
					horzEdge->Top.X, eNextHorz->Bot.X, eNextHorz->Top.X))
				{
                    OutPt* op2 = GetLastOutPt(eNextHorz);
                    AddJoin(op2, op1, eNextHorz->Top);
				}
				eNextHorz = eNextHorz->NextInSEL;
			}
			AddGhostJoin(op1, horzEdge->Bot);
		}
		
		//OK, so far we're still in range of the horizontal Edge  but make sure
        //we're at the last of consec. horizontals when matching with eMaxPair
        if(e == eMaxPair && IsLastHorz)
        {
          if (horzEdge->OutIdx >= 0)
            AddLocalMaxPoly(horzEdge, eMaxPair, horzEdge->Top);
          DeleteFromAEL(horzEdge);
          DeleteFromAEL(eMaxPair);
          return;
        }
        
		if(dir == dLeftToRight)
        {
          IntPoint Pt = IntPoint(e->Curr.X, horzEdge->Curr.Y);
          IntersectEdges(horzEdge, e, Pt);
        }
        else
        {
          IntPoint Pt = IntPoint(e->Curr.X, horzEdge->Curr.Y);
          IntersectEdges( e, horzEdge, Pt);
        }
        TEdge* eNext = GetNextInAEL(e, dir);
        SwapPositionsInAEL( horzEdge, e );
        e = eNext;
    } //end while(e)

	//Break out of loop if HorzEdge.NextInLML is not also horizontal ...
	if (!horzEdge->NextInLML || !IsHorizontal(*horzEdge->NextInLML)) break;

	UpdateEdgeIntoAEL(horzEdge);
    if (horzEdge->OutIdx >= 0) AddOutPt(horzEdge, horzEdge->Bot);
    GetHorzDirection(*horzEdge, dir, horzLeft, horzRight);

  } //end for (;;)

  if (horzEdge->OutIdx >= 0 && !op1)
  {
      op1 = GetLastOutPt(horzEdge);
      TEdge* eNextHorz = m_SortedEdges;
      while (eNextHorz)
      {
          if (eNextHorz->OutIdx >= 0 &&
              HorzSegmentsOverlap(horzEdge->Bot.X,
              horzEdge->Top.X, eNextHorz->Bot.X, eNextHorz->Top.X))
          {
              OutPt* op2 = GetLastOutPt(eNextHorz);
              AddJoin(op2, op1, eNextHorz->Top);
          }
          eNextHorz = eNextHorz->NextInSEL;
      }
      AddGhostJoin(op1, horzEdge->Top);
  }

  if (horzEdge->NextInLML)
  {
    if(horzEdge->OutIdx >= 0)
    {
      op1 = AddOutPt( horzEdge, horzEdge->Top);
      UpdateEdgeIntoAEL(horzEdge);
      if (horzEdge->WindDelta == 0) return;
      //nb: HorzEdge is no longer horizontal here
      TEdge* ePrev = horzEdge->PrevInAEL;
      TEdge* eNext = horzEdge->NextInAEL;
      if (ePrev && ePrev->Curr.X == horzEdge->Bot.X &&
        ePrev->Curr.Y == horzEdge->Bot.Y && ePrev->WindDelta != 0 &&
        (ePrev->OutIdx >= 0 && ePrev->Curr.Y > ePrev->Top.Y &&
        SlopesEqual(*horzEdge, *ePrev, m_UseFullRange)))
      {
        OutPt* op2 = AddOutPt(ePrev, horzEdge->Bot);
        AddJoin(op1, op2, horzEdge->Top);
      }
      else if (eNext && eNext->Curr.X == horzEdge->Bot.X &&
        eNext->Curr.Y == horzEdge->Bot.Y && eNext->WindDelta != 0 &&
        eNext->OutIdx >= 0 && eNext->Curr.Y > eNext->Top.Y &&
        SlopesEqual(*horzEdge, *eNext, m_UseFullRange))
      {
        OutPt* op2 = AddOutPt(eNext, horzEdge->Bot);
        AddJoin(op1, op2, horzEdge->Top);
      }
    }
    else
      UpdateEdgeIntoAEL(horzEdge); 
  }
  else
  {
    if (horzEdge->OutIdx >= 0) AddOutPt(horzEdge, horzEdge->Top);
    DeleteFromAEL(horzEdge);
  }
}
//------------------------------------------------------------------------------

bool Clipper::ProcessIntersections(const cInt topY)
{
  if( !m_ActiveEdges ) return true;
  try {
    BuildIntersectList(topY);
    size_t IlSize = m_IntersectList.size();
    if (IlSize == 0) return true;
    if (IlSize == 1 || FixupIntersectionOrder()) ProcessIntersectList();
    else return false;
  }
  catch(...) 
  {
    m_SortedEdges = 0;
    DisposeIntersectNodes();
    throw clipperException("ProcessIntersections error");
  }
  m_SortedEdges = 0;
  return true;
}
//------------------------------------------------------------------------------

void Clipper::DisposeIntersectNodes()
{
  for (size_t i = 0; i < m_IntersectList.size(); ++i )
    delete m_IntersectList[i];
  m_IntersectList.clear();
}
//------------------------------------------------------------------------------

void Clipper::BuildIntersectList(const cInt topY)
{
  if ( !m_ActiveEdges ) return;

  //prepare for sorting ...
  TEdge* e = m_ActiveEdges;
  m_SortedEdges = e;
  while( e )
  {
    e->PrevInSEL = e->PrevInAEL;
    e->NextInSEL = e->NextInAEL;
    e->Curr.X = TopX( *e, topY );
    e = e->NextInAEL;
  }

  //bubblesort ...
  bool isModified;
  do
  {
    isModified = false;
    e = m_SortedEdges;
    while( e->NextInSEL )
    {
      TEdge *eNext = e->NextInSEL;
      IntPoint Pt;
      if(e->Curr.X > eNext->Curr.X)
      {
        IntersectPoint(*e, *eNext, Pt);
        if (Pt.Y < topY) Pt = IntPoint(TopX(*e, topY), topY);
        IntersectNode * newNode = new IntersectNode;
        newNode->Edge1 = e;
        newNode->Edge2 = eNext;
        newNode->Pt = Pt;
        m_IntersectList.push_back(newNode);

        SwapPositionsInSEL(e, eNext);
        isModified = true;
      }
      else
        e = eNext;
    }
    if( e->PrevInSEL ) e->PrevInSEL->NextInSEL = 0;
    else break;
  }
  while ( isModified );
  m_SortedEdges = 0; //important
}
//------------------------------------------------------------------------------


void Clipper::ProcessIntersectList()
{
  for (size_t i = 0; i < m_IntersectList.size(); ++i)
  {
    IntersectNode* iNode = m_IntersectList[i];
    {
      IntersectEdges( iNode->Edge1, iNode->Edge2, iNode->Pt);
      SwapPositionsInAEL( iNode->Edge1 , iNode->Edge2 );
    }
    delete iNode;
  }
  m_IntersectList.clear();
}
//------------------------------------------------------------------------------

bool IntersectListSort(IntersectNode* node1, IntersectNode* node2)
{
  return node2->Pt.Y < node1->Pt.Y;
}
//------------------------------------------------------------------------------

inline bool EdgesAdjacent(const IntersectNode &inode)
{
  return (inode.Edge1->NextInSEL == inode.Edge2) ||
    (inode.Edge1->PrevInSEL == inode.Edge2);
}
//------------------------------------------------------------------------------

bool Clipper::FixupIntersectionOrder()
{
  //pre-condition: intersections are sorted Bottom-most first.
  //Now it's crucial that intersections are made only between adjacent edges,
  //so to ensure this the order of intersections may need adjusting ...
  CopyAELToSEL();
  std::sort(m_IntersectList.begin(), m_IntersectList.end(), IntersectListSort);
  size_t cnt = m_IntersectList.size();
  for (size_t i = 0; i < cnt; ++i) 
  {
    if (!EdgesAdjacent(*m_IntersectList[i]))
    {
      size_t j = i + 1;
      while (j < cnt && !EdgesAdjacent(*m_IntersectList[j])) j++;
      if (j == cnt)  return false;
      std::swap(m_IntersectList[i], m_IntersectList[j]);
    }
    SwapPositionsInSEL(m_IntersectList[i]->Edge1, m_IntersectList[i]->Edge2);
  }
  return true;
}
//------------------------------------------------------------------------------

void Clipper::DoMaxima(TEdge *e)
{
  TEdge* eMaxPair = GetMaximaPairEx(e);
  if (!eMaxPair)
  {
    if (e->OutIdx >= 0)
      AddOutPt(e, e->Top);
    DeleteFromAEL(e);
    return;
  }

  TEdge* eNext = e->NextInAEL;
  while(eNext && eNext != eMaxPair)
  {
    IntersectEdges(e, eNext, e->Top);
    SwapPositionsInAEL(e, eNext);
    eNext = e->NextInAEL;
  }

  if(e->OutIdx == Unassigned && eMaxPair->OutIdx == Unassigned)
  {
    DeleteFromAEL(e);
    DeleteFromAEL(eMaxPair);
  }
  else if( e->OutIdx >= 0 && eMaxPair->OutIdx >= 0 )
  {
    if (e->OutIdx >= 0) AddLocalMaxPoly(e, eMaxPair, e->Top);
    DeleteFromAEL(e);
    DeleteFromAEL(eMaxPair);
  }
#ifdef use_lines
  else if (e->WindDelta == 0)
  {
    if (e->OutIdx >= 0) 
    {
      AddOutPt(e, e->Top);
      e->OutIdx = Unassigned;
    }
    DeleteFromAEL(e);

    if (eMaxPair->OutIdx >= 0)
    {
      AddOutPt(eMaxPair, e->Top);
      eMaxPair->OutIdx = Unassigned;
    }
    DeleteFromAEL(eMaxPair);
  } 
#endif
  else throw clipperException("DoMaxima error");
}
//------------------------------------------------------------------------------

void Clipper::ProcessEdgesAtTopOfScanbeam(const cInt topY)
{
  TEdge* e = m_ActiveEdges;
  while( e )
  {
    //1. process maxima, treating them as if they're 'bent' horizontal edges,
    //   but exclude maxima with horizontal edges. nb: e can't be a horizontal.
    bool IsMaximaEdge = IsMaxima(e, topY);

    if(IsMaximaEdge)
    {
      TEdge* eMaxPair = GetMaximaPairEx(e);
      IsMaximaEdge = (!eMaxPair || !IsHorizontal(*eMaxPair));
    }

    if(IsMaximaEdge)
    {
      if (m_StrictSimple) m_Maxima.push_back(e->Top.X);
      TEdge* ePrev = e->PrevInAEL;
      DoMaxima(e);
      if( !ePrev ) e = m_ActiveEdges;
      else e = ePrev->NextInAEL;
    }
    else
    {
      //2. promote horizontal edges, otherwise update Curr.X and Curr.Y ...
      if (IsIntermediate(e, topY) && IsHorizontal(*e->NextInLML))
      {
        UpdateEdgeIntoAEL(e);
        if (e->OutIdx >= 0)
          AddOutPt(e, e->Bot);
        AddEdgeToSEL(e);
      } 
      else
      {
        e->Curr.X = TopX( *e, topY );
        e->Curr.Y = topY;
#ifdef use_xyz
		e->Curr.Z = topY == e->Top.Y ? e->Top.Z : (topY == e->Bot.Y ? e->Bot.Z : 0);
#endif
	  }

      //When StrictlySimple and 'e' is being touched by another edge, then
      //make sure both edges have a vertex here ...
      if (m_StrictSimple)
      {  
        TEdge* ePrev = e->PrevInAEL;
        if ((e->OutIdx >= 0) && (e->WindDelta != 0) && ePrev && (ePrev->OutIdx >= 0) &&
          (ePrev->Curr.X == e->Curr.X) && (ePrev->WindDelta != 0))
        {
          IntPoint pt = e->Curr;
#ifdef use_xyz
          SetZ(pt, *ePrev, *e);
#endif
          OutPt* op = AddOutPt(ePrev, pt);
          OutPt* op2 = AddOutPt(e, pt);
          AddJoin(op, op2, pt); //StrictlySimple (type-3) join
        }
      }

      e = e->NextInAEL;
    }
  }

  //3. Process horizontals at the Top of the scanbeam ...
  m_Maxima.sort();
  ProcessHorizontals();
  m_Maxima.clear();

  //4. Promote intermediate vertices ...
  e = m_ActiveEdges;
  while(e)
  {
    if(IsIntermediate(e, topY))
    {
      OutPt* op = 0;
      if( e->OutIdx >= 0 ) 
        op = AddOutPt(e, e->Top);
      UpdateEdgeIntoAEL(e);

      //if output polygons share an edge, they'll need joining later ...
      TEdge* ePrev = e->PrevInAEL;
      TEdge* eNext = e->NextInAEL;
      if (ePrev && ePrev->Curr.X == e->Bot.X &&
        ePrev->Curr.Y == e->Bot.Y && op &&
        ePrev->OutIdx >= 0 && ePrev->Curr.Y > ePrev->Top.Y &&
        SlopesEqual(e->Curr, e->Top, ePrev->Curr, ePrev->Top, m_UseFullRange) &&
        (e->WindDelta != 0) && (ePrev->WindDelta != 0))
      {
        OutPt* op2 = AddOutPt(ePrev, e->Bot);
        AddJoin(op, op2, e->Top);
      }
      else if (eNext && eNext->Curr.X == e->Bot.X &&
        eNext->Curr.Y == e->Bot.Y && op &&
        eNext->OutIdx >= 0 && eNext->Curr.Y > eNext->Top.Y &&
        SlopesEqual(e->Curr, e->Top, eNext->Curr, eNext->Top, m_UseFullRange) &&
        (e->WindDelta != 0) && (eNext->WindDelta != 0))
      {
        OutPt* op2 = AddOutPt(eNext, e->Bot);
        AddJoin(op, op2, e->Top);
      }
    }
    e = e->NextInAEL;
  }
}
//------------------------------------------------------------------------------

void Clipper::FixupOutPolyline(OutRec &outrec)
{
  OutPt *pp = outrec.Pts;
  OutPt *lastPP = pp->Prev;
  while (pp != lastPP)
  {
    pp = pp->Next;
    if (pp->Pt == pp->Prev->Pt)
    {
      if (pp == lastPP) lastPP = pp->Prev;
      OutPt *tmpPP = pp->Prev;
      tmpPP->Next = pp->Next;
      pp->Next->Prev = tmpPP;
      delete pp;
      pp = tmpPP;
    }
  }

  if (pp == pp->Prev)
  {
    DisposeOutPts(pp);
    outrec.Pts = 0;
    return;
  }
}
//------------------------------------------------------------------------------

void Clipper::FixupOutPolygon(OutRec &outrec)
{
    //FixupOutPolygon() - removes duplicate points and simplifies consecutive
    //parallel edges by removing the middle vertex.
    OutPt *lastOK = 0;
    outrec.BottomPt = 0;
    OutPt *pp = outrec.Pts;
    bool preserveCol = m_PreserveCollinear || m_StrictSimple;

    for (;;)
    {
        if (pp->Prev == pp || pp->Prev == pp->Next)
        {
            DisposeOutPts(pp);
            outrec.Pts = 0;
            return;
        }

        //test for duplicate points and collinear edges ...
        if ((pp->Pt == pp->Next->Pt) || (pp->Pt == pp->Prev->Pt) ||
            (SlopesEqual(pp->Prev->Pt, pp->Pt, pp->Next->Pt, m_UseFullRange) &&
            (!preserveCol || !Pt2IsBetweenPt1AndPt3(pp->Prev->Pt, pp->Pt, pp->Next->Pt))))
        {
            lastOK = 0;
            OutPt *tmp = pp;
            pp->Prev->Next = pp->Next;
            pp->Next->Prev = pp->Prev;
            pp = pp->Prev;
            delete tmp;
        }
        else if (pp == lastOK) break;
        else
        {
            if (!lastOK) lastOK = pp;
            pp = pp->Next;
        }
    }
    outrec.Pts = pp;
}
//------------------------------------------------------------------------------

int PointCount(OutPt *Pts)
{
    if (!Pts) return 0;
    int result = 0;
    OutPt* p = Pts;
    do
    {
        result++;
        p = p->Next;
    }
    while (p != Pts);
    return result;
}
//------------------------------------------------------------------------------

void Clipper::BuildResult(Paths &polys)
{
  polys.reserve(m_PolyOuts.size());
  for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
  {
    if (!m_PolyOuts[i]->Pts) continue;
    Path pg;
    OutPt* p = m_PolyOuts[i]->Pts->Prev;
    int cnt = PointCount(p);
    if (cnt < 2) continue;
    pg.reserve(cnt);
    for (int i = 0; i < cnt; ++i)
    {
      pg.push_back(p->Pt);
      p = p->Prev;
    }
    polys.push_back(pg);
  }
}
//------------------------------------------------------------------------------

void Clipper::BuildResult2(PolyTree& polytree)
{
    polytree.Clear();
    polytree.AllNodes.reserve(m_PolyOuts.size());
    //add each output polygon/contour to polytree ...
    for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); i++)
    {
        OutRec* outRec = m_PolyOuts[i];
        int cnt = PointCount(outRec->Pts);
        if ((outRec->IsOpen && cnt < 2) || (!outRec->IsOpen && cnt < 3)) continue;
        FixHoleLinkage(*outRec);
        PolyNode* pn = new PolyNode();
        //nb: polytree takes ownership of all the PolyNodes
        polytree.AllNodes.push_back(pn);
        outRec->PolyNd = pn;
        pn->Parent = 0;
        pn->Index = 0;
        pn->Contour.reserve(cnt);
        OutPt *op = outRec->Pts->Prev;
        for (int j = 0; j < cnt; j++)
        {
            pn->Contour.push_back(op->Pt);
            op = op->Prev;
        }
    }

    //fixup PolyNode links etc ...
    polytree.Childs.reserve(m_PolyOuts.size());
    for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); i++)
    {
        OutRec* outRec = m_PolyOuts[i];
        if (!outRec->PolyNd) continue;
        if (outRec->IsOpen) 
        {
          outRec->PolyNd->m_IsOpen = true;
          polytree.AddChild(*outRec->PolyNd);
        }
        else if (outRec->FirstLeft && outRec->FirstLeft->PolyNd) 
          outRec->FirstLeft->PolyNd->AddChild(*outRec->PolyNd);
        else
          polytree.AddChild(*outRec->PolyNd);
    }
}
//------------------------------------------------------------------------------

void SwapIntersectNodes(IntersectNode &int1, IntersectNode &int2)
{
  //just swap the contents (because fIntersectNodes is a single-linked-list)
  IntersectNode inode = int1; //gets a copy of Int1
  int1.Edge1 = int2.Edge1;
  int1.Edge2 = int2.Edge2;
  int1.Pt = int2.Pt;
  int2.Edge1 = inode.Edge1;
  int2.Edge2 = inode.Edge2;
  int2.Pt = inode.Pt;
}
//------------------------------------------------------------------------------

inline bool E2InsertsBeforeE1(TEdge &e1, TEdge &e2)
{
  if (e2.Curr.X == e1.Curr.X) 
  {
    if (e2.Top.Y > e1.Top.Y)
      return e2.Top.X < TopX(e1, e2.Top.Y); 
      else return e1.Top.X > TopX(e2, e1.Top.Y);
  } 
  else return e2.Curr.X < e1.Curr.X;
}
//------------------------------------------------------------------------------

bool GetOverlap(const cInt a1, const cInt a2, const cInt b1, const cInt b2, 
    cInt& Left, cInt& Right)
{
  if (a1 < a2)
  {
    if (b1 < b2) {Left = std::max(a1,b1); Right = std::min(a2,b2);}
    else {Left = std::max(a1,b2); Right = std::min(a2,b1);}
  } 
  else
  {
    if (b1 < b2) {Left = std::max(a2,b1); Right = std::min(a1,b2);}
    else {Left = std::max(a2,b2); Right = std::min(a1,b1);}
  }
  return Left < Right;
}
//------------------------------------------------------------------------------

inline void UpdateOutPtIdxs(OutRec& outrec)
{  
  OutPt* op = outrec.Pts;
  do
  {
    op->Idx = outrec.Idx;
    op = op->Prev;
  }
  while(op != outrec.Pts);
}
//------------------------------------------------------------------------------

void Clipper::InsertEdgeIntoAEL(TEdge *edge, TEdge* startEdge)
{
  if(!m_ActiveEdges)
  {
    edge->PrevInAEL = 0;
    edge->NextInAEL = 0;
    m_ActiveEdges = edge;
  }
  else if(!startEdge && E2InsertsBeforeE1(*m_ActiveEdges, *edge))
  {
      edge->PrevInAEL = 0;
      edge->NextInAEL = m_ActiveEdges;
      m_ActiveEdges->PrevInAEL = edge;
      m_ActiveEdges = edge;
  } 
  else
  {
    if(!startEdge) startEdge = m_ActiveEdges;
    while(startEdge->NextInAEL  && 
      !E2InsertsBeforeE1(*startEdge->NextInAEL , *edge))
        startEdge = startEdge->NextInAEL;
    edge->NextInAEL = startEdge->NextInAEL;
    if(startEdge->NextInAEL) startEdge->NextInAEL->PrevInAEL = edge;
    edge->PrevInAEL = startEdge;
    startEdge->NextInAEL = edge;
  }
}
//----------------------------------------------------------------------

OutPt* DupOutPt(OutPt* outPt, bool InsertAfter)
{
  OutPt* result = new OutPt;
  result->Pt = outPt->Pt;
  result->Idx = outPt->Idx;
  if (InsertAfter)
  {
    result->Next = outPt->Next;
    result->Prev = outPt;
    outPt->Next->Prev = result;
    outPt->Next = result;
  } 
  else
  {
    result->Prev = outPt->Prev;
    result->Next = outPt;
    outPt->Prev->Next = result;
    outPt->Prev = result;
  }
  return result;
}
//------------------------------------------------------------------------------

bool JoinHorz(OutPt* op1, OutPt* op1b, OutPt* op2, OutPt* op2b,
  const IntPoint Pt, bool DiscardLeft)
{
  Direction Dir1 = (op1->Pt.X > op1b->Pt.X ? dRightToLeft : dLeftToRight);
  Direction Dir2 = (op2->Pt.X > op2b->Pt.X ? dRightToLeft : dLeftToRight);
  if (Dir1 == Dir2) return false;

  //When DiscardLeft, we want Op1b to be on the Left of Op1, otherwise we
  //want Op1b to be on the Right. (And likewise with Op2 and Op2b.)
  //So, to facilitate this while inserting Op1b and Op2b ...
  //when DiscardLeft, make sure we're AT or RIGHT of Pt before adding Op1b,
  //otherwise make sure we're AT or LEFT of Pt. (Likewise with Op2b.)
  if (Dir1 == dLeftToRight) 
  {
    while (op1->Next->Pt.X <= Pt.X && 
      op1->Next->Pt.X >= op1->Pt.X && op1->Next->Pt.Y == Pt.Y)  
        op1 = op1->Next;
    if (DiscardLeft && (op1->Pt.X != Pt.X)) op1 = op1->Next;
    op1b = DupOutPt(op1, !DiscardLeft);
    if (op1b->Pt != Pt) 
    {
      op1 = op1b;
      op1->Pt = Pt;
      op1b = DupOutPt(op1, !DiscardLeft);
    }
  } 
  else
  {
    while (op1->Next->Pt.X >= Pt.X && 
      op1->Next->Pt.X <= op1->Pt.X && op1->Next->Pt.Y == Pt.Y) 
        op1 = op1->Next;
    if (!DiscardLeft && (op1->Pt.X != Pt.X)) op1 = op1->Next;
    op1b = DupOutPt(op1, DiscardLeft);
    if (op1b->Pt != Pt)
    {
      op1 = op1b;
      op1->Pt = Pt;
      op1b = DupOutPt(op1, DiscardLeft);
    }
  }

  if (Dir2 == dLeftToRight)
  {
    while (op2->Next->Pt.X <= Pt.X && 
      op2->Next->Pt.X >= op2->Pt.X && op2->Next->Pt.Y == Pt.Y)
        op2 = op2->Next;
    if (DiscardLeft && (op2->Pt.X != Pt.X)) op2 = op2->Next;
    op2b = DupOutPt(op2, !DiscardLeft);
    if (op2b->Pt != Pt)
    {
      op2 = op2b;
      op2->Pt = Pt;
      op2b = DupOutPt(op2, !DiscardLeft);
    };
  } else
  {
    while (op2->Next->Pt.X >= Pt.X && 
      op2->Next->Pt.X <= op2->Pt.X && op2->Next->Pt.Y == Pt.Y) 
        op2 = op2->Next;
    if (!DiscardLeft && (op2->Pt.X != Pt.X)) op2 = op2->Next;
    op2b = DupOutPt(op2, DiscardLeft);
    if (op2b->Pt != Pt)
    {
      op2 = op2b;
      op2->Pt = Pt;
      op2b = DupOutPt(op2, DiscardLeft);
    };
  };

  if ((Dir1 == dLeftToRight) == DiscardLeft)
  {
    op1->Prev = op2;
    op2->Next = op1;
    op1b->Next = op2b;
    op2b->Prev = op1b;
  }
  else
  {
    op1->Next = op2;
    op2->Prev = op1;
    op1b->Prev = op2b;
    op2b->Next = op1b;
  }
  return true;
}
//------------------------------------------------------------------------------

bool Clipper::JoinPoints(Join *j, OutRec* outRec1, OutRec* outRec2)
{
  OutPt *op1 = j->OutPt1, *op1b;
  OutPt *op2 = j->OutPt2, *op2b;

  //There are 3 kinds of joins for output polygons ...
  //1. Horizontal joins where Join.OutPt1 & Join.OutPt2 are vertices anywhere
  //along (horizontal) collinear edges (& Join.OffPt is on the same horizontal).
  //2. Non-horizontal joins where Join.OutPt1 & Join.OutPt2 are at the same
  //location at the Bottom of the overlapping segment (& Join.OffPt is above).
  //3. StrictSimple joins where edges touch but are not collinear and where
  //Join.OutPt1, Join.OutPt2 & Join.OffPt all share the same point.
  bool isHorizontal = (j->OutPt1->Pt.Y == j->OffPt.Y);

  if (isHorizontal  && (j->OffPt == j->OutPt1->Pt) &&
  (j->OffPt == j->OutPt2->Pt))
  {
    //Strictly Simple join ...
    if (outRec1 != outRec2) return false;
    op1b = j->OutPt1->Next;
    while (op1b != op1 && (op1b->Pt == j->OffPt)) 
      op1b = op1b->Next;
    bool reverse1 = (op1b->Pt.Y > j->OffPt.Y);
    op2b = j->OutPt2->Next;
    while (op2b != op2 && (op2b->Pt == j->OffPt)) 
      op2b = op2b->Next;
    bool reverse2 = (op2b->Pt.Y > j->OffPt.Y);
    if (reverse1 == reverse2) return false;
    if (reverse1)
    {
      op1b = DupOutPt(op1, false);
      op2b = DupOutPt(op2, true);
      op1->Prev = op2;
      op2->Next = op1;
      op1b->Next = op2b;
      op2b->Prev = op1b;
      j->OutPt1 = op1;
      j->OutPt2 = op1b;
      return true;
    } else
    {
      op1b = DupOutPt(op1, true);
      op2b = DupOutPt(op2, false);
      op1->Next = op2;
      op2->Prev = op1;
      op1b->Prev = op2b;
      op2b->Next = op1b;
      j->OutPt1 = op1;
      j->OutPt2 = op1b;
      return true;
    }
  } 
  else if (isHorizontal)
  {
    //treat horizontal joins differently to non-horizontal joins since with
    //them we're not yet sure where the overlapping is. OutPt1.Pt & OutPt2.Pt
    //may be anywhere along the horizontal edge.
    op1b = op1;
    while (op1->Prev->Pt.Y == op1->Pt.Y && op1->Prev != op1b && op1->Prev != op2)
      op1 = op1->Prev;
    while (op1b->Next->Pt.Y == op1b->Pt.Y && op1b->Next != op1 && op1b->Next != op2)
      op1b = op1b->Next;
    if (op1b->Next == op1 || op1b->Next == op2) return false; //a flat 'polygon'

    op2b = op2;
    while (op2->Prev->Pt.Y == op2->Pt.Y && op2->Prev != op2b && op2->Prev != op1b)
      op2 = op2->Prev;
    while (op2b->Next->Pt.Y == op2b->Pt.Y && op2b->Next != op2 && op2b->Next != op1)
      op2b = op2b->Next;
    if (op2b->Next == op2 || op2b->Next == op1) return false; //a flat 'polygon'

    cInt Left, Right;
    //Op1 --> Op1b & Op2 --> Op2b are the extremites of the horizontal edges
    if (!GetOverlap(op1->Pt.X, op1b->Pt.X, op2->Pt.X, op2b->Pt.X, Left, Right))
      return false;

    //DiscardLeftSide: when overlapping edges are joined, a spike will created
    //which needs to be cleaned up. However, we don't want Op1 or Op2 caught up
    //on the discard Side as either may still be needed for other joins ...
    IntPoint Pt;
    bool DiscardLeftSide;
    if (op1->Pt.X >= Left && op1->Pt.X <= Right) 
    {
      Pt = op1->Pt; DiscardLeftSide = (op1->Pt.X > op1b->Pt.X);
    } 
    else if (op2->Pt.X >= Left&& op2->Pt.X <= Right) 
    {
      Pt = op2->Pt; DiscardLeftSide = (op2->Pt.X > op2b->Pt.X);
    } 
    else if (op1b->Pt.X >= Left && op1b->Pt.X <= Right)
    {
      Pt = op1b->Pt; DiscardLeftSide = op1b->Pt.X > op1->Pt.X;
    } 
    else
    {
      Pt = op2b->Pt; DiscardLeftSide = (op2b->Pt.X > op2->Pt.X);
    }
    j->OutPt1 = op1; j->OutPt2 = op2;
    return JoinHorz(op1, op1b, op2, op2b, Pt, DiscardLeftSide);
  } else
  {
    //nb: For non-horizontal joins ...
    //    1. Jr.OutPt1.Pt.Y == Jr.OutPt2.Pt.Y
    //    2. Jr.OutPt1.Pt > Jr.OffPt.Y

    //make sure the polygons are correctly oriented ...
    op1b = op1->Next;
    while ((op1b->Pt == op1->Pt) && (op1b != op1)) op1b = op1b->Next;
    bool Reverse1 = ((op1b->Pt.Y > op1->Pt.Y) ||
      !SlopesEqual(op1->Pt, op1b->Pt, j->OffPt, m_UseFullRange));
    if (Reverse1)
    {
      op1b = op1->Prev;
      while ((op1b->Pt == op1->Pt) && (op1b != op1)) op1b = op1b->Prev;
      if ((op1b->Pt.Y > op1->Pt.Y) ||
        !SlopesEqual(op1->Pt, op1b->Pt, j->OffPt, m_UseFullRange)) return false;
    };
    op2b = op2->Next;
    while ((op2b->Pt == op2->Pt) && (op2b != op2))op2b = op2b->Next;
    bool Reverse2 = ((op2b->Pt.Y > op2->Pt.Y) ||
      !SlopesEqual(op2->Pt, op2b->Pt, j->OffPt, m_UseFullRange));
    if (Reverse2)
    {
      op2b = op2->Prev;
      while ((op2b->Pt == op2->Pt) && (op2b != op2)) op2b = op2b->Prev;
      if ((op2b->Pt.Y > op2->Pt.Y) ||
        !SlopesEqual(op2->Pt, op2b->Pt, j->OffPt, m_UseFullRange)) return false;
    }

    if ((op1b == op1) || (op2b == op2) || (op1b == op2b) ||
      ((outRec1 == outRec2) && (Reverse1 == Reverse2))) return false;

    if (Reverse1)
    {
      op1b = DupOutPt(op1, false);
      op2b = DupOutPt(op2, true);
      op1->Prev = op2;
      op2->Next = op1;
      op1b->Next = op2b;
      op2b->Prev = op1b;
      j->OutPt1 = op1;
      j->OutPt2 = op1b;
      return true;
    } else
    {
      op1b = DupOutPt(op1, true);
      op2b = DupOutPt(op2, false);
      op1->Next = op2;
      op2->Prev = op1;
      op1b->Prev = op2b;
      op2b->Next = op1b;
      j->OutPt1 = op1;
      j->OutPt2 = op1b;
      return true;
    }
  }
}
//----------------------------------------------------------------------

static OutRec* ParseFirstLeft(OutRec* FirstLeft)
{
  while (FirstLeft && !FirstLeft->Pts)
    FirstLeft = FirstLeft->FirstLeft;
  return FirstLeft;
}
//------------------------------------------------------------------------------

void Clipper::FixupFirstLefts1(OutRec* OldOutRec, OutRec* NewOutRec)
{ 
  //tests if NewOutRec contains the polygon before reassigning FirstLeft
  for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
  {
    OutRec* outRec = m_PolyOuts[i];
    OutRec* firstLeft = ParseFirstLeft(outRec->FirstLeft);
    if (outRec->Pts  && firstLeft == OldOutRec)
    {
      if (Poly2ContainsPoly1(outRec->Pts, NewOutRec->Pts))
        outRec->FirstLeft = NewOutRec;
    }
  }
}
//----------------------------------------------------------------------

void Clipper::FixupFirstLefts2(OutRec* InnerOutRec, OutRec* OuterOutRec)
{
  //A polygon has split into two such that one is now the inner of the other.
  //It's possible that these polygons now wrap around other polygons, so check
  //every polygon that's also contained by OuterOutRec's FirstLeft container
  //(including 0) to see if they've become inner to the new inner polygon ...
  OutRec* orfl = OuterOutRec->FirstLeft;
  for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
  {
    OutRec* outRec = m_PolyOuts[i];

    if (!outRec->Pts || outRec == OuterOutRec || outRec == InnerOutRec)
      continue;
    OutRec* firstLeft = ParseFirstLeft(outRec->FirstLeft);
    if (firstLeft != orfl && firstLeft != InnerOutRec && firstLeft != OuterOutRec)
      continue;
    if (Poly2ContainsPoly1(outRec->Pts, InnerOutRec->Pts))
      outRec->FirstLeft = InnerOutRec;
    else if (Poly2ContainsPoly1(outRec->Pts, OuterOutRec->Pts))
      outRec->FirstLeft = OuterOutRec;
    else if (outRec->FirstLeft == InnerOutRec || outRec->FirstLeft == OuterOutRec)
      outRec->FirstLeft = orfl;
  }
}
//----------------------------------------------------------------------
void Clipper::FixupFirstLefts3(OutRec* OldOutRec, OutRec* NewOutRec)
{
  //reassigns FirstLeft WITHOUT testing if NewOutRec contains the polygon
  for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
  {
    OutRec* outRec = m_PolyOuts[i];
    OutRec* firstLeft = ParseFirstLeft(outRec->FirstLeft);
    if (outRec->Pts && firstLeft == OldOutRec)
      outRec->FirstLeft = NewOutRec;
  }
}
//----------------------------------------------------------------------

void Clipper::JoinCommonEdges()
{
  for (JoinList::size_type i = 0; i < m_Joins.size(); i++)
  {
    Join* join = m_Joins[i];

    OutRec *outRec1 = GetOutRec(join->OutPt1->Idx);
    OutRec *outRec2 = GetOutRec(join->OutPt2->Idx);

    if (!outRec1->Pts || !outRec2->Pts) continue;
    if (outRec1->IsOpen || outRec2->IsOpen) continue;

    //get the polygon fragment with the correct hole state (FirstLeft)
    //before calling JoinPoints() ...
    OutRec *holeStateRec;
    if (outRec1 == outRec2) holeStateRec = outRec1;
    else if (OutRec1RightOfOutRec2(outRec1, outRec2)) holeStateRec = outRec2;
    else if (OutRec1RightOfOutRec2(outRec2, outRec1)) holeStateRec = outRec1;
    else holeStateRec = GetLowermostRec(outRec1, outRec2);

    if (!JoinPoints(join, outRec1, outRec2)) continue;

    if (outRec1 == outRec2)
    {
      //instead of joining two polygons, we've just created a new one by
      //splitting one polygon into two.
      outRec1->Pts = join->OutPt1;
      outRec1->BottomPt = 0;
      outRec2 = CreateOutRec();
      outRec2->Pts = join->OutPt2;

      //update all OutRec2.Pts Idx's ...
      UpdateOutPtIdxs(*outRec2);

      if (Poly2ContainsPoly1(outRec2->Pts, outRec1->Pts))
      {
        //outRec1 contains outRec2 ...
        outRec2->IsHole = !outRec1->IsHole;
        outRec2->FirstLeft = outRec1;

        if (m_UsingPolyTree) FixupFirstLefts2(outRec2, outRec1);

        if ((outRec2->IsHole ^ m_ReverseOutput) == (Area(*outRec2) > 0))
          ReversePolyPtLinks(outRec2->Pts);
            
      } else if (Poly2ContainsPoly1(outRec1->Pts, outRec2->Pts))
      {
        //outRec2 contains outRec1 ...
        outRec2->IsHole = outRec1->IsHole;
        outRec1->IsHole = !outRec2->IsHole;
        outRec2->FirstLeft = outRec1->FirstLeft;
        outRec1->FirstLeft = outRec2;

        if (m_UsingPolyTree) FixupFirstLefts2(outRec1, outRec2);

        if ((outRec1->IsHole ^ m_ReverseOutput) == (Area(*outRec1) > 0))
          ReversePolyPtLinks(outRec1->Pts);
      } 
      else
      {
        //the 2 polygons are completely separate ...
        outRec2->IsHole = outRec1->IsHole;
        outRec2->FirstLeft = outRec1->FirstLeft;

        //fixup FirstLeft pointers that may need reassigning to OutRec2
        if (m_UsingPolyTree) FixupFirstLefts1(outRec1, outRec2);
      }
     
    } else
    {
      //joined 2 polygons together ...

      outRec2->Pts = 0;
      outRec2->BottomPt = 0;
      outRec2->Idx = outRec1->Idx;

      outRec1->IsHole = holeStateRec->IsHole;
      if (holeStateRec == outRec2) 
        outRec1->FirstLeft = outRec2->FirstLeft;
      outRec2->FirstLeft = outRec1;

      if (m_UsingPolyTree) FixupFirstLefts3(outRec2, outRec1);
    }
  }
}

//------------------------------------------------------------------------------
// ClipperOffset support functions ...
//------------------------------------------------------------------------------

DoublePoint GetUnitNormal(const IntPoint &pt1, const IntPoint &pt2)
{
  if(pt2.X == pt1.X && pt2.Y == pt1.Y) 
    return DoublePoint(0, 0);

  double Dx = (double)(pt2.X - pt1.X);
  double dy = (double)(pt2.Y - pt1.Y);
  double f = 1 *1.0/ std::sqrt( Dx*Dx + dy*dy );
  Dx *= f;
  dy *= f;
  return DoublePoint(dy, -Dx);
}

//------------------------------------------------------------------------------
// ClipperOffset class
//------------------------------------------------------------------------------

ClipperOffset::ClipperOffset(double miterLimit, double arcTolerance)
{
  this->MiterLimit = miterLimit;
  this->ArcTolerance = arcTolerance;
  m_lowest.X = -1;
}
//------------------------------------------------------------------------------

ClipperOffset::~ClipperOffset()
{
  Clear();
}
//------------------------------------------------------------------------------

void ClipperOffset::Clear()
{
  for (int i = 0; i < m_polyNodes.ChildCount(); ++i)
    delete m_polyNodes.Childs[i];
  m_polyNodes.Childs.clear();
  m_lowest.X = -1;
}
//------------------------------------------------------------------------------

void ClipperOffset::AddPath(const Path& path, JoinType joinType, EndType endType)
{
  int highI = (int)path.size() - 1;
  if (highI < 0) return;
  PolyNode* newNode = new PolyNode();
  newNode->m_jointype = joinType;
  newNode->m_endtype = endType;

  //strip duplicate points from path and also get index to the lowest point ...
  if (endType == etClosedLine || endType == etClosedPolygon)
    while (highI > 0 && path[0] == path[highI]) highI--;
  newNode->Contour.reserve(highI + 1);
  newNode->Contour.push_back(path[0]);
  int j = 0, k = 0;
  for (int i = 1; i <= highI; i++)
    if (newNode->Contour[j] != path[i])
    {
      j++;
      newNode->Contour.push_back(path[i]);
      if (path[i].Y > newNode->Contour[k].Y ||
        (path[i].Y == newNode->Contour[k].Y &&
        path[i].X < newNode->Contour[k].X)) k = j;
    }
  if (endType == etClosedPolygon && j < 2)
  {
    delete newNode;
    return;
  }
  m_polyNodes.AddChild(*newNode);

  //if this path's lowest pt is lower than all the others then update m_lowest
  if (endType != etClosedPolygon) return;
  if (m_lowest.X < 0)
    m_lowest = IntPoint(m_polyNodes.ChildCount() - 1, k);
  else
  {
    IntPoint ip = m_polyNodes.Childs[(int)m_lowest.X]->Contour[(int)m_lowest.Y];
    if (newNode->Contour[k].Y > ip.Y ||
      (newNode->Contour[k].Y == ip.Y &&
      newNode->Contour[k].X < ip.X))
      m_lowest = IntPoint(m_polyNodes.ChildCount() - 1, k);
  }
}
//------------------------------------------------------------------------------

void ClipperOffset::AddPaths(const Paths& paths, JoinType joinType, EndType endType)
{
  for (Paths::size_type i = 0; i < paths.size(); ++i)
    AddPath(paths[i], joinType, endType);
}
//------------------------------------------------------------------------------

void ClipperOffset::FixOrientations()
{
  //fixup orientations of all closed paths if the orientation of the
  //closed path with the lowermost vertex is wrong ...
  if (m_lowest.X >= 0 && 
    !Orientation(m_polyNodes.Childs[(int)m_lowest.X]->Contour))
  {
    for (int i = 0; i < m_polyNodes.ChildCount(); ++i)
    {
      PolyNode& node = *m_polyNodes.Childs[i];
      if (node.m_endtype == etClosedPolygon ||
        (node.m_endtype == etClosedLine && Orientation(node.Contour)))
          ReversePath(node.Contour);
    }
  } else
  {
    for (int i = 0; i < m_polyNodes.ChildCount(); ++i)
    {
      PolyNode& node = *m_polyNodes.Childs[i];
      if (node.m_endtype == etClosedLine && !Orientation(node.Contour))
        ReversePath(node.Contour);
    }
  }
}
//------------------------------------------------------------------------------

void ClipperOffset::Execute(Paths& solution, double delta)
{
  solution.clear();
  FixOrientations();
  DoOffset(delta);
  
  //now clean up 'corners' ...
  Clipper clpr;
  clpr.AddPaths(m_destPolys, ptSubject, true);
  if (delta > 0)
  {
    clpr.Execute(ctUnion, solution, pftPositive, pftPositive);
  }
  else
  {
    IntRect r = clpr.GetBounds();
    Path outer(4);
    outer[0] = IntPoint(r.left - 10, r.bottom + 10);
    outer[1] = IntPoint(r.right + 10, r.bottom + 10);
    outer[2] = IntPoint(r.right + 10, r.top - 10);
    outer[3] = IntPoint(r.left - 10, r.top - 10);

    clpr.AddPath(outer, ptSubject, true);
    clpr.ReverseSolution(true);
    clpr.Execute(ctUnion, solution, pftNegative, pftNegative);
    if (solution.size() > 0) solution.erase(solution.begin());
  }
}
//------------------------------------------------------------------------------

void ClipperOffset::Execute(PolyTree& solution, double delta)
{
  solution.Clear();
  FixOrientations();
  DoOffset(delta);

  //now clean up 'corners' ...
  Clipper clpr;
  clpr.AddPaths(m_destPolys, ptSubject, true);
  if (delta > 0)
  {
    clpr.Execute(ctUnion, solution, pftPositive, pftPositive);
  }
  else
  {
    IntRect r = clpr.GetBounds();
    Path outer(4);
    outer[0] = IntPoint(r.left - 10, r.bottom + 10);
    outer[1] = IntPoint(r.right + 10, r.bottom + 10);
    outer[2] = IntPoint(r.right + 10, r.top - 10);
    outer[3] = IntPoint(r.left - 10, r.top - 10);

    clpr.AddPath(outer, ptSubject, true);
    clpr.ReverseSolution(true);
    clpr.Execute(ctUnion, solution, pftNegative, pftNegative);
    //remove the outer PolyNode rectangle ...
    if (solution.ChildCount() == 1 && solution.Childs[0]->ChildCount() > 0)
    {
      PolyNode* outerNode = solution.Childs[0];
      solution.Childs.reserve(outerNode->ChildCount());
      solution.Childs[0] = outerNode->Childs[0];
      solution.Childs[0]->Parent = outerNode->Parent;
      for (int i = 1; i < outerNode->ChildCount(); ++i)
        solution.AddChild(*outerNode->Childs[i]);
    }
    else
      solution.Clear();
  }
}
//------------------------------------------------------------------------------

void ClipperOffset::DoOffset(double delta)
{
  m_destPolys.clear();
  m_delta = delta;

  //if Zero offset, just copy any CLOSED polygons to m_p and return ...
  if (NEAR_ZERO(delta)) 
  {
    m_destPolys.reserve(m_polyNodes.ChildCount());
    for (int i = 0; i < m_polyNodes.ChildCount(); i++)
    {
      PolyNode& node = *m_polyNodes.Childs[i];
      if (node.m_endtype == etClosedPolygon)
        m_destPolys.push_back(node.Contour);
    }
    return;
  }

  //see offset_triginometry3.svg in the documentation folder ...
  if (MiterLimit > 2) m_miterLim = 2/(MiterLimit * MiterLimit);
  else m_miterLim = 0.5;

  double y;
  if (ArcTolerance <= 0.0) y = def_arc_tolerance;
  else if (ArcTolerance > std::fabs(delta) * def_arc_tolerance) 
    y = std::fabs(delta) * def_arc_tolerance;
  else y = ArcTolerance;
  //see offset_triginometry2.svg in the documentation folder ...
  double steps = pi / std::acos(1 - y / std::fabs(delta));
  if (steps > std::fabs(delta) * pi) 
    steps = std::fabs(delta) * pi;  //ie excessive precision check
  m_sin = std::sin(two_pi / steps);
  m_cos = std::cos(two_pi / steps);
  m_StepsPerRad = steps / two_pi;
  if (delta < 0.0) m_sin = -m_sin;

  m_destPolys.reserve(m_polyNodes.ChildCount() * 2);
  for (int i = 0; i < m_polyNodes.ChildCount(); i++)
  {
    PolyNode& node = *m_polyNodes.Childs[i];
    m_srcPoly = node.Contour;

    int len = (int)m_srcPoly.size();
    if (len == 0 || (delta <= 0 && (len < 3 || node.m_endtype != etClosedPolygon)))
        continue;

    m_destPoly.clear();
    if (len == 1)
    {
      if (node.m_jointype == jtRound)
      {
        double X = 1.0, Y = 0.0;
        for (cInt j = 1; j <= steps; j++)
        {
          m_destPoly.push_back(IntPoint(
            Round(m_srcPoly[0].X + X * delta),
            Round(m_srcPoly[0].Y + Y * delta)));
          double X2 = X;
          X = X * m_cos - m_sin * Y;
          Y = X2 * m_sin + Y * m_cos;
        }
      }
      else
      {
        double X = -1.0, Y = -1.0;
        for (int j = 0; j < 4; ++j)
        {
          m_destPoly.push_back(IntPoint(
            Round(m_srcPoly[0].X + X * delta),
            Round(m_srcPoly[0].Y + Y * delta)));
          if (X < 0) X = 1;
          else if (Y < 0) Y = 1;
          else X = -1;
        }
      }
      m_destPolys.push_back(m_destPoly);
      continue;
    }
    //build m_normals ...
    m_normals.clear();
    m_normals.reserve(len);
    for (int j = 0; j < len - 1; ++j)
      m_normals.push_back(GetUnitNormal(m_srcPoly[j], m_srcPoly[j + 1]));
    if (node.m_endtype == etClosedLine || node.m_endtype == etClosedPolygon)
      m_normals.push_back(GetUnitNormal(m_srcPoly[len - 1], m_srcPoly[0]));
    else
      m_normals.push_back(DoublePoint(m_normals[len - 2]));

    if (node.m_endtype == etClosedPolygon)
    {
      int k = len - 1;
      for (int j = 0; j < len; ++j)
        OffsetPoint(j, k, node.m_jointype);
      m_destPolys.push_back(m_destPoly);
    }
    else if (node.m_endtype == etClosedLine)
    {
      int k = len - 1;
      for (int j = 0; j < len; ++j)
        OffsetPoint(j, k, node.m_jointype);
      m_destPolys.push_back(m_destPoly);
      m_destPoly.clear();
      //re-build m_normals ...
      DoublePoint n = m_normals[len -1];
      for (int j = len - 1; j > 0; j--)
        m_normals[j] = DoublePoint(-m_normals[j - 1].X, -m_normals[j - 1].Y);
      m_normals[0] = DoublePoint(-n.X, -n.Y);
      k = 0;
      for (int j = len - 1; j >= 0; j--)
        OffsetPoint(j, k, node.m_jointype);
      m_destPolys.push_back(m_destPoly);
    }
    else
    {
      int k = 0;
      for (int j = 1; j < len - 1; ++j)
        OffsetPoint(j, k, node.m_jointype);

      IntPoint pt1;
      if (node.m_endtype == etOpenButt)
      {
        int j = len - 1;
        pt1 = IntPoint((cInt)Round(m_srcPoly[j].X + m_normals[j].X *
          delta), (cInt)Round(m_srcPoly[j].Y + m_normals[j].Y * delta));
        m_destPoly.push_back(pt1);
        pt1 = IntPoint((cInt)Round(m_srcPoly[j].X - m_normals[j].X *
          delta), (cInt)Round(m_srcPoly[j].Y - m_normals[j].Y * delta));
        m_destPoly.push_back(pt1);
      }
      else
      {
        int j = len - 1;
        k = len - 2;
        m_sinA = 0;
        m_normals[j] = DoublePoint(-m_normals[j].X, -m_normals[j].Y);
        if (node.m_endtype == etOpenSquare)
          DoSquare(j, k);
        else
          DoRound(j, k);
      }

      //re-build m_normals ...
      for (int j = len - 1; j > 0; j--)
        m_normals[j] = DoublePoint(-m_normals[j - 1].X, -m_normals[j - 1].Y);
      m_normals[0] = DoublePoint(-m_normals[1].X, -m_normals[1].Y);

      k = len - 1;
      for (int j = k - 1; j > 0; --j) OffsetPoint(j, k, node.m_jointype);

      if (node.m_endtype == etOpenButt)
      {
        pt1 = IntPoint((cInt)Round(m_srcPoly[0].X - m_normals[0].X * delta),
          (cInt)Round(m_srcPoly[0].Y - m_normals[0].Y * delta));
        m_destPoly.push_back(pt1);
        pt1 = IntPoint((cInt)Round(m_srcPoly[0].X + m_normals[0].X * delta),
          (cInt)Round(m_srcPoly[0].Y + m_normals[0].Y * delta));
        m_destPoly.push_back(pt1);
      }
      else
      {
        k = 1;
        m_sinA = 0;
        if (node.m_endtype == etOpenSquare)
          DoSquare(0, 1);
        else
          DoRound(0, 1);
      }
      m_destPolys.push_back(m_destPoly);
    }
  }
}
//------------------------------------------------------------------------------

void ClipperOffset::OffsetPoint(int j, int& k, JoinType jointype)
{
  //cross product ...
  m_sinA = (m_normals[k].X * m_normals[j].Y - m_normals[j].X * m_normals[k].Y);
  if (std::fabs(m_sinA * m_delta) < 1.0) 
  {
    //dot product ...
    double cosA = (m_normals[k].X * m_normals[j].X + m_normals[j].Y * m_normals[k].Y ); 
    if (cosA > 0) // angle => 0 degrees
    {
      m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + m_normals[k].X * m_delta),
        Round(m_srcPoly[j].Y + m_normals[k].Y * m_delta)));
      return; 
    }
    //else angle => 180 degrees   
  }
  else if (m_sinA > 1.0) m_sinA = 1.0;
  else if (m_sinA < -1.0) m_sinA = -1.0;

  if (m_sinA * m_delta < 0)
  {
    m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + m_normals[k].X * m_delta),
      Round(m_srcPoly[j].Y + m_normals[k].Y * m_delta)));
    m_destPoly.push_back(m_srcPoly[j]);
    m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + m_normals[j].X * m_delta),
      Round(m_srcPoly[j].Y + m_normals[j].Y * m_delta)));
  }
  else
    switch (jointype)
    {
      case jtMiter:
        {
          double r = 1 + (m_normals[j].X * m_normals[k].X +
            m_normals[j].Y * m_normals[k].Y);
          if (r >= m_miterLim) DoMiter(j, k, r); else DoSquare(j, k);
          break;
        }
      case jtSquare: DoSquare(j, k); break;
      case jtRound: DoRound(j, k); break;
    }
  k = j;
}
//------------------------------------------------------------------------------

void ClipperOffset::DoSquare(int j, int k)
{
  double dx = std::tan(std::atan2(m_sinA,
      m_normals[k].X * m_normals[j].X + m_normals[k].Y * m_normals[j].Y) / 4);
  m_destPoly.push_back(IntPoint(
      Round(m_srcPoly[j].X + m_delta * (m_normals[k].X - m_normals[k].Y * dx)),
      Round(m_srcPoly[j].Y + m_delta * (m_normals[k].Y + m_normals[k].X * dx))));
  m_destPoly.push_back(IntPoint(
      Round(m_srcPoly[j].X + m_delta * (m_normals[j].X + m_normals[j].Y * dx)),
      Round(m_srcPoly[j].Y + m_delta * (m_normals[j].Y - m_normals[j].X * dx))));
}
//------------------------------------------------------------------------------

void ClipperOffset::DoMiter(int j, int k, double r)
{
  double q = m_delta / r;
  m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + (m_normals[k].X + m_normals[j].X) * q),
      Round(m_srcPoly[j].Y + (m_normals[k].Y + m_normals[j].Y) * q)));
}
//------------------------------------------------------------------------------

void ClipperOffset::DoRound(int j, int k)
{
  double a = std::atan2(m_sinA,
  m_normals[k].X * m_normals[j].X + m_normals[k].Y * m_normals[j].Y);
  int steps = std::max((int)Round(m_StepsPerRad * std::fabs(a)), 1);

  double X = m_normals[k].X, Y = m_normals[k].Y, X2;
  for (int i = 0; i < steps; ++i)
  {
    m_destPoly.push_back(IntPoint(
        Round(m_srcPoly[j].X + X * m_delta),
        Round(m_srcPoly[j].Y + Y * m_delta)));
    X2 = X;
    X = X * m_cos - m_sin * Y;
    Y = X2 * m_sin + Y * m_cos;
  }
  m_destPoly.push_back(IntPoint(
  Round(m_srcPoly[j].X + m_normals[j].X * m_delta),
  Round(m_srcPoly[j].Y + m_normals[j].Y * m_delta)));
}

//------------------------------------------------------------------------------
// Miscellaneous public functions
//------------------------------------------------------------------------------

void Clipper::DoSimplePolygons()
{
  PolyOutList::size_type i = 0;
  while (i < m_PolyOuts.size()) 
  {
    OutRec* outrec = m_PolyOuts[i++];
    OutPt* op = outrec->Pts;
    if (!op || outrec->IsOpen) continue;
    do //for each Pt in Polygon until duplicate found do ...
    {
      OutPt* op2 = op->Next;
      while (op2 != outrec->Pts) 
      {
        if ((op->Pt == op2->Pt) && op2->Next != op && op2->Prev != op) 
        {
          //split the polygon into two ...
          OutPt* op3 = op->Prev;
          OutPt* op4 = op2->Prev;
          op->Prev = op4;
          op4->Next = op;
          op2->Prev = op3;
          op3->Next = op2;

          outrec->Pts = op;
          OutRec* outrec2 = CreateOutRec();
          outrec2->Pts = op2;
          UpdateOutPtIdxs(*outrec2);
          if (Poly2ContainsPoly1(outrec2->Pts, outrec->Pts))
          {
            //OutRec2 is contained by OutRec1 ...
            outrec2->IsHole = !outrec->IsHole;
            outrec2->FirstLeft = outrec;
            if (m_UsingPolyTree) FixupFirstLefts2(outrec2, outrec);
          }
          else
            if (Poly2ContainsPoly1(outrec->Pts, outrec2->Pts))
          {
            //OutRec1 is contained by OutRec2 ...
            outrec2->IsHole = outrec->IsHole;
            outrec->IsHole = !outrec2->IsHole;
            outrec2->FirstLeft = outrec->FirstLeft;
            outrec->FirstLeft = outrec2;
            if (m_UsingPolyTree) FixupFirstLefts2(outrec, outrec2);
            }
            else
          {
            //the 2 polygons are separate ...
            outrec2->IsHole = outrec->IsHole;
            outrec2->FirstLeft = outrec->FirstLeft;
            if (m_UsingPolyTree) FixupFirstLefts1(outrec, outrec2);
            }
          op2 = op; //ie get ready for the Next iteration
        }
        op2 = op2->Next;
      }
      op = op->Next;
    }
    while (op != outrec->Pts);
  }
}
//------------------------------------------------------------------------------

void ReversePath(Path& p)
{
  std::reverse(p.begin(), p.end());
}
//------------------------------------------------------------------------------

void ReversePaths(Paths& p)
{
  for (Paths::size_type i = 0; i < p.size(); ++i)
    ReversePath(p[i]);
}
//------------------------------------------------------------------------------

void SimplifyPolygon(const Path &in_poly, Paths &out_polys, PolyFillType fillType)
{
  Clipper c;
  c.StrictlySimple(true);
  c.AddPath(in_poly, ptSubject, true);
  c.Execute(ctUnion, out_polys, fillType, fillType);
}
//------------------------------------------------------------------------------

void SimplifyPolygons(const Paths &in_polys, Paths &out_polys, PolyFillType fillType)
{
  Clipper c;
  c.StrictlySimple(true);
  c.AddPaths(in_polys, ptSubject, true);
  c.Execute(ctUnion, out_polys, fillType, fillType);
}
//------------------------------------------------------------------------------

void SimplifyPolygons(Paths &polys, PolyFillType fillType)
{
  SimplifyPolygons(polys, polys, fillType);
}
//------------------------------------------------------------------------------

inline double DistanceSqrd(const IntPoint& pt1, const IntPoint& pt2)
{
  double Dx = ((double)pt1.X - pt2.X);
  double dy = ((double)pt1.Y - pt2.Y);
  return (Dx*Dx + dy*dy);
}
//------------------------------------------------------------------------------

double DistanceFromLineSqrd(
  const IntPoint& pt, const IntPoint& ln1, const IntPoint& ln2)
{
  //The equation of a line in general form (Ax + By + C = 0)
  //given 2 points (x�,y�) & (x�,y�) is ...
  //(y� - y�)x + (x� - x�)y + (y� - y�)x� - (x� - x�)y� = 0
  //A = (y� - y�); B = (x� - x�); C = (y� - y�)x� - (x� - x�)y�
  //perpendicular distance of point (x�,y�) = (Ax� + By� + C)/Sqrt(A� + B�)
  //see http://en.wikipedia.org/wiki/Perpendicular_distance
  double A = double(ln1.Y - ln2.Y);
  double B = double(ln2.X - ln1.X);
  double C = A * ln1.X  + B * ln1.Y;
  C = A * pt.X + B * pt.Y - C;
  return (C * C) / (A * A + B * B);
}
//---------------------------------------------------------------------------

bool SlopesNearCollinear(const IntPoint& pt1, 
    const IntPoint& pt2, const IntPoint& pt3, double distSqrd)
{
  //this function is more accurate when the point that's geometrically
  //between the other 2 points is the one that's tested for distance.
  //ie makes it more likely to pick up 'spikes' ...
	if (Abs(pt1.X - pt2.X) > Abs(pt1.Y - pt2.Y))
	{
    if ((pt1.X > pt2.X) == (pt1.X < pt3.X))
      return DistanceFromLineSqrd(pt1, pt2, pt3) < distSqrd;
    else if ((pt2.X > pt1.X) == (pt2.X < pt3.X))
      return DistanceFromLineSqrd(pt2, pt1, pt3) < distSqrd;
		else
	    return DistanceFromLineSqrd(pt3, pt1, pt2) < distSqrd;
	}
	else
	{
    if ((pt1.Y > pt2.Y) == (pt1.Y < pt3.Y))
      return DistanceFromLineSqrd(pt1, pt2, pt3) < distSqrd;
    else if ((pt2.Y > pt1.Y) == (pt2.Y < pt3.Y))
      return DistanceFromLineSqrd(pt2, pt1, pt3) < distSqrd;
		else
      return DistanceFromLineSqrd(pt3, pt1, pt2) < distSqrd;
	}
}
//------------------------------------------------------------------------------

bool PointsAreClose(IntPoint pt1, IntPoint pt2, double distSqrd)
{
    double Dx = (double)pt1.X - pt2.X;
    double dy = (double)pt1.Y - pt2.Y;
    return ((Dx * Dx) + (dy * dy) <= distSqrd);
}
//------------------------------------------------------------------------------

OutPt* ExcludeOp(OutPt* op)
{
  OutPt* result = op->Prev;
  result->Next = op->Next;
  op->Next->Prev = result;
  result->Idx = 0;
  return result;
}
//------------------------------------------------------------------------------

void CleanPolygon(const Path& in_poly, Path& out_poly, double distance)
{
  //distance = proximity in units/pixels below which vertices
  //will be stripped. Default ~= sqrt(2).
  
  size_t size = in_poly.size();
  
  if (size == 0) 
  {
    out_poly.clear();
    return;
  }

  OutPt* outPts = new OutPt[size];
  for (size_t i = 0; i < size; ++i)
  {
    outPts[i].Pt = in_poly[i];
    outPts[i].Next = &outPts[(i + 1) % size];
    outPts[i].Next->Prev = &outPts[i];
    outPts[i].Idx = 0;
  }

  double distSqrd = distance * distance;
  OutPt* op = &outPts[0];
  while (op->Idx == 0 && op->Next != op->Prev) 
  {
    if (PointsAreClose(op->Pt, op->Prev->Pt, distSqrd))
    {
      op = ExcludeOp(op);
      size--;
    } 
    else if (PointsAreClose(op->Prev->Pt, op->Next->Pt, distSqrd))
    {
      ExcludeOp(op->Next);
      op = ExcludeOp(op);
      size -= 2;
    }
    else if (SlopesNearCollinear(op->Prev->Pt, op->Pt, op->Next->Pt, distSqrd))
    {
      op = ExcludeOp(op);
      size--;
    }
    else
    {
      op->Idx = 1;
      op = op->Next;
    }
  }

  if (size < 3) size = 0;
  out_poly.resize(size);
  for (size_t i = 0; i < size; ++i)
  {
    out_poly[i] = op->Pt;
    op = op->Next;
  }
  delete [] outPts;
}
//------------------------------------------------------------------------------

void CleanPolygon(Path& poly, double distance)
{
  CleanPolygon(poly, poly, distance);
}
//------------------------------------------------------------------------------

void CleanPolygons(const Paths& in_polys, Paths& out_polys, double distance)
{
  out_polys.resize(in_polys.size());
  for (Paths::size_type i = 0; i < in_polys.size(); ++i)
    CleanPolygon(in_polys[i], out_polys[i], distance);
}
//------------------------------------------------------------------------------

void CleanPolygons(Paths& polys, double distance)
{
  CleanPolygons(polys, polys, distance);
}
//------------------------------------------------------------------------------

void Minkowski(const Path& poly, const Path& path, 
  Paths& solution, bool isSum, bool isClosed)
{
  int delta = (isClosed ? 1 : 0);
  size_t polyCnt = poly.size();
  size_t pathCnt = path.size();
  Paths pp;
  pp.reserve(pathCnt);
  if (isSum)
    for (size_t i = 0; i < pathCnt; ++i)
    {
      Path p;
      p.reserve(polyCnt);
      for (size_t j = 0; j < poly.size(); ++j)
        p.push_back(IntPoint(path[i].X + poly[j].X, path[i].Y + poly[j].Y));
      pp.push_back(p);
    }
  else
    for (size_t i = 0; i < pathCnt; ++i)
    {
      Path p;
      p.reserve(polyCnt);
      for (size_t j = 0; j < poly.size(); ++j)
        p.push_back(IntPoint(path[i].X - poly[j].X, path[i].Y - poly[j].Y));
      pp.push_back(p);
    }

  solution.clear();
  solution.reserve((pathCnt + delta) * (polyCnt + 1));
  for (size_t i = 0; i < pathCnt - 1 + delta; ++i)
    for (size_t j = 0; j < polyCnt; ++j)
    {
      Path quad;
      quad.reserve(4);
      quad.push_back(pp[i % pathCnt][j % polyCnt]);
      quad.push_back(pp[(i + 1) % pathCnt][j % polyCnt]);
      quad.push_back(pp[(i + 1) % pathCnt][(j + 1) % polyCnt]);
      quad.push_back(pp[i % pathCnt][(j + 1) % polyCnt]);
      if (!Orientation(quad)) ReversePath(quad);
      solution.push_back(quad);
    }
}
//------------------------------------------------------------------------------

void MinkowskiSum(const Path& pattern, const Path& path, Paths& solution, bool pathIsClosed)
{
  Minkowski(pattern, path, solution, true, pathIsClosed);
  Clipper c;
  c.AddPaths(solution, ptSubject, true);
  c.Execute(ctUnion, solution, pftNonZero, pftNonZero);
}
//------------------------------------------------------------------------------

void TranslatePath(const Path& input, Path& output, const IntPoint delta)
{
  //precondition: input != output
  output.resize(input.size());
  for (size_t i = 0; i < input.size(); ++i)
    output[i] = IntPoint(input[i].X + delta.X, input[i].Y + delta.Y);
}
//------------------------------------------------------------------------------

void MinkowskiSum(const Path& pattern, const Paths& paths, Paths& solution, bool pathIsClosed)
{
  Clipper c;
  for (size_t i = 0; i < paths.size(); ++i)
  {
    Paths tmp;
    Minkowski(pattern, paths[i], tmp, true, pathIsClosed);
    c.AddPaths(tmp, ptSubject, true);
    if (pathIsClosed)
    {
      Path tmp2;
      TranslatePath(paths[i], tmp2, pattern[0]);
      c.AddPath(tmp2, ptClip, true);
    }
  }
    c.Execute(ctUnion, solution, pftNonZero, pftNonZero);
}
//------------------------------------------------------------------------------

void MinkowskiDiff(const Path& poly1, const Path& poly2, Paths& solution)
{
  Minkowski(poly1, poly2, solution, false, true);
  Clipper c;
  c.AddPaths(solution, ptSubject, true);
  c.Execute(ctUnion, solution, pftNonZero, pftNonZero);
}
//------------------------------------------------------------------------------

enum NodeType {ntAny, ntOpen, ntClosed};

void AddPolyNodeToPaths(const PolyNode& polynode, NodeType nodetype, Paths& paths)
{
  bool match = true;
  if (nodetype == ntClosed) match = !polynode.IsOpen();
  else if (nodetype == ntOpen) return;

  if (!polynode.Contour.empty() && match)
    paths.push_back(polynode.Contour);
  for (int i = 0; i < polynode.ChildCount(); ++i)
    AddPolyNodeToPaths(*polynode.Childs[i], nodetype, paths);
}
//------------------------------------------------------------------------------

void PolyTreeToPaths(const PolyTree& polytree, Paths& paths)
{
  paths.resize(0); 
  paths.reserve(polytree.Total());
  AddPolyNodeToPaths(polytree, ntAny, paths);
}
//------------------------------------------------------------------------------

void ClosedPathsFromPolyTree(const PolyTree& polytree, Paths& paths)
{
  paths.resize(0); 
  paths.reserve(polytree.Total());
  AddPolyNodeToPaths(polytree, ntClosed, paths);
}
//------------------------------------------------------------------------------

void OpenPathsFromPolyTree(PolyTree& polytree, Paths& paths)
{
  paths.resize(0); 
  paths.reserve(polytree.Total());
  //Open paths are top level only, so ...
  for (int i = 0; i < polytree.ChildCount(); ++i)
    if (polytree.Childs[i]->IsOpen())
      paths.push_back(polytree.Childs[i]->Contour);
}
//------------------------------------------------------------------------------

std::ostream& operator <<(std::ostream &s, const IntPoint &p)
{
  s << "(" << p.X << "," << p.Y << ")";
  return s;
}
//------------------------------------------------------------------------------

std::ostream& operator <<(std::ostream &s, const Path &p)
{
  if (p.empty()) return s;
  Path::size_type last = p.size() -1;
  for (Path::size_type i = 0; i < last; i++)
    s << "(" << p[i].X << "," << p[i].Y << "), ";
  s << "(" << p[last].X << "," << p[last].Y << ")\n";
  return s;
}
//------------------------------------------------------------------------------

std::ostream& operator <<(std::ostream &s, const Paths &p)
{
  for (Paths::size_type i = 0; i < p.size(); i++)
    s << p[i];
  s << "\n";
  return s;
}
//------------------------------------------------------------------------------

} //ClipperLib namespace


================================================
FILE: dbnet/clipper/clipper.hpp
================================================
/*******************************************************************************
*                                                                              *
* Author    :  Angus Johnson                                                   *
* Version   :  6.4.2                                                           *
* Date      :  27 February 2017                                                *
* Website   :  http://www.angusj.com                                           *
* Copyright :  Angus Johnson 2010-2017                                         *
*                                                                              *
* License:                                                                     *
* Use, modification & distribution is subject to Boost Software License Ver 1. *
* http://www.boost.org/LICENSE_1_0.txt                                         *
*                                                                              *
* Attributions:                                                                *
* The code in this library is an extension of Bala Vatti's clipping algorithm: *
* "A generic solution to polygon clipping"                                     *
* Communications of the ACM, Vol 35, Issue 7 (July 1992) pp 56-63.             *
* http://portal.acm.org/citation.cfm?id=129906                                 *
*                                                                              *
* Computer graphics and geometric modeling: implementation and algorithms      *
* By Max K. Agoston                                                            *
* Springer; 1 edition (January 4, 2005)                                        *
* http://books.google.com/books?q=vatti+clipping+agoston                       *
*                                                                              *
* See also:                                                                    *
* "Polygon Offsetting by Computing Winding Numbers"                            *
* Paper no. DETC2005-85513 pp. 565-575                                         *
* ASME 2005 International Design Engineering Technical Conferences             *
* and Computers and Information in Engineering Conference (IDETC/CIE2005)      *
* September 24-28, 2005 , Long Beach, California, USA                          *
* http://www.me.berkeley.edu/~mcmains/pubs/DAC05OffsetPolygon.pdf              *
*                                                                              *
*******************************************************************************/

#ifndef clipper_hpp
#define clipper_hpp

#define CLIPPER_VERSION "6.4.2"

//use_int32: When enabled 32bit ints are used instead of 64bit ints. This
//improve performance but coordinate values are limited to the range +/- 46340
//#define use_int32

//use_xyz: adds a Z member to IntPoint. Adds a minor cost to perfomance.
//#define use_xyz

//use_lines: Enables line clipping. Adds a very minor cost to performance.
#define use_lines
  
//use_deprecated: Enables temporary support for the obsolete functions
//#define use_deprecated  

#include <vector>
#include <list>
#include <set>
#include <stdexcept>
#include <cstring>
#include <cstdlib>
#include <ostream>
#include <functional>
#include <queue>

namespace ClipperLib {

enum ClipType { ctIntersection, ctUnion, ctDifference, ctXor };
enum PolyType { ptSubject, ptClip };
//By far the most widely used winding rules for polygon filling are
//EvenOdd & NonZero (GDI, GDI+, XLib, OpenGL, Cairo, AGG, Quartz, SVG, Gr32)
//Others rules include Positive, Negative and ABS_GTR_EQ_TWO (only in OpenGL)
//see http://glprogramming.com/red/chapter11.html
enum PolyFillType { pftEvenOdd, pftNonZero, pftPositive, pftNegative };

#ifdef use_int32
  typedef int cInt;
  static cInt const loRange = 0x7FFF;
  static cInt const hiRange = 0x7FFF;
#else
  typedef signed long long cInt;
  static cInt const loRange = 0x3FFFFFFF;
  static cInt const hiRange = 0x3FFFFFFFFFFFFFFFLL;
  typedef signed long long long64;     //used by Int128 class
  typedef unsigned long long ulong64;

#endif

struct IntPoint {
  cInt X;
  cInt Y;
#ifdef use_xyz
  cInt Z;
  IntPoint(cInt x = 0, cInt y = 0, cInt z = 0): X(x), Y(y), Z(z) {};
#else
  IntPoint(cInt x = 0, cInt y = 0): X(x), Y(y) {};
#endif

  friend inline bool operator== (const IntPoint& a, const IntPoint& b)
  {
    return a.X == b.X && a.Y == b.Y;
  }
  friend inline bool operator!= (const IntPoint& a, const IntPoint& b)
  {
    return a.X != b.X  || a.Y != b.Y; 
  }
};
//------------------------------------------------------------------------------

typedef std::vector< IntPoint > Path;
typedef std::vector< Path > Paths;

inline Path& operator <<(Path& poly, const IntPoint& p) {poly.push_back(p); return poly;}
inline Paths& operator <<(Paths& polys, const Path& p) {polys.push_back(p); return polys;}

std::ostream& operator <<(std::ostream &s, const IntPoint &p);
std::ostream& operator <<(std::ostream &s, const Path &p);
std::ostream& operator <<(std::ostream &s, const Paths &p);

struct DoublePoint
{
  double X;
  double Y;
  DoublePoint(double x = 0, double y = 0) : X(x), Y(y) {}
  DoublePoint(IntPoint ip) : X((double)ip.X), Y((double)ip.Y) {}
};
//------------------------------------------------------------------------------

#ifdef use_xyz
typedef void (*ZFillCallback)(IntPoint& e1bot, IntPoint& e1top, IntPoint& e2bot, IntPoint& e2top, IntPoint& pt);
#endif

enum InitOptions {ioReverseSolution = 1, ioStrictlySimple = 2, ioPreserveCollinear = 4};
enum JoinType {jtSquare, jtRound, jtMiter};
enum EndType {etClosedPolygon, etClosedLine, etOpenButt, etOpenSquare, etOpenRound};

class PolyNode;
typedef std::vector< PolyNode* > PolyNodes;

class PolyNode 
{ 
public:
    PolyNode();
    virtual ~PolyNode(){};
    Path Contour;
    PolyNodes Childs;
    PolyNode* Parent;
    PolyNode* GetNext() const;
    bool IsHole() const;
    bool IsOpen() const;
    int ChildCount() const;
private:
    //PolyNode& operator =(PolyNode& other); 
    unsigned Index; //node index in Parent.Childs
    bool m_IsOpen;
    JoinType m_jointype;
    EndType m_endtype;
    PolyNode* GetNextSiblingUp() const;
    void AddChild(PolyNode& child);
    friend class Clipper; //to access Index
    friend class ClipperOffset; 
};

class PolyTree: public PolyNode
{ 
public:
    ~PolyTree(){ Clear(); };
    PolyNode* GetFirst() const;
    void Clear();
    int Total() const;
private:
  //PolyTree& operator =(PolyTree& other);
  PolyNodes AllNodes;
    friend class Clipper; //to access AllNodes
};

bool Orientation(const Path &poly);
double Area(const Path &poly);
int PointInPolygon(const IntPoint &pt, const Path &path);

void SimplifyPolygon(const Path &in_poly, Paths &out_polys, PolyFillType fillType = pftEvenOdd);
void SimplifyPolygons(const Paths &in_polys, Paths &out_polys, PolyFillType fillType = pftEvenOdd);
void SimplifyPolygons(Paths &polys, PolyFillType fillType = pftEvenOdd);

void CleanPolygon(const Path& in_poly, Path& out_poly, double distance = 1.415);
void CleanPolygon(Path& poly, double distance = 1.415);
void CleanPolygons(const Paths& in_polys, Paths& out_polys, double distance = 1.415);
void CleanPolygons(Paths& polys, double distance = 1.415);

void MinkowskiSum(const Path& pattern, const Path& path, Paths& solution, bool pathIsClosed);
void MinkowskiSum(const Path& pattern, const Paths& paths, Paths& solution, bool pathIsClosed);
void MinkowskiDiff(const Path& poly1, const Path& poly2, Paths& solution);

void PolyTreeToPaths(const PolyTree& polytree, Paths& paths);
void ClosedPathsFromPolyTree(const PolyTree& polytree, Paths& paths);
void OpenPathsFromPolyTree(PolyTree& polytree, Paths& paths);

void ReversePath(Path& p);
void ReversePaths(Paths& p);

struct IntRect { cInt left; cInt top; cInt right; cInt bottom; };

//enums that are used internally ...
enum EdgeSide { esLeft = 1, esRight = 2};

//forward declarations (for stuff used internally) ...
struct TEdge;
struct IntersectNode;
struct LocalMinimum;
struct OutPt;
struct OutRec;
struct Join;

typedef std::vector < OutRec* > PolyOutList;
typedef std::vector < TEdge* > EdgeList;
typedef std::vector < Join* > JoinList;
typedef std::vector < IntersectNode* > IntersectList;

//------------------------------------------------------------------------------

//ClipperBase is the ancestor to the Clipper class. It should not be
//instantiated directly. This class simply abstracts the conversion of sets of
//polygon coordinates into edge objects that are stored in a LocalMinima list.
class ClipperBase
{
public:
  ClipperBase();
  virtual ~ClipperBase();
  virtual bool AddPath(const Path &pg, PolyType PolyTyp, bool Closed);
  bool AddPaths(const Paths &ppg, PolyType PolyTyp, bool Closed);
  virtual void Clear();
  IntRect GetBounds();
  bool PreserveCollinear() {return m_PreserveCollinear;};
  void PreserveCollinear(bool value) {m_PreserveCollinear = value;};
protected:
  void DisposeLocalMinimaList();
  TEdge* AddBoundsToLML(TEdge *e, bool IsClosed);
  virtual void Reset();
  TEdge* ProcessBound(TEdge* E, bool IsClockwise);
  void InsertScanbeam(const cInt Y);
  bool PopScanbeam(cInt &Y);
  bool LocalMinimaPending();
  bool PopLocalMinima(cInt Y, const LocalMinimum *&locMin);
  OutRec* CreateOutRec();
  void DisposeAllOutRecs();
  void DisposeOutRec(PolyOutList::size_type index);
  void SwapPositionsInAEL(TEdge *edge1, TEdge *edge2);
  void DeleteFromAEL(TEdge *e);
  void UpdateEdgeIntoAEL(TEdge *&e);

  typedef std::vector<LocalMinimum> MinimaList;
  MinimaList::iterator m_CurrentLM;
  MinimaList           m_MinimaList;

  bool              m_UseFullRange;
  EdgeList          m_edges;
  bool              m_PreserveCollinear;
  bool              m_HasOpenPaths;
  PolyOutList       m_PolyOuts;
  TEdge           *m_ActiveEdges;

  typedef std::priority_queue<cInt> ScanbeamList;
  ScanbeamList     m_Scanbeam;
};
//------------------------------------------------------------------------------

class Clipper : public virtual ClipperBase
{
public:
  Clipper(int initOptions = 0);
  bool Execute(ClipType clipType,
      Paths &solution,
      PolyFillType fillType = pftEvenOdd);
  bool Execute(ClipType clipType,
      Paths &solution,
      PolyFillType subjFillType,
      PolyFillType clipFillType);
  bool Execute(ClipType clipType,
      PolyTree &polytree,
      PolyFillType fillType = pftEvenOdd);
  bool Execute(ClipType clipType,
      PolyTree &polytree,
      PolyFillType subjFillType,
      PolyFillType clipFillType);
  bool ReverseSolution() { return m_ReverseOutput; };
  void ReverseSolution(bool value) {m_ReverseOutput = value;};
  bool StrictlySimple() {return m_StrictSimple;};
  void StrictlySimple(bool value) {m_StrictSimple = value;};
  //set the callback function for z value filling on intersections (otherwise Z is 0)
#ifdef use_xyz
  void ZFillFunction(ZFillCallback zFillFunc);
#endif
protected:
  virtual bool ExecuteInternal();
private:
  JoinList         m_Joins;
  JoinList         m_GhostJoins;
  IntersectList    m_IntersectList;
  ClipType         m_ClipType;
  typedef std::list<cInt> MaximaList;
  MaximaList       m_Maxima;
  TEdge           *m_SortedEdges;
  bool             m_ExecuteLocked;
  PolyFillType     m_ClipFillType;
  PolyFillType     m_SubjFillType;
  bool             m_ReverseOutput;
  bool             m_UsingPolyTree; 
  bool             m_StrictSimple;
#ifdef use_xyz
  ZFillCallback   m_ZFill; //custom callback 
#endif
  void SetWindingCount(TEdge& edge);
  bool IsEvenOddFillType(const TEdge& edge) const;
  bool IsEvenOddAltFillType(const TEdge& edge) const;
  void InsertLocalMinimaIntoAEL(const cInt botY);
  void InsertEdgeIntoAEL(TEdge *edge, TEdge* startEdge);
  void AddEdgeToSEL(TEdge *edge);
  bool PopEdgeFromSEL(TEdge *&edge);
  void CopyAELToSEL();
  void DeleteFromSEL(TEdge *e);
  void SwapPositionsInSEL(TEdge *edge1, TEdge *edge2);
  bool IsContributing(const TEdge& edge) const;
  bool IsTopHorz(const cInt XPos);
  void DoMaxima(TEdge *e);
  void ProcessHorizontals();
  void ProcessHorizontal(TEdge *horzEdge);
  void AddLocalMaxPoly(TEdge *e1, TEdge *e2, const IntPoint &pt);
  OutPt* AddLocalMinPoly(TEdge *e1, TEdge *e2, const IntPoint &pt);
  OutRec* GetOutRec(int idx);
  void AppendPolygon(TEdge *e1, TEdge *e2);
  void IntersectEdges(TEdge *e1, TEdge *e2, IntPoint &pt);
  OutPt* AddOutPt(TEdge *e, const IntPoint &pt);
  OutPt* GetLastOutPt(TEdge *e);
  bool ProcessIntersections(const cInt topY);
  void BuildIntersectList(const cInt topY);
  void ProcessIntersectList();
  void ProcessEdgesAtTopOfScanbeam(const cInt topY);
  void BuildResult(Paths& polys);
  void BuildResult2(PolyTree& polytree);
  void SetHoleState(TEdge *e, OutRec *outrec);
  void DisposeIntersectNodes();
  bool FixupIntersectionOrder();
  void FixupOutPolygon(OutRec &outrec);
  void FixupOutPolyline(OutRec &outrec);
  bool IsHole(TEdge *e);
  bool FindOwnerFromSplitRecs(OutRec &outRec, OutRec *&currOrfl);
  void FixHoleLinkage(OutRec &outrec);
  void AddJoin(OutPt *op1, OutPt *op2, const IntPoint offPt);
  void ClearJoins();
  void ClearGhostJoins();
  void AddGhostJoin(OutPt *op, const IntPoint offPt);
  bool JoinPoints(Join *j, OutRec* outRec1, OutRec* outRec2);
  void JoinCommonEdges();
  void DoSimplePolygons();
  void FixupFirstLefts1(OutRec* OldOutRec, OutRec* NewOutRec);
  void FixupFirstLefts2(OutRec* InnerOutRec, OutRec* OuterOutRec);
  void FixupFirstLefts3(OutRec* OldOutRec, OutRec* NewOutRec);
#ifdef use_xyz
  void SetZ(IntPoint& pt, TEdge& e1, TEdge& e2);
#endif
};
//------------------------------------------------------------------------------

class ClipperOffset 
{
public:
  ClipperOffset(double miterLimit = 2.0, double roundPrecision = 0.25);
  ~ClipperOffset();
  void AddPath(const Path& path, JoinType joinType, EndType endType);
  void AddPaths(const Paths& paths, JoinType joinType, EndType endType);
  void Execute(Paths& solution, double delta);
  void Execute(PolyTree& solution, double delta);
  void Clear();
  double MiterLimit;
  double ArcTolerance;
private:
  Paths m_destPolys;
  Path m_srcPoly;
  Path m_destPoly;
  std::vector<DoublePoint> m_normals;
  double m_delta, m_sinA, m_sin, m_cos;
  double m_miterLim, m_StepsPerRad;
  IntPoint m_lowest;
  PolyNode m_polyNodes;

  void FixOrientations();
  void DoOffset(double delta);
  void OffsetPoint(int j, int& k, JoinType jointype);
  void DoSquare(int j, int k);
  void DoMiter(int j, int k, double r);
  void DoRound(int j, int k);
};
//------------------------------------------------------------------------------

class clipperException : public std::exception
{
  public:
    clipperException(const char* description): m_descr(description) {}
    virtual ~clipperException() throw() {}
    virtual const char* what() const throw() {return m_descr.c_str();}
  private:
    std::string m_descr;
};
//------------------------------------------------------------------------------

} //ClipperLib namespace

#endif //clipper_hpp


================================================
FILE: dbnet/common.hpp
================================================
#ifndef DBNET_COMMON_H_
#define DBNET_COMMON_H_

#include <iostream>
#include <fstream>
#include <map>
#include <sstream>
#include <vector>
#include <opencv2/opencv.hpp>
#include "dirent.h"
#include "NvInfer.h"
#include <chrono>

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

using namespace nvinfer1;

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--) {
        Weights wt{ DataType::kFLOAT, nullptr, 0 };
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x) {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + ".weight"].values;
    float *beta = (float*)weightMap[lname + ".bias"].values;
    float *mean = (float*)weightMap[lname + ".running_mean"].values;
    float *var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{ DataType::kFLOAT, scval, len };

    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{ DataType::kFLOAT, shval, len };

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{ DataType::kFLOAT, pval, len };

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

ILayer* convBnLeaky(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname, std::string bnname, bool bias = true) {
    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
    int p = ksize / 2;
    IConvolutionLayer* conv1 = nullptr;
    if (bias) {
        conv1 = network->addConvolutionNd(input, outch, DimsHW{ ksize, ksize }, weightMap[lname + ".weight"], weightMap[lname + ".bias"]);
    }
    else {
        conv1 = network->addConvolutionNd(input, outch, DimsHW{ ksize, ksize }, weightMap[lname + ".weight"], emptywts);
    }
    assert(conv1);
    conv1->setStrideNd(DimsHW{ s, s });
    conv1->setPaddingNd(DimsHW{ p, p });
    conv1->setNbGroups(g);
    //IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-4);
    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname.substr(0, lname.find_last_of(".")) + bnname, 1e-5);
    auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    lr->setAlpha(0.1);
    return lr;
}


IActivationLayer* basicBlock(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname) {
    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };

    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ 3, 3 }, weightMap[lname + "conv1.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{ stride, stride });
    conv1->setPaddingNd(DimsHW{ 1, 1 });

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5);

    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{ 3, 3 }, weightMap[lname + "conv2.weight"], emptywts);
    assert(conv2);
    conv2->setPaddingNd(DimsHW{ 1, 1 });

    IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5);

    IElementWiseLayer* ew1;
    if (inch != outch) {
        IConvolutionLayer* conv3 = network->addConvolutionNd(input, outch, DimsHW{ 1, 1 }, weightMap[lname + "downsample.0.weight"], emptywts);
        assert(conv3);
        conv3->setStrideNd(DimsHW{ stride, stride });
        IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "downsample.1", 1e-5);
        ew1 = network->addElementWise(*bn3->getOutput(0), *bn2->getOutput(0), ElementWiseOperation::kSUM);
    }
    else {
        ew1 = network->addElementWise(input, *bn2->getOutput(0), ElementWiseOperation::kSUM);
    }
    IActivationLayer* relu2 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU);
    assert(relu2);
    return relu2;
}

int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
    DIR *p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 &&
            strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }
    closedir(p_dir);
    return 0;
}

#endif


================================================
FILE: dbnet/dbnet.cpp
================================================
#include <iostream>
#include <chrono>
#include "cuda_runtime_api.h"
#include "logging.h"
#include "common.hpp"
#include <math.h>
#include "clipper.hpp"

#define USE_FP16  // comment out this if want to use FP32
#define DEVICE 0  // GPU id
#define EXPANDRATIO 1.5
#define BOX_MINI_SIZE 5
#define SCORE_THRESHOLD 0.3
#define BOX_THRESHOLD 0.7

static const int SHORT_INPUT = 640;
static const int MAX_INPUT_SIZE = 1440; // 32x
static const int MIN_INPUT_SIZE = 608;
static const int OPT_INPUT_W = 1152;
static const int OPT_INPUT_H = 640;
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "out";
static Logger gLogger;

cv::RotatedRect expandBox(cv::Point2f temp[], float ratio)
{
    ClipperLib::Path path = {
        {ClipperLib::cInt(temp[0].x), ClipperLib::cInt(temp[0].y)},
        {ClipperLib::cInt(temp[1].x), ClipperLib::cInt(temp[1].y)},
        {ClipperLib::cInt(temp[2].x), ClipperLib::cInt(temp[2].y)},
        {ClipperLib::cInt(temp[3].x), ClipperLib::cInt(temp[3].y)}};
    double area = ClipperLib::Area(path);
    double distance;
    double length = 0.0;
    for (int i = 0; i < 4; i++) {
        length = length + sqrtf(powf((temp[i].x - temp[(i + 1) % 4].x), 2) +
                                powf((temp[i].y - temp[(i + 1) % 4].y), 2));
    }

    distance = area * ratio / length;

    ClipperLib::ClipperOffset offset;
    offset.AddPath(path, ClipperLib::JoinType::jtRound,
                   ClipperLib::EndType::etClosedPolygon);
    ClipperLib::Paths paths;
    offset.Execute(paths, distance);
    
    std::vector<cv::Point> contour;
    for (int i = 0; i < paths[0].size(); i++) {
        contour.emplace_back(paths[0][i].X, paths[0][i].Y);
    }
    offset.Clear();
    return cv::minAreaRect(contour);
}

float paddimg(cv::Mat& In_Out_img, int shortsize = 960) {
    int w = In_Out_img.cols;
    int h = In_Out_img.rows;
    float scale = 1.f;
    if (w < h) {
        scale = (float)shortsize / w;
        h = scale * h;
        w = shortsize;
    }
    else {
        scale = (float)shortsize / h;
        w = scale * w;
        h = shortsize;
    }

    if (h % 32 != 0) {
        h = (h / 32 + 1) * 32;
    }
    if (w % 32 != 0) {
        w = (w / 32 + 1) * 32;
    }

    cv::resize(In_Out_img, In_Out_img, cv::Size(w, h));
    return scale;
}

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
    INetworkDefinition* network = builder->createNetworkV2(explicitBatch);
    // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims4{ 1, 3, -1, -1 });
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("./DBNet.wts");
    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };

    /* ------ Resnet18 backbone------ */
      // Add convolution layer with 6 outputs and a 5x5 filter.
    IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 64, DimsHW{ 7, 7 }, weightMap["backbone.conv1.weight"], emptywts);   
    assert(conv1);
    conv1->setStrideNd(DimsHW{ 2, 2 });
    conv1->setPaddingNd(DimsHW{ 3, 3 });

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "backbone.bn1", 1e-5);
    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);
    IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{ 3, 3 });
    assert(pool1);
    pool1->setStrideNd(DimsHW{ 2, 2 });
    pool1->setPaddingNd(DimsHW{ 1, 1 });

    IActivationLayer* relu2 = basicBlock(network, weightMap, *pool1->getOutput(0), 64, 64, 1, "backbone.layer1.0.");
    IActivationLayer* relu3 = basicBlock(network, weightMap, *relu2->getOutput(0), 64, 64, 1, "backbone.layer1.1."); // x2

    IActivationLayer* relu4 = basicBlock(network, weightMap, *relu3->getOutput(0), 64, 128, 2, "backbone.layer2.0.");
    IActivationLayer* relu5 = basicBlock(network, weightMap, *relu4->getOutput(0), 128, 128, 1, "backbone.layer2.1."); // x3

    IActivationLayer* relu6 = basicBlock(network, weightMap, *relu5->getOutput(0), 128, 256, 2, "backbone.layer3.0.");
    IActivationLayer* relu7 = basicBlock(network, weightMap, *relu6->getOutput(0), 256, 256, 1, "backbone.layer3.1."); //x4

    IActivationLayer* relu8 = basicBlock(network, weightMap, *relu7->getOutput(0), 256, 512, 2, "backbone.layer4.0.");
    IActivationLayer* relu9 = basicBlock(network, weightMap, *relu8->getOutput(0), 512, 512, 1, "backbone.layer4.1."); //x5

    /* ------- FPN  neck ------- */
    ILayer* p5 = convBnLeaky(network, weightMap, *relu9->getOutput(0), 64, 1, 1, 1, "neck.reduce_conv_c5.conv", ".bn"); // k=1 s = 1  p = k/2=1/2=0
    ILayer* c4_1 = convBnLeaky(network, weightMap, *relu7->getOutput(0), 64, 1, 1, 1, "neck.reduce_conv_c4.conv", ".bn");

    float *deval = reinterpret_cast<float*>(malloc(sizeof(float) * 64 * 2 * 2));
    for (int i = 0; i < 64 * 2 * 2; i++) {
        deval[i] = 1.0;
    }
    Weights deconvwts1{ DataType::kFLOAT, deval, 64 * 2 * 2 };
    IDeconvolutionLayer* p4_1 = network->addDeconvolutionNd(*p5->getOutput(0), 64, DimsHW{ 2, 2 }, deconvwts1, emptywts);
    p4_1->setStrideNd(DimsHW{ 2, 2 });
    p4_1->setNbGroups(64);
    weightMap["deconv1"] = deconvwts1;

    IElementWiseLayer* p4_add = network->addElementWise(*p4_1->getOutput(0), *c4_1->getOutput(0), ElementWiseOperation::kSUM);
    ILayer* p4 = convBnLeaky(network, weightMap, *p4_add->getOutput(0), 64, 3, 1, 1, "neck.smooth_p4.conv", ".bn");  // smooth
    ILayer* c3_1 = convBnLeaky(network, weightMap, *relu5->getOutput(0), 64, 1, 1, 1, "neck.reduce_conv_c3.conv", ".bn");

    Weights deconvwts2{ DataType::kFLOAT, deval, 64 * 2 * 2 };
    IDeconvolutionLayer* p3_1 = network->addDeconvolutionNd(*p4->getOutput(0), 64, DimsHW{ 2, 2 }, deconvwts2, emptywts);
    p3_1->setStrideNd(DimsHW{ 2, 2 });
    p3_1->setNbGroups(64);

    IElementWiseLayer* p3_add = network->addElementWise(*p3_1->getOutput(0), *c3_1->getOutput(0), ElementWiseOperation::kSUM);
    ILayer* p3 = convBnLeaky(network, weightMap, *p3_add->getOutput(0), 64, 3, 1, 1, "neck.smooth_p3.conv", ".bn");  // smooth
    ILayer* c2_1 = convBnLeaky(network, weightMap, *relu3->getOutput(0), 64, 1, 1, 1, "neck.reduce_conv_c2.conv", ".bn");

    Weights deconvwts3{ DataType::kFLOAT, deval, 64 * 2 * 2 };
    IDeconvolutionLayer* p2_1 = network->addDeconvolutionNd(*p3->getOutput(0), 64, DimsHW{ 2, 2 }, deconvwts3, emptywts);
    p2_1->setStrideNd(DimsHW{ 2, 2 });
    p2_1->setNbGroups(64);

    IElementWiseLayer* p2_add = network->addElementWise(*p2_1->getOutput(0), *c2_1->getOutput(0), ElementWiseOperation::kSUM);
    ILayer* p2 = convBnLeaky(network, weightMap, *p2_add->getOutput(0), 64, 3, 1, 1, "neck.smooth_p2.conv", ".bn");  // smooth

    Weights deconvwts4{ DataType::kFLOAT, deval, 64 * 2 * 2 };
    IDeconvolutionLayer* p3_up_p2 = network->addDeconvolutionNd(*p3->getOutput(0), 64, DimsHW{ 2, 2 }, deconvwts4, emptywts);
    p3_up_p2->setStrideNd(DimsHW{ 2, 2 });
    p3_up_p2->setNbGroups(64);

    float *deval2 = reinterpret_cast<float*>(malloc(sizeof(float) * 64 * 8 * 8));
    for (int i = 0; i < 64 * 8 * 8; i++) {
        deval2[i] = 1.0;
    }
    Weights deconvwts5{ DataType::kFLOAT, deval2, 64 * 8 * 8 };
    IDeconvolutionLayer* p4_up_p2 = network->addDeconvolutionNd(*p4->getOutput(0), 64, DimsHW{ 8, 8 }, deconvwts5, emptywts);
    p4_up_p2->setPaddingNd(DimsHW{ 2, 2 });
    p4_up_p2->setStrideNd(DimsHW{ 4, 4 });
    p4_up_p2->setNbGroups(64);
    weightMap["deconv2"] = deconvwts5;

    Weights deconvwts6{ DataType::kFLOAT, deval2, 64 * 8 * 8 };
    IDeconvolutionLayer* p5_up_p2 = network->addDeconvolutionNd(*p5->getOutput(0), 64, DimsHW{ 8, 8 }, deconvwts6, emptywts);
    p5_up_p2->setStrideNd(DimsHW{ 8, 8 });
    p5_up_p2->setNbGroups(64);

    // torch.cat([p2, p3, p4, p5], dim=1)
    ITensor* inputTensors[] = { p2->getOutput(0), p3_up_p2->getOutput(0), p4_up_p2->getOutput(0), p5_up_p2->getOutput(0) };
    IConcatenationLayer* neck_cat = network->addConcatenation(inputTensors, 4);

    ILayer* neck_out = convBnLeaky(network, weightMap, *neck_cat->getOutput(0), 256, 3, 1, 1, "neck.conv.0", ".1");  // smooth
    assert(neck_out);
    ILayer* binarize1 = convBnLeaky(network, weightMap, *neck_out->getOutput(0), 64, 3, 1, 1, "head.binarize.0", ".1");  //  
    Weights deconvwts7{ DataType::kFLOAT, deval, 64 * 2 * 2 };
    IDeconvolutionLayer* binarizeup = network->addDeconvolutionNd(*binarize1->getOutput(0), 64, DimsHW{ 2, 2 }, deconvwts7, emptywts);
    binarizeup->setStrideNd(DimsHW{ 2, 2 });
    binarizeup->setNbGroups(64);
    IScaleLayer* binarizebn1 = addBatchNorm2d(network, weightMap, *binarizeup->getOutput(0), "head.binarize.4", 1e-5);
    IActivationLayer* binarizerelu1 = network->addActivation(*binarizebn1->getOutput(0), ActivationType::kRELU);
    assert(binarizerelu1);

    Weights deconvwts8{ DataType::kFLOAT, deval, 64 * 2 * 2 };
    IDeconvolutionLayer* binarizeup2 = network->addDeconvolutionNd(*binarizerelu1->getOutput(0), 64, DimsHW{ 2, 2 }, deconvwts8, emptywts);
    binarizeup2->setStrideNd(DimsHW{ 2, 2 });
    binarizeup2->setNbGroups(64);

    IConvolutionLayer* binarize3 = network->addConvolutionNd(*binarizeup2->getOutput(0), 1, DimsHW{ 3, 3 }, weightMap["head.binarize.7.weight"], weightMap["head.binarize.7.bias"]);
    assert(binarize3);
    binarize3->setStrideNd(DimsHW{ 1, 1 });
    binarize3->setPaddingNd(DimsHW{ 1, 1 });
    IActivationLayer* binarize4 = network->addActivation(*binarize3->getOutput(0), ActivationType::kSIGMOID);
    assert(binarize4);

    //threshold_maps = self.thresh(x)
    ILayer* thresh1 = convBnLeaky(network, weightMap, *neck_out->getOutput(0), 64, 3, 1, 1, "head.thresh.0", ".1", false);  //  
    Weights deconvwts9{ DataType::kFLOAT, deval, 64 * 2 * 2 };
    IDeconvolutionLayer* threshup = network->addDeconvolutionNd(*thresh1->getOutput(0), 64, DimsHW{ 2, 2 }, deconvwts9, emptywts);
    threshup->setStrideNd(DimsHW{ 2, 2 });
    threshup->setNbGroups(64);
    IConvolutionLayer* thresh2 = network->addConvolutionNd(*threshup->getOutput(0), 64, DimsHW{ 3, 3 }, weightMap["head.thresh.3.1.weight"], weightMap["head.thresh.3.1.bias"]);
    assert(thresh2);
    thresh2->setStrideNd(DimsHW{ 1, 1 });
    thresh2->setPaddingNd(DimsHW{ 1, 1 });

    IScaleLayer* threshbn1 = addBatchNorm2d(network, weightMap, *thresh2->getOutput(0), "head.thresh.4", 1e-5);
    IActivationLayer* threshrelu1 = network->addActivation(*threshbn1->getOutput(0), ActivationType::kRELU);
    assert(threshrelu1);

    Weights deconvwts10{ DataType::kFLOAT, deval, 64 * 2 * 2 };
    IDeconvolutionLayer* threshup2 = network->addDeconvolutionNd(*threshrelu1->getOutput(0), 64, DimsHW{ 2, 2 }, deconvwts10, emptywts);
    threshup2->setStrideNd(DimsHW{ 2, 2 });
    threshup2->setNbGroups(64);
    IConvolutionLayer* thresh3 = network->addConvolutionNd(*threshup2->getOutput(0), 1, DimsHW{ 3, 3 }, weightMap["head.thresh.6.1.weight"], weightMap["head.thresh.6.1.bias"]);
    assert(thresh3);
    thresh3->setStrideNd(DimsHW{ 1, 1 });
    thresh3->setPaddingNd(DimsHW{ 1, 1 });
    IActivationLayer* thresh4 = network->addActivation(*thresh3->getOutput(0), ActivationType::kSIGMOID);
    assert(thresh4);

    ITensor* inputTensors2[] = { binarize4->getOutput(0), thresh4->getOutput(0) };
    IConcatenationLayer* head_out = network->addConcatenation(inputTensors2, 2);

    // y = F.interpolate(y, size=(H, W)) 
    head_out->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*head_out->getOutput(0));

    IOptimizationProfile* profile = builder->createOptimizationProfile();
    profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kMIN, Dims4(1, 3, MIN_INPUT_SIZE, MIN_INPUT_SIZE));
    profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kOPT, Dims4(1, 3, OPT_INPUT_H, OPT_INPUT_W));
    profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kMAX, Dims4(1, 3, MAX_INPUT_SIZE, MAX_INPUT_SIZE));
    config->addOptimizationProfile(profile);

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#ifdef USE_FP16
    config->setFlag(BuilderFlag::kFP16);
#endif
    std::cout << "Building engine, please wait for a while..." << std::endl;
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    //ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int h_scale, int w_scale) {
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
    context.setBindingDimensions(inputIndex, Dims4(1, 3, h_scale, w_scale));

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], 3 * h_scale * w_scale * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], 2 * h_scale * w_scale * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, 3 * h_scale * w_scale * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueueV2(buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], h_scale * w_scale * 2 * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

bool get_mini_boxes(cv::RotatedRect& rotated_rect, cv::Point2f rect[],
                    int min_size)
{

    cv::Point2f temp_rect[4];
    rotated_rect.points(temp_rect);
    for (int i = 0; i < 4; i++) {
        for (int j = i + 1; j < 4; j++) {
            if (temp_rect[i].x > temp_rect[j].x) {
                cv::Point2f temp;
                temp = temp_rect[i];
                temp_rect[i] = temp_rect[j];
                temp_rect[j] = temp;
            }
        }
    }
    int index0 = 0;
    int index1 = 1;
    int index2 = 2;
    int index3 = 3;
    if (temp_rect[1].y > temp_rect[0].y) {
        index0 = 0;
        index3 = 1;
    } else {
        index0 = 1;
        index3 = 0;
    }
    if (temp_rect[3].y > temp_rect[2].y) {
        index1 = 2;
        index2 = 3;
    } else {
        index1 = 3;
        index2 = 2;
    }   

    rect[0] = temp_rect[index0];  // Left top coordinate
    rect[1] = temp_rect[index1];  // Left bottom coordinate
    rect[2] = temp_rect[index2];  // Right bottom coordinate
    rect[3] = temp_rect[index3];  // Right top coordinate

    if (rotated_rect.size.width < min_size ||
        rotated_rect.size.height < min_size) {
        return false;
    } else {
        return true;
    }
}

float get_box_score(float* map, cv::Point2f rect[], int width, int height,
                    float threshold)
{

    int xmin = width - 1;
    int ymin = height - 1;
    int xmax = 0;
    int ymax = 0;

    for (int j = 0; j < 4; j++) {
        if (rect[j].x < xmin) {
            xmin = rect[j].x;
        }
        if (rect[j].y < ymin) {
            ymin = rect[j].y;
        }
        if (rect[j].x > xmax) {
            xmax = rect[j].x;
        }
        if (rect[j].y > ymax) {
            ymax = rect[j].y;
        }
    }
    float sum = 0;
    int num = 0;
    for (int i = ymin; i <= ymax; i++) {
        for (int j = xmin; j <= xmax; j++) {
            if (map[i * width + j] > threshold) {
                sum = sum + map[i * width + j];
                num++;
            }
        }
    }

    return sum / num;
}

int main(int argc, char** argv) {
    cudaSetDevice(DEVICE);
    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{ nullptr };
    size_t size{ 0 };

    if (argc == 2 && std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{ nullptr };
        APIToModel(1, &modelStream);
        assert(modelStream != nullptr);
        std::ofstream p("DBNet.engine", std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 0;
    }
    else if (argc == 3 && std::string(argv[1]) == "-d") {
        std::ifstream file("DBNet.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    }
    else {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./debnet -s  // serialize model to plan file" << std::endl;
        std::cerr << "./debnet -d ../samples  // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // prepare input data ---------------------------
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    std::vector<std::string> file_names;
    if (read_files_in_dir(argv[2], file_names) < 0) {
        std::cout << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    // icdar2015.yaml Hyperparameter
    std::vector<float> mean_value{ 0.406, 0.456, 0.485 };  // BGR
    std::vector<float> std_value{ 0.225, 0.224, 0.229 };

    int fcount = 0;

    for (auto f : file_names) {
        fcount++;
        std::cout << fcount << "  " << f << std::endl;
        cv::Mat pr_img = cv::imread(std::string(argv[2]) + "/" + f);
        cv::Mat src_img = pr_img.clone();
        if (pr_img.empty()) continue;
        float scale = paddimg(pr_img, SHORT_INPUT); // resize the image
        std::cout << "letterbox shape: " << pr_img.cols << ", " << pr_img.rows << std::endl;
        if (pr_img.cols < MIN_INPUT_SIZE || pr_img.rows < MIN_INPUT_SIZE) continue;
        float* data = new float[3 * pr_img.rows * pr_img.cols];

        auto start = std::chrono::system_clock::now();
        int i = 0;
        for (int row = 0; row < pr_img.rows; ++row) {
            uchar* uc_pixel = pr_img.data + row * pr_img.step;
            for (int col = 0; col < pr_img.cols; ++col) {
                data[i] = (uc_pixel[2] / 255.0 - mean_value[2]) / std_value[2];
                data[i + pr_img.rows * pr_img.cols] = (uc_pixel[1] / 255.0 - mean_value[1]) / std_value[1];
                data[i + 2 * pr_img.rows * pr_img.cols] = (uc_pixel[0] / 255.0 - mean_value[0]) / std_value[0];
                uc_pixel += 3;
                ++i;
            }
        }
        auto end = std::chrono::system_clock::now();
        std::cout << "pre time:"<< std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;

        float* prob = new float[pr_img.rows *pr_img.cols * 2];
        // Run inference
        start = std::chrono::system_clock::now();
        doInference(*context, data, prob, pr_img.rows, pr_img.cols);
        end = std::chrono::system_clock::now();
        std::cout << "detect time:"<< std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;

        // prob shape is 2*640*640, get the first one
        cv::Mat map = cv::Mat::zeros(cv::Size(pr_img.cols, pr_img.rows), CV_8UC1);
        for (int h = 0; h < pr_img.rows; ++h) {
            uchar *ptr = map.ptr(h);
            for (int w = 0; w < pr_img.cols; ++w) {
                ptr[w] = (prob[h * pr_img.cols + w] > 0.3) ? 255 : 0;
            }
        }

        // Extracting minimum circumscribed rectangle
        std::vector<std::vector<cv::Point>> contours;
        std::vector<cv::Vec4i> hierarcy;
        cv::findContours(map, contours, hierarcy, CV_RETR_LIST, CV_CHAIN_APPROX_SIMPLE);

        std::vector<cv::Rect> boundRect(contours.size());
        std::vector<cv::RotatedRect> box(contours.size());
        cv::Point2f rect[4];
        cv::Point2f order_rect[4];

        for (int i = 0; i < contours.size(); i++) {
            cv::RotatedRect rotated_rect = cv::minAreaRect(cv::Mat(contours[i]));
            if (!get_mini_boxes(rotated_rect, rect, BOX_MINI_SIZE)) {
                std::cout << "box too small" <<  std::endl;
                continue;
            }

            // drop low score boxes
            float score = get_box_score(prob, rect, pr_img.cols, pr_img.rows,
                                        SCORE_THRESHOLD);
            if (score < BOX_THRESHOLD) {
                std::cout << "score too low =  " << score << ", threshold = " << BOX_THRESHOLD <<  std::endl;
                continue;
            }

            // Scaling the predict boxes depend on EXPANDRATIO
            cv::RotatedRect expandbox = expandBox(rect, EXPANDRATIO);
            expandbox.points(rect);
            if (!get_mini_boxes(expandbox, rect, BOX_MINI_SIZE + 2)) {  
                continue;
            }

            // Restore the coordinates to the original image
            for (int k = 0; k < 4; k++) {
                order_rect[k] = rect[k];
                order_rect[k].x = int(order_rect[k].x / pr_img.cols * src_img.cols);
                order_rect[k].y = int(order_rect[k].y / pr_img.rows * src_img.rows);
            }
            
            cv::rectangle(src_img, cv::Point(order_rect[0].x,order_rect[0].y), cv::Point(order_rect[2].x,order_rect[2].y), cv::Scalar(0, 0, 255), 2, 8);
            //std::cout << "After LT =  " << order_rect[0] << ", After RD = " << order_rect[2] <<  std::endl;            
        }

        cv::imwrite("_" + f, src_img);
        std::cout << "write image done." << std::endl;
        //cv::waitKey(0);

        delete prob;
        delete data;
    }

    return 0;
}

================================================
FILE: dbnet/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) override
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: dbnet/utils.h
================================================
#ifndef __TRT_UTILS_H_
#define __TRT_UTILS_H_

#include <iostream>
#include <vector>
#include <algorithm>
#include <cudnn.h>

#ifndef CUDA_CHECK

#define CUDA_CHECK(callstr)                                                                    \
    {                                                                                          \
        cudaError_t error_code = callstr;                                                      \
        if (error_code != cudaSuccess) {                                                       \
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
            assert(0);                                                                         \
        }                                                                                      \
    }

#endif

namespace Tn
{
    class Profiler : public nvinfer1::IProfiler
    {
    public:
        void printLayerTimes(int itrationsTimes)
        {
            float totalTime = 0;
            for (size_t i = 0; i < mProfile.size(); i++)
            {
                printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / itrationsTimes);
                totalTime += mProfile[i].second;
            }
            printf("Time over all layers: %4.3f\n", totalTime / itrationsTimes);
        }
    private:
        typedef std::pair<std::string, float> Record;
        std::vector<Record> mProfile;

        virtual void reportLayerTime(const char* layerName, float ms)
        {
            auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; });
            if (record == mProfile.end())
                mProfile.push_back(std::make_pair(layerName, ms));
            else
                record->second += ms;
        }
    };

    //Logger for TensorRT info/warning/errors
    class Logger : public nvinfer1::ILogger
    {
    public:

        Logger(): Logger(Severity::kWARNING) {}

        Logger(Severity severity): reportableSeverity(severity) {}

        void log(Severity severity, const char* msg) override
        {
            // suppress messages with severity enum value greater than the reportable
            if (severity > reportableSeverity) return;

            switch (severity)
            {
                case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
                case Severity::kERROR: std::cerr << "ERROR: "; break;
                case Severity::kWARNING: std::cerr << "WARNING: "; break;
                case Severity::kINFO: std::cerr << "INFO: "; break;
                default: std::cerr << "UNKNOWN: "; break;
            }
            std::cerr << msg << std::endl;
        }

        Severity reportableSeverity{Severity::kWARNING};
    };

    template<typename T> 
    void write(char*& buffer, const T& val)
    {
        *reinterpret_cast<T*>(buffer) = val;
        buffer += sizeof(T);
    }

    template<typename T> 
    void read(const char*& buffer, T& val)
    {
        val = *reinterpret_cast<const T*>(buffer);
        buffer += sizeof(T);
    }
}

#endif

================================================
FILE: densenet/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.10)

# set the project name
project(densenet)

add_definitions(-std=c++11)

# get main project dir to include common files
get_filename_component(MAIN_DIR ../ ABSOLUTE)

# When enabled the static version of the 
# CUDA runtime library will be used in CUDA_LIBRARIES
option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)

# specify the C++ standard
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED True)
set(CMAKE_BUILD_TYPE Debug)

# include

# include and link cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)

# include and link tensorrt
include_directories(/usr/include/x86_64-linux-gnu)
link_directories(/usr/lib/x86_64-linux-gnu)

# add the executable
add_executable(densenet ${PROJECT_SOURCE_DIR}/densenet121.cpp)

target_link_libraries(densenet nvinfer)
target_link_libraries(densenet cudart)

add_definitions(-O2 -pthread)

================================================
FILE: densenet/README.md
================================================
# Densenet121

The Pytorch implementation is [makaveli10/densenet](https://github.com/makaveli10/torchtrtz/tree/main/densenet). Model from torchvision.
The tensorrt implemenation is taken from [makaveli10/cpptensorrtz](https://github.com/makaveli10/cpptensorrtz/).

## How to Run

1. generate densenet121.wts from pytorch

```
git clone https://github.com/wang-xinyu/tensorrtx.git
git clone https://github.com/makaveli10/torchtrtz.git

// go to torchtrtz/densenet
// Enter these two commands to create densenet121.wts
python models.py
python gen_trtwts.py
```

2. build densenet and run

```
// put densenet121.wts into tensorrtx/densenet
// go to tensorrtx/densenet
mkdir build
cd build
cmake ..
make
sudo ./densenet -s  // serialize model to file i.e. 'densenet.engine'
sudo ./densenet -d  // deserialize model and run inference
```

3. Verify output from [torch impl](https://github.com/makaveli10/torchtrtz/blob/main/densenet/README.md)

TensorRT output[:5]:
```
    [-0.587389, -0.329202, -1.83404, -1.89935, -0.928404]
```


================================================
FILE: densenet/densenet121.cpp
================================================
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include <cmath>

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

// stuff we know about the network and the input/output blobs
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int OUTPUT_SIZE = 1000;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";

using namespace nvinfer1;

static Logger gLogger;

// Load weights from files shared with TensorRT samples.
// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file)
{
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        
        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + ".weight"].values;
    float *beta = (float*)weightMap[lname + ".bias"].values;
    float *mean = (float*)weightMap[lname + ".running_mean"].values;
    float *var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;
    std::cout << "len " << len << std::endl;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};
    
    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

IConvolutionLayer* addDenseLayer(INetworkDefinition* network, ITensor* input, std::map<std::string, Weights>& weightMap, std::string lname, float eps)
{
    // add Batchnorm
    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *input, lname + ".norm1", eps);

    // add relu
    IActivationLayer* relu1 = network -> addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    // add conv
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    IConvolutionLayer* conv1 = network -> addConvolutionNd(*relu1->getOutput(0), 128, DimsHW{1, 1}, weightMap[lname + ".conv1.weight"], emptywts);
    assert(conv1);
    conv1 -> setStrideNd(DimsHW{1, 1});

    // add Batchnorm
    IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv1 -> getOutput(0), lname + ".norm2", eps);

    // add relu
    IActivationLayer* relu2 = network -> addActivation(*bn2->getOutput(0), ActivationType::kRELU);
    assert(relu2);

    // add conv
    IConvolutionLayer* conv2 = network -> addConvolutionNd(*relu2->getOutput(0), 32, DimsHW{3, 3}, weightMap[lname + ".conv2.weight"], emptywts);
    assert(conv2);
    conv2 -> setStrideNd(DimsHW{1, 1});
    conv2 -> setPaddingNd(DimsHW{1, 1});
    return conv2;
}


IPoolingLayer* addTransition(INetworkDefinition* network, ITensor& input, std::map<std::string, Weights>& weightMap, int outch, std::string lname, float eps)
{
    // add batch norm
    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap,input, lname + ".norm", eps);

    // add relu activation
    IActivationLayer* relu1 = network -> addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    // add convolution layer
    // empty weights for no bias
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    IConvolutionLayer* conv1 = network -> addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{1, 1}, weightMap[lname + ".conv.weight"], emptywts);
    assert(conv1);
    conv1 -> setStrideNd(DimsHW{1, 1});

    // add pooling
    IPoolingLayer* pool1 = network->addPoolingNd(*conv1->getOutput(0), PoolingType::kAVERAGE, DimsHW{2, 2});
    assert(pool1);
    pool1 -> setStrideNd(DimsHW{2, 2});
    pool1 -> setPaddingNd(DimsHW{0,0});
    return pool1;
}


IConcatenationLayer* addDenseBlock(INetworkDefinition* network, ITensor* input, std::map<std::string, Weights>& weightMap, int numDenseLayers, std::string lname, float eps)
{
    IConvolutionLayer* c{nullptr};
    IConcatenationLayer* concat{nullptr};
    ITensor* inputTensors[numDenseLayers+1];
    inputTensors[0] = input;

    c = addDenseLayer(network, input, weightMap, lname + ".denselayer" + std::to_string(1), eps);
    int i;
    for(i=1; i<numDenseLayers; i++)
    {
        // inch += 32;
        inputTensors[i] = c -> getOutput(0);
        concat = network -> addConcatenation(inputTensors, i+1);
        assert(concat);
        c = addDenseLayer(network, concat->getOutput(0), weightMap, lname + ".denselayer" + std::to_string(i+1), eps);
    }
    inputTensors[numDenseLayers] = c -> getOutput(0);
    concat = network -> addConcatenation(inputTensors, numDenseLayers+1);
    assert(concat);
    return concat;
}


/**
 * Uses the TensorRT API to create the network engine.  
**/
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt)
{
    // Initialize NetworkDefinition
    INetworkDefinition* network = builder -> createNetworkV2(0U);

    auto data = network -> addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../densenet121.wts");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    auto conv0 = network -> addConvolutionNd(*data, 64, DimsHW{7, 7}, weightMap["features.conv0.weight"], emptywts);
    assert(conv0);
    conv0 -> setStrideNd(DimsHW{2, 2});
    conv0 -> setPaddingNd(DimsHW{3, 3});

    auto norm0 = addBatchNorm2d(network, weightMap, *conv0 -> getOutput(0), "features.norm0", 1e-5);

    auto relu0 = network -> addActivation(*norm0 -> getOutput(0), ActivationType::kRELU);
    assert(relu0);

    auto pool0 = network -> addPoolingNd(*relu0 -> getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
    assert(pool0);
    pool0 -> setStrideNd(DimsHW{2, 2});
    pool0 -> setPaddingNd(DimsHW{1, 1});
    
    auto dense1 = addDenseBlock(network, pool0 -> getOutput(0), weightMap, 6, "features.denseblock1", 1e-5);
    auto transition1 = addTransition(network, *dense1 -> getOutput(0), weightMap, 128, "features.transition1", 1e-5);

    auto dense2 = addDenseBlock(network, transition1 -> getOutput(0), weightMap, 12, "features.denseblock2", 1e-5);
    auto transition2 = addTransition(network, *dense2 -> getOutput(0), weightMap, 256, "features.transition2", 1e-5);

    auto dense3 = addDenseBlock(network, transition2 -> getOutput(0), weightMap, 24, "features.denseblock3", 1e-5);
    auto transition3 = addTransition(network, *dense3 -> getOutput(0), weightMap, 512, "features.transition3", 1e-5);

    auto dense4 = addDenseBlock(network, transition3 -> getOutput(0), weightMap, 16, "features.denseblock4", 1e-5);

    auto bn5 = addBatchNorm2d(network, weightMap, *dense4 -> getOutput(0), "features.norm5", 1e-5);
    auto relu5 = network -> addActivation(*bn5 -> getOutput(0), ActivationType::kRELU);

    // adaptive average pool => pytorch (F.adaptive_avg_pool2d(input, (1, 1)))
    auto pool5 = network -> addPoolingNd(*relu5 -> getOutput(0), PoolingType::kAVERAGE, DimsHW{7,7});

    auto fc1 = network -> addFullyConnected(*pool5 -> getOutput(0), 1000, weightMap["classifier.weight"], weightMap["classifier.bias"]);
    assert(fc1);

    // set ouput blob name
    fc1 -> getOutput(0) -> setName(OUTPUT_BLOB_NAME);
    std::cout << "set name out" << std::endl;

    // mark the output
    network -> markOutput(*fc1 -> getOutput(0));

    // set batchsize and workspace size
    builder -> setMaxBatchSize(maxBatchSize);
    config -> setMaxWorkspaceSize(1 << 28); // 256 MiB

    // build engine
    ICudaEngine* engine = builder -> buildEngineWithConfig(*network, *config);
    std::cout << "build out" << std::endl;
    
    // destroy
    network -> destroy();

    // fere host mem
    for(auto& mem: weightMap)
    {
        free((void*)(mem.second.values));
    }

    return engine;
}


void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream)
{
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
    config->destroy();
}

/**
 * Performs inference on the given input and 
 * writes the output from device to host memory.
**/
void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

int main(int argc, char** argv)
{
    if (argc != 2) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./densenet -s   // serialize model to plan file" << std::endl;
        std::cerr << "./densenet -d   // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{nullptr};
    size_t size{0};

    if (std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(1, &modelStream);
        assert(modelStream != nullptr);

        std::ofstream p("densenet.engine", std::ios::binary);
        if (!p)
        {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 1;
    } else if (std::string(argv[1]) == "-d") {
        std::ifstream file("densenet.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        return -1;
    }


    // Subtract mean from image
    static float data[3 * INPUT_H * INPUT_W];
    for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
        data[i] = 1.0;

    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    // Run inference
    static float prob[OUTPUT_SIZE];
    for (int i = 0; i < 100; i++) {
        auto start = std::chrono::system_clock::now();
        doInference(*context, data, prob, 1);
        auto end = std::chrono::system_clock::now();
        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    // Print histogram of the output distribution
    std::cout << "\nOutput:\n\n";
    for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
    {
        std::cout << prob[i] << ", ";
        if (i % 10 == 0) std::cout << i / 10 << std::endl;
    }
    std::cout << std::endl;

    return 0;
}


================================================
FILE: densenet/densenet121.py
================================================
import os
import sys
import struct
import argparse

import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt


BATCH_SIZE = 1
INPUT_H = 224
INPUT_W = 224
OUTPUT_SIZE = 1000
INPUT_BLOB_NAME = "data"
OUTPUT_BLOB_NAME = "prob"
EPS = 1e-5

WEIGHT_PATH = "./densenet121.wts"
ENGINE_PATH = "./densenet121.engine"

TRT_LOGGER = trt.Logger(trt.Logger.INFO)


def load_weights(file):
    print(f"Loading weights: {file}")

    assert os.path.exists(file), 'Unable to load weight file.'

    weight_map = {}
    with open(file, "r") as f:
        lines = [line.strip() for line in f]
    count = int(lines[0])
    assert count == len(lines) - 1
    for i in range(1, count + 1):
        splits = lines[i].split(" ")
        name = splits[0]
        cur_count = int(splits[1])
        assert cur_count + 2 == len(splits)
        values = []
        for j in range(2, len(splits)):
            # hex string to bytes to float
            values.append(struct.unpack(">f", bytes.fromhex(splits[j])))
        weight_map[name] = np.array(values, dtype=np.float32)

    return weight_map


def add_batch_norm_2d(network, weight_map, input, layer_name):
    gamma = weight_map[layer_name + ".weight"]
    beta = weight_map[layer_name + ".bias"]
    mean = weight_map[layer_name + ".running_mean"]
    var = weight_map[layer_name + ".running_var"]
    var = np.sqrt(var + EPS)

    scale = gamma / var
    shift = -mean / var * gamma + beta
    return network.add_scale(input=input,
                             mode=trt.ScaleMode.CHANNEL,
                             shift=shift,
                             scale=scale)


def add_dense_layer(network, input, weight_map, lname):
    bn1 = add_batch_norm_2d(network, weight_map, input, lname + ".norm1")

    relu1 = network.add_activation(bn1.get_output(0), type=trt.ActivationType.RELU)
    assert relu1

    conv1 = network.add_convolution(input=relu1.get_output(0),
                                    num_output_maps=128,
                                    kernel_shape=(1, 1),
                                    kernel=weight_map[lname + ".conv1.weight"],
                                    bias=trt.Weights())
    assert conv1
    conv1.stride = (1, 1)

    bn2 = add_batch_norm_2d(network, weight_map, conv1.get_output(0), lname + ".norm2")

    relu2 = network.add_activation(bn2.get_output(0), type=trt.ActivationType.RELU)
    assert relu2

    conv2 = network.add_convolution(input=relu2.get_output(0),
                                    num_output_maps=32,
                                    kernel_shape=(3, 3),
                                    kernel=weight_map[lname + ".conv2.weight"],
                                    bias=trt.Weights())
    assert conv2
    conv2.stride = (1, 1)
    conv2.padding = (1, 1)

    return conv2


def add_transition(network, input, weight_map, outch, lname):
    bn1 = add_batch_norm_2d(network, weight_map, input, lname + ".norm")

    relu1 = network.add_activation(bn1.get_output(0), type=trt.ActivationType.RELU)
    assert relu1

    conv1 = network.add_convolution(input=relu1.get_output(0),
                                    num_output_maps=outch,
                                    kernel_shape=(1, 1),
                                    kernel=weight_map[lname + ".conv.weight"],
                                    bias=trt.Weights())
    assert conv1
    conv1.stride = (1, 1)

    pool1 = network.add_pooling(input=conv1.get_output(0),
                                type=trt.PoolingType.AVERAGE,
                                window_size=trt.DimsHW(2, 2))
    assert pool1
    pool1.stride_nd = (2, 2)
    pool1.padding_nd = (0, 0)

    return pool1


def add_dense_block(network, input, weight_map, num_dense_layers, lname):
    input_tensors = [None for _ in range(num_dense_layers+1)]
    input_tensors[0] = input
    c = add_dense_layer(network, input, weight_map, lname + ".denselayer" + str(1))
    for i in range(1, num_dense_layers):
        input_tensors[i] = c.get_output(0)
        concat = network.add_concatenation(input_tensors[:i+1])
        assert concat
        c = add_dense_layer(network, concat.get_output(0), weight_map, lname + ".denselayer" + str(i+1))

    input_tensors[num_dense_layers] = c.get_output(0)
    concat = network.add_concatenation(input_tensors)
    assert concat

    return concat


def create_engine(max_batch_size, builder, config, dt):
    weight_map = load_weights(WEIGHT_PATH)
    network = builder.create_network()

    data = network.add_input(INPUT_BLOB_NAME, dt, (3, INPUT_H, INPUT_W))
    assert data

    conv0 = network.add_convolution(input=data,
                                    num_output_maps=64,
                                    kernel_shape=(7, 7),
                                    kernel=weight_map["features.conv0.weight"],
                                    bias=trt.Weights())
    assert conv0
    conv0.stride = (2, 2)
    conv0.padding = (3, 3)

    bn0 = add_batch_norm_2d(network, weight_map, conv0.get_output(0), "features.norm0")

    relu0 = network.add_activation(bn0.get_output(0), type=trt.ActivationType.RELU)
    assert relu0

    pool0 = network.add_pooling(input=relu0.get_output(0),
                                type=trt.PoolingType.MAX,
                                window_size=trt.DimsHW(3, 3))
    assert pool0
    pool0.stride_nd = (2, 2)
    pool0.padding_nd = (1, 1)

    dense1 = add_dense_block(network, pool0.get_output(0), weight_map, 6, "features.denseblock1")
    transition1 = add_transition(network, dense1.get_output(0), weight_map, 128, "features.transition1")

    dense2 = add_dense_block(network, transition1.get_output(0), weight_map, 12, "features.denseblock2")
    transition2 = add_transition(network, dense2.get_output(0), weight_map, 256, "features.transition2")

    dense3 = add_dense_block(network, transition2.get_output(0), weight_map, 24, "features.denseblock3")
    transition3 = add_transition(network, dense3.get_output(0), weight_map, 512, "features.transition3")

    dense4 = add_dense_block(network, transition3.get_output(0), weight_map, 16, "features.denseblock4")

    bn5 = add_batch_norm_2d(network, weight_map, dense4.get_output(0), "features.norm5")
    relu5 = network.add_activation(bn5.get_output(0), type=trt.ActivationType.RELU)

    pool5 = network.add_pooling(relu5.get_output(0), type=trt.PoolingType.AVERAGE, window_size=trt.DimsHW(7, 7))

    fc1 = network.add_fully_connected(input=pool5.get_output(0),
                                      num_outputs=OUTPUT_SIZE,
                                      kernel=weight_map["classifier.weight"],
                                      bias=weight_map["classifier.bias"])
    assert fc1

    fc1.get_output(0).name = OUTPUT_BLOB_NAME
    network.mark_output(fc1.get_output(0))

    # Build Engine
    builder.max_batch_size = max_batch_size
    builder.max_workspace_size = 1 << 20
    engine = builder.build_engine(network, config)

    del network
    del weight_map

    return engine


def API_to_model(max_batch_size):
    builder = trt.Builder(TRT_LOGGER)
    config = builder.create_builder_config()
    engine = create_engine(max_batch_size, builder, config, trt.float32)
    assert engine
    with open(ENGINE_PATH, "wb") as f:
        f.write(engine.serialize())

    del engine
    del builder
    del config


class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()


def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream


def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-s", action='store_true')
    parser.add_argument("-d", action='store_true')
    args = parser.parse_args()

    if not (args.s ^ args.d):
        print(
            "arguments not right!\n"
            "python densenet121.py -s   # serialize model to plan file\n"
            "python densenet121.py -d   # deserialize plan file and run inference"
        )
        sys.exit()

    if args.s:
        API_to_model(BATCH_SIZE)
    else:
        runtime = trt.Runtime(TRT_LOGGER)
        assert runtime

        with open(ENGINE_PATH, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        assert engine

        context = engine.create_execution_context()
        assert context

        data = np.ones((BATCH_SIZE * 3 * INPUT_H * INPUT_W), dtype=np.float32)
        inputs, outputs, bindings, stream = allocate_buffers(engine)
        inputs[0].host = data

        trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)

        print(f'Output: \n{trt_outputs[0][:10]}\n{trt_outputs[0][-10:]}')


================================================
FILE: densenet/logging.h
================================================
/*
 * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
        , mPrefix(other.mPrefix)
        , mShouldLog(other.mShouldLog)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) override
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
            {
                ss << " ";
            }
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//!         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H

================================================
FILE: detr/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(detr)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

find_package(CUDA REQUIRED)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(/data/app/TensorRT-8.4.3.1/include)
link_directories(/data/app/TensorRT-8.4.3.1/lib)

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

add_executable(detr ${PROJECT_SOURCE_DIR}/detr.cpp)
target_link_libraries(detr nvinfer)
target_link_libraries(detr cudart)
target_link_libraries(detr ${OpenCV_LIBS})

add_definitions(-O2 -pthread)


================================================
FILE: detr/README.md
================================================
# DETR

The Pytorch implementation is [facebookresearch/detr](https://github.com/facebookresearch/detr).

For details see [End-to-End Object Detection with Transformers](https://ai.facebook.com/research/publications/end-to-end-object-detection-with-transformers).

## Test Environment

- GTX2080Ti / Ubuntu16.04 / cuda10.2 / cudnn8.0.4 / TensorRT7.2.1 / OpenCV4.2
- GTX2080Ti / win10 / cuda10.2 / cudnn8.0.4 / TensorRT7.2.1 / OpenCV4.2 / VS2017

## How to Run

1. generate .wts from pytorch with .pth

```
// git clone https://github.com/facebookresearch/detr.git
// go to facebookresearch/detr
// download https://dl.fbaipublicfiles.com/detr/detr-r50-e632da11.pth
// download https://raw.githubusercontent.com/freedenS/TestImage/main/demo.jpg
// copy tensorrtx/detr/gen_wts.py and demo.jpg into facebookresearch/detr
python gen_wts.py
// a file 'detr.wts' will be generated.
```

2. build tensorrtx/detr and run

```
// put detr.wts into tensorrtx/detr
// go to tensorrtx/detr
// update parameters in detr.cpp if your model is trained on custom dataset.The parameters are corresponding to config in detr.
mkdir build
cd build
cmake ..
make
sudo ./detr -s [.wts] // serialize model to plan file
sudo ./detr -d [.engine] [image folder] // deserialize and run inference, the images in [image folder] will be processed
// For example
sudo ./detr -s ../detr.wts detr.engine
sudo ./detr -d detr.engine ../samples
```

3. check the images generated, as follows. _demo.jpg and so on.

## Backbone

#### R50

```
1.download pretrained model
  https://dl.fbaipublicfiles.com/detr/detr-r50-e632da11.pth
2.export wts
  set first parameter in Backbone in gen_wts.py(line 23) to resnet50
  set path of pretrained model(line 87 in gen_wts.py)
3.set resnet_type in BuildResNet(line 546 in detr.cpp) to R50
```

#### R101

```
1.download pretrained model
  https://dl.fbaipublicfiles.com/detr/detr-r101-2c7b67e5.pth
2.export wts
  set first parameter in Backbone in gen_wts.py(line 23) to resnet101
  set path of pretrained model(line 87 in gen_wts.py)
3.set resnet_type in BuildResNet(line 546 in detr.cpp) to R101
```

## NOTE

- tensorrt use fixed input size, if the size of your data is different from the engine, you need to adjust your data and the result.
- image preprocessing with c++ is a little different with python(opencv vs PIL)

## Quantization

1. quantizationType:fp32,fp16,int8. see BuildDETRModel(detr.cpp line 613) for detail.

2. the usage of int8 is same with [tensorrtx/yolov5](../yolov5/README.md).


## Latency

average cost of doInference(in detr.cpp) from second time with batch=1 under the ubuntu environment above

|      | fp32    | fp16    | int8   |
| ---- | ------- | ------- | ------ |
| R50  | 19.57ms | 9.424ms | 8.38ms |
| R101 | 30.82ms | 12.4ms  | 9.59ms |


================================================
FILE: detr/backbone.hpp
================================================
#pragma once
#include <map>
#include "common.hpp"

enum RESNETTYPE {
    R18 = 0,
    R34,
    R50,
    R101,
    R152
};

const std::map<RESNETTYPE, std::vector<int>> num_blocks_per_stage = {
    {R18, {2, 2, 2, 2}},
    {R34, {3, 4, 6, 3}},
    {R50, {3, 4, 6, 3}},
    {R101, {3, 4, 23, 3}},
    {R152, {3, 8, 36, 3}}
};

IScaleLayer* addBatchNorm2d(
INetworkDefinition *network,
std::unordered_map<std::string, Weights>& weightMap,
ITensor& input,
const std::string& lname,
float eps = 1e-5
) {
    float *gamma = (float*)(weightMap[lname + ".weight"].values);
    float *beta = (float*)(weightMap[lname + ".bias"].values);
    float *mean = (float*)(weightMap[lname + ".running_mean"].values);
    float *var = (float*)(weightMap[lname + ".running_var"].values);
    int len = weightMap[lname + ".running_var"].count;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{ DataType::kFLOAT, scval, len };

    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{ DataType::kFLOAT, shval, len };

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{ DataType::kFLOAT, pval, len };

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

ILayer* BasicStem(
INetworkDefinition *network,
std::unordered_map<std::string, Weights>& weightMap,
const std::string& lname,
ITensor& input,
int out_channels,
int group_num = 1
) {
    // conv1
    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
    IConvolutionLayer* conv1 = network->addConvolutionNd(
        input,
        out_channels,
        DimsHW{ 7, 7 },
        weightMap[lname + ".conv1.weight"],
        emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{ 2, 2 });
    conv1->setPaddingNd(DimsHW{ 3, 3 });
    conv1->setNbGroups(group_num);

    auto bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn1");
    assert(bn1);

    auto r1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(r1);

    auto max_pool2d = network->addPoolingNd(*r1->getOutput(0), PoolingType::kMAX, DimsHW{ 3, 3 });
    max_pool2d->setStrideNd(DimsHW{ 2, 2 });
    max_pool2d->setPaddingNd(DimsHW{ 1, 1 });
    auto mp_dim = max_pool2d->getOutput(0)->getDimensions();
    return max_pool2d;
}

ITensor* BasicBlock(
INetworkDefinition *network,
std::unordered_map<std::string, Weights>& weightMap,
const std::string& lname,
ITensor& input,
int in_channels,
int out_channels,
int stride = 1
) {
    // conv1
    IConvolutionLayer* conv1 = network->addConvolutionNd(
        input,
        out_channels,
        DimsHW{ 3, 3 },
        weightMap[lname + ".conv1.weight"],
        weightMap[lname + ".conv1.bias"]);
    assert(conv1);
    conv1->setStrideNd(DimsHW{ stride, stride });
    conv1->setPaddingNd(DimsHW{ 1, 1 });

    auto r1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);
    assert(r1);

    // conv2
    IConvolutionLayer* conv2 = network->addConvolutionNd(
        *r1->getOutput(0),
        out_channels, DimsHW{ 3, 3 },
        weightMap[lname + ".conv2.weight"],
        weightMap[lname + ".conv2.bias"]);
    assert(conv2);
    conv2->setStrideNd(DimsHW{ 1, 1 });
    conv2->setPaddingNd(DimsHW{ 1, 1 });

    // shortcut
    ITensor* shortcut_value = nullptr;
    if (in_channels != out_channels) {
        auto shortcut = network->addConvolutionNd(
            input,
            out_channels,
            DimsHW{ 1, 1 },
            weightMap[lname + ".shortcut.weight"],
            weightMap[lname + ".shortcut.bias"]);
        assert(shortcut);
        shortcut->setStrideNd(DimsHW{ stride, stride });
        shortcut_value = shortcut->getOutput(0);
    } else {
        shortcut_value = &input;
    }

    // add
    auto ew = network->addElementWise(*conv2->getOutput(0), *shortcut_value, ElementWiseOperation::kSUM);
    assert(ew);

    auto r3 = network->addActivation(*ew->getOutput(0), ActivationType::kRELU);
    assert(r3);

    return r3->getOutput(0);
}

ITensor* BottleneckBlock(
INetworkDefinition *network,
std::unordered_map<std::string, Weights>& weightMap,
const std::string& lname,
ITensor& input,
int in_channels,
int bottleneck_channels,
int out_channels,
int stride = 1,
int dilation = 1,
int group_num = 1
) {
    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
    // conv1
    IConvolutionLayer* conv1 = network->addConvolutionNd(
        input,
        bottleneck_channels,
        DimsHW{ 1, 1 },
        weightMap[lname + ".conv1.weight"],
        emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{ 1, 1 });
    conv1->setNbGroups(group_num);

    auto bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn1");
    assert(bn1);

    auto r1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(r1);

    // conv2
    IConvolutionLayer* conv2 = network->addConvolutionNd(
        *r1->getOutput(0),
        bottleneck_channels,
        DimsHW{ 3, 3 },
        weightMap[lname + ".conv2.weight"],
        emptywts);
    assert(conv2);
    conv2->setStrideNd(DimsHW{ stride, stride });
    conv2->setPaddingNd(DimsHW{ 1 * dilation, 1 * dilation });
    conv2->setDilationNd(DimsHW{ dilation, dilation });
    conv2->setNbGroups(group_num);

    auto bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + ".bn2");
    assert(bn2);

    auto r2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU);
    assert(r2);

    // conv3
    IConvolutionLayer* conv3 = network->addConvolutionNd(
        *r2->getOutput(0),
        out_channels,
        DimsHW{ 1, 1 },
        weightMap[lname + ".conv3.weight"],
        emptywts);
    assert(conv3);
    conv3->setStrideNd(DimsHW{ 1, 1 });
    conv3->setNbGroups(group_num);

    auto bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + ".bn3");
    assert(bn3);

    // shortcut
    ITensor* shortcut_value = nullptr;
    if (in_channels != out_channels) {
        auto shortcut = network->addConvolutionNd(
            input,
            out_channels,
            DimsHW{ 1, 1 },
            weightMap[lname + ".downsample.0.weight"],
            emptywts);
        assert(shortcut);
        shortcut->setStrideNd(DimsHW{stride, stride});
        shortcut->setNbGroups(group_num);

        auto shortcut_bn = addBatchNorm2d(network, weightMap, *shortcut->getOutput(0), lname + ".downsample.1");
        assert(shortcut_bn);
        shortcut_value = shortcut_bn->getOutput(0);
    } else {
        shortcut_value = &input;
    }

    // add
    auto ew = network->addElementWise(*bn3->getOutput(0), *shortcut_value, ElementWiseOperation::kSUM);
    assert(ew);

    auto r3 = network->addActivation(*ew->getOutput(0), ActivationType::kRELU);
    assert(r3);

    return r3->getOutput(0);
}

ITensor* MakeStage(
INetworkDefinition *network,
std::unordered_map<std::string, Weights>& weightMap,
const std::string& lname,
ITensor& input,
int stage,
RESNETTYPE resnet_type,
int in_channels,
int bottleneck_channels,
int out_channels,
int first_stride = 1,
int dilation = 1
) {
    ITensor* out = &input;
    for (int i = 0; i < stage; i++) {
        std::string layerName = lname + "." + std::to_string(i);
        int stride = i == 0 ? first_stride : 1;

        if (resnet_type == R18 || resnet_type == R34)
            out = BasicBlock(network, weightMap, layerName, *out, in_channels, out_channels, stride);
        else
            out = BottleneckBlock(
                network,
                weightMap,
                layerName,
                *out,
                in_channels,
                bottleneck_channels,
                out_channels,
                stride,
                dilation);

        in_channels = out_channels;
    }
    return out;
}

ITensor* BuildResNet(
INetworkDefinition *network,
std::unordered_map<std::string, Weights>& weightMap,
ITensor& input,
RESNETTYPE resnet_type,
int stem_out_channels,
int bottleneck_channels,
int res2_out_channels,
int res5_dilation = 1
) {
    assert(res5_dilation == 1 || res5_dilation == 2);  // "res5_dilation must be 1 or 2"
    if (resnet_type == R18 || resnet_type == R34) {
        assert(res2_out_channels == 64);  // "res2_out_channels must be 64 for R18/R34")
        assert(res5_dilation == 1);  // "res5_dilation must be 1 for R18/R34")
    }

    int out_channels = res2_out_channels;
    ITensor* out = nullptr;
    // stem
    auto stem = BasicStem(network, weightMap, "backbone.0.body", input, stem_out_channels);
    out = stem->getOutput(0);

    // res
    for (int i = 0; i < 4; i++) {
        int dilation = (i == 3) ? res5_dilation : 1;
        int first_stride = (i == 0 || (i == 3 && dilation == 2)) ? 1 : 2;
        out = MakeStage(
            network,
            weightMap,
            "backbone.0.body.layer" + std::to_string(i + 1),
            *out,
            num_blocks_per_stage.at(resnet_type)[i],
            resnet_type,
            stem_out_channels,
            bottleneck_channels,
            out_channels,
            first_stride,
            dilation);
        stem_out_channels = out_channels;
        bottleneck_channels *= 2;
        out_channels *= 2;
    }
    return out;
}


================================================
FILE: detr/calibrator.hpp
================================================
#pragma once

#include "NvInfer.h"
#include <string>
#include <vector>
#include <iostream>
#include <iterator>
#include <fstream>
#include <algorithm>
#include "common.hpp"
#include "macros.h"
 
//! \class Int8EntropyCalibrator2
//!
//! \brief Implements Entropy calibrator 2.
//!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
//!
class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
 public:
    Int8EntropyCalibrator2(int batchsize, int input_w, int input_h,
    const char* img_dir, const char* calib_table_name,
    const char* input_blob_name, bool read_cache = true);

    virtual ~Int8EntropyCalibrator2();
    int getBatchSize() const TRT_NOEXCEPT override;
    bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
    const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
    void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;

 private:
    int batchsize_;
    int input_w_;
    int input_h_;
    int img_idx_;
    std::string img_dir_;
    std::vector<std::string> img_files_;
    size_t input_count_;
    std::string calib_table_name_;
    const char* input_blob_name_;
    bool read_cache_;
    void* device_input_;
    std::vector<char> calib_cache_;
};

Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize,
int input_w, int input_h, const char* img_dir,
const char* calib_table_name, const char* input_blob_name,
bool read_cache)
    : batchsize_(batchsize)
    , input_w_(input_w)
    , input_h_(input_h)
    , img_idx_(0)
    , img_dir_(img_dir)
    , calib_table_name_(calib_table_name)
    , input_blob_name_(input_blob_name)
    , read_cache_(read_cache) {
    input_count_ = 3 * input_w * input_h * batchsize;
    CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float)));
    read_files_in_dir(img_dir, img_files_);
}

Int8EntropyCalibrator2::~Int8EntropyCalibrator2() {
    CUDA_CHECK(cudaFree(device_input_));
}

int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT {
    return batchsize_;
}

bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT {
    if (img_idx_ + batchsize_ > static_cast<int>(img_files_.size())) {
        return false;
    }

    std::vector<float> input_imgs_(input_count_, 0);
    for (int i = img_idx_; i < img_idx_ + batchsize_; i++) {
        std::cout << img_files_[i] << "  " << i << std::endl;
        cv::Mat temp = cv::imread(img_dir_ + img_files_[i]);
        if (temp.empty()) {
            std::cerr << "Fatal error: image cannot open!" << std::endl;
            return false;
        }
        preprocessImg(temp, input_w_, input_h_);
        for (int c = 0; c < 3; c++) {
            for (int h = 0; h < input_h_; h++) {
                for (int w = 0; w < input_w_; w++) {
                    input_imgs_[(i-img_idx_)*input_w_*input_h_*3 +
                        c * input_h_ * input_w_ + h * input_w_ + w] = temp.at<cv::Vec3f>(h, w)[c];
                }
            }
        }
    }
    img_idx_ += batchsize_;

    CUDA_CHECK(cudaMemcpy(device_input_, input_imgs_.data(), input_count_ * sizeof(float), cudaMemcpyHostToDevice));
    assert(!strcmp(names[0], input_blob_name_));
    bindings[0] = device_input_;
    return true;
}

const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT {
    std::cout << "reading calib cache: " << calib_table_name_ << std::endl;
    calib_cache_.clear();
    std::ifstream input(calib_table_name_, std::ios::binary);
    input >> std::noskipws;
    if (read_cache_ && input.good()) {
        std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(calib_cache_));
    }
    length = calib_cache_.size();
    return length ? calib_cache_.data() : nullptr;
}

void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT {
    std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl;
    std::ofstream output(calib_table_name_, std::ios::binary);
    output.write(reinterpret_cast<const char*>(cache), length);
}


================================================
FILE: detr/common.hpp
================================================
#pragma once

#include <dirent.h>
#include <cuda_runtime_api.h>
#include <fstream>
#include <sstream>
#include <iostream>
#include <vector>
#include <unordered_map>
#include <algorithm>
#include "./logging.h"
#include <NvInfer.h>
#include <opencv2/opencv.hpp>

static Logger gLogger;

using namespace nvinfer1;
void loadWeights(const std::string file, std::unordered_map<std::string, Weights>& weightMap) {
    std::cout << "Loading weights: " << file << std::endl;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--) {
        Weights wt{ DataType::kFLOAT, nullptr, 0 };
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x) {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }
}

int CalculateSize(Dims a) {
    int res = 1;
    for (int i = 0; i < a.nbDims; i++) {
        res *= a.d[i];
    }
    return res;
}

static inline int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
    DIR *p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 &&
            strcmp(p_file->d_name, "..") != 0) {
            // std::string cur_file_name(p_dir_name);
            // cur_file_name += "/";
            // cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}

void preprocessImg(cv::Mat& img, int newh, int neww) {
    // convert to rgb
    cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
    cv::resize(img, img, cv::Size(neww, newh));
    img.convertTo(img, CV_32FC3);
    img /= 255;
    img -= cv::Scalar(0.485, 0.456, 0.406);
    img /= cv::Scalar(0.229, 0.224, 0.225);
}

#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)\
    {\
        cudaError_t error_code = callstr;\
        if (error_code != cudaSuccess) {\
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
            assert(0);\
        }\
    }
#endif  // CUDA_CHECK


================================================
FILE: detr/detr.cpp
================================================
#pragma once
#include <iostream>
#include <unordered_map>
#include "./logging.h"
#include "backbone.hpp"
#include "calibrator.hpp"

#define DEVICE 0
#define BATCH_SIZE 1

// 1 / math.sqrt(head_dim) https://github.com/pytorch/pytorch/blob/master/torch/csrc/api/include/torch/nn/functional/activation.h#623
static const float SCALING = 0.17677669529663687;
static const int INPUT_H = 800;
static const int INPUT_W = 1066;
static const int NUM_CLASS = 92;  // include background
static const float SCALING_ONE = 1.0;
static const float SHIFT_ZERO = 0.0;
static const float POWER_TWO = 2.0;
static const float EPS = 0.00001;
static const int D_MODEL = 256;
static const int NHEAD = 8;
static const int DIM_FEEDFORWARD = 2048;
static const int NUM_ENCODE_LAYERS = 6;
static const int NUM_DECODE_LAYERS = 6;
static const int NUM_QUERIES = 100;
static const float SCORE_THRESH = 0.5;

const char* INPUT_NODE_NAME = "images";
const std::vector<std::string> OUTPUT_NAMES = { "scores", "boxes"};

ITensor* PositionEmbeddingSine(
INetworkDefinition *network,
std::unordered_map<std::string, Weights>& weightMap,
ITensor& input,
int num_pos_feats = 64,
int temperature = 10000
) {
    // refer to https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py#12
    // TODO: improve this implementation
    auto mask_dim = input.getDimensions();
    int h = mask_dim.d[1], w = mask_dim.d[2];
    std::vector<std::vector<float>> y_embed(h);
    for (int i = 0; i < h; i++)
        y_embed[i] = std::vector<float>(w, i + 1);
    std::vector<float> sub_embed(w, 0);
    for (int i = 0; i < w; i++)
        sub_embed[i] = i + 1;
    std::vector<std::vector<float>> x_embed(h, sub_embed);

    // normalize
    float eps = 1e-6, scale = 2.0 * 3.1415926;
    for (int i = 0; i < h; i++) {
        for (int j = 0; j < w; j++) {
            y_embed[i][j] = y_embed[i][j] / (h + eps) * scale;
            x_embed[i][j] = x_embed[i][j] / (w + eps) * scale;
        }
    }

    // dim_t
    std::vector<float> dim_t(num_pos_feats, 0);
    for (int i = 0; i < num_pos_feats; i++) {
        dim_t[i] = pow(temperature, (2 * (i / 2) / static_cast<float>(num_pos_feats)));
    }

    // pos_x, pos_y
    std::vector<std::vector<std::vector<float>>> pos_x(h,
    std::vector<std::vector<float>>(w,
    std::vector<float>(num_pos_feats, 0)));

    std::vector<std::vector<std::vector<float>>> pos_y(h,
    std::vector<std::vector<float>>(w,
    std::vector<float>(num_pos_feats, 0)));
    for (int i = 0; i < h; i++) {
        for (int j = 0; j < w; j++) {
            for (int k = 0; k < num_pos_feats; k++) {
                float value_x = x_embed[i][j] / dim_t[k];
                float value_y = y_embed[i][j] / dim_t[k];
                if (k & 1) {
                    pos_x[i][j][k] = std::cos(value_x);
                    pos_y[i][j][k] = std::cos(value_y);
                } else {
                    pos_x[i][j][k] = std::sin(value_x);
                    pos_y[i][j][k] = std::sin(value_y);
                }
            }
        }
    }

    // pos
    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * h * w * num_pos_feats * 2));
    float *pNext = pval;
    for (int i = 0; i < h; i++) {
        for (int j = 0; j < w; j++) {
            for (int k = 0; k < num_pos_feats; k++) {
                *pNext = pos_y[i][j][k];
                ++pNext;
            }
            for (int k = 0; k < num_pos_feats; k++) {
                *pNext = pos_x[i][j][k];
                ++pNext;
            }
        }
    }
    Weights pos_embed_weight{ DataType::kFLOAT, pval, h * w * num_pos_feats * 2 };
    weightMap["pos"] = pos_embed_weight;
    auto pos_embed = network->addConstant(Dims4{ h * w, num_pos_feats * 2, 1, 1 }, pos_embed_weight);
    assert(pos_embed);
    return pos_embed->getOutput(0);
}

ITensor* MultiHeadAttention(
INetworkDefinition *network,
std::unordered_map<std::string, Weights>& weightMap,
const std::string& lname,
ITensor& query,
ITensor& key,
ITensor& value,
int embed_dim = 256,
int num_heads = 8
) {
    int tgt_len = query.getDimensions().d[0];
    int head_dim = embed_dim / num_heads;

    // q
    auto linear_q = network->addFullyConnected(
        query,
        embed_dim,
        weightMap[lname + ".in_proj_weight_q"],
        weightMap[lname + ".in_proj_bias_q"]);
    assert(linear_q);

    // k
    auto linear_k = network->addFullyConnected(
        key,
        embed_dim,
        weightMap[lname + ".in_proj_weight_k"],
        weightMap[lname + ".in_proj_bias_k"]);
    assert(linear_k);

    // v
    auto linear_v = network->addFullyConnected(
        value,
        embed_dim,
        weightMap[lname + ".in_proj_weight_v"],
        weightMap[lname + ".in_proj_bias_v"]);
    assert(linear_v);

    auto scaling_t = network->addConstant(Dims4{ 1, 1, 1, 1 }, Weights{ DataType::kFLOAT, &SCALING, 1 });
    assert(scaling_t);
    auto q_scaling = network->addElementWise(
        *linear_q->getOutput(0),
        *scaling_t->getOutput(0),
        ElementWiseOperation::kPROD);
    assert(q_scaling);

    auto q_shuffle = network->addShuffle(*q_scaling->getOutput(0));
    assert(q_shuffle);
    q_shuffle->setName((lname + ".q_shuffle").c_str());
    q_shuffle->setReshapeDimensions(Dims3{ -1, num_heads, head_dim });
    q_shuffle->setSecondTranspose(Permutation{1, 0, 2});

    auto k_shuffle = network->addShuffle(*linear_k->getOutput(0));
    assert(k_shuffle);
    k_shuffle->setName((lname + ".k_shuffle").c_str());
    k_shuffle->setReshapeDimensions(Dims3{ -1, num_heads, head_dim });
    k_shuffle->setSecondTranspose(Permutation{ 1, 0, 2 });

    auto v_shuffle = network->addShuffle(*linear_v->getOutput(0));
    assert(v_shuffle);
    v_shuffle->setName((lname + ".v_shuffle").c_str());
    v_shuffle->setReshapeDimensions(Dims3{ -1, num_heads, head_dim });
    v_shuffle->setSecondTranspose(Permutation{ 1, 0, 2 });
#if NV_TENSORRT_MAJOR >= 8
    auto q_product_k = network->addMatrixMultiply(*q_shuffle->getOutput(0), nvinfer1::MatrixOperation::kNONE, *k_shuffle->getOutput(0), nvinfer1::MatrixOperation::kTRANSPOSE);
#else
    auto q_product_k = network->addMatrixMultiply(*q_shuffle->getOutput(0), false, *k_shuffle->getOutput(0), true);
#endif
    assert(q_product_k);

    // src_key_padding_mask are all false, so do nothing here
    // see https://github.com/pytorch/pytorch/blob/master/torch/csrc/api/include/torch/nn/functional/activation.h#826-#839

    auto softmax = network->addSoftMax(*q_product_k->getOutput(0));
    assert(softmax);
    softmax->setAxes(4);
#if NV_TENSORRT_MAJOR >= 8
    auto attn_product_v = network->addMatrixMultiply(*softmax->getOutput(0), nvinfer1::MatrixOperation::kNONE, *v_shuffle->getOutput(0), nvinfer1::MatrixOperation::kNONE);
#else
    auto attn_product_v = network->addMatrixMultiply(*softmax->getOutput(0), false, *v_shuffle->getOutput(0), false);
#endif
    assert(attn_product_v);

    auto attn_shuffle = network->addShuffle(*attn_product_v->getOutput(0));
    assert(attn_shuffle);
    attn_shuffle->setName((lname + ".attn_shuffle").c_str());
    attn_shuffle->setFirstTranspose(Permutation{ 1, 0, 2 });
    attn_shuffle->setReshapeDimensions(Dims4{ tgt_len, -1, 1, 1 });

    auto linear_attn = network->addFullyConnected(
        *attn_shuffle->getOutput(0),
        embed_dim,
        weightMap[lname + ".out_proj.weight"],
        weightMap[lname + ".out_proj.bias"]);
    assert(linear_attn);

    return linear_attn->getOutput(0);
}

ITensor* LayerNorm(
INetworkDefinition *network,
ITensor& input,
std::unordered_map<std::string, Weights>& weightMap,
const std::string& lname,
int d_model = 256
) {
    // TODO: maybe a better implementation https://github.com/NVIDIA/TensorRT/blob/master/plugin/common/common.cuh#212
    auto mean = network->addReduce(input, ReduceOperation::kAVG, 2, true);
    assert(mean);

    auto sub_mean = network->addElementWise(input, *mean->getOutput(0), ElementWiseOperation::kSUB);
    assert(sub_mean);

    // implement pow2 with scale
    Weights scale{ DataType::kFLOAT, &SCALING_ONE, 1 };
    Weights shift{ DataType::kFLOAT, &SHIFT_ZERO, 1 };
    Weights power{ DataType::kFLOAT, &POWER_TWO, 1 };
    auto pow2 = network->addScaleNd(*sub_mean->getOutput(0), ScaleMode::kUNIFORM, shift, scale, power, 0);
    assert(pow2);

    auto pow_mean = network->addReduce(*pow2->getOutput(0), ReduceOperation::kAVG, 2, true);
    assert(pow_mean);

    auto eps = network->addConstant(Dims4{ 1, 1, 1, 1 }, Weights{ DataType::kFLOAT, &EPS, 1 });
    assert(eps);

    auto add_eps = network->addElementWise(*pow_mean->getOutput(0), *eps->getOutput(0), ElementWiseOperation::kSUM);
    assert(add_eps);

    auto sqrt = network->addUnary(*add_eps->getOutput(0), UnaryOperation::kSQRT);
    assert(sqrt);

    auto div = network->addElementWise(*sub_mean->getOutput(0), *sqrt->getOutput(0), ElementWiseOperation::kDIV);
    assert(div);

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * d_model));
    for (int i = 0; i < d_model; i++) {
        pval[i] = 1.0;
    }
    Weights norm1_power{ DataType::kFLOAT, pval, d_model };
    weightMap[lname + ".power"] = norm1_power;
    auto affine = network->addScaleNd(
        *div->getOutput(0),
        ScaleMode::kCHANNEL,
        weightMap[lname + ".bias"],
        weightMap[lname + ".weight"],
        norm1_power,
        1);
    assert(affine);
    return affine->getOutput(0);
}

ITensor* TransformerEncoderLayer(
INetworkDefinition *network,
std::unordered_map<std::string, Weights>& weightMap,
const std::string& lname,
ITensor& src,
ITensor& pos,
int d_model = 256,
int nhead = 8,
int dim_feedforward = 2048
) {
    auto pos_embed = network->addElementWise(src, pos, ElementWiseOperation::kSUM);
    assert(pos_embed);

    ITensor* src2 = MultiHeadAttention(
        network,
        weightMap,
        lname + ".self_attn",
        *pos_embed->getOutput(0),
        *pos_embed->getOutput(0),
        src,
        d_model,
        nhead);

    auto shortcut1 = network->addElementWise(src, *src2, ElementWiseOperation::kSUM);
    assert(shortcut1);

    ITensor* norm1 = LayerNorm(network, *shortcut1->getOutput(0), weightMap, lname + ".norm1");

    auto linear1 = network->addFullyConnected(
        *norm1,
        dim_feedforward,
        weightMap[lname + ".linear1.weight"],
        weightMap[lname + ".linear1.bias"]);
    assert(linear1);

    auto relu = network->addActivation(*linear1->getOutput(0), ActivationType::kRELU);
    assert(relu);

    auto linear2 = network->addFullyConnected(
        *relu->getOutput(0),
        d_model,
        weightMap[lname + ".linear2.weight"],
        weightMap[lname + ".linear2.bias"]);
    assert(linear2);

    auto shortcut2 = network->addElementWise(*norm1, *linear2->getOutput(0), ElementWiseOperation::kSUM);
    assert(shortcut2);

    ITensor* norm2 = LayerNorm(network, *shortcut2->getOutput(0), weightMap, lname + ".norm2");
    return norm2;
}

ITensor* TransformerEncoder(
INetworkDefinition *network,
std::unordered_map<std::string, Weights>& weightMap,
const std::string& lname,
ITensor& src,
ITensor& pos,
int num_layers = 6
) {
    ITensor* out = &src;
    for (int i = 0; i < num_layers; i++) {
        std::string layer_name = lname + ".layers." + std::to_string(i);
        out = TransformerEncoderLayer(network, weightMap, layer_name, *out, pos);
    }
    return out;
}

ITensor* TransformerDecoderLayer(
INetworkDefinition *network,
std::unordered_map<std::string, Weights>& weightMap,
const std::string& lname,
ITensor& tgt,
ITensor& memory,
ITensor& pos,
ITensor& query_pos,
int d_model = 256,
int nhead = 8,
int dim_feedforward = 2048
) {
    auto pos_embed = network->addElementWise(tgt, query_pos, ElementWiseOperation::kSUM);
    assert(pos_embed);

    ITensor* tgt2 = MultiHeadAttention(
        network,
        weightMap,
        lname + ".self_attn",
        *pos_embed->getOutput(0),
        *pos_embed->getOutput(0),
        tgt);

    auto shortcut1 = network->addElementWise(tgt, *tgt2, ElementWiseOperation::kSUM);
    assert(shortcut1);
    ITensor* norm1 = LayerNorm(network, *shortcut1->getOutput(0), weightMap, lname + ".norm1");

    auto query_embed = network->addElementWise(*norm1, query_pos, ElementWiseOperation::kSUM);
    assert(query_embed);

    auto key_embed = network->addElementWise(memory, pos, ElementWiseOperation::kSUM);
    assert(key_embed);

    ITensor* mha2 = MultiHeadAttention(
        network,
        weightMap,
        lname + ".multihead_attn",
        *query_embed->getOutput(0),
        *key_embed->getOutput(0),
        memory);

    auto shortcut2 = network->addElementWise(*norm1, *mha2, ElementWiseOperation::kSUM);
    assert(shortcut2);

    ITensor* norm2 = LayerNorm(network, *shortcut2->getOutput(0), weightMap, lname + ".norm2");

    auto linear1 = network->addFullyConnected(
        *norm2,
        dim_feedforward,
        weightMap[lname + ".linear1.weight"],
        weightMap[lname + ".linear1.bias"]);
    assert(linear1);

    auto relu = network->addActivation(*linear1->getOutput(0), ActivationType::kRELU);
    assert(relu);

    auto linear2 = network->addFullyConnected(
        *relu->getOutput(0),
        d_model,
        weightMap[lname + ".linear2.weight"],
        weightMap[lname + ".linear2.bias"]);
    assert(linear2);

    auto shortcut3 = network->addElementWise(*norm2, *linear2->getOutput(0), ElementWiseOperation::kSUM);
    assert(shortcut3);

    ITensor* norm3 = LayerNorm(network, *shortcut3->getOutput(0), weightMap, lname + ".norm3");

    return norm3;
}

ITensor* TransformerDecoder(
INetworkDefinition *network,
std::unordered_map<std::string, Weights>& weightMap,
const std::string& lname,
ITensor& tgt,
ITensor& memory,
ITensor& pos,
ITensor& query_pos,
int num_layers = 6,
int d_model = 256,
int nhead = 8,
int dim_feedforward = 2048
) {
    ITensor* out = &tgt;
    for (int i = 0; i < num_layers; i++) {
        std::string layer_name = lname + ".layers." + std::to_string(i);
        out = TransformerDecoderLayer(
            network,
            weightMap,
            layer_name,
            *out,
            memory,
            pos,
            query_pos,
            d_model,
            nhead,
            dim_feedforward);
    }
    ITensor* norm = LayerNorm(network, *out, weightMap, lname + ".norm", d_model);
    return norm;
}

ITensor* Transformer(
INetworkDefinition *network,
std::unordered_map<std::string, Weights>& weightMap,
const std::string& lname,
ITensor& src,
ITensor& pos_embed,
int num_queries = 100,
int num_encoder_layers = 6,
int num_decoder_layers = 6,
int d_model = 256,
int nhead = 8,
int dim_feedforward = 2048
) {
    auto memory = TransformerEncoder(network, weightMap, lname + ".encoder", src, pos_embed, num_encoder_layers);

    // construct tgt
    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * num_queries * d_model));
    for (int i = 0; i < num_queries * d_model; i++) {
        pval[i] = 0.0;
    }
    Weights tgt_weight{ DataType::kFLOAT, pval, num_queries * d_model };
    weightMap[lname + ".tgt_weight"] = tgt_weight;
    auto tgt = network->addConstant(Dims4{ num_queries, d_model, 1, 1 }, tgt_weight);
    assert(tgt);
    // construct query_pos
    auto query_pos = network->addConstant(Dims4{ num_queries, d_model, 1, 1 }, weightMap["query_embed.weight"]);
    assert(query_pos);

    auto out = TransformerDecoder(
        network,
        weightMap,
        lname + ".decoder",
        *tgt->getOutput(0),
        *memory, pos_embed,
        *query_pos->getOutput(0),
        num_decoder_layers,
        d_model,
        nhead,
        dim_feedforward);
    return out;
}

ITensor* MLP(
INetworkDefinition *network,
std::unordered_map<std::string, Weights>& weightMap,
const std::string& lname,
ITensor& src,
int num_layers = 3,
int hidden_dim = 256,
int output_dim = 4
) {
    ITensor* out = &src;
    for (int i = 0; i < num_layers; i++) {
        std::string layer_name = lname + "." + std::to_string(i);
        if (i != num_layers - 1) {
            auto fc = network->addFullyConnected(
                *out,
                hidden_dim,
                weightMap[layer_name + ".weight"],
                weightMap[layer_name + ".bias"]);
            assert(fc);
            auto relu = network->addActivation(*fc->getOutput(0), ActivationType::kRELU);
            assert(relu);
            out = relu->getOutput(0);
        } else {
            auto fc = network->addFullyConnected(
                *out,
                output_dim,
                weightMap[layer_name + ".weight"],
                weightMap[layer_name + ".bias"]);
            assert(fc);
            out = fc->getOutput(0);
        }
    }
    return out;
}

std::vector<ITensor*> Predict(
INetworkDefinition *network,
std::unordered_map<std::string, Weights>& weightMap,
ITensor& src
) {
    auto class_embed = network->addFullyConnected(
        src,
        NUM_CLASS,
        weightMap["class_embed.weight"],
        weightMap["class_embed.bias"]);
    assert(class_embed);
    auto class_softmax = network->addSoftMax(*class_embed->getOutput(0));
    assert(class_softmax);
    class_softmax->setAxes(2);
    ITensor* bbox = MLP(network, weightMap, "bbox_embed.layers", src);
    auto bbox_sig = network->addActivation(*bbox, ActivationType::kSIGMOID);
    assert(bbox_sig);
    std::vector<ITensor*> output = { class_softmax->getOutput(0), bbox_sig->getOutput(0) };
    return output;
}

ICudaEngine* createEngine_r50detr(
unsigned int maxBatchSize,
const std::string& wtsfile,
IBuilder* builder,
IBuilderConfig* config,
DataType dt,
const std::string& modelType = "fp16"
) {
    /*
    description: after fuse bn
    */
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_NODE_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W });

    // preprocess
    std::unordered_map<std::string, Weights> weightMap;
    loadWeights(wtsfile, weightMap);

    // backbone
    auto features = BuildResNet(network, weightMap, *data, R50, 64, 64, 256);
    ITensor* pos_embed = PositionEmbeddingSine(network, weightMap, *features, 128);
    auto input_proj = network->addConvolutionNd(
        *features,
        D_MODEL,
        DimsHW{ 1, 1 },
        weightMap["input_proj.weight"],
        weightMap["input_proj.bias"]);
    assert(input_proj);
    input_proj->setStrideNd(DimsHW{ 1, 1 });
    auto flatten = network->addShuffle(*input_proj->getOutput(0));
    assert(flatten);
    flatten->setReshapeDimensions(Dims4{ input_proj->getOutput(0)->getDimensions().d[0], -1, 1, 1 });
    flatten->setSecondTranspose(Permutation{ 1, 0, 2, 3 });

    auto out1 = Transformer(
        network,
        weightMap,
        "transformer",
        *flatten->getOutput(0),
        *pos_embed,
        NUM_QUERIES,
        NUM_ENCODE_LAYERS,
        NUM_DECODE_LAYERS,
        D_MODEL,
        NHEAD,
        DIM_FEEDFORWARD);
    std::vector<ITensor*> results = Predict(network, weightMap, *out1);

    // build output
    for (int i = 0; i < results.size(); i++) {
        network->markOutput(*results[i]);
        results[i]->setName(OUTPUT_NAMES[i].c_str());
    }

    // build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(1ULL << 30);

    if (modelType == "fp32") {
    } else if (modelType == "fp16") {
        config->setFlag(BuilderFlag::kFP16);
    } else if (modelType == "int8") {
        std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
        assert(builder->platformHasFastInt8());
        config->setFlag(BuilderFlag::kINT8);
        Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(BATCH_SIZE, INPUT_W, INPUT_H, "./coco_calib/",
        "int8calib.table", INPUT_NODE_NAME);
        config->setInt8Calibrator(calibrator);
    } else {
        throw("does not support model type");
    }

    std::cout << "Building engine, please wait for a while..." << std::endl;
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // destroy network
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return engine;
}

void BuildDETRModel(unsigned int maxBatchSize, IHostMemory** modelStream,
const std::string& wtsfile, std::string modelType = "fp32") {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine_r50detr(maxBatchSize,
        wtsfile, builder, config, DataType::kFLOAT, modelType);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
}

void doInference(IExecutionContext& context, cudaStream_t& stream, std::vector<void*>& buffers,
std::vector<float>& input, std::vector<float*>& output) {
    CUDA_CHECK(cudaMemcpyAsync(buffers[0], input.data(), input.size() * sizeof(float),
    cudaMemcpyHostToDevice, stream));

    context.enqueue(BATCH_SIZE, buffers.data(), stream, nullptr);

    CUDA_CHECK(cudaMemcpyAsync(output[0], buffers[1], BATCH_SIZE * NUM_QUERIES * NUM_CLASS * sizeof(float),
    cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(cudaMemcpyAsync(output[1], buffers[2], BATCH_SIZE * NUM_QUERIES * 4 * sizeof(float),
    cudaMemcpyDeviceToHost, stream));

    cudaStreamSynchronize(stream);
}

bool parse_args(int argc, char** argv, std::string& wtsFile, std::string& engineFile, std::string& imgDir) {
    if (argc < 4) return false;
    if (std::string(argv[1]) == "-s") {
        wtsFile = std::string(argv[2]);
        engineFile = std::string(argv[3]);
    } else if (std::string(argv[1]) == "-d") {
        engineFile = std::string(argv[2]);
        imgDir = std::string(argv[3]);
    } else {
        return false;
    }
    return true;
}

int main(int argc, char** argv) {
    cudaSetDevice(DEVICE);

    std::string wtsFile = "";
    std::string engineFile = "";

    std::string imgDir;
    if (!parse_args(argc, argv, wtsFile, engineFile, imgDir)) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./detr -s [.wts] [.engine] // serialize model to plan file" << std::endl;
        std::cerr << "./detr -d [.engine] ../samples // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    if (!wtsFile.empty()) {
        IHostMemory* modelStream{ nullptr };
        BuildDETRModel(BATCH_SIZE, &modelStream, wtsFile, "fp32");
        assert(modelStream != nullptr);
        std::ofstream p(engineFile, std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 0;
    }

    // deserialize the .engine and run inference
    std::ifstream file(engineFile, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engineFile << " error!" << std::endl;
        return -1;
    }

    std::string trtModelStream;
    size_t modelSize{ 0 };
    file.seekg(0, file.end);
    modelSize = file.tellg();
    file.seekg(0, file.beg);
    trtModelStream.resize(modelSize);
    assert(!trtModelStream.empty());
    file.read(const_cast<char*>(trtModelStream.c_str()), modelSize);
    file.close();

    // build engine
    std::cout << "build engine" << std::endl;
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream.c_str(), modelSize);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    runtime->destroy();

    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    // prepare input file
    std::vector<std::string> fileList;
    if (read_files_in_dir(imgDir.c_str(), fileList) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    // calculate input size
    int input_size = CalculateSize(context->getBindingDimensions(0));

    // prepare input data
    std::vector<float> data(BATCH_SIZE * input_size, 0);
    void *data_d, *scores_d, *boxes_d;
    CUDA_CHECK(cudaMalloc(&data_d, BATCH_SIZE * input_size * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&scores_d, BATCH_SIZE * NUM_QUERIES * NUM_CLASS * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&boxes_d, BATCH_SIZE * NUM_QUERIES * 4 * sizeof(float)));

    std::vector<float> scores_h(BATCH_SIZE * NUM_QUERIES * NUM_CLASS);
    std::vector<float> boxes_h(BATCH_SIZE * NUM_QUERIES * 4);

    std::vector<void*> buffers = { data_d, scores_d, boxes_d };
    std::vector<float*> outputs = {scores_h.data(), boxes_h.data()};

    int fcount = 0;
    int fileLen = fileList.size();
    for (int f = 0; f < fileLen; f++) {
        fcount++;
        if (fcount < BATCH_SIZE && f + 1 != fileLen) continue;

        for (int b = 0; b < fcount; b++) {
            cv::Mat img = cv::imread(imgDir + "/" + fileList[f - fcount + 1 + b]);
            if (img.empty()) continue;
            preprocessImg(img, INPUT_H, INPUT_W);
            assert(img.cols * img.rows * 3 == input_size);
            for (int c = 0; c < 3; c++) {
                for (int h = 0; h < img.rows; h++) {
                    for (int w = 0; w < img.cols; w++) {
                        data[b * input_size +
                        c * img.rows * img.cols + h * img.cols + w] = img.at<cv::Vec3f>(h, w)[c];
                    }
                }
            }
        }

        // Run inference
        auto start = std::chrono::system_clock::now();

        doInference(*context, stream, buffers, data, outputs);

        auto end = std::chrono::system_clock::now();
        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;

        for (int b = 0; b < fcount; b++) {
            cv::Mat img = cv::imread(imgDir + "/" + fileList[f - fcount + 1 + b]);
            for (int i = 0; i < scores_h.size(); i += NUM_CLASS) {
                int label = -1;
                float score = -1;
                for (int j = i; j < i + NUM_CLASS; j++) {
                    if (score < scores_h[j]) {
                        label = j;
                        score = scores_h[j];
                    }
                }
                if (score > SCORE_THRESH && (label % NUM_CLASS != NUM_CLASS - 1)) {
                    int ind = label / NUM_CLASS;
                    label = label % NUM_CLASS;
                    float cx = boxes_h[ind * 4];
                    float cy = boxes_h[ind * 4 + 1];
                    float w = boxes_h[ind * 4 + 2];
                    float h = boxes_h[ind * 4 + 3];
                    float x1 = (cx - w / 2.0) * img.cols;
                    float y1 = (cy - h / 2.0) * img.rows;
                    float x2 = (cx + w / 2.0) * img.cols;
                    float y2 = (cy + h / 2.0) * img.rows;
                    cv::Rect r(x1, y1, x2 - x1, y2 - y1);
                    cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
                    cv::putText(img, std::to_string(label), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
                    cv::Scalar(0xFF, 0xFF, 0xFF), 2);
                }
            }
            cv::imwrite("_" + fileList[f - fcount + 1 + b], img);
        }
        fcount = 0;
    }

    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(data_d));
    CUDA_CHECK(cudaFree(scores_d));
    CUDA_CHECK(cudaFree(boxes_d));
    context->destroy();
    engine->destroy();

    return 0;
}


================================================
FILE: detr/gen_wts.py
================================================
import cv2

import torch
from models.transformer import Transformer
from models.position_encoding import PositionEmbeddingSine
from models.backbone import Backbone, Joiner
from models.detr import DETR
import torchvision.transforms as T
from PIL import Image
import struct

def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x.unbind(-1)
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
         (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return torch.stack(b, dim=-1)

def build_backbone():
    N_steps = 256 // 2
    position_embedding = PositionEmbeddingSine(N_steps, normalize=True)
    train_backbone = True
    return_interm_layers = False
    backbone = Backbone('resnet50', train_backbone, return_interm_layers, False)
    model = Joiner(backbone, position_embedding)
    model.num_channels = backbone.num_channels
    return model

def gen_wts(model, filename):
    f = open(filename + '.wts', 'w')
    f.write('{}\n'.format(len(model.state_dict().keys()) + 72))
    for k, v in model.state_dict().items():
        if 'in_proj' in k:
            dim = int(v.size(0) / 3)
            q_weight = v[:dim].reshape(-1).cpu().numpy()
            k_weight = v[dim:2*dim].reshape(-1).cpu().numpy()
            v_weight = v[2*dim:].reshape(-1).cpu().numpy()
            f.write('{} {} '.format(k + '_q', len(q_weight)))
            for vv in q_weight:
                f.write(' ')
                f.write(struct.pack('>f', float(vv)).hex())
            f.write('\n')

            f.write('{} {} '.format(k + '_k', len(k_weight)))
            for vv in k_weight:
                f.write(' ')
                f.write(struct.pack('>f', float(vv)).hex())
            f.write('\n')

            f.write('{} {} '.format(k + '_v', len(v_weight)))
            for vv in v_weight:
                f.write(' ')
                f.write(struct.pack('>f', float(vv)).hex())
            f.write('\n')
        else:
            vr = v.reshape(-1).cpu().numpy()
            f.write('{} {} '.format(k, len(vr)))
            for vv in vr:
                f.write(' ')
                f.write(struct.pack('>f',float(vv)).hex())
            f.write('\n')
    f.close()

def main():
    num_classes = 91
    device = torch.device('cuda')

    backbone = build_backbone()

    transformer = Transformer(
        d_model=256,
        dropout=0.1,
        nhead=8,
        dim_feedforward=2048,
        num_encoder_layers=6,
        num_decoder_layers=6,
        normalize_before=False,
        return_intermediate_dec=True,
    )

    model = DETR(
        backbone,
        transformer,
        num_classes=num_classes,
        num_queries=100,
        aux_loss=True,
    )
    checkpoint = torch.load('./detr-r50-e632da11.pth')
    model.load_state_dict(checkpoint['model'])
    model.to(device)
    model.eval()

    gen_wts(model, "detr")

    # test
    # with torch.no_grad():
    #     transform = T.Compose([T.Resize(800), T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
    #     im = Image.open('./image/demo.jpg')
    #     img = transform(im).unsqueeze(0)

    #     img = img.to(device)
    #     res = model(img)

    #     logits = res['pred_logits']
    #     pred_boxes = res['pred_boxes']
    #     out_prob = logits.softmax(-1)[0, :, :-1]
    #     keep = out_prob.max(-1).values > 0.5
    #     label = out_prob[keep].argmax(dim=1)
    #     out_bbox = pred_boxes[0, keep]
    #     out_bbox = out_bbox.to(torch.device('cpu'))
    #     out_bbox = box_cxcywh_to_xyxy(out_bbox)
    #     out_bbox = out_bbox * torch.tensor([640, 480, 640, 480])
    #     image = cv2.imread('./image/demo.jpg')
    #     for ob in out_bbox:
    #         x0 = int(ob[0].item())
    #         y0 = int(ob[1].item())
    #         x1 = int(ob[2].item())
    #         y1 = int(ob[3].item())
    #         cv2.rectangle(image, (x0, y0), (x1, y1), (0,0,255), 1)
        
    #     cv2.imwrite('res.jpg', image)

if __name__ == '__main__':
    main()

================================================
FILE: detr/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>

#include "macros.h"


using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf {
 public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog) {}

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput) {}

    ~LogStreamConsumerBuffer() {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr()) {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync() {
        putOutput();
        return 0;
    }

    void putOutput() {
        if (mShouldLog) {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog) {
        mShouldLog = shouldLog;
    }

 private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase {
 public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog) {}

 protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
 public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer)  // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity) {}

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer)  // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity) {}

    void setReportableSeverity(Severity reportableSeverity) {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

 private:
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity) {
        switch (severity) {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger {
 public:
    explicit Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity) {}

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult {
        kRUNNING,  //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger() {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity) {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom {
     public:
        TestAtom(TestAtom&&) = default;

     private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline) {}

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline) {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom) {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result) {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass) {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const {
        return mReportableSeverity;
    }

 private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity) {
        switch (severity) {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result) {
        switch (result) {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result) {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv) {
        std::stringstream ss;
        for (int i = 0; i < argc; i++) {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace {

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

}  // anonymous namespace

#endif  // TENSORRT_LOGGING_H


================================================
FILE: detr/macros.h
================================================
#ifndef __MACROS_H
#define __MACROS_H

#include "NvInfer.h"

#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else

#if defined(_MSC_VER)
#define API __declspec(dllimport)a
#else
#define API
#endif
#endif  // API_EXPORTS

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif

#endif  // __MACROS_H


================================================
FILE: docker/README.md
================================================
# Tutorials

## Introduction

This folder contains the docker and docker-compose file to build the development environment without pain.

## Prerequisites

* OS: Linux or WSL2
* docker
* nvidia-container-toolkit
* (Optional but **recommended**) docker-compose

## Usage

1. (With docker-compose) configure the `.env` file, change `DATA_DIR` to your mount point, such as your code or data folder, etc, comment the `volumes` in docker compose file if not necessariy needed

2. Build image:
```bash
docker compose -f docker-compose.yml build
```

3. Run a container at background:
```bash
docker compose -f docker-compose.yml up -d
```

4. Attach to this container with your IDE and have fun!

## HowTos

### How to build and run with docker?

``` bash
docker build -f docker/x86_64.dockerfile -v .
docker run -it --gpus all --privileged --net=host --ipc=host -v  /bin/bash
```

### How to build image with other TensorRT version?

Change the `TAG` on top of the `.dockerfile`. Note: all images are officially owned by NVIDIA NGC, which requires a registration before pulling. For this repo, the mainly used `TAG` would be:

| Container Image | Container OS | Driver | CUDA | TensorRT | Torch | Recommended |
| :----: | :----: | :----: | :----: | :----: | :----: | :----: |
| 20.12-py3 | Ubuntu 20.04 | 455 | 11.2 | 7.2.2 | 1.8.0 | ❌ |
| 24.01-py3 | Ubuntu 22.04 | 545 | 12.3 | 8.6.1 | 2.2.0 | ✅ |
| 24.04-py3 | Ubuntu 22.04 | 545 | 12.4 | 8.6.3 | 2.3.0 | ✅ |
| 24.09-py3 | Ubuntu 22.04 | 560 | 12.6 | 10.4.0 | 2.5.0 | ✅ |

For more detail of the support matrix, please check [HERE](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)

### How to customize the opencv in the image?

If prebuilt package from apt cannot meet your requirements, please refer to the demo code in `.dockerfile` to build opencv from source.

### How to solve failiures when building image?

For *443 timeout* or any similar network issues, a proxy may required. To make your host proxy work for building env of docker, please change the `build` node inside docker-compose file like this:
```YAML
    build:
      dockerfile: x86_64.dockerfile
      args:
        HTTP_PROXY: ${PROXY}
        HTTPS_PROXY: ${PROXY}
        ALL_PROXY: ${PROXY}
        http_proxy: ${PROXY}
        https_proxy: ${PROXY}
        all_proxy: ${PROXY}
```
then add `PROXY="http://xxx:xxx"` in `.env` file

## Note

The older version support, like TensorRT version **< 8**, may be deprecated in the future.


================================================
FILE: docker/tensorrtx-docker-compose.yml
================================================
services:
  tensorrt:
    image: tensortx:1.0.1
    container_name: tensortx
    environment:
      - NVIDIA_VISIBLE_DEVICES=all
    build:
      dockerfile: x86_64.dockerfile
    cap_add:
      - CAP_SYS_ADMIN
    security_opt:
      - seccomp:unconfined
    privileged: true
    stdin_open: true
    tty: true
    shm_size: '8gb'
    ulimits:
      memlock:
        soft: -1
        hard: -1
    devices:
      - /dev:/dev:rw
    volumes:
      #### user ####
      - ${HOME}:/workspace/localhome:rw
      #### custom ####
      - mount:/mnt:rw
    deploy:
      restart_policy:
        condition: on-failure
        max_attempts: 1
        delay: 5s
      resources:
        reservations:
          devices:
            - driver: nvidia
              capabilities: [gpu]
              count: all

volumes:
  mount:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: ${DATA_DIR}


================================================
FILE: docker/x86_64.dockerfile
================================================
ARG TAG=24.01-py3

FROM nvcr.io/nvidia/tensorrt:${TAG} AS tensorrtx

ENV DEBIAN_FRONTEND noninteractive

# basic tools
RUN apt update && apt-get install -y --fix-missing --no-install-recommends \
sudo wget curl git ca-certificates ninja-build tzdata pkg-config \
gdb libglib2.0-dev libmount-dev locales \
&& rm -rf /var/lib/apt/lists/*
RUN pip install --no-cache-dir yapf isort cmake-format pre-commit

## fix a potential pre-commit error
RUN locale-gen "en_US.UTF-8"

## override older cmake
RUN find /usr/local/share -type d -name "cmake-*" -exec rm -rf {} + \
&& curl -fsSL "https://github.com/Kitware/CMake/releases/download/v3.30.0/cmake-3.30.0-linux-x86_64.sh" \
-o cmake.sh && bash cmake.sh --skip-license --exclude-subdir --prefix=/usr/local && rm cmake.sh

RUN apt update && apt-get install -y \
libopencv-dev \
&& rm -rf /var/lib/apt/lists/*

## a template to build opencv and opencv_contrib from source
# RUN git clone -b 4.x https://github.com/opencv/opencv_contrib.git \
# && git clone -b 4.x https://github.com/opencv/opencv.git opencv \
# && cmake -S opencv -B opencv/build -G Ninja \
# -DBUILD_LIST=core,calib3d,imgproc,imgcodecs,highgui \
# -DOPENCV_EXTRA_MODULES_PATH="/workspace/opencv_contrib/modules" \
# -DCMAKE_BUILD_TYPE=RELEASE \
# -DCMAKE_INSTALL_PREFIX=/usr/local \
# -DENABLE_FAST_MATH=ON \
# -DOPENCV_GENERATE_PKGCONFIG=ON \
# -DBUILD_opencv_python2=OFF \
# -DBUILD_opencv_python3=OFF \
# -DBUILD_JAVA=OFF \
# -DBUILD_DOCS=OFF \
# -DBUILD_PERF_TESTS=OFF \
# -DBUILD_TESTS=OFF \
# && ninja -C opencv/build install


================================================
FILE: efficient_ad/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.12)
project(EfficientAD-M)

add_definitions(-w)
add_definitions(-D API_EXPORTS)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE "Debug")
set(CMAKE_CUDA_ARCHITECTURES 61 75 86 89)
set(THREADS_PREFER_PTHREAD_FLAG ON)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /od")

### nvcc
set(CMAKE_CUDA_COMPILER "D:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8/bin/nvcc.exe")
enable_language(CUDA)
### cuda
include_directories("D:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8/include")
link_directories("D:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8/lib/x64")
### tensorrt
set(TRT_DIR "D:/Program Files/NVIDIA GPU Computing Toolkit/TensorRT-8.5.3.1/")
include_directories(${TRT_DIR}/include)
link_directories(${TRT_DIR}/lib)
### opencv
set(OpenCV_DIR "E:/OpenCV/OpenCV_4.6.0/opencv/build")
find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})
### dirent
include_directories("E:/SDK/dirent-1.24/include")

include_directories(${PROJECT_SOURCE_DIR}/src/)
file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu)

add_executable(efficientAD_det "./efficientAD_det.cpp" ${SRCS})
target_link_libraries(efficientAD_det nvinfer
                                      cudart
                                      nvinfer_plugin
                                      ${OpenCV_LIBS}
                                      )


================================================
FILE: efficient_ad/README.md
================================================
# EfficientAd

EfficientAd: Accurate Visual Anomaly Detection at Millisecond-Level Latencies.

The Pytorch implementation is [openvinotoolkit/anomalib](https://github.com/openvinotoolkit/anomalib).

<p align="center">
<img src="https://github.com/wang-xinyu/tensorrtx/assets/15235574/061c90a7-fe59-48e0-a8d0-6bddc4296cf1">
</p>

# Test Environment

GTX3080 / Windows10 22H2 / cuda11.8 / cudnn8.9.7 / TensorRT8.5.3 / OpenCV4.6

# How to Run

1. training to generate weight files (`efficientAD_[category].pt`)

   ```
   // Please refer to Anomalib's tutorial for details:
   // https://github.com/openvinotoolkit/anomalib?tab=readme-ov-file#-training
   ```

2. generate `.wts` from pytorch with `.pt`

   ```
   cd ./datas/models/
   // copy your `.pt` file to the current directory.
   python gen_wts.py
   // a file `efficientAD_[category].wts` will be generated.
   ```

3. build and run

   ```
   mkdir build
   cd build
   cmake ..
   make
   sudo ./EfficientAD-M -s [.wts] // serialize model to plan file
   sudo ./EfficientAD-M -d [.engine] [image folder] // deserialize and run inference, the images in [image folder] will be processed
   ```

# Latency

average cost of doInference(in `efficientad_detect.cpp`) from second time with batch=1 under the windows environment above

|               | FP32 |
| :-----------: | :--: |
| EfficientAD-M | 12ms |


================================================
FILE: efficient_ad/efficientAD_det.cpp
================================================
#include <cuda_runtime.h>

#include <chrono>
#include <cmath>
#include <cstdint>
#include <iostream>
#include <opencv2/opencv.hpp>

#include "config.h"
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "postprocess.h"
#include "utils.h"

using namespace nvinfer1;

static Logger gLogger;
// const static int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
const static int kInputSize = 3 * 256 * 256;
const static int kOutputSize = 1 * 256 * 256;

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, float& gd, float& gw,
                std::string& img_dir) {
    if (argc != 4)
        return false;
    if (std::string(argv[1]) == "-s") {
        wts = std::string(argv[2]);
        engine = std::string(argv[3]);
    } else if (std::string(argv[1]) == "-d") {
        engine = std::string(argv[2]);
        img_dir = std::string(argv[3]);
    } else {
        return false;
    }
    return true;
}

void prepare_infer_buffers(ICudaEngine* engine, float** gpu_input_buffer, float** gpu_output_buffer,
                           float** cpu_output_buffer) {
    // assert(engine->getNbIOTensors() == 2);
    assert(engine->getNbBindings() == 2);

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(kInputTensorName);
    const int outputIndex = engine->getBindingIndex(kOutputTensorName);
    // nvinfer1::Dims outputDims = engine->getBindingDimensions(outputIndex);
    assert(inputIndex == 0);
    assert(outputIndex == 1);

    // Create GPU in/output buffers on device
    CUDA_CHECK(cudaMalloc((void**)gpu_input_buffer, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)gpu_output_buffer, kBatchSize * 1 * kOutputSize * sizeof(float)));  // 3 or 1 ??
    // Create CPU output buffers on host
    *cpu_output_buffer = new float[kBatchSize * kOutputSize];
}

void preprocessImg(cv::Mat& img, int newh, int neww) {
    cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
    cv::resize(img, img, cv::Size(neww, newh));
    img.convertTo(img, CV_32FC3);
    // ImageNet normalize
    img /= 255.0f;
    img -= cv::Scalar(0.485, 0.456, 0.406);
    img /= cv::Scalar(0.229, 0.224, 0.225);
}

void infer(IExecutionContext& context, cudaStream_t& stream, std::vector<void*>& gpu_buffers,
           std::vector<float>& cpu_input_data, std::vector<float>& cpu_output_data, int batchsize) {
    // copy input data from host (CPU) to device (GPU)
    CUDA_CHECK(cudaMemcpyAsync(gpu_buffers[0], cpu_input_data.data(), cpu_input_data.size() * sizeof(float),
                               cudaMemcpyHostToDevice, stream));
    // execute inference using context provided by engine
    context.enqueue(batchsize, gpu_buffers.data(), stream, nullptr);
    // copy output back from device (GPU) to host (CPU)
    CUDA_CHECK(cudaMemcpyAsync(cpu_output_data.data(), gpu_buffers[1], batchsize * kOutputSize * sizeof(float),
                               cudaMemcpyDeviceToHost, stream));
    // synchronize the stream to prevent issues (block CUDA and wait for CUDA operations to be completed)
    cudaStreamSynchronize(stream);
}

void serialize_engine(unsigned int max_batchsize, float& gd, float& gw, std::string& wts_name,
                      std::string& engine_name) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = nullptr;
    engine = build_efficientAD_engine(max_batchsize, builder, config, DataType::kFLOAT, gd, gw, wts_name);
    assert(engine != nullptr);

    // Serialize the engine
    IHostMemory* serialized_engine = engine->serialize();
    assert(serialized_engine != nullptr);

    // Save engine to file
    std::ofstream p(engine_name, std::ios::binary);
    if (!p) {
        std::cerr << "Could not open plan output file" << std::endl;
        assert(false);
    }
    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());

    // Close everything down
    engine->destroy();
    config->destroy();
    serialized_engine->destroy();
    builder->destroy();
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
                        IExecutionContext** context) {
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        assert(false);
    }
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    char* serialized_engine = new char[size];
    assert(serialized_engine);
    file.read(serialized_engine, size);
    file.close();

    *runtime = createInferRuntime(gLogger);
    assert(*runtime);
    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
    assert(*engine != nullptr);
    *context = (*engine)->createExecutionContext();
    assert(*context);

    delete[] serialized_engine;
}

int main(int argc, char** argv) {
    cudaSetDevice(kGpuId);

    std::string wts_name = "";
    std::string engine_name = "";
    float gd = 1.0f, gw = 1.0f;
    std::string img_dir;

    if (!parse_args(argc, argv, wts_name, engine_name, gd, gw, img_dir)) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./efficientad_det -s [.wts] [.engine]  // serialize model to plan file" << std::endl;
        std::cerr
                << "./efficientad_det -d [.engine] [../../datas/images/...]  // deserialize plan file and run inference"
                << std::endl;
        return -1;
    }

    // Create a model using the API directly and serialize it to a file
    if (!wts_name.empty()) {
        serialize_engine(kBatchSize, gd, gw, wts_name, engine_name);
        return 0;
    }

    // Deserialize the engine from file
    IRuntime* runtime = nullptr;
    ICudaEngine* engine = nullptr;
    IExecutionContext* context = nullptr;
    deserialize_engine(engine_name, &runtime, &engine, &context);

    // create CUDA stream for simultaneous CUDA operations
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    // prepare cpu and gpu buffers
    void *gpu_input_buffer, *gpu_output_buffer;
    CUDA_CHECK(cudaMalloc(&gpu_input_buffer, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&gpu_output_buffer, kBatchSize * 1 * kOutputSize * sizeof(float)));  // 3 or 1 ??
    std::vector<void*> gpu_buffers = {gpu_input_buffer, gpu_output_buffer};
    std::vector<float> cpu_input_data(kBatchSize * kInputSize, 0);
    std::vector<float> cpu_output_data(kBatchSize * kOutputSize, 0);

    // read images from directory
    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    std::vector<cv::Mat> originImg_batch;
    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
        // get a batch of images
        std::vector<cv::Mat> img_batch;
        std::vector<std::string> img_name_batch;

        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
            originImg_batch.push_back(img.clone());
            preprocessImg(img, kInputW, kInputH);
            assert(img.cols * img.rows * 3 == 3 * 256 * 256);
            for (int c = 0; c < 3; c++) {
                for (int h = 0; h < img.rows; h++) {
                    for (int w = 0; w < img.cols; w++) {
                        cpu_input_data[c * img.rows * img.cols + h * img.cols + w] = img.at<cv::Vec3f>(h, w)[c];
                    }
                }
            }
            img_batch.push_back(img);
            img_name_batch.push_back(file_names[j]);
        }

        // Run inference
        auto start = std::chrono::system_clock::now();
        // infer(*context, stream, (void**)gpu_buffers, cpu_input_data, cpu_output_buffer, kBatchSize);
        infer(*context, stream, gpu_buffers, cpu_input_data, cpu_output_data,
              kBatchSize);  // change to save into vec `cpu_output_data`
        auto end = std::chrono::system_clock::now();
        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
                  << "ms" << std::endl;

        // postProcess
        cv::Mat img_1(256, 256, CV_8UC1);
        for (int row = 0; row < 256; row++) {
            for (int col = 0; col < 256; col++) {
                float value = cpu_output_data[row * 256 + col];
                if (value < 0)  // clip(0,1)
                    value = 0;
                else if (value > 1)
                    value = 1;
                img_1.at<uchar>(row, col) = static_cast<uchar>(value * 255);
            }
        }

        cv::Mat HeatMap, colorMap;
        // genHeatMap(img_batch[0], img_1, HeatMap);
        cv::applyColorMap(img_1, colorMap, cv::COLORMAP_JET);
        cv::resize(originImg_batch[i], originImg_batch[i], cv::Size(256, 256));
        cv::cvtColor(originImg_batch[i], originImg_batch[i], cv::COLOR_RGB2BGR);
        cv::addWeighted(originImg_batch[i], 0.5, colorMap, 0.5, 0, HeatMap);

        // Save images
        for (size_t j = 0; j < img_batch.size(); j++) {
            cv::imwrite("_output" + img_name_batch[j], img_1);
            cv::imwrite("_heatmap" + img_name_batch[j], HeatMap);
        }
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(gpu_buffers[0]));
    CUDA_CHECK(cudaFree(gpu_buffers[1]));

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    return 0;
}


================================================
FILE: efficient_ad/src/config.h
================================================
#pragma once

/* --------------------------------------------------------
 * These configs are related to tensorrt model, if these are changed,
 * please re-compile and re-serialize the tensorrt model.
 * --------------------------------------------------------*/

// For INT8, you need prepare the calibration dataset, please refer to
#define USE_FP32  // set USE_INT8 or USE_FP16 or USE_FP32

// These are used to define input/output tensor names,
// you can set them to whatever you want.
const static char* kInputTensorName = "data";
const static char* kOutputTensorName = "prob";

constexpr static int kBatchSize = 1;

// input width and height must by divisible by 32
constexpr static int kInputH = 256;
constexpr static int kInputW = 256;

/* --------------------------------------------------------
 * These configs are NOT related to tensorrt model, if these are changed,
 * please re-compile, but no need to re-serialize the tensorrt model.
 * --------------------------------------------------------*/

// default GPU_id
const static int kGpuId = 0;

// If your image size is larger than 4096 * 3112, please increase this value
const static int kMaxInputImageSize = 4096 * 3112;


================================================
FILE: efficient_ad/src/cuda_utils.h
================================================
#ifndef TRTX_CUDA_UTILS_H_
#define TRTX_CUDA_UTILS_H_

#include <cuda_runtime_api.h>

#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)                                                                    \
    {                                                                                          \
        cudaError_t error_code = callstr;                                                      \
        if (error_code != cudaSuccess) {                                                       \
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
            assert(0);                                                                         \
        }                                                                                      \
    }
#endif  // CUDA_CHECK

#endif  // TRTX_CUDA_UTILS_H_


================================================
FILE: efficient_ad/src/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "NvInferRuntimeCommon.h"
#include "macros.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf {
   public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {}

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {}

    ~LogStreamConsumerBuffer() {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr()) {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync() {
        putOutput();
        return 0;
    }

    void putOutput() {
        if (mShouldLog) {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }

   private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase {
   public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog) {}

   protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
   public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(severity <= reportableSeverity),
          mSeverity(severity) {}

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(other.mShouldLog),
          mSeverity(other.mSeverity) {}

    void setReportableSeverity(Severity reportableSeverity) {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

   private:
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger {
   public:
    Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {}

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult {
        kRUNNING,  //!< The test is running
        kPASSED,   //!< The test passed
        kFAILED,   //!< The test failed
        kWAIVED    //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger() { return *this; }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity) { mReportableSeverity = severity; }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom {
       public:
        TestAtom(TestAtom&&) = default;

       private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started), mName(name), mCmdline(cmdline) {}

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline) {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom) {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result) {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass) {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const { return mReportableSeverity; }

   private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result) {
        switch (result) {
            case TestResult::kRUNNING:
                return "RUNNING";
            case TestResult::kPASSED:
                return "PASSED";
            case TestResult::kFAILED:
                return "FAILED";
            case TestResult::kWAIVED:
                return "WAIVED";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result) {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv) {
        std::stringstream ss;
        for (int i = 0; i < argc; i++) {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace {

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

}  // anonymous namespace

#endif  // TENSORRT_LOGGING_H


================================================
FILE: efficient_ad/src/macros.h
================================================
#ifndef __MACROS_H
#define __MACROS_H

#include <NvInfer.h>

#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else

#if defined(_MSC_VER)
#define API __declspec(dllimport)
#else
#define API
#endif
#endif  // API_EXPORTS

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif

#endif  // __MACROS_H


================================================
FILE: efficient_ad/src/model.cpp
================================================
#include "model.h"

#include <cassert>
#include <cmath>
#include <cstring>
#include <fstream>
#include <iostream>
#include <map>
#include <opencv2/opencv.hpp>
#include <string>
#include <vector>

#include "config.h"

using namespace nvinfer1;

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
static std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--) {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x) {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

void printNetworkLayers(INetworkDefinition* network) {
    int numLayers = network->getNbLayers();
    // std::cout << "currently num of layers: " << numLayers << std::endl;

    auto dataTypeToString = [](DataType type) {
        switch (type) {
            case DataType::kFLOAT:
                return "kFLOAT";
            case DataType::kHALF:
                return "kHALF";
            case DataType::kINT8:
                return "kINT8";
            case DataType::kINT32:
                return "kINT32";
            case DataType::kBOOL:
                return "kBOOL";
            default:
                return "Unknown";
        }
    };

    for (int i = 0; i < numLayers; ++i) {
        ILayer* layer = network->getLayer(i);
        std::cout << "--- Layer" << i << " = " << layer->getName() << std::endl;
        std::cout << "input & output tensor type: " << dataTypeToString(layer->getInput(0)->getType()) << "\t"
                  << dataTypeToString(layer->getOutput(0)->getType()) << std::endl;

        // input
        int inTensorNum = layer->getNbInputs();
        for (int j = 0; j < inTensorNum; ++j) {
            // std::cout << layer->getInput(j)->getDimensions().nbDims;
            Dims dims_in = layer->getInput(j)->getDimensions();
            std::cout << "input shape[" << j << "]: (";
            for (int k = 0; k < dims_in.nbDims; ++k) {
                std::cout << dims_in.d[k];
                if (k < dims_in.nbDims - 1) {
                    std::cout << ", ";
                }
            }
            std::cout << ")\t";
        }
        std::cout << std::endl;

        // output
        int outTensorNum = layer->getNbOutputs();
        for (int j = 0; j < outTensorNum; ++j) {
            // std::cout << layer->getOutput(j)->getName();
            Dims dims_out = layer->getOutput(j)->getDimensions();
            std::cout << "output shape: (";
            for (int k = 0; k < dims_out.nbDims; ++k) {
                std::cout << dims_out.d[k];
                if (k < dims_out.nbDims - 1) {
                    std::cout << ", ";
                }
            }
            std::cout << ")";
        }
        std::cout << "\n" << std::endl;
    }
}

static IScaleLayer* NormalizeInput(INetworkDefinition* network, ITensor& input) {
    float meanValues[3] = {-0.485f, -0.456f, -0.406f};
    float stdValues[3] = {1.0f / 0.229f, 1.0f / 0.224f, 1.0f / 0.225f};
    Weights meanWeights{DataType::kFLOAT, meanValues, 3};
    Weights stdWeights{DataType::kFLOAT, stdValues, 3};

    IScaleLayer* NormaLayer = network->addScale(input, ScaleMode::kCHANNEL, meanWeights, stdWeights, Weights{});
    assert(NormaLayer != nullptr);

    return NormaLayer;
}

static IScaleLayer* NormalizeTeacherMap(INetworkDefinition* network, std::map<std::string, Weights>& weightMap,
                                        ITensor& input) {
    float* mean = (float*)weightMap["mean_std.mean"].values;
    float* std = (float*)weightMap["mean_std.std"].values;
    int len = weightMap["mean_std.mean"].count;

    // 1.scale
    float* scaleVal = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scaleVal[i] = 1.0 / std[i];
    }
    Weights scale{DataType::kFLOAT, scaleVal, len};

    // 2.shift
    float* shiftVal = nullptr;
    shiftVal = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shiftVal[i] = -mean[i];
    }
    Weights shift{DataType::kFLOAT, shiftVal, len};

    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, Weights{}, Weights{});
    assert(scale_1);
    IScaleLayer* scale_2 = network->addScale(*scale_1->getOutput(0), ScaleMode::kCHANNEL, Weights{}, scale, Weights{});
    assert(scale_2);

    return scale_2;
}

static ILayer* NormalizeFinalMap(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input,
                                 std::string name) {
    float* qa = (float*)weightMap["quantiles.qa_" + name].values;
    float* qb = (float*)weightMap["quantiles.qb_" + name].values;
    int len = weightMap["quantiles.qa_" + name].count;

    Weights qbWeight_2{DataType::kFLOAT, qb, len};

    // fmap_st - qa_st
    float* shiftVal_1 = nullptr;
    shiftVal_1 = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shiftVal_1[i] = -qa[i];
    }
    Weights qa_shiftWeight_1{DataType::kFLOAT, shiftVal_1, len};
    IScaleLayer* mapNorm_subLayer_1 =
            network->addScale(input, ScaleMode::kUNIFORM, qa_shiftWeight_1, Weights{}, Weights{});
    assert(mapNorm_subLayer_1);

    // qb_st - qa_st
    float* shiftVal_2 = nullptr;
    shiftVal_2 = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shiftVal_2[i] = qb[i] - qa[i];
    }

    // (fmap_st - qa_st) / (qb_st - qa_st)
    float* scaleVal_1 = nullptr;
    scaleVal_1 = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scaleVal_1[i] = 1.0f / shiftVal_2[i];
    }
    Weights scaleWeight_1{DataType::kFLOAT, scaleVal_1, len};
    IScaleLayer* mapNorm_divLayer_1 = network->addScale(*mapNorm_subLayer_1->getOutput(0), ScaleMode::kUNIFORM,
                                                        Weights{}, scaleWeight_1, Weights{});
    assert(mapNorm_divLayer_1);

    // ((fmap_st - qa_st) / (qb_st - qa_st)) * 0.1
    float* scaleVal_2 = nullptr;
    scaleVal_2 = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scaleVal_2[i] = 0.1f;
    }
    Weights scaleWeight_2{DataType::kFLOAT, scaleVal_2, 1};
    IScaleLayer* mapNorm_Layer = network->addScale(*mapNorm_divLayer_1->getOutput(0), ScaleMode::kUNIFORM, Weights{},
                                                   scaleWeight_2, Weights{});
    assert(mapNorm_Layer);

    return mapNorm_Layer;
}

static ILayer* convRelu(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input,
                        int outch, int ksize, int s, int p, int g, std::string lname, bool withRelu) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    IConvolutionLayer* conv1 = network->addConvolutionNd(
            input, outch, DimsHW{ksize, ksize}, weightMap[lname + ".weight"],
            weightMap[lname + ".bias"]);  // if without bias weights, the results won't match with torch version
    assert(conv1);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{p, p});
    conv1->setNbGroups(g);
    conv1->setName((lname).c_str());

    if (!withRelu)
        return conv1;

    auto relu = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);
    assert(relu);

    return relu;
}

static IResizeLayer* interpolate(INetworkDefinition* network, ITensor& input, Dims upsampleScale,
                                 ResizeMode resizeMode) {
    IResizeLayer* interpolateLayer = network->addResize(input);
    assert(interpolateLayer);
    interpolateLayer->setOutputDimensions(upsampleScale);
    interpolateLayer->setResizeMode(resizeMode);

    return interpolateLayer;
}

static ILayer* interpConvRelu(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input,
                              int outch, int ksize, int s, int p, int g, std::string lname, int dim) {
    IResizeLayer* interpolateLayer = network->addResize(input);
    assert(interpolateLayer != nullptr);
    interpolateLayer->setOutputDimensions(Dims3{input.getDimensions().d[0], dim, dim});
    interpolateLayer->setResizeMode(ResizeMode::kLINEAR);

    IConvolutionLayer* conv1 = network->addConvolutionNd(*interpolateLayer->getOutput(0), outch, DimsHW{ksize, ksize},
                                                         weightMap[lname + ".weight"], weightMap[lname + ".bias"]);
    assert(conv1);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{p, p});
    conv1->setNbGroups(g);
    conv1->setName((lname + ".conv").c_str());

    auto relu = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);
    assert(relu);

    return relu;
}

static IPoolingLayer* avgPool2d(INetworkDefinition* network, ITensor& input, int kernelSize, int stride, int padding) {
    IPoolingLayer* poolLayer = network->addPooling(input, PoolingType::kAVERAGE, DimsHW{kernelSize, kernelSize});
    assert(poolLayer);
    poolLayer->setStride(DimsHW{stride, stride});
    poolLayer->setPadding(DimsHW{padding, padding});

    return poolLayer;
}

static void slice(INetworkDefinition* network, ITensor& input, std::vector<ITensor*>& layer_vec) {
    Dims inputDims = input.getDimensions();
    ISliceLayer* slice1 = network->addSlice(input, Dims3{0, 0, 0},
                                            Dims3{inputDims.d[0] / 2, inputDims.d[1], inputDims.d[2]}, Dims3{1, 1, 1});
    assert(slice1);

    ISliceLayer* slice2 = network->addSlice(input, Dims3{inputDims.d[0] / 2, 0, 0},
                                            Dims3{inputDims.d[0] / 2, inputDims.d[1], inputDims.d[2]}, Dims3{1, 1, 1});
    assert(slice2);

    layer_vec.push_back(slice1->getOutput(0));
    layer_vec.push_back(slice2->getOutput(0));
}

static IElementWiseLayer* mergeMap(INetworkDefinition* network, ITensor& input1, ITensor& input2) {
    float* scaleVal = nullptr;
    scaleVal = reinterpret_cast<float*>(malloc(sizeof(float) * 1));
    for (int i = 0; i < 1; i++) {
        scaleVal[i] = 0.5f;
    }
    Weights scaleWeight{DataType::kFLOAT, scaleVal, 1};
    IScaleLayer* mergeMapLayer1 = network->addScale(input1, ScaleMode::kUNIFORM, Weights{}, scaleWeight, Weights{});
    assert(mergeMapLayer1);

    IScaleLayer* mergeMapLayer2 = network->addScale(input2, ScaleMode::kUNIFORM, Weights{}, scaleWeight, Weights{});
    assert(mergeMapLayer2);

    IElementWiseLayer* mergedMapLayer = network->addElementWise(
            *mergeMapLayer1->getOutput(0), *mergeMapLayer2->getOutput(0), ElementWiseOperation::kSUM);
    assert(mergedMapLayer);

    return mergedMapLayer;
}

ICudaEngine* build_efficientAD_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt,
                                      float& gd, float& gw, std::string& wts_name) {
    /* create network object */
    INetworkDefinition* network = builder->createNetworkV2(0U);

    /* create input tensor {3, kInputH, kInputW} */
    ITensor* InputData = network->addInput(kInputTensorName, dt, Dims3{3, kInputH, kInputW});
    assert(InputData);

    /* create weight map */
    std::map<std::string, Weights> weightMap = loadWeights(wts_name);

    /* AE */
    // auto BN1 = NormalizeInput(network, *InputData);
    // encoder
    auto enconv1 = convRelu(network, weightMap, *InputData, 32, 4, 2, 1, 1, "ae.encoder.enconv1", true);
    auto enconv2 = convRelu(network, weightMap, *enconv1->getOutput(0), 32, 4, 2, 1, 1, "ae.encoder.enconv2", true);
    auto enconv3 = convRelu(network, weightMap, *enconv2->getOutput(0), 64, 4, 2, 1, 1, "ae.encoder.enconv3", true);
    auto enconv4 = convRelu(network, weightMap, *enconv3->getOutput(0), 64, 4, 2, 1, 1, "ae.encoder.enconv4", true);
    auto enconv5 = convRelu(network, weightMap, *enconv4->getOutput(0), 64, 4, 2, 1, 1, "ae.encoder.enconv5", true);
    auto enconv6 = convRelu(network, weightMap, *enconv5->getOutput(0), 64, 8, 1, 0, 1, "ae.encoder.enconv6", false);
    // decoder
    auto deconv1 = interpConvRelu(network, weightMap, *enconv6->getOutput(0), 64, 4, 1, 2, 1, "ae.decoder.deconv1", 3);
    auto deconv2 = interpConvRelu(network, weightMap, *deconv1->getOutput(0), 64, 4, 1, 2, 1, "ae.decoder.deconv2", 8);
    auto deconv3 = interpConvRelu(network, weightMap, *deconv2->getOutput(0), 64, 4, 1, 2, 1, "ae.decoder.deconv3", 15);
    auto deconv4 = interpConvRelu(network, weightMap, *deconv3->getOutput(0), 64, 4, 1, 2, 1, "ae.decoder.deconv4", 32);
    auto deconv5 = interpConvRelu(network, weightMap, *deconv4->getOutput(0), 64, 4, 1, 2, 1, "ae.decoder.deconv5", 63);
    auto deconv6 =
            interpConvRelu(network, weightMap, *deconv5->getOutput(0), 64, 4, 1, 2, 1, "ae.decoder.deconv6", 127);
    auto deconv7 = interpConvRelu(network, weightMap, *deconv6->getOutput(0), 64, 3, 1, 1, 1, "ae.decoder.deconv7", 56);
    auto deconv8 = convRelu(network, weightMap, *deconv7->getOutput(0), 384, 3, 1, 1, 1, "ae.decoder.deconv8", false);

    /* PDN_medium_teacher */
    // no BN added after the convolutional layer
    auto teacher1 = convRelu(network, weightMap, *InputData, 256, 4, 1, 0, 1, "teacher.conv1", true);
    auto avgPool1 = avgPool2d(network, *teacher1->getOutput(0), 2, 2, 0);
    auto teacher2 = convRelu(network, weightMap, *avgPool1->getOutput(0), 512, 4, 1, 0, 1, "teacher.conv2", true);
    auto avgPool2 = avgPool2d(network, *teacher2->getOutput(0), 2, 2, 0);
    auto teacher3 = convRelu(network, weightMap, *avgPool2->getOutput(0), 512, 1, 1, 0, 1, "teacher.conv3", true);
    auto teacher4 = convRelu(network, weightMap, *teacher3->getOutput(0), 512, 3, 1, 0, 1, "teacher.conv4", true);
    auto teacher5 = convRelu(network, weightMap, *teacher4->getOutput(0), 384, 4, 1, 0, 1, "teacher.conv5", true);
    auto teacher6 = convRelu(network, weightMap, *teacher5->getOutput(0), 384, 1, 1, 0, 1, "teacher.conv6", false);

    /* PDN_medium_student */
    auto student1 = convRelu(network, weightMap, *InputData, 256, 4, 1, 0, 1, "student.conv1", true);
    auto avgPool3 = avgPool2d(network, *student1->getOutput(0), 2, 2, 0);
    auto student2 = convRelu(network, weightMap, *avgPool3->getOutput(0), 512, 4, 1, 0, 1, "student.conv2", true);
    auto avgPool4 = avgPool2d(network, *student2->getOutput(0), 2, 2, 0);
    auto student3 = convRelu(network, weightMap, *avgPool4->getOutput(0), 512, 1, 1, 0, 1, "student.conv3", true);
    auto student4 = convRelu(network, weightMap, *student3->getOutput(0), 512, 3, 1, 0, 1, "student.conv4", true);
    auto student5 = convRelu(network, weightMap, *student4->getOutput(0), 768, 4, 1, 0, 1, "student.conv5", true);
    auto student6 = convRelu(network, weightMap, *student5->getOutput(0), 768, 1, 1, 0, 1, "student.conv6", false);

    /* postCalculate */
    auto normal_teacher_output = NormalizeTeacherMap(network, weightMap, *teacher6->getOutput(0));
    std::vector<ITensor*> layer_vec{};
    slice(network, *student6->getOutput(0), layer_vec);
    ITensor* y_st = layer_vec[0];
    ITensor* y_stae = layer_vec[1];

    // distance_st
    IElementWiseLayer* sub_st =
            network->addElementWise(*normal_teacher_output->getOutput(0), *y_st, ElementWiseOperation::kSUB);
    assert(sub_st);
    IElementWiseLayer* distance_st =
            network->addElementWise(*sub_st->getOutput(0), *sub_st->getOutput(0), ElementWiseOperation::kPROD);
    assert(distance_st);

    // distance_stae
    IElementWiseLayer* sub_stae = network->addElementWise(*deconv8->getOutput(0), *y_stae, ElementWiseOperation::kSUB);
    assert(sub_stae);
    IElementWiseLayer* distance_stae =
            network->addElementWise(*sub_stae->getOutput(0), *sub_stae->getOutput(0), ElementWiseOperation::kPROD);
    assert(distance_stae);

    IReduceLayer* map_st = network->addReduce(*distance_st->getOutput(0), ReduceOperation::kAVG, 1, true);
    assert(map_st);
    IReduceLayer* map_stae = network->addReduce(*distance_stae->getOutput(0), ReduceOperation::kAVG, 1, true);
    assert(map_stae);

    IPaddingLayer* padMap_st = network->addPadding(*map_st->getOutput(0), DimsHW{4, 4}, DimsHW{4, 4});
    assert(padMap_st);
    IPaddingLayer* padMap_stae = network->addPadding(*map_stae->getOutput(0), DimsHW{4, 4}, DimsHW{4, 4});
    assert(padMap_stae);

    IResizeLayer* interpMap_st =
            interpolate(network, *padMap_st->getOutput(0),
                        Dims3{padMap_st->getOutput(0)->getDimensions().d[0], 256, 256}, ResizeMode::kLINEAR);
    assert(interpMap_st);
    IResizeLayer* interpMap_stae =
            interpolate(network, *padMap_stae->getOutput(0),
                        Dims3{padMap_stae->getOutput(0)->getDimensions().d[0], 256, 256}, ResizeMode::kLINEAR);
    assert(interpMap_stae);

    ILayer* normalizedMap_st = NormalizeFinalMap(network, weightMap, *interpMap_st->getOutput(0), "st");
    assert(normalizedMap_st);
    ILayer* normalizedMap_stae = NormalizeFinalMap(network, weightMap, *interpMap_stae->getOutput(0), "ae");
    assert(normalizedMap_stae);

    IElementWiseLayer* mergedMapLayer =
            mergeMap(network, *normalizedMap_st->getOutput(0), *normalizedMap_st->getOutput(0));
    printNetworkLayers(network);

    /* ouput */
    mergedMapLayer->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*mergedMapLayer->getOutput(0));

    /* Engine config */
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#if defined(USE_FP16)
    config->setFlag(BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(BuilderFlag::kINT8);
    Int8EntropyCalibrator2* calibrator =
            new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif
    std::cout << "Building engine, please wait for a while..." << std::endl;
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    return engine;
}


================================================
FILE: efficient_ad/src/model.h
================================================
#pragma once

#include <NvInfer.h>

#include <string>

nvinfer1::ICudaEngine* build_efficientAD_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder,
                                                nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, float& gd,
                                                float& gw, std::string& wts_name);


================================================
FILE: efficient_ad/src/postprocess.h
================================================
#pragma once

#include <opencv2/opencv.hpp>

void genHeatMap(cv::Mat originImg, cv::Mat& anomalyGrayMap, cv::Mat& HeatMap) {
    cv::Mat colorMap;
    cv::applyColorMap(colorMap, anomalyGrayMap, cv::COLORMAP_JET);
    cv::addWeighted(originImg, 0.5, colorMap, 0.5, 0, HeatMap);
}


================================================
FILE: efficient_ad/src/utils.h
================================================
#pragma once

#include <dirent.h>
#include <cstring>
#include <fstream>
#include <sstream>
#include <string>
#include <unordered_map>
#include <vector>

static inline int read_files_in_dir(const char* p_dir_name, std::vector<std::string>& file_names) {
    DIR* p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}


================================================
FILE: efficientnet/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(efficientnet)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

find_package(CUDA REQUIRED)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(/usr/include/x86_64-linux-gnu/)
link_directories(/usr/lib/x86_64-linux-gnu/)

add_executable(efficientnet  ${PROJECT_SOURCE_DIR}/efficientnet.cpp)
target_link_libraries(efficientnet nvinfer)
target_link_libraries(efficientnet cudart)

add_definitions(-O2 -pthread)


================================================
FILE: efficientnet/README.md
================================================
# EfficientNet

A TensorRT implementation of EfficientNet.
For the Pytorch implementation, you can refer to [EfficientNet-PyTorch](https://github.com/lukemelas/EfficientNet-PyTorch)

## How to run

1. install `efficientnet_pytorch`
```
pip install efficientnet_pytorch
```

2. gennerate `.wts` file
```
python gen_wts.py
```

3. build

```
mkdir build
cd build
cmake ..
make
```
4. serialize model to engine
```
./efficientnet -s [.wts] [.engine] [b0 b1 b2 b3 ... b7]  // serialize model to engine file
```
such as
```
./efficientnet -s ../efficientnet-b3.wts efficientnet-b3.engine b3
```
5. deserialize and do infer
```
./efficientnet -d [.engine] [b0 b1 b2 b3 ... b7]   // deserialize engine file and run inference
```
such as 
```
./efficientnet -d efficientnet-b3.engine b3
```
6. see if the output is same as pytorch side


For more models, please refer to [tensorrtx](https://github.com/wang-xinyu/tensorrtx)


================================================
FILE: efficientnet/efficientnet.cpp
================================================
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include "utils.hpp"

#define USE_FP32 //USE_FP16
#define INPUT_NAME "data"
#define OUTPUT_NAME "prob"
#define MAX_BATCH_SIZE 8

using namespace nvinfer1;
static Logger gLogger;

static std::vector<BlockArgs>
	block_args_list = {
		BlockArgs{1, 3, 1, 1, 32, 16, 0.25, true},
		BlockArgs{2, 3, 2, 6, 16, 24, 0.25, true},
		BlockArgs{2, 5, 2, 6, 24, 40, 0.25, true},
		BlockArgs{3, 3, 2, 6, 40, 80, 0.25, true},
		BlockArgs{3, 5, 1, 6, 80, 112, 0.25, true},
		BlockArgs{4, 5, 2, 6, 112, 192, 0.25, true},
		BlockArgs{1, 3, 1, 6, 192, 320, 0.25, true}};

static std::map<std::string, GlobalParams>
	global_params_map = {
		// input_h,input_w,num_classes,batch_norm_epsilon,
		// width_coefficient,depth_coefficient,depth_divisor, min_depth
		{"b0", GlobalParams{224, 224, 1000, 0.001, 1.0, 1.0, 8, -1}},
		{"b1", GlobalParams{240, 240, 1000, 0.001, 1.0, 1.1, 8, -1}},
		{"b2", GlobalParams{260, 260, 1000, 0.001, 1.1, 1.2, 8, -1}},
		{"b3", GlobalParams{300, 300, 1000, 0.001, 1.2, 1.4, 8, -1}},
		{"b4", GlobalParams{380, 380, 1000, 0.001, 1.4, 1.8, 8, -1}},
		{"b5", GlobalParams{456, 456, 1000, 0.001, 1.6, 2.2, 8, -1}},
		{"b6", GlobalParams{528, 528, 1000, 0.001, 1.8, 2.6, 8, -1}},
		{"b7", GlobalParams{600, 600, 1000, 0.001, 2.0, 3.1, 8, -1}},
		{"b8", GlobalParams{672, 672, 1000, 0.001, 2.2, 3.6, 8, -1}},
		{"l2", GlobalParams{800, 800, 1000, 0.001, 4.3, 5.3, 8, -1}},
};

ICudaEngine *createEngine(unsigned int maxBatchSize, IBuilder *builder, IBuilderConfig *config, DataType dt, std::string path_wts, std::vector<BlockArgs> block_args_list, GlobalParams global_params)
{
	float bn_eps = global_params.batch_norm_epsilon;
	DimsHW image_size = DimsHW{global_params.input_h, global_params.input_w};

	std::map<std::string, Weights> weightMap = loadWeights(path_wts);
	Weights emptywts{DataType::kFLOAT, nullptr, 0};
	INetworkDefinition *network = builder->createNetworkV2(0U);
	ITensor *data = network->addInput(INPUT_NAME, dt, Dims3{3, global_params.input_h, global_params.input_w});
	assert(data);

	int out_channels = roundFilters(32, global_params);
	auto conv_stem = addSamePaddingConv2d(network, weightMap, *data, out_channels, 3, 2, 1, 1, image_size, "_conv_stem");
	auto bn0 = addBatchNorm2d(network, weightMap, *conv_stem->getOutput(0), "_bn0", bn_eps);
	auto swish0 = addSwish(network, *bn0->getOutput(0));
	ITensor *x = swish0->getOutput(0);
	image_size = calculateOutputImageSize(image_size, 2);
	int block_id = 0;
	for (int i = 0; i < block_args_list.size(); i++)
	{
		BlockArgs block_args = block_args_list[i];

		block_args.input_filters = roundFilters(block_args.input_filters, global_params);
		block_args.output_filters = roundFilters(block_args.output_filters, global_params);
		block_args.num_repeat = roundRepeats(block_args.num_repeat, global_params);
		x = MBConvBlock(network, weightMap, *x, "_blocks." + std::to_string(block_id), block_args, global_params, image_size);

		assert(x);
		block_id++;
		image_size = calculateOutputImageSize(image_size, block_args.stride);
		if (block_args.num_repeat > 1)
		{
			block_args.input_filters = block_args.output_filters;
			block_args.stride = 1;
		}
		for (int r = 0; r < block_args.num_repeat - 1; r++)
		{
			x = MBConvBlock(network, weightMap, *x, "_blocks." + std::to_string(block_id), block_args, global_params, image_size);
			block_id++;
		}
	}
	out_channels = roundFilters(1280, global_params);
	auto conv_head = addSamePaddingConv2d(network, weightMap, *x, out_channels, 1, 1, 1, 1, image_size, "_conv_head", false);
	auto bn1 = addBatchNorm2d(network, weightMap, *conv_head->getOutput(0), "_bn1", bn_eps);
	auto swish1 = addSwish(network, *bn1->getOutput(0));
	auto avg_pool = network->addPoolingNd(*swish1->getOutput(0), PoolingType::kAVERAGE, image_size);

	IFullyConnectedLayer *final = network->addFullyConnected(*avg_pool->getOutput(0), global_params.num_classes, weightMap["_fc.weight"], weightMap["_fc.bias"]);
	assert(final);

	final->getOutput(0)->setName(OUTPUT_NAME);
	network->markOutput(*final->getOutput(0));

	// Build engine
	builder->setMaxBatchSize(maxBatchSize);
	config->setMaxWorkspaceSize(1 << 20);
#ifdef USE_FP16
	config->setFlag(BuilderFlag::kFP16);
#endif
	std::cout << "build engine ..." << std::endl;

	ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config);
	assert(engine != nullptr);

	std::cout << "build finished" << std::endl;
	// Don't need the network any more
	network->destroy();
	// Release host memory
	for (auto &mem : weightMap)
	{
		free((void *)(mem.second.values));
	}

	return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory **modelStream, std::string wtsPath, std::vector<BlockArgs> block_args_list, GlobalParams global_params)
{
	// Create builder
	IBuilder *builder = createInferBuilder(gLogger);
	IBuilderConfig *config = builder->createBuilderConfig();

	// Create model to populate the network, then set the outputs and create an engine
	ICudaEngine *engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT, wtsPath, block_args_list, global_params);
	assert(engine != nullptr);

	// Serialize the engine
	(*modelStream) = engine->serialize();

	// Close everything down
	engine->destroy();
	builder->destroy();
	config->destroy();
}
void doInference(IExecutionContext &context, float *input, float *output, int batchSize, GlobalParams global_params)
{
	const ICudaEngine &engine = context.getEngine();

	// Pointers to input and output device buffers to pass to engine.
	// Engine requires exactly IEngine::getNbBindings() number of buffers.
	assert(engine.getNbBindings() == 2);
	void *buffers[2];

	// In order to bind the buffers, we need to know the names of the input and output tensors.
	// Note that indices are guaranteed to be less than IEngine::getNbBindings()
	const int inputIndex = engine.getBindingIndex(INPUT_NAME);
	const int outputIndex = engine.getBindingIndex(OUTPUT_NAME);

	// Create GPU buffers on device
	CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * global_params.input_h * global_params.input_w * sizeof(float)));
	CHECK(cudaMalloc(&buffers[outputIndex], batchSize * global_params.num_classes * sizeof(float)));

	// Create stream
	cudaStream_t stream;
	CHECK(cudaStreamCreate(&stream));

	// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
	CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * global_params.input_h * global_params.input_w * sizeof(float), cudaMemcpyHostToDevice, stream));
	context.enqueue(batchSize, buffers, stream, nullptr);
	CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * global_params.num_classes * sizeof(float), cudaMemcpyDeviceToHost, stream));
	cudaStreamSynchronize(stream);

	// Release stream and buffers
	cudaStreamDestroy(stream);
	CHECK(cudaFree(buffers[inputIndex]));
	CHECK(cudaFree(buffers[outputIndex]));
}

bool parse_args(int argc, char **argv, std::string &wts, std::string &engine, std::string &backbone)
{
	if (std::string(argv[1]) == "-s" && argc == 5)
	{
		wts = std::string(argv[2]);
		engine = std::string(argv[3]);
		backbone = std::string(argv[4]);
	}
	else if (std::string(argv[1]) == "-d" && argc == 4)
	{
		engine = std::string(argv[2]);
		backbone = std::string(argv[3]);
	}
	else
	{
		return false;
	}
	return true;
}

int main(int argc, char **argv)
{
	std::string wtsPath = "";
	std::string engine_name = "";
	std::string backbone = "";
	if (!parse_args(argc, argv, wtsPath, engine_name, backbone))
	{
		std::cerr << "arguments not right!" << std::endl;
		std::cerr << "./efficientnet -s [.wts] [.engine] [b0 b1 b2 b3 ... b7]  // serialize model to engine file" << std::endl;
		std::cerr << "./efficientnet -d [.engine] [b0 b1 b2 b3 ... b7]   // deserialize engine file and run inference" << std::endl;
		return -1;
	}
	GlobalParams global_params = global_params_map[backbone];
	// create a model using the API directly and serialize it to a stream
	if (!wtsPath.empty())
	{
		IHostMemory *modelStream{nullptr};
		APIToModel(MAX_BATCH_SIZE, &modelStream, wtsPath, block_args_list, global_params);
		assert(modelStream != nullptr);

		std::ofstream p(engine_name, std::ios::binary);
		if (!p)
		{
			std::cerr << "could not open plan output file" << std::endl;
			return -1;
		}
		p.write(reinterpret_cast<const char *>(modelStream->data()), modelStream->size());
		modelStream->destroy();
		return 1;
	}

	char *trtModelStream{nullptr};
	size_t size{0};

	std::ifstream file(engine_name, std::ios::binary);
	if (file.good())
	{
		file.seekg(0, file.end);
		size = file.tellg();
		file.seekg(0, file.beg);
		trtModelStream = new char[size];
		assert(trtModelStream);
		file.read(trtModelStream, size);
		file.close();
	}
	else
	{
		std::cerr << "could not open plan file" << std::endl;
		return -1;
	}

	// dummy input
	float *data = new float[3 * global_params.input_h * global_params.input_w];
	for (int i = 0; i < 3 * global_params.input_h * global_params.input_w; i++)
		data[i] = 0.1;

	IRuntime *runtime = createInferRuntime(gLogger);
	assert(runtime != nullptr);
	ICudaEngine *engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
	assert(engine != nullptr);
	IExecutionContext *context = engine->createExecutionContext();
	assert(context != nullptr);
	delete[] trtModelStream;

	// Run inference
	float *prob = new float[global_params.num_classes];
	for (int i = 0; i < 100; i++)
	{
		auto start = std::chrono::system_clock::now();
		doInference(*context, data, prob, 1, global_params);
		auto end = std::chrono::system_clock::now();
		std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
	}
	for (unsigned int i = 0; i < 20; i++)
	{
		std::cout << prob[i] << ", ";
	}
	std::cout << std::endl;
	// Destroy the engine
	context->destroy();
	engine->destroy();
	runtime->destroy();
	delete data;
	delete prob;

	return 0;
}


================================================
FILE: efficientnet/gen_wts.py
================================================
import torch
import struct
from efficientnet_pytorch import EfficientNet
model = EfficientNet.from_pretrained('efficientnet-b3')

model.eval()
f = open('efficientnet-b3.wts', 'w')
f.write('{}\n'.format(len(model.state_dict().keys())))
for k, v in model.state_dict().items():
    vr = v.reshape(-1).cpu().numpy()
    f.write('{} {} '.format(k, len(vr)))
    for vv in vr:
        f.write(' ')
        f.write(struct.pack('>f',float(vv)).hex())
    f.write('\n')
f.close()


================================================
FILE: efficientnet/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) override
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: efficientnet/utils.hpp
================================================
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"
#include <math.h>
#include <string>
#include <algorithm>
using namespace nvinfer1;

#define CHECK(status)                                          \
    do                                                         \
    {                                                          \
        auto ret = (status);                                   \
        if (ret != 0)                                          \
        {                                                      \
            std::cerr << "Cuda failure: " << ret << std::endl; \
            abort();                                           \
        }                                                      \
    } while (0)

// Load weights from files shared with TensorRT samples.
// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file)
{
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t *val = reinterpret_cast<uint32_t *>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

struct BlockArgs
{
    int num_repeat;
    int kernel_size;
    int stride;
    float expand_ratio;
    int input_filters;
    int output_filters;
    float se_ratio;
    bool id_skip;
};

struct GlobalParams
{
    int input_h;
    int input_w;
    int num_classes;
    float batch_norm_epsilon;
    float width_coefficient;
    float depth_coefficient;
    int depth_divisor;
    int min_depth;
};

int roundFilters(int filters, GlobalParams global_params)
{
    float multiplier = global_params.width_coefficient;
    int divisor = global_params.depth_divisor;
    int min_depth = global_params.min_depth;
    filters = int(filters * multiplier);
    if (min_depth < 0)
    {
        min_depth = divisor;
    }
    // follow the formula transferred from official TensorFlow implementation
    int new_filters = std::max(min_depth, int(int(filters + divisor / 2) / divisor) * divisor);
    if (new_filters < 0.9 * filters) // prevent rounding by more than 10%
        new_filters += divisor;
    return int(new_filters);
}

DimsHW calculateOutputImageSize(DimsHW image_size, int stride)
{
    int image_h = int(ceil(float(image_size.h()) / float(stride)));
    int image_w = int(ceil(float(image_size.w()) / float(stride)));
    return DimsHW{image_h, image_w};
}

int roundRepeats(int repeats, GlobalParams global_params)
{
    float multiplier = global_params.depth_coefficient;
    // follow the formula transferred from official TensorFlow implementation
    int new_repeats = int(ceil(multiplier * repeats));
    return new_repeats;
}

IScaleLayer *addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights> &weightMap, ITensor &input, std::string lname, float eps)
{
    float *gamma = (float *)weightMap[lname + ".weight"].values;
    float *beta = (float *)weightMap[lname + ".bias"].values;
    float *mean = (float *)weightMap[lname + ".running_mean"].values;
    float *var = (float *)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;
    float *scval = reinterpret_cast<float *>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++)
    {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};

    float *shval = reinterpret_cast<float *>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++)
    {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float *pval = reinterpret_cast<float *>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++)
    {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer *scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

IConvolutionLayer *addSamePaddingConv2d(INetworkDefinition *network, std::map<std::string, Weights> &weightMap, ITensor &input, int outch, int kernel_size, int stride, int dilation, int groups, DimsHW image_size, std::string lname, bool bias = true)
{
    int ih = image_size.h();
    int iw = image_size.w();
    int kh = kernel_size;
    int kw = kernel_size;
    int sh = stride;
    int sw = stride;
    int oh = ceil(float(ih) / float(sh));
    int ow = ceil(float(iw) / float(sw));
    int pad_h = std::max((oh - 1) * stride + (kh - 1) * dilation + 1 - ih, 0);
    int pad_w = std::max((ow - 1) * stride + (kw - 1) * dilation + 1 - iw, 0);
    int pad_left = 0;
    int pad_right = 0;
    int pad_top = 0;
    int pad_bottom = 0;
    if (pad_h > 0 || pad_w > 0)
    {
        pad_left = int(pad_w / 2);
        pad_right = pad_w - int(pad_w / 2);
        pad_top = int(pad_h / 2);
        pad_bottom = pad_h - int(pad_h / 2);
    }
    Weights bias_wt{DataType::kFLOAT, nullptr, 0};
    if (bias)
    {
        bias_wt = weightMap[lname + ".bias"];
    }
    IConvolutionLayer *conv = network->addConvolutionNd(input, outch, DimsHW{kh, kw}, weightMap[lname + ".weight"], bias_wt);
    conv->setPrePadding(DimsHW{pad_top, pad_left});
    conv->setPostPadding(DimsHW{pad_bottom, pad_right});
    conv->setStrideNd(DimsHW{stride, stride});
    conv->setDilationNd(DimsHW{dilation, dilation});
    conv->setNbGroups(groups);
    return conv;
}

ILayer *addSwish(INetworkDefinition *network, ITensor &input)
{
    //swish
    auto *sigmoid = network->addActivation(input, ActivationType::kSIGMOID);
    auto *ew = network->addElementWise(input, *sigmoid->getOutput(0), ElementWiseOperation::kPROD);
    return ew;
}

ITensor *MBConvBlock(INetworkDefinition *network, std::map<std::string, Weights> &weightMap, ITensor &input, std::string lname, BlockArgs block_args, GlobalParams global_params, DimsHW image_size)
{
    bool has_se = block_args.se_ratio > 0 && block_args.se_ratio <= 1;
    bool id_skip = block_args.id_skip;
    float bn_eps = global_params.batch_norm_epsilon;
    int input_filters = block_args.input_filters;
    int output_filters = block_args.output_filters;
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    ITensor *x = &input;
    int inp = block_args.input_filters;
    int oup = int(block_args.input_filters * block_args.expand_ratio);
    // expand_ratio != 1
    if (fabs(block_args.expand_ratio - 1) > 1e-5)
    {
        auto expand_conv = addSamePaddingConv2d(network, weightMap, input, oup, 1, 1, 1, 1, image_size, lname + "._expand_conv");
        auto bn0 = addBatchNorm2d(network, weightMap, *expand_conv->getOutput(0), lname + "._bn0", bn_eps);
        auto swish0 = addSwish(network, *bn0->getOutput(0));
        x = swish0->getOutput(0);
    }
    int k = block_args.kernel_size;
    int s = block_args.stride;
    auto depthwise_conv = addSamePaddingConv2d(network, weightMap, *x, oup, k, s, 1, oup, image_size, lname + "._depthwise_conv", false);
    auto bn1 = addBatchNorm2d(network, weightMap, *depthwise_conv->getOutput(0), lname + "._bn1", bn_eps);
    //swish
    auto swish1 = addSwish(network, *bn1->getOutput(0));
    x = swish1->getOutput(0);
    image_size = calculateOutputImageSize(image_size, s);
    if (has_se)
    {
        auto avg_pool = network->addPoolingNd(*x, PoolingType::kAVERAGE, image_size);
        int num_squeezed_channels = std::max(1, int(input_filters * block_args.se_ratio));
        auto se_reduce = addSamePaddingConv2d(network, weightMap, *avg_pool->getOutput(0), num_squeezed_channels, 1, 1, 1, 1, DimsHW{1, 1}, lname + "._se_reduce");

        auto swish2 = addSwish(network, *se_reduce->getOutput(0));
        auto se_expand = addSamePaddingConv2d(network, weightMap, *swish2->getOutput(0), oup, 1, 1, 1, 1, DimsHW{1, 1}, lname + "._se_expand");

        auto *sigmoid = network->addActivation(*se_expand->getOutput(0), ActivationType::kSIGMOID);
        auto *ew = network->addElementWise(*x, *sigmoid->getOutput(0), ElementWiseOperation::kPROD);
        x = ew->getOutput(0);
    }
    int final_oup = block_args.output_filters;
    auto project_conv = addSamePaddingConv2d(network, weightMap, *x, final_oup, 1, 1, 1, 1, image_size, lname + "._project_conv");

    auto bn2 = addBatchNorm2d(network, weightMap, *project_conv->getOutput(0), lname + "._bn2", bn_eps);
    x = bn2->getOutput(0);

    if (id_skip && block_args.stride == 1 && input_filters == output_filters)
    {
        auto *ew = network->addElementWise(input, *x, ElementWiseOperation::kSUM);
        x = ew->getOutput(0);
    }
    return x;
}


================================================
FILE: ghostnet/README.md
================================================
# GhostNet

GhostNetv1 architecture is from the paper "GhostNet: More Features from Cheap Operations" [(https://arxiv.org/abs/1911.11907)](https://arxiv.org/abs/1911.11907).
GhostNetv2 architecture is from the paper "GhostNetV2: Enhance Cheap Operation with Long-Range Attention" [(https://arxiv.org/abs/2211.12905)](https://arxiv.org/abs/2211.12905).

For the PyTorch implementations, you can refer to [huawei-noah/ghostnet](https://github.com/huawei-noah/ghostnet).

Both versions use the following techniques in their TensorRT implementations:

- **BatchNorm** layer is implemented by TensorRT's **Scale** layer.
- **Ghost Modules** are used to generate more features from cheap operations, as described in the paper.
- Replacing `IPoolingLayer` with `IReduceLayer` in TensorRT for Global Average Pooling. The `IReduceLayer` allows you to perform reduction operations (such as sum, average, max) over specified dimensions without being constrained by the kernel size limitations of pooling layers.

## Project Structure

```plaintext
ghostnet
│
├── ghostnetv1
│   ├── CMakeLists.txt
│   ├── gen_wts.py
│   ├── ghostnetv1.cpp
│   └── logging.h
│
├── ghostnetv2
│   ├── CMakeLists.txt
│   ├── gen_wts.py
│   ├── ghostnetv2.cpp
│   └── logging.h
│
└── README.md
```

## Steps to use GhostNet in TensorRT

### 1. Generate `.wts` files for both GhostNetv1 and GhostNetv2

```bash
# For ghostnetv1
python ghostnetv1/gen_wts.py

# For ghostnetv2
python ghostnetv2/gen_wts.py
```

### 2. Build the project

```bash
cd tensorrtx/ghostnet
mkdir build
cd build
cmake ..
make
```

### 3. Serialize the models to engine files

Use the following commands to serialize the PyTorch models into TensorRT engine files (`ghostnetv1.engine` and `ghostnetv2.engine`):

```bash
# For ghostnetv1
sudo ./ghostnetv1 -s

# For ghostnetv2
sudo ./ghostnetv2 -s
```

### 4. Run inference using the engine files

Once the engine files are generated, you can run inference with the following commands:

```bash
# For ghostnetv1
sudo ./ghostnetv1 -d

# For ghostnetv2
sudo ./ghostnetv2 -d
```

### 5. Verify output

Compare the output with the PyTorch implementation from [huawei-noah/ghostnet](https://github.com/huawei-noah/ghostnet) to ensure that the TensorRT results are consistent with the PyTorch model.


================================================
FILE: ghostnet/ghostnetv1/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(ghostnetv1)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(/usr/include/x86_64-linux-gnu/)
link_directories(/usr/lib/x86_64-linux-gnu/)

add_executable(ghostnetv1 ${PROJECT_SOURCE_DIR}/ghostnetv1.cpp)
target_link_libraries(ghostnetv1 nvinfer)
target_link_libraries(ghostnetv1 cudart)

add_definitions(-O2 -pthread)


================================================
FILE: ghostnet/ghostnetv1/gen_wts.py
================================================
"""
Creates a GhostNet Model as defined in:
GhostNet: More Features from Cheap Operations By Kai Han, Yunhe Wang, Qi Tian, Jianyuan Guo, Chunjing Xu, Chang Xu.
https://arxiv.org/abs/1911.11907
Modified from https://github.com/d-li14/mobilenetv3.pytorch and https://github.com/rwightman/pytorch-image-models
"""
import torch
import torch.nn as nn
import torch.onnx
import struct
import torch
import torch.nn.functional as F
import math


def _make_divisible(v, divisor, min_value=None):
    """
    This function is taken from the original tf repo.
    It ensures that all layers have a channel number that is divisible by 8
    It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    """
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    # Make sure that round down does not go down by more than 10%.
    if new_v < 0.9 * v:
        new_v += divisor
    return new_v


def hard_sigmoid(x, inplace: bool = False):
    if inplace:
        return x.add_(3.).clamp_(0., 6.).div_(6.)
    else:
        return F.relu6(x + 3.) / 6.


class SqueezeExcite(nn.Module):
    def __init__(self, in_chs, se_ratio=0.25, reduced_base_chs=None,
                 act_layer=nn.ReLU, gate_fn=hard_sigmoid, divisor=4, **_):
        super(SqueezeExcite, self).__init__()
        self.gate_fn = gate_fn
        reduced_chs = _make_divisible((reduced_base_chs or in_chs) * se_ratio, divisor)
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True)
        self.act1 = act_layer(inplace=True)
        self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True)

    def forward(self, x):
        x_se = self.avg_pool(x)
        x_se = self.conv_reduce(x_se)
        x_se = self.act1(x_se)
        x_se = self.conv_expand(x_se)
        x = x * self.gate_fn(x_se)
        return x


class ConvBnAct(nn.Module):
    def __init__(self, in_chs, out_chs, kernel_size,
                 stride=1, act_layer=nn.ReLU):
        super(ConvBnAct, self).__init__()
        self.conv = nn.Conv2d(in_chs, out_chs, kernel_size, stride, kernel_size//2, bias=False)
        self.bn1 = nn.BatchNorm2d(out_chs)
        self.act1 = act_layer(inplace=True)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn1(x)
        x = self.act1(x)
        return x


class GhostModule(nn.Module):
    def __init__(self, inp, oup, kernel_size=1, ratio=2, dw_size=3, stride=1, relu=True):
        super(GhostModule, self).__init__()
        self.oup = oup
        init_channels = math.ceil(oup / ratio)
        new_channels = init_channels*(ratio-1)

        self.primary_conv = nn.Sequential(
            nn.Conv2d(inp, init_channels, kernel_size, stride, kernel_size//2, bias=False),
            nn.BatchNorm2d(init_channels),
            nn.ReLU(inplace=True) if relu else nn.Sequential(),
        )

        self.cheap_operation = nn.Sequential(
            nn.Conv2d(init_channels, new_channels, dw_size, 1, dw_size//2, groups=init_channels, bias=False),
            nn.BatchNorm2d(new_channels),
            nn.ReLU(inplace=True) if relu else nn.Sequential(),
        )

    def forward(self, x):
        x1 = self.primary_conv(x)
        x2 = self.cheap_operation(x1)
        out = torch.cat([x1, x2], dim=1)
        return out[:, :self.oup, :, :]


class GhostBottleneck(nn.Module):
    """ Ghost bottleneck w/ optional SE"""

    def __init__(self, in_chs, mid_chs, out_chs, dw_kernel_size=3,
                 stride=1, act_layer=nn.ReLU, se_ratio=0.):
        super(GhostBottleneck, self).__init__()
        has_se = se_ratio is not None and se_ratio > 0.
        self.stride = stride

        # Point-wise expansion
        self.ghost1 = GhostModule(in_chs, mid_chs, relu=True)

        # Depth-wise convolution
        if self.stride > 1:
            self.conv_dw = nn.Conv2d(mid_chs, mid_chs, dw_kernel_size, stride=stride,
                                     padding=(dw_kernel_size-1)//2, groups=mid_chs, bias=False)
            self.bn_dw = nn.BatchNorm2d(mid_chs)

        # Squeeze-and-excitation
        if has_se:
            self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio)
        else:
            self.se = None

        # Point-wise linear projection
        self.ghost2 = GhostModule(mid_chs, out_chs, relu=False)

        # shortcut
        if (in_chs == out_chs and self.stride == 1):
            self.shortcut = nn.Sequential()
        else:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_chs, in_chs, dw_kernel_size, stride=stride,
                          padding=(dw_kernel_size-1)//2, groups=in_chs, bias=False),
                nn.BatchNorm2d(in_chs),
                nn.Conv2d(in_chs, out_chs, 1, stride=1, padding=0, bias=False),
                nn.BatchNorm2d(out_chs),
            )

    def forward(self, x):
        residual = x

        # 1st ghost bottleneck
        x = self.ghost1(x)

        # Depth-wise convolution
        if self.stride > 1:
            x = self.conv_dw(x)
            x = self.bn_dw(x)

        # Squeeze-and-excitation
        if self.se is not None:
            x = self.se(x)

        # 2nd ghost bottleneck
        x = self.ghost2(x)

        x += self.shortcut(residual)
        return x


class GhostNet(nn.Module):
    def __init__(self, cfgs, num_classes=1000, width=1.0, dropout=0.2):
        super(GhostNet, self).__init__()
        # setting of inverted residual blocks
        self.cfgs = cfgs
        self.dropout = dropout

        # building first layer
        output_channel = _make_divisible(16 * width, 4)
        self.conv_stem = nn.Conv2d(3, output_channel, 3, 2, 1, bias=False)
        self.bn1 = nn.BatchNorm2d(output_channel)
        self.act1 = nn.ReLU(inplace=True)
        input_channel = output_channel

        # building inverted residual blocks
        stages = []
        block = GhostBottleneck
        for cfg in self.cfgs:
            layers = []
            for k, exp_size, c, se_ratio, s in cfg:
                output_channel = _make_divisible(c * width, 4)
                hidden_channel = _make_divisible(exp_size * width, 4)
                layers.append(block(input_channel, hidden_channel, output_channel, k, s,
                              se_ratio=se_ratio))
                input_channel = output_channel
            stages.append(nn.Sequential(*layers))

        output_channel = _make_divisible(exp_size * width, 4)
        stages.append(nn.Sequential(ConvBnAct(input_channel, output_channel, 1)))
        input_channel = output_channel

        self.blocks = nn.Sequential(*stages)

        # building last several layers
        output_channel = 1280
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.conv_head = nn.Conv2d(input_channel, output_channel, 1, 1, 0, bias=True)
        self.act2 = nn.ReLU(inplace=True)
        self.classifier = nn.Linear(output_channel, num_classes)

    def forward(self, x):
        x = self.conv_stem(x)
        x = self.bn1(x)
        x = self.act1(x)
        x = self.blocks(x)
        x = self.global_pool(x)
        x = self.conv_head(x)
        x = self.act2(x)
        x = x.view(x.size(0), -1)
        if self.dropout > 0.:
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.classifier(x)
        return x


def ghostnet(**kwargs):
    """
    Constructs a GhostNet model
    """
    cfgs = [
        # k, t, c, SE, s
        # stage1
        [[3,  16,  16, 0, 1]],
        # stage2
        [[3,  48,  24, 0, 2]],
        [[3,  72,  24, 0, 1]],
        # stage3
        [[5,  72,  40, 0.25, 2]],
        [[5, 120,  40, 0.25, 1]],
        # stage4
        [[3, 240,  80, 0, 2]],
        [[3, 200,  80, 0, 1],
         [3, 184,  80, 0, 1],
         [3, 184,  80, 0, 1],
         [3, 480, 112, 0.25, 1],
         [3, 672, 112, 0.25, 1]],
        # stage5
        [[5, 672, 160, 0.25, 2]],
        [[5, 960, 160, 0, 1],
         [5, 960, 160, 0.25, 1],
         [5, 960, 160, 0, 1],
         [5, 960, 160, 0.25, 1]]
    ]
    return GhostNet(cfgs, **kwargs)


def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True


# Function to export weights in the specified format
def export_weight(model):
    f = open("ghostnetv1.weights", 'w')
    f.write("{}\n".format(len(model.state_dict().keys())))

    # Convert weights to hexadecimal format
    for k, v in model.state_dict().items():
        print('exporting ... {}: {}'.format(k, v.shape))

        # Reshape the weights to 1D
        vr = v.reshape(-1).cpu().numpy()
        f.write("{} {}".format(k, len(vr)))
        for vv in vr:
            f.write(" ")
            f.write(struct.pack(">f", float(vv)).hex())
        f.write("\n")

    f.close()


# Function to evaluate the model (optional)
def eval_model(input, model):
    output = model(input)
    print("------from inference------")
    print(input)
    print(output)


if __name__ == "__main__":
    setup_seed(1)

    model = ghostnet(num_classes=1000, width=1.0, dropout=0.2)

    model.eval()

    input = torch.full((32, 3, 320, 256), 10.0)

    export_weight(model)

    eval_model(input, model)


================================================
FILE: ghostnet/ghostnetv1/ghostnetv1.cpp
================================================
#include <chrono>
#include <cmath>
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"

using namespace std;

#define CHECK(status)                                          \
    do {                                                       \
        auto ret = (status);                                   \
        if (ret != 0) {                                        \
            std::cerr << "Cuda failure: " << ret << std::endl; \
            abort();                                           \
        }                                                      \
    } while (0)

// stuff we know about the network and the input/output blobs
static const int INPUT_H = 256;
static const int INPUT_W = 320;
static const int OUTPUT_SIZE = 1000;
static const int batchSize = 32;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
using namespace nvinfer1;

static Logger gLogger;

// Load weights from files shared with TensorRT samples.
// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    if (!input.is_open()) {
        std::cerr << "Unable to load weight file." << std::endl;
        exit(EXIT_FAILURE);
    }

    // Read number of weight blobs
    int32_t count;
    input >> count;
    if (count <= 0) {
        std::cerr << "Invalid weight map file." << std::endl;
        exit(EXIT_FAILURE);
    }

    while (count--) {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(uint32_t) * size));
        for (uint32_t x = 0, y = size; x < y; ++x) {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

int _make_divisible(int v, int divisor, int min_value = -1) {
    if (min_value == -1) {
        min_value = divisor;
    }

    int new_v = std::max(min_value, (v + divisor / 2) / divisor * divisor);

    if (new_v < static_cast<int>(0.9 * v)) {
        new_v += divisor;
    }

    return new_v;
}

ILayer* hardSigmoid(INetworkDefinition* network, ITensor& input) {

    IActivationLayer* scale_layer = network->addActivation(input, ActivationType::kHARD_SIGMOID);

    return scale_layer;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input,
                            std::string lname, float eps) {
    float* gamma = (float*)weightMap[lname + ".weight"].values;
    float* beta = (float*)weightMap[lname + ".bias"].values;
    float* mean = (float*)weightMap[lname + ".running_mean"].values;
    float* var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;

    float* scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};

    float* shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float* pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

IActivationLayer* convBnReluStem(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input,
                                 int outch, std::string lname) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    IConvolutionLayer* conv1 =
            network->addConvolutionNd(input, outch, DimsHW{3, 3}, weightMap[lname + ".weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{2, 2});   // Stride = 2
    conv1->setPaddingNd(DimsHW{1, 1});  // Padding = 1

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "bn1", 1e-5);

    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    return relu1;
}

ILayer* convBnAct(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input,
                  int out_channels, std::string lname, ActivationType actType = ActivationType::kRELU) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    IConvolutionLayer* conv =
            network->addConvolutionNd(input, out_channels, DimsHW{1, 1}, weightMap[lname + ".conv.weight"], emptywts);
    assert(conv);
    conv->setStrideNd(DimsHW{1, 1});

    IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn1", 1e-5);

    IActivationLayer* act = network->addActivation(*bn->getOutput(0), actType);
    assert(act);

    return act;
}

ILayer* squeezeExcite(INetworkDefinition* network, ITensor& input, std::map<std::string, Weights>& weightMap,
                      int in_chs, float se_ratio = 0.25, std::string lname = "", float eps = 1e-5) {

    IReduceLayer* avg_pool = network->addReduce(input, ReduceOperation::kAVG, 1 << 2 | 1 << 3, true);
    assert(avg_pool);

    // Reduce channels with 1x1 convolution
    int reduced_chs = _make_divisible(static_cast<int>(in_chs * se_ratio), 4);
    IConvolutionLayer* conv_reduce =
            network->addConvolutionNd(*avg_pool->getOutput(0), reduced_chs, DimsHW{1, 1},
                                      weightMap[lname + ".conv_reduce.weight"], weightMap[lname + ".conv_reduce.bias"]);
    assert(conv_reduce);

    IActivationLayer* relu1 = network->addActivation(*conv_reduce->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    // Expand channels back with another 1x1 convolution
    IConvolutionLayer* conv_expand =
            network->addConvolutionNd(*relu1->getOutput(0), in_chs, DimsHW{1, 1},
                                      weightMap[lname + ".conv_expand.weight"], weightMap[lname + ".conv_expand.bias"]);
    assert(conv_expand);
    cout << "SE conv_expand -> " << printTensorShape(conv_expand->getOutput(0)) << endl;

    // Apply hardSigmoid function
    ILayer* hard_sigmoid = hardSigmoid(network, *conv_expand->getOutput(0));
    cout << "hard_sigmoid conv_expand -> " << printTensorShape(hard_sigmoid->getOutput(0)) << endl;

    // Elementwise multiplication of input and gated SE output
    IElementWiseLayer* scale = network->addElementWise(input, *hard_sigmoid->getOutput(0), ElementWiseOperation::kPROD);
    assert(scale);

    return scale;
}

ILayer* ghostModule(INetworkDefinition* network, ITensor& input, std::map<std::string, Weights>& weightMap, int inp,
                    int oup, int kernel_size = 1, int ratio = 2, int dw_size = 3, int stride = 1, bool relu = true,
                    std::string lname = "") {
    int init_channels = std::ceil(oup / ratio);
    int new_channels = init_channels * (ratio - 1);

    // Primary convolution
    IConvolutionLayer* primary_conv = network->addConvolutionNd(input, init_channels, DimsHW{kernel_size, kernel_size},
                                                                weightMap[lname + ".primary_conv.0.weight"], Weights{});
    primary_conv->setStrideNd(DimsHW{stride, stride});
    primary_conv->setPaddingNd(DimsHW{kernel_size / 2, kernel_size / 2});
    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *primary_conv->getOutput(0), lname + ".primary_conv.1", 1e-5);

    // Cheap operation (Depthwise Convolution)
    IConvolutionLayer* cheap_conv =
            network->addConvolutionNd(*bn1->getOutput(0), new_channels, DimsHW{dw_size, dw_size},
                                      weightMap[lname + ".cheap_operation.0.weight"], Weights{});
    cheap_conv->setStrideNd(DimsHW{1, 1});
    cheap_conv->setPaddingNd(DimsHW{dw_size / 2, dw_size / 2});
    cheap_conv->setNbGroups(init_channels);
    IScaleLayer* bn2 =
            addBatchNorm2d(network, weightMap, *cheap_conv->getOutput(0), lname + ".cheap_operation.1", 1e-5);

    // Define relu1 and relu2
    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU);

    // Initialize inputs array based on the `relu` flag
    std::vector<ITensor*> inputs_vec;
    if (relu) {
        inputs_vec = {relu1->getOutput(0), relu2->getOutput(0)};
    } else {
        inputs_vec = {bn1->getOutput(0), bn2->getOutput(0)};
    }

    ITensor* inputs[] = {inputs_vec[0], inputs_vec[1]};
    IConcatenationLayer* concat = network->addConcatenation(inputs, 2);
    std::cout << printTensorShape(concat->getOutput(0)) << std::endl;

    // Slice the output to keep only the first `oup` channels
    Dims start{4, {0, 0, 0, 0}};  // Starting from batch=0, channel=0, height=0, width=0
    Dims size{4,
              {concat->getOutput(0)->getDimensions().d[0], oup, concat->getOutput(0)->getDimensions().d[2],
               concat->getOutput(0)
                       ->getDimensions()
                       .d[3]}};     // Keep all batches, first `oup` channels, all heights and widths
    Dims stride_{4, {1, 1, 1, 1}};  // Stride is 1 for all dimensions

    ISliceLayer* slice = network->addSlice(*concat->getOutput(0), start, size, stride_);
    cout << "slice" << printTensorShape(slice->getOutput(0)) << endl;

    return slice;
}

ILayer* ghostBottleneck(INetworkDefinition* network, ITensor& input, std::map<std::string, Weights>& weightMap,
                        int in_chs, int mid_chs, int out_chs, int dw_kernel_size = 3, int stride = 1,
                        float se_ratio = 0.0f, std::string lname = "") {
    ILayer* ghost1 = ghostModule(network, input, weightMap, in_chs, mid_chs, 1, 2, 3, 1, true, lname + ".ghost1");

    ILayer* depthwise_conv = ghost1;
    if (stride > 1) {
        IConvolutionLayer* conv_dw =
                network->addConvolutionNd(*ghost1->getOutput(0), mid_chs, DimsHW{dw_kernel_size, dw_kernel_size},
                                          weightMap[lname + ".conv_dw.weight"], Weights{});
        conv_dw->setStrideNd(DimsHW{stride, stride});
        conv_dw->setPaddingNd(DimsHW{(dw_kernel_size - 1) / 2, (dw_kernel_size - 1) / 2});
        conv_dw->setNbGroups(mid_chs);  // Depth-wise convolution
        IScaleLayer* bn_dw = addBatchNorm2d(network, weightMap, *conv_dw->getOutput(0), lname + ".bn_dw", 1e-5);
        depthwise_conv = bn_dw;
    }

    ILayer* se_layer = depthwise_conv;
    if (se_ratio > 0.0f) {
        se_layer = squeezeExcite(network, *depthwise_conv->getOutput(0), weightMap, mid_chs, se_ratio, lname + ".se");
    }

    ILayer* ghost2 = ghostModule(network, *se_layer->getOutput(0), weightMap, mid_chs, out_chs, 1, 2, 3, 1, false,
                                 lname + ".ghost2");

    ILayer* shortcut_layer = nullptr;
    if (in_chs == out_chs && stride == 1) {
        shortcut_layer = network->addIdentity(input);
    } else {
        IConvolutionLayer* conv_shortcut_dw =
                network->addConvolutionNd(input, in_chs, DimsHW{dw_kernel_size, dw_kernel_size},
                                          weightMap[lname + ".shortcut.0.weight"], Weights{});

        conv_shortcut_dw->setStrideNd(DimsHW{stride, stride});
        conv_shortcut_dw->setPaddingNd(DimsHW{(dw_kernel_size - 1) / 2, (dw_kernel_size - 1) / 2});
        conv_shortcut_dw->setNbGroups(in_chs);  // Depth-wise convolution
        IScaleLayer* bn_shortcut_dw =
                addBatchNorm2d(network, weightMap, *conv_shortcut_dw->getOutput(0), lname + ".shortcut.1", 1e-5);

        IConvolutionLayer* conv_shortcut_pw =
                network->addConvolutionNd(*bn_shortcut_dw->getOutput(0), out_chs, DimsHW{1, 1},
                                          weightMap[lname + ".shortcut.2.weight"], Weights{});
        IScaleLayer* bn_shortcut_pw =
                addBatchNorm2d(network, weightMap, *conv_shortcut_pw->getOutput(0), lname + ".shortcut.3", 1e-5);
        shortcut_layer = bn_shortcut_pw;
    }

    IElementWiseLayer* ew_sum =
            network->addElementWise(*ghost2->getOutput(0), *shortcut_layer->getOutput(0), ElementWiseOperation::kSUM);

    return ew_sum;
}

ICudaEngine* createEngine(IBuilder* builder, IBuilderConfig* config, DataType dt) {

    INetworkDefinition* network =
            builder->createNetworkV2(1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));

    // Create input tensor of shape {batchSize, 3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims4{batchSize, 3, INPUT_H, INPUT_W});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../ghostnetv1.weights");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    // Conv Stem
    IActivationLayer* conv_stem = convBnReluStem(network, weightMap, *data, 16, "conv_stem");

    ILayer* current_layer = conv_stem;
    current_layer =
            ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 16, 16, 16, 3, 1, 0, "blocks.0.0");
    current_layer =
            ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 16, 48, 24, 3, 2, 0, "blocks.1.0");
    current_layer =
            ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 24, 72, 24, 3, 1, 0, "blocks.2.0");
    current_layer =
            ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 24, 72, 40, 5, 2, 0.25, "blocks.3.0");
    current_layer =
            ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 40, 120, 40, 5, 1, 0.25, "blocks.4.0");
    current_layer =
            ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 40, 240, 80, 3, 2, 0, "blocks.5.0");
    current_layer =
            ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 80, 200, 80, 3, 1, 0, "blocks.6.0");
    current_layer =
            ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 80, 184, 80, 3, 1, 0, "blocks.6.1");
    current_layer =
            ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 80, 184, 80, 3, 1, 0, "blocks.6.2");
    current_layer =
            ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 80, 480, 112, 3, 1, 0.25, "blocks.6.3");
    current_layer =
            ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 112, 672, 112, 3, 1, 0.25, "blocks.6.4");
    current_layer =
            ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 112, 672, 160, 5, 2, 0.25, "blocks.7.0");
    current_layer =
            ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 160, 960, 160, 5, 1, 0, "blocks.8.0");
    current_layer =
            ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 160, 960, 160, 5, 1, 0.25, "blocks.8.1");
    current_layer =
            ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 160, 960, 160, 5, 1, 0, "blocks.8.2");
    current_layer =
            ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 160, 960, 160, 5, 1, 0.25, "blocks.8.3");

    // Apply ConvBnAct
    current_layer = convBnAct(network, weightMap, *current_layer->getOutput(0), 960, "blocks.9.0");
    // Global Average Pooling
    IReduceLayer* global_pool =
            network->addReduce(*current_layer->getOutput(0), ReduceOperation::kAVG, 1 << 2 | 1 << 3, true);
    assert(global_pool);

    // Conv Head
    IConvolutionLayer* conv_head = network->addConvolutionNd(
            *global_pool->getOutput(0), 1280, DimsHW{1, 1}, weightMap["conv_head.weight"], weightMap["conv_head.bias"]);
    IActivationLayer* act2 = network->addActivation(*conv_head->getOutput(0), ActivationType::kRELU);

    // Fully Connected Layer (Classifier)
    IFullyConnectedLayer* classifier = network->addFullyConnected(
            *act2->getOutput(0), 1000, weightMap["classifier.weight"], weightMap["classifier.bias"]);
    classifier->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*classifier->getOutput(0));

    // Build engine
    config->setMaxWorkspaceSize(1 << 24);
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    return engine;
}

void APIToModel(IHostMemory** modelStream) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    config->destroy();
    builder->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
    const ICudaEngine& engine = context.getEngine();

    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Pointers to input and output device buffers to pass to engine.
    void* buffers[2];

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float),
                          cudaMemcpyHostToDevice, stream));
    context.enqueueV2(buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost,
                          stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

int main(int argc, char** argv) {
    if (argc != 2) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./ghostnetv1 -s   // serialize model to plan file" << std::endl;
        std::cerr << "./ghostnetv1 -d   // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // create a model using the API directly and serialize it to a stream
    char* trtModelStream{nullptr};
    size_t size{0};

    if (std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(&modelStream);
        assert(modelStream != nullptr);

        std::ofstream p("ghostnetv1.engine", std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 0;
    } else if (std::string(argv[1]) == "-d") {
        std::ifstream file("ghostnetv1.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        return -1;
    }

    float* data = new float[batchSize * 3 * INPUT_H * INPUT_W];
    for (int i = 0; i < batchSize * 3 * INPUT_H * INPUT_W; i++)
        data[i] = 10.0;

    float* prob = new float[batchSize * OUTPUT_SIZE];

    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    doInference(*context, data, prob, batchSize);

    std::cout << "\nOutput:\n\n";
    for (int i = 0; i < batchSize; i++) {
        std::cout << "Batch " << i << ":\n";
        for (unsigned int j = 0; j < OUTPUT_SIZE; j++) {
            std::cout << prob[i * OUTPUT_SIZE + j] << ", ";
            if (j % 10 == 0)
                std::cout << j / 10 << std::endl;
        }
        std::cout << "\n";
    }

    context->destroy();
    engine->destroy();
    runtime->destroy();
    delete[] data;
    delete[] prob;

    return 0;
}


================================================
FILE: ghostnet/ghostnetv1/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "NvInferRuntimeCommon.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf {
   public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {}

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {}

    ~LogStreamConsumerBuffer() {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr()) {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync() {
        putOutput();
        return 0;
    }

    void putOutput() {
        if (mShouldLog) {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }

   private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase {
   public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog) {}

   protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
   public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(severity <= reportableSeverity),
          mSeverity(severity) {}

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(other.mShouldLog),
          mSeverity(other.mSeverity) {}

    void setReportableSeverity(Severity reportableSeverity) {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

   private:
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger {
   public:
    Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {}

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult {
        kRUNNING,  //!< The test is running
        kPASSED,   //!< The test passed
        kFAILED,   //!< The test failed
        kWAIVED    //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger() { return *this; }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) noexcept override {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity) { mReportableSeverity = severity; }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom {
       public:
        TestAtom(TestAtom&&) = default;

       private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started), mName(name), mCmdline(cmdline) {}

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline) {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom) {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result) {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass) {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const { return mReportableSeverity; }

   private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result) {
        switch (result) {
            case TestResult::kRUNNING:
                return "RUNNING";
            case TestResult::kPASSED:
                return "PASSED";
            case TestResult::kFAILED:
                return "FAILED";
            case TestResult::kWAIVED:
                return "WAIVED";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result) {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv) {
        std::stringstream ss;
        for (int i = 0; i < argc; i++) {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace {

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

}  // anonymous namespace

#endif  // TENSORRT_LOGGING_H


================================================
FILE: ghostnet/ghostnetv2/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(ghostnetv2)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(/usr/include/x86_64-linux-gnu/)
link_directories(/usr/lib/x86_64-linux-gnu/)

add_executable(ghostnetv2 ${PROJECT_SOURCE_DIR}/ghostnetv2.cpp)
target_link_libraries(ghostnetv2 nvinfer)
target_link_libraries(ghostnetv2 cudart)

add_definitions(-O2 -pthread)


================================================
FILE: ghostnet/ghostnetv2/gen_wts.py
================================================
import torch
import torch.nn as nn
import torch.onnx
import struct

import torch
import torch.nn.functional as F
import math

from timm.models.registry import register_model


def _make_divisible(v, divisor, min_value=None):
    """
    This function is taken from the original tf repo.
    It ensures that all layers have a channel number that is divisible by 8
    It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    """
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    # Make sure that round down does not go down by more than 10%.
    if new_v < 0.9 * v:
        new_v += divisor
    return new_v


def hard_sigmoid(x, inplace: bool = False):
    if inplace:
        return x.add_(3.).clamp_(0., 6.).div_(6.)
    else:
        return F.relu6(x + 3.) / 6.


class SqueezeExcite(nn.Module):
    def __init__(self, in_chs, se_ratio=0.25, reduced_base_chs=None,
                 act_layer=nn.ReLU, gate_fn=hard_sigmoid, divisor=4, **_):
        super(SqueezeExcite, self).__init__()
        self.gate_fn = gate_fn
        reduced_chs = _make_divisible((reduced_base_chs or in_chs) * se_ratio, divisor)
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True)
        self.act1 = act_layer(inplace=True)
        self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True)

    def forward(self, x):
        x_se = self.avg_pool(x)
        x_se = self.conv_reduce(x_se)
        x_se = self.act1(x_se)
        x_se = self.conv_expand(x_se)
        x = x * self.gate_fn(x_se)
        return x


class ConvBnAct(nn.Module):
    def __init__(self, in_chs, out_chs, kernel_size,
                 stride=1, act_layer=nn.ReLU):
        super(ConvBnAct, self).__init__()
        self.conv = nn.Conv2d(in_chs, out_chs, kernel_size, stride, kernel_size//2, bias=False)
        self.bn1 = nn.BatchNorm2d(out_chs)
        self.act1 = act_layer(inplace=True)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn1(x)
        x = self.act1(x)
        return x


class GhostModuleV2(nn.Module):
    def __init__(self, inp, oup, kernel_size=1, ratio=2, dw_size=3, stride=1, relu=True, mode=None, args=None):
        super(GhostModuleV2, self).__init__()
        self.mode = mode
        self.gate_fn = nn.Sigmoid()

        if self.mode in ['original']:
            self.oup = oup
            init_channels = math.ceil(oup / ratio)
            new_channels = init_channels*(ratio-1)
            self.primary_conv = nn.Sequential(
                nn.Conv2d(inp, init_channels, kernel_size, stride, kernel_size//2, bias=False),
                nn.BatchNorm2d(init_channels),
                nn.ReLU(inplace=True) if relu else nn.Sequential(),
            )
            self.cheap_operation = nn.Sequential(
                nn.Conv2d(init_channels, new_channels, dw_size, 1, dw_size//2, groups=init_channels, bias=False),
                nn.BatchNorm2d(new_channels),
                nn.ReLU(inplace=True) if relu else nn.Sequential(),
            )
        elif self.mode in ['attn']:
            self.oup = oup
            init_channels = math.ceil(oup / ratio)
            new_channels = init_channels*(ratio-1)
            self.primary_conv = nn.Sequential(
                nn.Conv2d(inp, init_channels, kernel_size, stride, kernel_size//2, bias=False),
                nn.BatchNorm2d(init_channels),
                nn.ReLU(inplace=True) if relu else nn.Sequential(),
            )
            self.cheap_operation = nn.Sequential(
                nn.Conv2d(init_channels, new_channels, dw_size, 1, dw_size//2, groups=init_channels, bias=False),
                nn.BatchNorm2d(new_channels),
                nn.ReLU(inplace=True) if relu else nn.Sequential(),
            )
            self.short_conv = nn.Sequential(
                nn.Conv2d(inp, oup, kernel_size, stride, kernel_size//2, bias=False),
                nn.BatchNorm2d(oup),
                nn.Conv2d(oup, oup, kernel_size=(1, 5), stride=1, padding=(0, 2), groups=oup, bias=False),
                nn.BatchNorm2d(oup),
                nn.Conv2d(oup, oup, kernel_size=(5, 1), stride=1, padding=(2, 0), groups=oup, bias=False),
                nn.BatchNorm2d(oup),
            )

    def forward(self, x):
        if self.mode in ['original']:
            x1 = self.primary_conv(x)
            x2 = self.cheap_operation(x1)
            out = torch.cat([x1, x2], dim=1)
            return out[:, :self.oup, :, :]
        elif self.mode in ['attn']:
            res = self.short_conv(F.avg_pool2d(x, kernel_size=2, stride=2))
            x1 = self.primary_conv(x)
            x2 = self.cheap_operation(x1)
            out = torch.cat([x1, x2], dim=1)
            return out[:, :self.oup, :, :]*F.interpolate(self.gate_fn(res),
                                                         size=(out.shape[-2], out.shape[-1]), mode='nearest')


class GhostBottleneckV2(nn.Module):

    def __init__(self, in_chs, mid_chs, out_chs, dw_kernel_size=3,
                 stride=1, act_layer=nn.ReLU, se_ratio=0., layer_id=None, args=None):
        super(GhostBottleneckV2, self).__init__()
        has_se = se_ratio is not None and se_ratio > 0.
        self.stride = stride

        # Point-wise expansion
        if layer_id <= 1:
            self.ghost1 = GhostModuleV2(in_chs, mid_chs, relu=True, mode='original', args=args)
        else:
            self.ghost1 = GhostModuleV2(in_chs, mid_chs, relu=True, mode='attn', args=args)

        # Depth-wise convolution
        if self.stride > 1:
            self.conv_dw = nn.Conv2d(mid_chs, mid_chs, dw_kernel_size, stride=stride,
                                     padding=(dw_kernel_size-1)//2, groups=mid_chs, bias=False)
            self.bn_dw = nn.BatchNorm2d(mid_chs)

        # Squeeze-and-excitation
        if has_se:
            self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio)
        else:
            self.se = None

        self.ghost2 = GhostModuleV2(mid_chs, out_chs, relu=False, mode='original', args=args)

        # shortcut
        if (in_chs == out_chs and self.stride == 1):
            self.shortcut = nn.Sequential()
        else:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_chs, in_chs, dw_kernel_size, stride=stride,
                          padding=(dw_kernel_size-1)//2, groups=in_chs, bias=False),
                nn.BatchNorm2d(in_chs),
                nn.Conv2d(in_chs, out_chs, 1, stride=1, padding=0, bias=False),
                nn.BatchNorm2d(out_chs),
            )

    def forward(self, x):
        residual = x
        x = self.ghost1(x)
        if self.stride > 1:
            x = self.conv_dw(x)
            x = self.bn_dw(x)
        if self.se is not None:
            x = self.se(x)
        x = self.ghost2(x)
        x += self.shortcut(residual)
        return x


class GhostNetV2(nn.Module):
    def __init__(self, cfgs, num_classes=1000, width=1.0, dropout=0.2, block=GhostBottleneckV2, args=None):
        super(GhostNetV2, self).__init__()
        self.cfgs = cfgs
        self.dropout = dropout

        # building first layer
        output_channel = _make_divisible(16 * width, 4)
        self.conv_stem = nn.Conv2d(3, output_channel, 3, 2, 1, bias=False)
        self.bn1 = nn.BatchNorm2d(output_channel)
        self.act1 = nn.ReLU(inplace=True)
        input_channel = output_channel

        # building inverted residual blocks
        stages = []
        layer_id = 0
        for cfg in self.cfgs:
            layers = []
            for k, exp_size, c, se_ratio, s in cfg:
                output_channel = _make_divisible(c * width, 4)
                hidden_channel = _make_divisible(exp_size * width, 4)
                layers.append(block(input_channel, hidden_channel, output_channel, k, s,
                                    se_ratio=se_ratio, layer_id=layer_id, args=args))
                input_channel = output_channel
                layer_id += 1
            stages.append(nn.Sequential(*layers))

        output_channel = _make_divisible(exp_size * width, 4)
        stages.append(nn.Sequential(ConvBnAct(input_channel, output_channel, 1)))
        input_channel = output_channel

        self.blocks = nn.Sequential(*stages)

        # building last several layers
        output_channel = 1280
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.conv_head = nn.Conv2d(input_channel, output_channel, 1, 1, 0, bias=True)
        self.act2 = nn.ReLU(inplace=True)
        self.classifier = nn.Linear(output_channel, num_classes)

    def forward(self, x):
        x = self.conv_stem(x)
        x = self.bn1(x)
        x = self.act1(x)
        x = self.blocks(x)
        x = self.global_pool(x)
        x = self.conv_head(x)
        x = self.act2(x)
        x = x.view(x.size(0), -1)
        if self.dropout > 0.:
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.classifier(x)
        return x


@register_model
def ghostnetv2(**kwargs):
    cfgs = [
        # k, t, c, SE, s
        [[3,  16,  16, 0, 1]],
        [[3,  48,  24, 0, 2]],
        [[3,  72,  24, 0, 1]],
        [[5,  72,  40, 0.25, 2]],
        [[5, 120,  40, 0.25, 1]],
        [[3, 240,  80, 0, 2]],
        [[3, 200,  80, 0, 1],
         [3, 184,  80, 0, 1],
         [3, 184,  80, 0, 1],
         [3, 480, 112, 0.25, 1],
         [3, 672, 112, 0.25, 1]],
        [[5, 672, 160, 0.25, 2]],
        [[5, 960, 160, 0, 1],
         [5, 960, 160, 0.25, 1],
         [5, 960, 160, 0, 1],
         [5, 960, 160, 0.25, 1]]
    ]
    return GhostNetV2(cfgs, num_classes=kwargs['num_classes'],
                      width=kwargs['width'],
                      dropout=kwargs['dropout'],
                      args=kwargs['args'])


def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True


# Function to export weights in the specified format
def export_weight(model):
    f = open("ghostnetv2.weights", 'w')
    f.write("{}\n".format(len(model.state_dict().keys())))

    # Convert weights to hexadecimal format
    for k, v in model.state_dict().items():
        print('exporting ... {}: {}'.format(k, v.shape))

        # Reshape the weights to 1D
        vr = v.reshape(-1).cpu().numpy()
        f.write("{} {}".format(k, len(vr)))
        for vv in vr:
            f.write(" ")
            f.write(struct.pack(">f", float(vv)).hex())
        f.write("\n")

    f.close()


# Function to evaluate the model (optional)
def eval_model(input, model):
    output = model(input)
    print("------from inference------")
    print(input)
    print(output)


if __name__ == "__main__":
    setup_seed(1)

    # Create an instance of GhostNetV2
    model = ghostnetv2(width=1.0, num_classes=1000, dropout=0.2, args=None)
    model.eval()

    # Dummy input tensor (adjust the shape as per your requirement)
    input = torch.full((32, 3, 320, 256), 10.0)

    # Export the model weights
    export_weight(model)

    # Evaluate the model
    eval_model(input, model)


================================================
FILE: ghostnet/ghostnetv2/ghostnetv2.cpp
================================================
#include <chrono>
#include <cmath>
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"

using namespace std;

#define CHECK(status)                                          \
    do {                                                       \
        auto ret = (status);                                   \
        if (ret != 0) {                                        \
            std::cerr << "Cuda failure: " << ret << std::endl; \
            abort();                                           \
        }                                                      \
    } while (0)

// Define input/output parameters
static const int INPUT_H = 256;
static const int INPUT_W = 320;
static const int OUTPUT_SIZE = 1000;
static const int batchSize = 32;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
using namespace nvinfer1;

static Logger gLogger;

// Load weight file
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open the weight file
    std::ifstream input(file);
    if (!input.is_open()) {
        std::cerr << "Unable to load weight file." << std::endl;
        exit(EXIT_FAILURE);
    }

    // Read the number of weights
    int32_t count;
    input >> count;
    if (count <= 0) {
        std::cerr << "Invalid weight map file." << std::endl;
        exit(EXIT_FAILURE);
    }

    while (count--) {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read the name and size
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load weight data
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(uint32_t) * size));
        for (uint32_t x = 0, y = size; x < y; ++x) {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

int _make_divisible(int v, int divisor, int min_value = -1) {
    // If min_value is not specified, set it to divisor
    if (min_value == -1) {
        min_value = divisor;
    }

    // Calculate new channel size to be divisible by divisor
    int new_v = std::max(min_value, (v + divisor / 2) / divisor * divisor);

    // Ensure rounding down does not reduce by more than 10%
    if (new_v < static_cast<int>(0.9 * v)) {
        new_v += divisor;
    }

    return new_v;
}

ILayer* hardSigmoid(INetworkDefinition* network, ITensor& input) {
    // Apply Hard Sigmoid activation function
    IActivationLayer* scale_layer = network->addActivation(input, ActivationType::kHARD_SIGMOID);

    // Return the output after activation
    return scale_layer;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input,
                            std::string lname, float eps) {
    float* gamma = (float*)weightMap[lname + ".weight"].values;
    float* beta = (float*)weightMap[lname + ".bias"].values;
    float* mean = (float*)weightMap[lname + ".running_mean"].values;
    float* var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;

    float* scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};

    float* shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float* pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

IActivationLayer* convBnReluStem(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input,
                                 int outch, std::string lname) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    // Step 1: Convolution layer
    IConvolutionLayer* conv1 =
            network->addConvolutionNd(input, outch, DimsHW{3, 3}, weightMap[lname + ".weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{2, 2});   // Stride of 2
    conv1->setPaddingNd(DimsHW{1, 1});  // Padding of 1

    // Step 2: Batch normalization layer
    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "bn1", 1e-5);

    // Step 3: ReLU activation
    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    return relu1;  // Return the result after activation
}

ILayer* convBnAct(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input,
                  int out_channels, std::string lname, ActivationType actType = ActivationType::kRELU) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    // Add convolution layer
    IConvolutionLayer* conv =
            network->addConvolutionNd(input, out_channels, DimsHW{1, 1}, weightMap[lname + ".conv.weight"], emptywts);
    assert(conv);
    conv->setStrideNd(DimsHW{1, 1});

    // Add batch normalization layer
    IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn1", 1e-5);

    // Add activation layer (default is ReLU)
    IActivationLayer* act = network->addActivation(*bn->getOutput(0), actType);
    assert(act);

    return act;
}

ILayer* squeezeExcite(INetworkDefinition* network, ITensor& input, std::map<std::string, Weights>& weightMap,
                      int in_chs, float se_ratio = 0.25, std::string lname = "", float eps = 1e-5) {
    // Step 1: Global average pooling
    IReduceLayer* avg_pool = network->addReduce(input, ReduceOperation::kAVG, 1 << 2 | 1 << 3, true);
    assert(avg_pool);

    // Step 2: 1x1 convolution for dimension reduction
    int reduced_chs = _make_divisible(static_cast<int>(in_chs * se_ratio), 4);
    IConvolutionLayer* conv_reduce =
            network->addConvolutionNd(*avg_pool->getOutput(0), reduced_chs, DimsHW{1, 1},
                                      weightMap[lname + ".conv_reduce.weight"], weightMap[lname + ".conv_reduce.bias"]);
    assert(conv_reduce);

    // Step 3: ReLU activation
    IActivationLayer* relu1 = network->addActivation(*conv_reduce->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    // Step 4: 1x1 convolution for dimension expansion
    IConvolutionLayer* conv_expand =
            network->addConvolutionNd(*relu1->getOutput(0), in_chs, DimsHW{1, 1},
                                      weightMap[lname + ".conv_expand.weight"], weightMap[lname + ".conv_expand.bias"]);
    assert(conv_expand);

    // Step 5: Hard Sigmoid activation
    ILayer* hard_sigmoid = hardSigmoid(network, *conv_expand->getOutput(0));

    // Step 6: Multiply input by the output of SE module
    IElementWiseLayer* scale = network->addElementWise(input, *hard_sigmoid->getOutput(0), ElementWiseOperation::kPROD);
    assert(scale);

    return scale;
}

ILayer* ghostModuleV2(INetworkDefinition* network, ITensor& input, std::map<std::string, Weights>& weightMap, int inp,
                      int oup, int kernel_size = 1, int ratio = 2, int dw_size = 3, int stride = 1, bool relu = true,
                      std::string lname = "", std::string mode = "original") {
    int init_channels = std::ceil(oup / ratio);
    int new_channels = init_channels * (ratio - 1);

    // Primary convolution
    IConvolutionLayer* primary_conv = network->addConvolutionNd(input, init_channels, DimsHW{kernel_size, kernel_size},
                                                                weightMap[lname + ".primary_conv.0.weight"], Weights{});
    primary_conv->setStrideNd(DimsHW{stride, stride});
    primary_conv->setPaddingNd(DimsHW{kernel_size / 2, kernel_size / 2});

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *primary_conv->getOutput(0), lname + ".primary_conv.1", 1e-5);

    ITensor* act1_output = bn1->getOutput(0);
    if (relu) {
        IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
        act1_output = relu1->getOutput(0);
    }

    // Cheap operation
    IConvolutionLayer* cheap_conv =
            network->addConvolutionNd(*act1_output, new_channels, DimsHW{dw_size, dw_size},
                                      weightMap[lname + ".cheap_operation.0.weight"], Weights{});
    cheap_conv->setStrideNd(DimsHW{1, 1});
    cheap_conv->setPaddingNd(DimsHW{dw_size / 2, dw_size / 2});
    cheap_conv->setNbGroups(init_channels);

    IScaleLayer* bn2 =
            addBatchNorm2d(network, weightMap, *cheap_conv->getOutput(0), lname + ".cheap_operation.1", 1e-5);

    ITensor* act2_output = bn2->getOutput(0);
    if (relu) {
        IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU);
        act2_output = relu2->getOutput(0);
    }

    // Concatenate
    ITensor* concat_inputs[] = {act1_output, act2_output};
    IConcatenationLayer* concat = network->addConcatenation(concat_inputs, 2);

    // Slice to oup channels
    Dims start{4, {0, 0, 0, 0}};
    Dims size = concat->getOutput(0)->getDimensions();
    size.d[1] = oup;
    Dims stride_{4, {1, 1, 1, 1}};

    ISliceLayer* slice = network->addSlice(*concat->getOutput(0), start, size, stride_);

    ITensor* out = slice->getOutput(0);

    if (mode == "original") {
        return slice;
    } else if (mode == "attn") {
        // Attention mechanism
        // Average pooling
        IPoolingLayer* avg_pool = network->addPoolingNd(input, PoolingType::kAVERAGE, DimsHW{2, 2});
        avg_pool->setStrideNd(DimsHW{2, 2});

        ITensor* avg_pooled = avg_pool->getOutput(0);

        // Short convolution branch
        IConvolutionLayer* short_conv1 =
                network->addConvolutionNd(*avg_pooled, oup, DimsHW{kernel_size, kernel_size},
                                          weightMap[lname + ".short_conv.0.weight"], Weights{});
        short_conv1->setStrideNd(DimsHW{1, 1});
        short_conv1->setPaddingNd(DimsHW{kernel_size / 2, kernel_size / 2});
        IScaleLayer* short_bn1 =
                addBatchNorm2d(network, weightMap, *short_conv1->getOutput(0), lname + ".short_conv.1", 1e-5);

        // Conv with kernel size (1,5)
        IConvolutionLayer* short_conv2 = network->addConvolutionNd(
                *short_bn1->getOutput(0), oup, DimsHW{1, 5}, weightMap[lname + ".short_conv.2.weight"], Weights{});
        short_conv2->setStrideNd(DimsHW{1, 1});
        short_conv2->setPaddingNd(DimsHW{0, 2});
        short_conv2->setNbGroups(oup);
        IScaleLayer* short_bn2 =
                addBatchNorm2d(network, weightMap, *short_conv2->getOutput(0), lname + ".short_conv.3", 1e-5);

        // Conv with kernel size (5,1)
        IConvolutionLayer* short_conv3 = network->addConvolutionNd(
                *short_bn2->getOutput(0), oup, DimsHW{5, 1}, weightMap[lname + ".short_conv.4.weight"], Weights{});
        short_conv3->setStrideNd(DimsHW{1, 1});
        short_conv3->setPaddingNd(DimsHW{2, 0});
        short_conv3->setNbGroups(oup);
        IScaleLayer* short_bn3 =
                addBatchNorm2d(network, weightMap, *short_conv3->getOutput(0), lname + ".short_conv.5", 1e-5);

        ITensor* res = short_bn3->getOutput(0);

        // Sigmoid activation
        IActivationLayer* gate = network->addActivation(*res, ActivationType::kSIGMOID);

        // Upsample to the same size as out
        IResizeLayer* gate_upsampled = network->addResize(*gate->getOutput(0));
        gate_upsampled->setResizeMode(ResizeMode::kNEAREST);
        Dims out_dims = out->getDimensions();
        gate_upsampled->setOutputDimensions(out_dims);

        // Element-wise multiplication
        IElementWiseLayer* scaled_out =
                network->addElementWise(*out, *gate_upsampled->getOutput(0), ElementWiseOperation::kPROD);

        return scaled_out;
    } else {
        std::cerr << "Invalid mode: " << mode << " in ghostModuleV2" << std::endl;
        return nullptr;
    }
}

ILayer* ghostBottleneck(INetworkDefinition* network, ITensor& input, std::map<std::string, Weights>& weightMap,
                        int in_chs, int mid_chs, int out_chs, int dw_kernel_size = 3, int stride = 1,
                        float se_ratio = 0.0f, std::string lname = "", int layer_id = 0) {
    // Determine mode based on layer_id
    std::string mode = (layer_id <= 1) ? "original" : "attn";

    // ghost1
    ILayer* ghost1 =
            ghostModuleV2(network, input, weightMap, in_chs, mid_chs, 1, 2, 3, 1, true, lname + ".ghost1", mode);

    ILayer* depthwise_conv = ghost1;
    if (stride > 1) {
        IConvolutionLayer* conv_dw =
                network->addConvolutionNd(*ghost1->getOutput(0), mid_chs, DimsHW{dw_kernel_size, dw_kernel_size},
                                          weightMap[lname + ".conv_dw.weight"], Weights{});
        conv_dw->setStrideNd(DimsHW{stride, stride});
        conv_dw->setPaddingNd(DimsHW{(dw_kernel_size - 1) / 2, (dw_kernel_size - 1) / 2});
        conv_dw->setNbGroups(mid_chs);
        IScaleLayer* bn_dw = addBatchNorm2d(network, weightMap, *conv_dw->getOutput(0), lname + ".bn_dw", 1e-5);
        depthwise_conv = bn_dw;
    }

    ILayer* se_layer = depthwise_conv;
    if (se_ratio > 0.0f) {
        se_layer = squeezeExcite(network, *depthwise_conv->getOutput(0), weightMap, mid_chs, se_ratio, lname + ".se");
    }

    // ghost2 uses original mode
    ILayer* ghost2 = ghostModuleV2(network, *se_layer->getOutput(0), weightMap, mid_chs, out_chs, 1, 2, 3, 1, false,
                                   lname + ".ghost2", "original");

    ILayer* shortcut_layer = nullptr;
    if (in_chs == out_chs && stride == 1) {
        shortcut_layer = network->addIdentity(input);
    } else {
        IConvolutionLayer* conv_shortcut_dw =
                network->addConvolutionNd(input, in_chs, DimsHW{dw_kernel_size, dw_kernel_size},
                                          weightMap[lname + ".shortcut.0.weight"], Weights{});
        conv_shortcut_dw->setStrideNd(DimsHW{stride, stride});
        conv_shortcut_dw->setPaddingNd(DimsHW{(dw_kernel_size - 1) / 2, (dw_kernel_size - 1) / 2});
        conv_shortcut_dw->setNbGroups(in_chs);
        IScaleLayer* bn_shortcut_dw =
                addBatchNorm2d(network, weightMap, *conv_shortcut_dw->getOutput(0), lname + ".shortcut.1", 1e-5);

        IConvolutionLayer* conv_shortcut_pw =
                network->addConvolutionNd(*bn_shortcut_dw->getOutput(0), out_chs, DimsHW{1, 1},
                                          weightMap[lname + ".shortcut.2.weight"], Weights{});
        IScaleLayer* bn_shortcut_pw =
                addBatchNorm2d(network, weightMap, *conv_shortcut_pw->getOutput(0), lname + ".shortcut.3", 1e-5);
        shortcut_layer = bn_shortcut_pw;
    }

    IElementWiseLayer* ew_sum =
            network->addElementWise(*ghost2->getOutput(0), *shortcut_layer->getOutput(0), ElementWiseOperation::kSUM);

    return ew_sum;
}

ICudaEngine* createEngine(IBuilder* builder, IBuilderConfig* config, DataType dt) {
    // Use explicit batch mode
    INetworkDefinition* network =
            builder->createNetworkV2(1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));

    // Create input tensor
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims4{batchSize, 3, INPUT_H, INPUT_W});
    assert(data);

    // Load weights
    std::map<std::string, Weights> weightMap = loadWeights("../ghostnetv2.weights");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    // Step 1: Conv Stem
    IActivationLayer* conv_stem = convBnReluStem(network, weightMap, *data, 16, "conv_stem");

    ILayer* current_layer = conv_stem;

    current_layer =
            ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 16, 16, 16, 3, 1, 0.0f, "blocks.0.0", 0);
    current_layer =
            ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 16, 48, 24, 3, 2, 0.0f, "blocks.1.0", 1);
    current_layer =
            ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 24, 72, 24, 3, 1, 0.0f, "blocks.2.0", 2);
    current_layer =
            ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 24, 72, 40, 5, 2, 0.25f, "blocks.3.0", 3);
    current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 40, 120, 40, 5, 1, 0.25f,
                                    "blocks.4.0", 4);
    current_layer =
            ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 40, 240, 80, 3, 2, 0.0f, "blocks.5.0", 5);
    current_layer =
            ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 80, 200, 80, 3, 1, 0.0f, "blocks.6.0", 6);
    current_layer =
            ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 80, 184, 80, 3, 1, 0.0f, "blocks.6.1", 7);
    current_layer =
            ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 80, 184, 80, 3, 1, 0.0f, "blocks.6.2", 8);
    current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 80, 480, 112, 3, 1, 0.25f,
                                    "blocks.6.3", 9);
    current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 112, 672, 112, 3, 1, 0.25f,
                                    "blocks.6.4", 10);
    current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 112, 672, 160, 5, 2, 0.25f,
                                    "blocks.7.0", 11);
    current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 160, 960, 160, 5, 1, 0.0f,
                                    "blocks.8.0", 12);
    current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 160, 960, 160, 5, 1, 0.25f,
                                    "blocks.8.1", 13);
    current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 160, 960, 160, 5, 1, 0.0f,
                                    "blocks.8.2", 14);
    current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 160, 960, 160, 5, 1, 0.25f,
                                    "blocks.8.3", 15);

    // Apply ConvBnAct
    current_layer = convBnAct(network, weightMap, *current_layer->getOutput(0), 960, "blocks.9.0");

    // Global average pooling
    IReduceLayer* global_pool =
            network->addReduce(*current_layer->getOutput(0), ReduceOperation::kAVG, 1 << 2 | 1 << 3, true);
    assert(global_pool);

    // Conv Head
    IConvolutionLayer* conv_head = network->addConvolutionNd(
            *global_pool->getOutput(0), 1280, DimsHW{1, 1}, weightMap["conv_head.weight"], weightMap["conv_head.bias"]);
    IActivationLayer* act2 = network->addActivation(*conv_head->getOutput(0), ActivationType::kRELU);

    // Fully connected layer (classifier)
    IFullyConnectedLayer* classifier = network->addFullyConnected(
            *act2->getOutput(0), 1000, weightMap["classifier.weight"], weightMap["classifier.bias"]);
    classifier->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*classifier->getOutput(0));

    // Build the engine
    config->setMaxWorkspaceSize(1 << 24);
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);

    // Destroy the network
    network->destroy();

    // Free memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    return engine;
}

void APIToModel(IHostMemory** modelStream) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model and serialize
    ICudaEngine* engine = createEngine(builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Release resources
    engine->destroy();
    config->destroy();
    builder->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
    const ICudaEngine& engine = context.getEngine();

    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Input and output buffers
    void* buffers[2];

    // Create buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // Copy input data to device, execute inference, and copy output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float),
                          cudaMemcpyHostToDevice, stream));
    context.enqueueV2(buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost,
                          stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

int main(int argc, char** argv) {
    if (argc != 2) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./ghostnetv2 -s   // serialize model to plan file" << std::endl;
        std::cerr << "./ghostnetv2 -d   // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // Create model and serialize
    char* trtModelStream{nullptr};
    size_t size{0};

    if (std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(&modelStream);
        assert(modelStream != nullptr);

        std::ofstream p("ghostnetv2.engine", std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 0;
    } else if (std::string(argv[1]) == "-d") {
        std::ifstream file("ghostnetv2.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        return -1;
    }

    // Allocate input and output data
    float* data = new float[batchSize * 3 * INPUT_H * INPUT_W];
    for (int i = 0; i < batchSize * 3 * INPUT_H * INPUT_W; i++)
        data[i] = 10.0;

    float* prob = new float[batchSize * OUTPUT_SIZE];

    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    // Execute inference
    doInference(*context, data, prob, batchSize);

    // Print output results
    std::cout << "\nOutput:\n\n";
    for (int i = 0; i < batchSize; i++) {
        std::cout << "Batch " << i << ":\n";
        for (unsigned int j = 0; j < OUTPUT_SIZE; j++) {
            std::cout << prob[i * OUTPUT_SIZE + j] << ", ";
            if (j % 10 == 0)
                std::cout << j / 10 << std::endl;
        }
        std::cout << "\n";
    }

    // Release resources
    context->destroy();
    engine->destroy();
    runtime->destroy();
    delete[] data;
    delete[] prob;

    return 0;
}


================================================
FILE: ghostnet/ghostnetv2/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "NvInferRuntimeCommon.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf {
   public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {}

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {}

    ~LogStreamConsumerBuffer() {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr()) {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync() {
        putOutput();
        return 0;
    }

    void putOutput() {
        if (mShouldLog) {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }

   private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase {
   public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog) {}

   protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
   public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(severity <= reportableSeverity),
          mSeverity(severity) {}

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(other.mShouldLog),
          mSeverity(other.mSeverity) {}

    void setReportableSeverity(Severity reportableSeverity) {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

   private:
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger {
   public:
    Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {}

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult {
        kRUNNING,  //!< The test is running
        kPASSED,   //!< The test passed
        kFAILED,   //!< The test failed
        kWAIVED    //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger() { return *this; }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) noexcept override {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity) { mReportableSeverity = severity; }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom {
       public:
        TestAtom(TestAtom&&) = default;

       private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started), mName(name), mCmdline(cmdline) {}

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline) {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom) {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result) {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass) {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const { return mReportableSeverity; }

   private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result) {
        switch (result) {
            case TestResult::kRUNNING:
                return "RUNNING";
            case TestResult::kPASSED:
                return "PASSED";
            case TestResult::kFAILED:
                return "FAILED";
            case TestResult::kWAIVED:
                return "WAIVED";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result) {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv) {
        std::stringstream ss;
        for (int i = 0; i < argc; i++) {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace {

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

}  // anonymous namespace

#endif  // TENSORRT_LOGGING_H


================================================
FILE: googlenet/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.14)

project(
  googlenet
  VERSION 0.1
  LANGUAGES C CXX CUDA)

if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
  set(CMAKE_CUDA_ARCHITECTURES
      60
      70
      72
      75
      80
      86
      89)
endif()

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_INCLUDE_CURRENT_DIR TRUE)

option(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cudaruntime library" OFF)

find_package(Threads REQUIRED)
find_package(CUDAToolkit REQUIRED)
find_package(OpenCV REQUIRED)

if(NOT TARGET TensorRT::TensorRT)
  include(FindTensorRT.cmake)
endif()

add_executable(${PROJECT_NAME} googlenet.cpp)

target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_LIST_DIR}
                                                  ${OpenCV_INCLUDE_DIRS})

target_link_libraries(${PROJECT_NAME} PUBLIC Threads::Threads CUDA::cudart
                                             TensorRT::TensorRT ${OpenCV_LIBS})


================================================
FILE: googlenet/FindTensorRT.cmake
================================================
cmake_minimum_required(VERSION 3.17.0)

function(_guess_path var_name required_files)
  set(_result "")

  foreach(path_entry IN LISTS ARGN)
    if(NOT EXISTS "${path_entry}")
      message(DEBUG "skip non-existing path '${path_entry}'")
      continue()
    endif()

    set(_ok TRUE)
    foreach(required_file IN LISTS required_files)
      if(NOT EXISTS "${path_entry}/${required_file}")
        set(_ok FALSE)
        message(DEBUG "'${path_entry}' missing '${required_file}'")
        break()
      endif()
    endforeach()

    if(_ok)
      list(APPEND _result "${path_entry}")
      message(DEBUG "accept '${path_entry}'")
    else()
      message(DEBUG "reject '${path_entry}'")
    endif()
  endforeach()

  if(_result STREQUAL "")
    message(
      FATAL_ERROR
        "_guess_path(${var_name}) failed: no valid path found. required_files='${required_files}' candidates='${ARGN}'"
    )
  endif()

  set(${var_name}
      "${_result}"
      PARENT_SCOPE)
endfunction()

# add library
add_library(TensorRT IMPORTED INTERFACE)
add_library(TensorRT::TensorRT ALIAS TensorRT)

set(TRT_VERSION
    CACHE
      STRING
      "TensorRT version, e.g. \"8.6.1.6\" or \"8.6.1.6+cuda12.0.1.011\", \"8.6.1.6.Windows10.x86_64.cuda-12.0\" etc"
)

if(NOT TRT_VERSION STREQUAL "" AND NOT $ENV{TRT_VERSION} STREQUAL "")
  message(
    WARNING
      "TRT_VERSION defined by cmake and environment variable both, using the later one"
  )
endif()

if(NOT $ENV{TRT_VERSION} STREQUAL "")
  set(TRT_VERSION $ENV{TRT_VERSION})
endif()

string(REGEX MATCH "([0-9]+)" _match ${TRT_VERSION})
set(TRT_MAJOR_VERSION "${_match}")
unset(_match)

if(WIN32)
  set(TensorRT_DIR "C:/Program Files/TensorRT-${TRT_VERSION}")
  if(NOT EXISTS "${TensorRT_DIR}")
    message(
      FATAL_ERROR
        "TensorRT_DIR=${TensorRT_DIR} does not exist!"
    )
  endif()

  if(${TRT_MAJOR_VERSION} GREATER_EQUAL 10)
    set(_modules nvinfer_10 nvinfer_plugin_10 nvinfer_vc_plugin_10
                 nvinfer_dispatch_10 nvinfer_lean_10)
    message(DEBUG "Using ${_modules}")
  else()
    set(_modules nvinfer nvinfer_plugin nvinfer_vc_plugin nvinfer_dispatch
                 nvinfer_lean)
  endif()

  set(TensorRT_LIBRARY_DIR "${TensorRT_DIR}/lib")
  set(TensorRT_INCLUDE_DIR "${TensorRT_DIR}/include")
elseif(UNIX)
  string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" _trt_arch)
  set(_trt_include_candidates)
  if(_trt_arch MATCHES "^(aarch64|arm64|arch64)$")
    set(_trt_include_candidates "/usr/include/aarch64-linux-gnu" "/usr/include"
                                "/usr/local/cuda/targets/aarch64-linux/include")
    set(_trt_library_candidates
        "/usr/local/tensorrt/targets/aarch64-linux-gnu/lib"
        "/usr/lib/aarch64-linux-gnu" "/usr/lib/aarch64-linux-gnu/tegra"
        "/usr/lib")
  elseif(_trt_arch MATCHES "^(x86_64|amd64)$")
    set(_trt_include_candidates
        "/usr/local/tensorrt/targets/x86_64-linux-gnu/include"
        "/usr/include/x86_64-linux-gnu" "/usr/include")
    set(_trt_library_candidates
        "/usr/local/tensorrt/targets/x86_64-linux-gnu/lib"
        "/usr/lib/x86_64-linux-gnu" "/usr/lib")
  else()
    message(FATAL_ERROR "Unknown architecture")
  endif()

  set(_modules nvinfer nvinfer_plugin)
  if(${TRT_MAJOR_VERSION} GREATER_EQUAL 8)
    list(APPEND _modules nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean)
  endif()

  _guess_path(TensorRT_LIBRARY_DIR "libnvinfer.so;libnvinfer_plugin.so"
              ${_trt_library_candidates})
  message(STATUS "TensorRT libraries: ${TensorRT_LIBRARY_DIR}")
  _guess_path(TensorRT_INCLUDE_DIR "NvInfer.h" ${_trt_include_candidates})
  message(STATUS "TensorRT includes: ${TensorRT_INCLUDE_DIR}")
endif()

foreach(lib IN LISTS _modules)
  find_library(
    TensorRT_${lib}_LIBRARY
    NAMES ${lib}
    HINTS ${TensorRT_LIBRARY_DIR})
  list(APPEND TensorRT_LIBRARIES ${TensorRT_${lib}_LIBRARY})
endforeach()

target_link_libraries(TensorRT INTERFACE ${TensorRT_LIBRARIES})

message(STATUS "Found TensorRT libs: ${TensorRT_LIBRARIES}")

set_target_properties(
  TensorRT
  PROPERTIES C_STANDARD 17
             CXX_STANDARD 17
             POSITION_INDEPENDENT_CODE ON
             SKIP_BUILD_RPATH TRUE
             BUILD_WITH_INSTALL_RPATH TRUE
             INSTALL_RPATH "$ORIGIN"
             INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIR}")

unset(TRT_MAJOR_VERSION)
unset(_modules)
unset(_trt_include_candidates)
unset(_trt_library_candidates)
unset(_trt_arch)


================================================
FILE: googlenet/README.md
================================================
# Googlenet

## Introduction

GoogLeNet (Inception v1) model architecture from [Going Deeper with Convolutions](http://arxiv.org/abs/1409.4842). For model details, refer to code from [torchvision](https://github.com/pytorch/vision/blob/main/torchvision/models/googlenet.py#L29), for generating `.wts` file, refer to [pytorchx/googlenet](https://github.com/wang-xinyu/pytorchx/tree/master/googlenet)

## Usage

1. use `gen_wts.py` to generate wts file.

```bash
python3 gen_wts.py
```

2. build C++ code

```bash
pushd tensorrtx/googlenet
cmake -S . -B build -G Ninja --fresh
cmake --build build
```

3. serialize wts model to engine file.

```bash
./build/googlenet -s
```

4. run inference

```bash
./build/googlenet -i
```

output looks like:

```bash
...
====
Execution time: 637us
-1.823, -0.9841, 0.6483, 0.7607, -0.4659, -1.407, -2.807, -1.175, -0.4034, -1.881, -1.267, -1.654, 0.7542, -1.777, -0.7118, -2.134, -1.542, 0.1852, -3.036, -0.5396, -0.1669,
====
prediction result:
Top: 0 idx: 285, logits: 9.9, label: Egyptian cat
Top: 1 idx: 281, logits: 8.304, label: tabby, tabby cat
Top: 2 idx: 282, logits: 6.859, label: tiger cat
```


================================================
FILE: googlenet/gen_wts.py
================================================
import struct

import cv2
import numpy as np
import torch
from torchvision.models.googlenet import googlenet


def read_imagenet_labels() -> dict[int, str]:
    """
    read ImageNet 1000 labels

    Returns:
        dict[int, str]: labels dict
    """
    clsid2label = {}
    with open("../assets/imagenet1000_clsidx_to_labels.txt", "r") as f:
        for i in f.readlines():
            k, v = i.split(": ")
            clsid2label.setdefault(int(k), v[1:-3])
    return clsid2label


def preprocess(img: np.array) -> torch.Tensor:
    """
    a preprocess method align with ImageNet dataset

    Args:
        img (np.array): input image

    Returns:
        torch.Tensor: preprocessed image in `NCHW` layout
    """
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
    img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_LINEAR)
    mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
    std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
    img = (img - mean) / std
    img = img.transpose(2, 0, 1)[None, ...]
    return torch.from_numpy(img)


def main():
    labels = read_imagenet_labels()

    img = cv2.imread("../assets/cats.jpg", cv2.IMREAD_COLOR)
    img = preprocess(img)

    model = googlenet(pretrained=True)
    with torch.inference_mode():
        model = model.eval()
        output = model(img)
        for i, batch in enumerate(torch.topk(output, k=3).indices):
            for j, idx in enumerate(batch):
                print(f"\tBatch: {i}, Top: {j}, logits: {output[i][idx]:.4f}, label: {labels[int(idx)]}")
        print(f"{'=' * 32}")

    with open("../models/googlenet.wts", "w") as f:
        f.write("{}\n".format(len(model.state_dict().keys())))
        for k, v in model.state_dict().items():
            vr = v.reshape(-1).cpu().numpy()
            f.write("{} {} ".format(k, len(vr)))
            print(k, v.shape)
            for vv in vr:
                f.write(" ")
                f.write(struct.pack(">f", float(vv)).hex())
            f.write("\n")
        f.close()


if __name__ == "__main__":
    main()


================================================
FILE: googlenet/googlenet.cpp
================================================
#include <NvInfer.h>
#include <cassert>
#include <chrono>
#include <cmath>
#include <opencv2/opencv.hpp>
#include <vector>
#include "logging.h"
#include "utils.h"

using WeightMap = std::map<std::string, Weights>;
using M = nvinfer1::MatrixOperation;
using E = nvinfer1::ElementWiseOperation;
using NDCF = nvinfer1::NetworkDefinitionCreationFlag;

static Logger gLogger;

// stuff we know about googlenet
static constexpr const std::size_t N = 1;
static constexpr const int32_t INPUT_H = 224;
static constexpr const int32_t INPUT_W = 224;
static constexpr const std::array<int32_t, 2> SIZES = {3 * INPUT_H * INPUT_W, 1000};
static constexpr const std::array<const char*, 2> NAMES = {"data", "prob"};
static constexpr const bool TRT_PREPROCESS = TRT_VERSION >= 8510 ? true : false;
static constexpr const char* WTS_PATH = "../models/googlenet.wts";
static constexpr const char* ENGINE_PATH = "../models/googlenet.engine";
static constexpr const char* LABELS_PATH = "../assets/imagenet1000_clsidx_to_labels.txt";
static constexpr const std::array<const float, 3> mean = {0.485f, 0.456f, 0.406f};
static constexpr const std::array<const float, 3> stdv = {0.229f, 0.224f, 0.225f};

auto addBatchNorm2d(INetworkDefinition* network, WeightMap& m, ITensor& input, const std::string& lname,
                    float eps = 1e-3) -> ILayer* {
    static Weights none{DataType::kFLOAT, nullptr, 0ll};
    float* gamma = (float*)m[lname + ".weight"].values;
    float* beta = (float*)m[lname + ".bias"].values;
    float* mean = (float*)m[lname + ".running_mean"].values;
    float* var = (float*)m[lname + ".running_var"].values;
    int64_t len = m[lname + ".running_var"].count;

    auto* scval = static_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};

    auto* shift_val = static_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shift_val[i] = beta[i] - (mean[i] * scval[i]);
    }
    Weights shift{DataType::kFLOAT, shift_val, len};

    m[lname + ".scale"] = scale;
    m[lname + ".shift"] = shift;
    m[lname + ".power"] = none;
    auto* bn = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, none);
    assert(bn);
    bn->setName(lname.c_str());
    return bn;
}

/**
 * @brief A basic conv2d+bn+relu layer from googlenet
 *
 * @param network network definition from TensorRT API
 * @param weightMap weight map
 * @param input input tensor
 * @param outch output channels
 * @param k kernel size for convolution
 * @param s stride size for convolution
 * @param p padding size for convolution
 * @param lname layer name from weight map
 * @return ILayer*
 */
ILayer* basicConv2d(INetworkDefinition* network, WeightMap& weightMap, ITensor& input, const std::string& lname,
                    int32_t outch, int k, int s = 1, int p = 0) {
    static const Weights none{DataType::kFLOAT, nullptr, 0ll};
    auto* conv = network->addConvolutionNd(input, outch, DimsHW{k, k}, weightMap[lname + ".conv.weight"], none);
    auto* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn");
    auto* relu = network->addActivation(*bn->getOutput(0), ActivationType::kRELU);
    assert(conv && bn && relu);
    conv->setName(lname.c_str());
    bn->setName((lname + ".bn").c_str());
    relu->setName((lname + ".relu").c_str());
    conv->setStrideNd(DimsHW{s, s});
    conv->setPaddingNd(DimsHW{p, p});
    return relu;
}

/**
 * @brief Inception module from googlenet implementation in torchvision, see:
 * https://github.com/pytorch/vision/blob/v0.24.1/torchvision/models/googlenet.py#L184
 *
 * @param network network definition from TensorRT API
 * @param weightMap weight map
 * @param input input tensor
 * @param lname layer name from weight map
 * @param ch1x1
 * @param ch3x3red
 * @param ch3x3
 * @param ch5x5red
 * @param ch5x5
 * @param pool_proj
 * @return IConcatenationLayer*
 */
IConcatenationLayer* inception(INetworkDefinition* network, WeightMap& weightMap, ITensor& input,
                               const std::string& lname, int ch1x1, int ch3x3red, int ch3x3, int ch5x5red, int ch5x5,
                               int pool_proj) {
    // "cbr" means "Conv-Batchnorm-Relu"
    auto* cbr1 = basicConv2d(network, weightMap, input, lname + "branch1", ch1x1, 1);
    auto* cbr2 = basicConv2d(network, weightMap, input, lname + "branch2.0", ch3x3red, 1);
    auto* cbr3 = basicConv2d(network, weightMap, *cbr2->getOutput(0), lname + "branch2.1", ch3x3, 3, 1, 1);
    auto* cbr4 = basicConv2d(network, weightMap, input, lname + "branch3.0", ch5x5red, 1);
    auto* cbr5 = basicConv2d(network, weightMap, *cbr4->getOutput(0), lname + "branch3.1", ch5x5, 3, 1, 1);
    auto* pool1 = network->addPoolingNd(input, PoolingType::kMAX, DimsHW{3, 3});
    auto* cbr6 = basicConv2d(network, weightMap, *pool1->getOutput(0), lname + "branch4.1", pool_proj, 1);
    assert(cbr1 && cbr2 && cbr3 && cbr4 && cbr5 && pool1 && cbr6);
    pool1->setStrideNd(DimsHW{1, 1});
    pool1->setPaddingNd(DimsHW{1, 1});
    pool1->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_UP);

    std::array<ITensor*, 4> inputTensors = {cbr1->getOutput(0), cbr3->getOutput(0), cbr5->getOutput(0),
                                            cbr6->getOutput(0)};
    IConcatenationLayer* cat1 = network->addConcatenation(inputTensors.data(), 4);
    assert(cat1);
    return cat1;
}

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(int32_t N, IRuntime* runtime, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    WeightMap weightMap = loadWeights(WTS_PATH);

#if TRT_VERSION >= 11200
    auto flag = 1U << static_cast<int>(NDCF::kSTRONGLY_TYPED);
#elif TRT_VERSION >= 10000
    auto flag = 0U;
#else
    auto flag = 1U << static_cast<int>(NDCF::kEXPLICIT_BATCH);
#endif
    auto* network = builder->createNetworkV2(flag);

    ITensor* input{nullptr};
    if constexpr (TRT_PREPROCESS) {
        dt = DataType::kUINT8;
        input = network->addInput(NAMES[0], dt, Dims4{N, INPUT_H, INPUT_W, 3});
        auto* trans = addTransformLayer(network, *input, true, mean, stdv);
        input = trans->getOutput(0);
    } else {
        input = network->addInput(NAMES[0], dt, Dims4{N, 3, INPUT_H, INPUT_W});
    }
    assert(input);

    auto* relu1 = basicConv2d(network, weightMap, *input, "conv1", 64, 7, 2, 3);
    auto* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
    assert(pool1);
    pool1->setStrideNd(DimsHW{2, 2});
    pool1->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_UP);
    pool1->setName("pool1");

    auto* relu2 = basicConv2d(network, weightMap, *pool1->getOutput(0), "conv2", 64, 1);
    auto* relu3 = basicConv2d(network, weightMap, *relu2->getOutput(0), "conv3", 192, 3, 1, 1);
    auto* pool2 = network->addPoolingNd(*relu3->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
    assert(pool2);
    pool2->setStrideNd(DimsHW{2, 2});
    pool2->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_UP);
    pool2->setName("pool2");

    auto* cat1 = inception(network, weightMap, *pool2->getOutput(0), "inception3a.", 64, 96, 128, 16, 32, 32);
    auto* cat2 = inception(network, weightMap, *cat1->getOutput(0), "inception3b.", 128, 128, 192, 32, 96, 64);
    auto* pool3 = network->addPoolingNd(*cat2->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
    assert(pool3);
    pool3->setStrideNd(DimsHW{2, 2});
    pool3->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_UP);
    pool3->setName("pool3");

    auto* cat3 = inception(network, weightMap, *pool3->getOutput(0), "inception4a.", 192, 96, 208, 16, 48, 64);
    cat3 = inception(network, weightMap, *cat3->getOutput(0), "inception4b.", 160, 112, 224, 24, 64, 64);
    cat3 = inception(network, weightMap, *cat3->getOutput(0), "inception4c.", 128, 128, 256, 24, 64, 64);
    cat3 = inception(network, weightMap, *cat3->getOutput(0), "inception4d.", 112, 144, 288, 32, 64, 64);
    cat3 = inception(network, weightMap, *cat3->getOutput(0), "inception4e.", 256, 160, 320, 32, 128, 128);

    IPoolingLayer* pool4 = network->addPoolingNd(*cat3->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    assert(pool4);
    pool4->setStrideNd(DimsHW{2, 2});
    pool4->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_UP);
    pool4->setName("pool4");

    cat3 = inception(network, weightMap, *pool4->getOutput(0), "inception5a.", 256, 160, 320, 32, 128, 128);
    cat3 = inception(network, weightMap, *cat3->getOutput(0), "inception5b.", 384, 192, 384, 48, 128, 128);

    // this is a AdaptiveAvgPool2d in pytorch implementation
    IPoolingLayer* pool5 = network->addPoolingNd(*cat3->getOutput(0), PoolingType::kAVERAGE, DimsHW{7, 7});
    auto* shuffle = network->addShuffle(*pool5->getOutput(0));
    assert(pool5 && shuffle);
    shuffle->setName("shuffle");
    shuffle->setReshapeDimensions(Dims2{1, -1});  // "-1" means "1024"

    auto* fcw = network->addConstant(Dims2{1000, 1024}, weightMap["fc.weight"])->getOutput(0);
    auto* fcb = network->addConstant(Dims2{1, 1000}, weightMap["fc.bias"])->getOutput(0);
    auto* fc0 = network->addMatrixMultiply(*shuffle->getOutput(0), M::kNONE, *fcw, M::kTRANSPOSE);
    auto* fc1 = network->addElementWise(*fc0->getOutput(0), *fcb, E::kSUM);

    fc1->getOutput(0)->setName(NAMES[1]);
    network->markOutput(*fc1->getOutput(0));
    // Build engine
#if TRT_VERSION >= 8000
    config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, WORKSPACE_SIZE);
    IHostMemory* mem = builder->buildSerializedNetwork(*network, *config);
    ICudaEngine* engine = runtime->deserializeCudaEngine(mem->data(), mem->size());
    delete network;
#else
    builder->setMaxBatchSize(N);
    config->setMaxWorkspaceSize(WORKSPACE_SIZE);
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    network->destroy();
#endif

    std::cout << "build finished\n";
    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)mem.second.values);
    }

    return engine;
}

void APIToModel(int32_t N, IRuntime* runtime, IHostMemory** modelStream) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(N, runtime, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

#if TRT_VERSION >= 8000
    delete engine;
    delete config;
    delete builder;
#else
    engine->destroy();
    config->destroy();
    builder->destroy();
#endif
}

std::vector<std::vector<float>> doInference(IExecutionContext& context, void* input, int64_t batchSize) {
    const auto& engine = context.getEngine();
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));
    std::vector<void*> buffers;

#if TRT_VERSION >= 8000
    const int32_t nIO = engine.getNbIOTensors();
#else
    const int32_t nIO = engine.getNbBindings();
#endif

    buffers.resize(nIO);
    for (auto i = 0; i < nIO; ++i) {
        std::size_t size = 0;
#if TRT_VERSION >= 8000
        auto* tensor_name = engine.getIOTensorName(i);
        auto s = getSize(engine.getTensorDataType(tensor_name));
        size = s * batchSize * SIZES[i];
        CHECK(cudaMalloc(&buffers[i], size));
        if (i == 0) {
            CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream));
        }
        context.setTensorAddress(tensor_name, buffers[i]);
#else
        const int32_t idx = engine.getBindingIndex(NAMES[i]);
        auto s = getSize(engine.getBindingDataType(idx));
        assert(idx == i);
        size = s * batchSize * SIZES[i];
        CHECK(cudaMalloc(&buffers[i], size));
        if (i == 0) {
            CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream));
        }
#endif
    }

#if TRT_VERSION >= 8000
    assert(context.enqueueV3(stream));
#else
    assert(context.enqueueV2(buffers.data(), stream, nullptr));
#endif

    std::vector<std::vector<float>> prob;
    for (int i = 1; i < nIO; ++i) {
        std::vector<float> tmp(batchSize * SIZES[i], std::nanf(""));
        std::size_t size = batchSize * SIZES[i] * sizeof(float);
        CHECK(cudaMemcpyAsync(tmp.data(), buffers[i], size, cudaMemcpyDeviceToHost, stream));
        prob.emplace_back(tmp);
    }
    CHECK(cudaStreamSynchronize(stream));

    for (auto& buffer : buffers) {
        CHECK(cudaFree(buffer));
    }
    CHECK(cudaStreamDestroy(stream));
    return prob;
}

int main(int argc, char** argv) {
    checkTrtEnv();
    if (argc != 2) {
        std::cerr << "arguments not right!\n";
        std::cerr << "./googlenet -s   // serialize model to plan file\n";
        std::cerr << "./googlenet -d   // deserialize plan file and run inference\n";
        return -1;
    }

    // create a model using the API directly and serialize it to a stream
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    char* trtModelStream{nullptr};
    std::streamsize size{0};

    if (std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(1, runtime, &modelStream);
        assert(modelStream != nullptr);

        std::ofstream p(ENGINE_PATH, std::ios::binary | std::ios::trunc);
        if (!p) {
            std::cerr << "could not open plan output file\n";
            return -1;
        }
        if (modelStream->size() > static_cast<std::size_t>(std::numeric_limits<std::streamsize>::max())) {
            std::cerr << "this model is too large to serialize\n";
            return -1;
        }
        const auto* data_ptr = reinterpret_cast<const char*>(modelStream->data());
        auto data_size = static_cast<std::streamsize>(modelStream->size());
        p.write(data_ptr, data_size);
#if TRT_VERSION >= 8000
        delete modelStream;
#else
        modelStream->destroy();
#endif
        return 0;
    } else if (std::string(argv[1]) == "-d") {
        std::ifstream file(ENGINE_PATH, std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        return 1;
    }

#if TRT_VERSION >= 8000
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
#else
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
#endif
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);

    const std::string img_path = "../assets/cats.jpg";
    void* input = nullptr;
    std::vector<float> flat_img;
    cv::Mat img = cv::imread(img_path, cv::IMREAD_COLOR);

    if constexpr (TRT_PREPROCESS) {
        // for simplicity, resize image on cpu side
        cv::resize(img, img, cv::Size(INPUT_W, INPUT_H), 0, 0, cv::INTER_LINEAR);
        input = static_cast<void*>(img.data);
    } else {
        flat_img = preprocess_img(img, true, mean, stdv, N, INPUT_H, INPUT_W);
        input = flat_img.data();
    }
    assert(input);

    for (int32_t i = 0; i < 100; ++i) {
        auto _start = std::chrono::system_clock::now();
        auto prob = doInference(*context, input, 1);
        auto _end = std::chrono::system_clock::now();
        auto _time = std::chrono::duration_cast<std::chrono::microseconds>(_end - _start).count();
        std::cout << "Execution time: " << _time << "us\n";

        for (const auto& vector : prob) {
            int idx = 0;
            for (auto v : vector) {
                std::cout << std::setprecision(4) << v << ", " << std::flush;
                if (++idx > 20) {
                    std::cout << "\n====\n";
                    break;
                }
            }
        }

        if (i == 99) {
            std::cout << "prediction result:\n";
            auto labels = loadImagenetLabelMap(LABELS_PATH);
            int _top = 0;
            for (auto& [idx, logits] : topk(prob[0], 3)) {
                std::cout << "Top: " << _top++ << " idx: " << idx << ", logits: " << logits
                          << ", label: " << labels[idx] << "\n";
            }
        }
    }
    delete[] trtModelStream;

#if TRT_VERSION >= 8000
    delete context;
    delete engine;
    delete runtime;
#else
    context->destroy();
    engine->destroy();
    runtime->destroy();
#endif

    return 0;
}


================================================
FILE: googlenet/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include <cassert>
#include <cstdint>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include <utility>
#include "NvInferRuntime.h"
#include "macros.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf {
   public:
    LogStreamConsumerBuffer(std::ostream& stream, std::string prefix, bool shouldLog)
        : mOutput(stream), mPrefix(std::move(prefix)), mShouldLog(shouldLog) {}

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) noexcept
        : mOutput(other.mOutput), mPrefix(std::move(other.mPrefix)), mShouldLog(other.mShouldLog) {}

    ~LogStreamConsumerBuffer() override {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr()) {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    int sync() override {
        putOutput();
        return 0;
    }

    void putOutput() {
        if (mShouldLog) {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }

   private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase {
   public:
    LogStreamConsumerBase(std::ostream& stream, std::string prefix, bool shouldLog)
        : mBuffer(stream, std::move(prefix), shouldLog) {}

   protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
   public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(severity <= reportableSeverity),
          mSeverity(severity) {}

    LogStreamConsumer(LogStreamConsumer&& other) noexcept
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(other.mShouldLog),
          mSeverity(other.mSeverity) {}

    void setReportableSeverity(Severity reportableSeverity) {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

   private:
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger {
   private:
    struct TestInfo;

   public:
    Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {}

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult : std::uint8_t {
        kRUNNING,  //!< The test is running
        kPASSED,   //!< The test passed
        kFAILED,   //!< The test failed
        kWAIVED    //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger() { return *this; }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << '\n';
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity) { mReportableSeverity = severity; }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom {
       public:
        TestAtom(TestAtom&&) = default;

       private:
        friend class Logger;

        TestAtom(bool started, TestInfo info)
            : mStarted(started), mName(std::move(info.name)), mCmdline(std::move(info.cmdline)) {}

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline) {
        return TestAtom{false, TestInfo{name, cmdline}};
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom) {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result) {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass) {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    [[nodiscard]] Severity getReportableSeverity() const { return mReportableSeverity; }

   private:
    struct TestInfo {
        std::string name;
        std::string cmdline;
    };
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result) {
        switch (result) {
            case TestResult::kRUNNING:
                return "RUNNING";
            case TestResult::kPASSED:
                return "PASSED";
            case TestResult::kFAILED:
                return "FAILED";
            case TestResult::kWAIVED:
                return "WAIVED";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result) {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << '\n';
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv) {
        std::stringstream ss;
        for (int i = 0; i < argc; i++) {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace {

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kVERBOSE};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINFO};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kWARNING};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kERROR};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINTERNAL_ERROR};
}

}  // anonymous namespace

#endif  // TENSORRT_LOGGING_H


================================================
FILE: googlenet/macros.h
================================================
#pragma once
#include <NvInfer.h>

#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else

#if defined(_MSC_VER)
#define API __declspec(dllimport)
#else
#define API
#endif
#endif  // API_EXPORTS

#define TRT_VERSION \
    ((NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + (NV_TENSORRT_PATCH * 10) + NV_TENSORRT_BUILD)

#if TRT_VERSION < 7220
#error "TensorRT >= 7.2.2 is required for this demo."
#endif

#if TRT_VERSION >= 8000
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif


================================================
FILE: googlenet/utils.h
================================================
#pragma once
#include <cuda_runtime_api.h>
#include <algorithm>
#include <cassert>
#include <fstream>
#include <iostream>
#include <map>
#include <numeric>
#include <opencv2/opencv.hpp>
#include <string>
#include <vector>
#include "macros.h"

using namespace nvinfer1;

constexpr const std::size_t WORKSPACE_SIZE = 16 << 20;

#define CHECK(status)                                     \
    do {                                                  \
        auto ret = (status);                              \
        if (ret != cudaSuccess) {                         \
            std::cerr << "Cuda failure: " << ret << "\n"; \
            std::abort();                                 \
        }                                                 \
    } while (0)

static void checkTrtEnv(int device = 0) {
#if TRT_VERSION < 8000
    CHECK(cudaGetDevice(&device));
    cudaDeviceProp prop{};
    CHECK(cudaGetDeviceProperties(&prop, device));
    const int sm = prop.major * 10 + prop.minor;
    if (sm > 86) {
        std::cerr << "TensorRT < 8 does not support SM > 86 on this GPU.";
        std::abort();
    }
#endif
}

/**
 * @brief TensorRT weight files have a simple space delimited format:
 * [type] [size] <data x size in hex>
 * 
 * @param file input weight file path
 * @return std::map<std::string, nvinfer1::Weights> 
 */
static auto loadWeights(const std::string& file) {
    std::cout << "Loading weights: " << file << "\n";
    std::map<std::string, nvinfer1::Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--) {
        nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0};

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> wt.count;

        // Load blob
        auto* val = new uint32_t[wt.count];
        input >> std::hex;
        for (auto x = 0ll; x < wt.count; ++x) {
            input >> val[x];
        }
        wt.values = val;
        weightMap[name] = wt;
    }

    return weightMap;
}

/**
 * @brief a preprocess function aligning with ImageNet preprocess in torchvision, only support 3-channel image
 * 
 * @param img opencv image with BGR layout
 * @param bgr2rgb whether to convert BGR to RGB
 * @param mean subtract mean
 * @param std divide std
 * @param n batch size
 * @param h resize height
 * @param w resize width
 * @return std::vector<float> contiguous flatten image data in float32 type
 */
static std::vector<float> preprocess_img(cv::Mat& img, bool bgr2rgb, const std::array<const float, 3>& mean,
                                         const std::array<const float, 3>& std, int n, int h, int w) {
    const auto c = img.channels();
    const auto size = c * h * w;
    if (c != 3) {
        std::cerr << "this demo only supports 3 channel input image.\n";
        std::abort();
    }
    if (bgr2rgb) {
        cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
    }
    cv::resize(img, img, cv::Size(w, h), 0, 0, cv::INTER_LINEAR);
    img.convertTo(img, CV_32FC3, 1.f / 255);
    img = (img - cv::Scalar(mean[0], mean[1], mean[2])) / cv::Scalar(std[0], std[1], std[2]);
    std::vector<float> chw(static_cast<std::size_t>(n) * c * h * w, 0.f);

    // fill all batch with the same input image
    for (int i = 0; i < n; ++i) {
        for (int y = 0; y < h; ++y) {
            for (int x = 0; x < w; ++x) {
                const cv::Vec3f v = img.at<cv::Vec3f>(y, x);
                chw[i * size + 0 * h * w + y * w + x] = v[0];
                chw[i * size + 1 * h * w + y * w + x] = v[1];
                chw[i * size + 2 * h * w + y * w + x] = v[2];
            }
        }
    }
    return chw;
}

static auto topk(const std::vector<float>& v, int k) -> std::vector<std::pair<int, float>> {
    if (k <= 0)
        return {};
    auto stride = std::min<std::ptrdiff_t>(k, static_cast<int64_t>(v.size()));

    std::vector<int> idx(v.size());
    std::iota(idx.begin(), idx.end(), 0);

    std::partial_sort(idx.begin(), idx.begin() + k, idx.end(), [&](int a, int b) { return v[a] > v[b]; });

    std::vector<std::pair<int, float>> out;
    out.reserve(stride);
    for (auto i = 0; i < stride; ++i)
        out.emplace_back(idx[i], v[idx[i]]);
    return out;
}

static std::map<int, std::string> loadImagenetLabelMap(const std::string& path) {
    std::map<int, std::string> labels;
    std::ifstream in(path);
    if (!in.is_open()) {
        return labels;
    }
    std::string line;
    while (std::getline(in, line)) {
        auto colon = line.find(':');
        if (colon == std::string::npos) {
            continue;
        }
        auto first_quote = line.find('\'', colon);
        if (first_quote == std::string::npos) {
            continue;
        }
        auto second_quote = line.find('\'', first_quote + 1);
        if (second_quote == std::string::npos) {
            continue;
        }
        int idx = std::stoi(line.substr(0, colon));
        labels[idx] = line.substr(first_quote + 1, second_quote - first_quote - 1);
    }
    return labels;
}

static ILayer* addTransformLayer(INetworkDefinition* network, ITensor& input, bool bgr2rgb,
                                 const std::array<const float, 3>& mean, const std::array<const float, 3>& std) {
    struct ScaleParams {
        std::array<float, 3> shift;
        std::array<float, 3> scale;
    };
    static std::vector<std::unique_ptr<ScaleParams>> gScaleParams;
    auto params = std::make_unique<ScaleParams>();
    params->shift = {-mean[0] / std[0], -mean[1] / std[1], -mean[2] / std[2]};
    params->scale = {1.f / (std[0] * 255.f), 1.f / (std[1] * 255.f), 1.f / (std[2] * 255.f)};

    static const Weights empty{DataType::kFLOAT, nullptr, 0ll};
    const Weights shift{DataType::kFLOAT, params->shift.data(), 3ll};
    const Weights scale{DataType::kFLOAT, params->scale.data(), 3ll};

    gScaleParams.emplace_back(std::move(params));

    ITensor* in = &input;
    if (input.getType() != DataType::kFLOAT) {
#if TRT_VERSION >= 8000
        auto* cast = network->addCast(input, DataType::kFLOAT);
        assert(cast);
        cast->setName("Cast to FP32");
        in = cast->getOutput(0);
#else
        auto* identity = network->addIdentity(input);
        assert(identity);
        identity->setName("Convert to FP32");
        identity->setOutputType(0, DataType::kFLOAT);
        in = identity->getOutput(0);
#endif
    }
    // Convert from NHWC to NCHW
    auto* perm = network->addShuffle(*in);
    assert(perm);
    perm->setName("NHWC -> NCHW");
    perm->setFirstTranspose(Permutation{0, 3, 1, 2});

    // Convert from BGR to RGB (optional)
    ITensor* data{nullptr};
    if (bgr2rgb) {
        auto add_slice = [&](int c, const char* name) -> ITensor* {
            auto dims = perm->getOutput(0)->getDimensions();
            Dims4 start = {0, c, 0, 0}, stride = {1, 1, 1, 1};
            Dims4 size = {dims.d[0], 1, dims.d[2], dims.d[3]};
            auto* _slice = network->addSlice(*perm->getOutput(0), start, size, stride);
            _slice->setName(name);
            assert(_slice && _slice->getNbOutputs() == 1);
            return _slice->getOutput(0);
        };
        std::array<ITensor*, 3> channels = {add_slice(2, "R"), add_slice(1, "G"), add_slice(0, "B")};
        auto* cat = network->addConcatenation(channels.data(), 3);
        assert(cat);
        cat->setName("RGB");
        cat->setAxis(1);
        data = cat->getOutput(0);
    } else {
        data = perm->getOutput(0);
    }

    // Normalize
    auto* trans = network->addScale(*data, ScaleMode::kCHANNEL, shift, scale, empty);
    assert(trans);
    trans->setName("mean & std");
#if TRT_VERSION >= 8000
    trans->setChannelAxis(1);
#endif
    return trans;
}

static size_t getSize(DataType dt) {
    switch (dt) {
#if TRT_VERSION >= 8510
        case DataType::kUINT8:
#endif
        case DataType::kINT8:
            return sizeof(int8_t);
        case DataType::kFLOAT:
            return sizeof(float);
        case DataType::kHALF:
            return sizeof(int16_t);
        case DataType::kINT32:
            return sizeof(int32_t);
        default: {
            std::cerr << "Unsupported data type\n";
            std::abort();
        }
    }
}


================================================
FILE: hrnet/hrnet-image-classification/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(hrnet)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

find_package(CUDA REQUIRED)
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

add_executable(hrnet ${PROJECT_SOURCE_DIR}/hrnet.cpp)
target_link_libraries(hrnet nvinfer)
target_link_libraries(hrnet cudart)
target_link_libraries(hrnet ${OpenCV_LIBS})

add_definitions(-O2 -pthread)


================================================
FILE: hrnet/hrnet-image-classification/README.md
================================================
# HRNet

The Pytorch implementation is [HRNet-Image-Classification](https://github.com/HRNet/HRNet-Image-Classification).  The implemented model is **HRNet-W18-C-Small-v2** 


## How to Run

* 1. generate .wts

  Download code and model from [HRNet-Image-Classification](https://github.com/HRNet/HRNet-Image-Classification) and config your environments.

  Put `demo.py`  in the `YOUR_ROOT_DIR\HRNet-Image-Classification\tools `  folder, set `savewts in  main()` as `True`, and run, the .wts will be generated.

* 2. cmake and make

  ```
  mkdir build
  cd build
  cmake ..
  make
  sudo ./hrnet -s             // serialize model to plan file i.e. 'hrnet.engine'
  sudo ./hrnet -d  ../samples // deserialize plan file and run inference, the images in samples will be processed.
  ```

## Result

The test img:

![](https://user-images.githubusercontent.com/20653176/93732833-ac103200-fc05-11ea-88ff-6f59f316a377.JPEG)

Pytorch Result:

![image-20200921115119593](https://user-images.githubusercontent.com/20653176/93731787-225e6580-fc01-11ea-9578-393079cd1873.png)

TRT Result:

![image-20200921114959069](https://user-images.githubusercontent.com/20653176/93731788-238f9280-fc01-11ea-954f-2debc20e102a.png)


================================================
FILE: hrnet/hrnet-image-classification/common.hpp
================================================
#pragma once

#include <fstream>
#include <map>
#include <sstream>
#include <vector>
#include <opencv2/opencv.hpp>
#include <dirent.h>
#include "NvInfer.h"
#include "NvInferPlugin.h"
#include "cuda_runtime_api.h"

using namespace nvinfer1;

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
    DIR *p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 &&
            strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{ DataType::kFLOAT, nullptr, 0 };
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + ".weight"].values;
    float *beta = (float*)weightMap[lname + ".bias"].values;
    float *mean = (float*)weightMap[lname + ".running_mean"].values;
    float *var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;
    //std::cout << "len " << len << std::endl;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{ DataType::kFLOAT, scval, len };

    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{ DataType::kFLOAT, shval, len };

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{ DataType::kFLOAT, pval, len };

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

ILayer* convBnLeaky(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int p, std::string convname, std::string bnname, bool bias = false) {
    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
    IConvolutionLayer* conv1;
    //Dims dim;
    if (!bias)
    {
        conv1 = network->addConvolutionNd(input, outch, DimsHW{ ksize, ksize }, weightMap[convname + ".weight"], emptywts);
    }
    else
    {
        conv1 = network->addConvolutionNd(input, outch, DimsHW{ ksize, ksize }, weightMap[convname + ".weight"], weightMap[convname + ".bias"]);
    }
    assert(conv1);
    conv1->setStrideNd(DimsHW{ s, s });
    conv1->setPaddingNd(DimsHW{ p, p });
    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), bnname, 1e-4);
    auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    return lr;
}

IActivationLayer* ResBlock2Conv(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname) {
    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
    IConvolutionLayer* conv1 = network->addConvolutionNd(input, inch, DimsHW{ 1, 1 }, weightMap[lname + ".conv1.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{ stride, stride });
    conv1->setPaddingNd(DimsHW{ 0, 0 });

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn1", 1e-5);
    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);
    ///
    IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), inch, DimsHW{ 3, 3 }, weightMap[lname + ".conv2.weight"], emptywts);
    assert(conv2);
    conv2->setStrideNd(DimsHW{ stride, stride });
    conv2->setPaddingNd(DimsHW{ 1, 1 });

    IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + ".bn2", 1e-5);

    IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU);
    assert(relu2);
    //////
    IConvolutionLayer* conv3 = network->addConvolutionNd(*relu2->getOutput(0), outch, DimsHW{ 1, 1 }, weightMap[lname + ".conv3.weight"], emptywts);
    assert(conv3);
    conv1->setStrideNd(DimsHW{ stride, stride });
    conv3->setPaddingNd(DimsHW{ 0, 0 });

    IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + ".bn3", 1e-5);

    IElementWiseLayer* ew1;
    if (inch != outch) {
        IConvolutionLayer* conv4 = network->addConvolutionNd(input, outch, DimsHW{ 1, 1 }, weightMap[lname + ".downsample.0.weight"], emptywts);
        assert(conv4);
        conv4->setStrideNd(DimsHW{ stride, stride });
        conv4->setPaddingNd(DimsHW{ 0, 0 });
        IScaleLayer* bn4 = addBatchNorm2d(network, weightMap, *conv4->getOutput(0), lname + ".downsample.1", 1e-5);
        ew1 = network->addElementWise(*bn4->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM);
    }
    else {
        ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM);
    }
    IActivationLayer* relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU);
    assert(relu3);
    return relu3;
}

IActivationLayer* ResBlock(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname) {
    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
    // in 256 out 64
    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ 1, 1 }, weightMap[lname + ".conv1.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{ stride, stride });
    conv1->setPaddingNd(DimsHW{ 0, 0 });

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn1", 1e-5);

    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);
    ///
    IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{ 3, 3 }, weightMap[lname + ".conv2.weight"], emptywts);
    assert(conv2);
    conv2->setStrideNd(DimsHW{ stride, stride });
    conv2->setPaddingNd(DimsHW{ 1, 1 });

    IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + ".bn2", 1e-5);

    IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU);
    assert(relu2);
    //////
    IConvolutionLayer* conv3 = network->addConvolutionNd(*relu2->getOutput(0), inch, DimsHW{ 1, 1 }, weightMap[lname + ".conv3.weight"], emptywts);
    assert(conv3);
    conv1->setStrideNd(DimsHW{ stride, stride });
    conv1->setPaddingNd(DimsHW{ 0, 0 });

    IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + ".bn3", 1e-5);

    IElementWiseLayer* ew1;
    ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM);
    IActivationLayer* relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU);
    assert(relu3);
    return relu3;
}

IActivationLayer* liteResBlock(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, std::string lname) {
    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
    // in 256 out 64
    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ 3, 3 }, weightMap[lname + ".conv1.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{ 1, 1 });
    conv1->setPaddingNd(DimsHW{ 1, 1 });

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn1", 1e-5);

    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);
    ///
    IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{ 3, 3 }, weightMap[lname + ".conv2.weight"], emptywts);
    assert(conv2);
    conv2->setStrideNd(DimsHW{ 1, 1 });
    conv2->setPaddingNd(DimsHW{ 1, 1 });

    IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + ".bn2", 1e-5);

    IElementWiseLayer* ew1;
    ew1 = network->addElementWise(input, *bn2->getOutput(0), ElementWiseOperation::kSUM);

    IActivationLayer* relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU);
    assert(relu3);
    return relu3;
}

ILayer* netAddUpsample(INetworkDefinition* network, ITensor* input, int inputChannels, int stride){
    nvinfer1::Dims inpDims = input->getDimensions();
    assert(inpDims.nbDims == 3); // chw
    assert(inpDims.d[1] == inpDims.d[2]);
    int h = inpDims.d[1];
    int w = inpDims.d[2];
    // add pre multiply matrix as a constant
    /*
    kSPATIA Elements correspond to different spatial data.

    kCHANNEL Elements correspond to different channels.
    */
    nvinfer1::Dims preDims{ 3,
                           {1, stride * h, w},
                           {nvinfer1::DimensionType::kCHANNEL,
                            nvinfer1::DimensionType::kSPATIAL,
                            nvinfer1::DimensionType::kSPATIAL} };
    int size = stride * h * w;
    nvinfer1::Weights preMul{ nvinfer1::DataType::kFLOAT, nullptr, size };
    float* preWt = new float[size];
    /* (2*h * w)
    [ [1, 0, ..., 0],
      [1, 0, ..., 0],
      [0, 1, ..., 0],
      [0, 1, ..., 0],
      ...,
      ...,
      [0, 0, ..., 1],
      [0, 0, ..., 1] ]
    */
    for (int i = 0, idx = 0; i < h; ++i)
    {
        for (int s = 0; s < stride; ++s)
        {
            for (int j = 0; j < w; ++j, ++idx)
            {
                preWt[idx] = (i == j) ? 1.0 : 0.0;
            }
        }
    }
    preMul.values = preWt;
    nvinfer1::IConstantLayer* preM = network->addConstant(preDims, preMul);
    assert(preM != nullptr);
    //std::string preLayerName = "preMul_" + std::to_string(layerIdx);
    //preM->setName(preLayerName.c_str());
    // add post multiply matrix as a constant
    nvinfer1::Dims postDims{ 3,
                            {1, h, stride * w},
                            {nvinfer1::DimensionType::kCHANNEL,
                             nvinfer1::DimensionType::kSPATIAL,
                             nvinfer1::DimensionType::kSPATIAL} };
    size = stride * h * w;
    nvinfer1::Weights postMul{ nvinfer1::DataType::kFLOAT, nullptr, size };
    float* postWt = new float[size];
    /* (h * 2*w)
    [ [1, 1, 0, 0, ..., 0, 0],
      [0, 0, 1, 1, ..., 0, 0],
      ...,
      ...,
      [0, 0, 0, 0, ..., 1, 1] ]
    */
    for (int i = 0, idx = 0; i < h; ++i)
    {
        for (int j = 0; j < stride * w; ++j, ++idx)
        {
            postWt[idx] = (j / stride == i) ? 1.0 : 0.0;
        }
    }
    postMul.values = postWt;
    nvinfer1::IConstantLayer* post_m = network->addConstant(postDims, postMul);
    assert(post_m != nullptr);
    // add matrix multiply layers for upsampling
    nvinfer1::IMatrixMultiplyLayer* mm1
        = network->addMatrixMultiply(*preM->getOutput(0),
            nvinfer1::MatrixOperation::kNONE, *input,
            nvinfer1::MatrixOperation::kNONE);
    assert(mm1 != nullptr);
    nvinfer1::IMatrixMultiplyLayer* mm2
        = network->addMatrixMultiply(*mm1->getOutput(0),
            nvinfer1::MatrixOperation::kNONE,
            *post_m->getOutput(0),
            nvinfer1::MatrixOperation::kNONE);
    assert(mm2 != nullptr);
    return mm2;
}


================================================
FILE: hrnet/hrnet-image-classification/demo.py
================================================
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# Copyright (c) Microsoft
# Licensed under the MIT License.
# Written by Bin Xiao (Bin.Xiao@microsoft.com)
# Modified by Ke Sun (sunk@mail.ustc.edu.cn)
# ------------------------------------------------------------------------------

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import os
import sys
import shutil
import pprint

import torch
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
import torchvision.datasets as datasets
import torchvision.transforms as transforms

import _init_paths
import models
from config import config
from config import update_config
from core.function import validate
from utils.modelsummary import get_model_summary
from utils.utils import create_logger
from core.evaluate import accuracy

import cv2
import numpy as np
from PIL import Image
import struct

def parse_args():
    parser = argparse.ArgumentParser(description='Train keypoints network')
    
    parser.add_argument('--cfg',
                        help='experiment configure file name',
                        default=r"E:\LearningCodes\GithubRepo\HRNet-Image-Classification\experiments\cls_hrnet_w18_small_v2_sgd_lr5e-2_wd1e-4_bs32_x100.yaml",
                        type=str)
    parser.add_argument('--modelDir',
                        help='model directory',
                        type=str,
                        default='')
    parser.add_argument('--logDir',
                        help='log directory',
                        type=str,
                        default='')
    parser.add_argument('--dataDir',
                        help='data directory',
                        type=str,
                        default='')
    parser.add_argument('--testModel',
                        help='testModel',
                        type=str,
                        default=r'E:\LearningCodes\GithubRepo\HRNet-Image-Classification\hrnet_w18_small_model_v2.pth')
    parser.add_argument('--testImg',
                    help='imgs',
                    type=str,
                    default=r'E:\Datasets\tiny-imagenet-200\tiny-imagenet-200\val\images\val_41.JPEG')
    args = parser.parse_args()
    update_config(config, args)

    return args

def main():
    savewts = False
    args = parse_args()

    logger, final_output_dir, tb_log_dir = create_logger(
        config, args.cfg, 'demo')

    logger.info(pprint.pformat(args))
    logger.info(pprint.pformat(config))

    # cudnn related setting
    cudnn.benchmark = config.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = config.CUDNN.ENABLED

    # eval() 函数用来执行一个字符串表达式，并返回表达式的值。
    model = eval('models.'+config.MODEL.NAME+'.get_cls_net')(
        config)

    model.load_state_dict(torch.load(args.testModel))

    if savewts:
        f = open('HRNetClassify.wts', 'w')
        f.write('{}\n'.format(len(model.state_dict().keys())))
        for k, v in model.state_dict().items():
            vr = v.reshape(-1).cpu().numpy()
            f.write('{} {} '.format(k, len(vr)))
            for vv in vr:
                f.write(' ')
                f.write(struct.pack('>f', float(vv)).hex())
            f.write('\n')
        exit(0)
    # load img
    image = cv2.imread(args.testImg) #BGR 0-255 hwc
    #im = Image.open(args.testImg)
    #print(im.getpixel((0,0)))  ## 0-255
    #resize
    # config.MODEL.IMAGE_SIZE[0]
    resized_img = cv2.resize(image, (config.MODEL.IMAGE_SIZE[0], config.MODEL.IMAGE_SIZE[1]))
    resized_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB) #RGB
    # normalize
    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]
    inp_image = ((resized_img/255. - mean) / std).astype(np.float32) # R-0.485  B-
    inp_image = inp_image.transpose(2, 0, 1) # chw
    inp_image = torch.from_numpy(inp_image).unsqueeze(0) # to_tensor
    model.eval()
    output = model(inp_image)
    #print(output)

    _, pred = output.topk(1)
    pred = pred.t()
    print(pred)
if __name__ == "__main__":
    main()

================================================
FILE: hrnet/hrnet-image-classification/hrnet.cpp
================================================
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include "common.hpp"
#include "logging.h"

static Logger gLogger;
#define DEVICE 0  // GPU id
#define BATCH_SIZE 1

const char* INPUT_BLOB_NAME = "image";
const char* OUTPUT_BLOB_NAME = "output";
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int OUTPUT_SIZE = 1000;

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W });
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("E:\\LearningCodes\\GithubRepo\\HRNet-Image-Classification\\tools\\HRNetClassify.wts");
    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };

    auto id_993 = convBnLeaky(network, weightMap, *data, 64, 3, 2, 1, "conv1", "bn1");  //conv1.weight 
    auto id_996 = convBnLeaky(network, weightMap, *id_993->getOutput(0), 64, 3, 2, 1, "conv2", "bn2");  //conv1.weight                                                                                 //Res
    // IActivationLayer* ResBlock(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname) {
    auto id_1008 = ResBlock2Conv(network, weightMap, *id_996->getOutput(0), 64, 256, 1, "layer1.0");
    auto id_1018 = ResBlock(network, weightMap, *id_1008->getOutput(0), 256, 64, 1, "layer1.1");

    // transition1-1
    auto id_1021 = convBnLeaky(network, weightMap, *id_1018->getOutput(0), 18, 3, 1, 1, "transition1.0.0", "transition1.0.1");
    auto id_1031 = liteResBlock(network, weightMap, *id_1021->getOutput(0), 18, "stage2.0.branches.0.0");
    auto id_1038 = liteResBlock(network, weightMap, *id_1031->getOutput(0), 18, "stage2.0.branches.0.1");
    //Ҳ֧
    auto id_1024 = convBnLeaky(network, weightMap, *id_1018->getOutput(0), 36, 3, 2, 1, "transition1.1.0.0", "transition1.1.0.1");
    auto id_1045 = liteResBlock(network, weightMap, *id_1024->getOutput(0), 36, "stage2.0.branches.1.0");
    auto id_1052 = liteResBlock(network, weightMap, *id_1045->getOutput(0), 36, "stage2.0.branches.1.1");

    // conv+bn+upsample
    IConvolutionLayer* id_1053 = network->addConvolutionNd(*id_1052->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage2.0.fuse_layers.0.1.0.weight"], emptywts);
    assert(id_1053);
    id_1053->setStrideNd(DimsHW{ 1, 1 });
    id_1053->setPaddingNd(DimsHW{ 0, 0 });

    IScaleLayer* id_1054 = addBatchNorm2d(network, weightMap, *id_1053->getOutput(0), "stage2.0.fuse_layers.0.1.1", 1e-5);
    ILayer* id_1083 = netAddUpsample(network, id_1054->getOutput(0), 18, 2);
    IElementWiseLayer* id_1084 = network->addElementWise(*id_1083->getOutput(0), *id_1038->getOutput(0), ElementWiseOperation::kSUM);
    IActivationLayer* id_1085 = network->addActivation(*id_1084->getOutput(0), ActivationType::kRELU);

    // transition1-2
    IConvolutionLayer* id_1086 = network->addConvolutionNd(*id_1038->getOutput(0), 36, DimsHW{ 3, 3 }, weightMap["stage2.0.fuse_layers.1.0.0.0.weight"], emptywts);
    assert(id_1086);
    id_1086->setStrideNd(DimsHW{ 2, 2 });
    id_1086->setPaddingNd(DimsHW{ 1, 1 });

    IScaleLayer* id_1087 = addBatchNorm2d(network, weightMap, *id_1086->getOutput(0), "stage2.0.fuse_layers.1.0.0.1", 1e-5);
    IElementWiseLayer* id_1088 = network->addElementWise(*id_1087->getOutput(0), *id_1052->getOutput(0), ElementWiseOperation::kSUM);
    IActivationLayer* id_1089 = network->addActivation(*id_1088->getOutput(0), ActivationType::kRELU);

    ///////////////////////////////////
    // transition2-1  stage_3
    auto id_1099 = liteResBlock(network, weightMap, *id_1085->getOutput(0), 18, "stage3.0.branches.0.0");
    auto id_1106 = liteResBlock(network, weightMap, *id_1099->getOutput(0), 18, "stage3.0.branches.0.1");
    // transition2-2  stage_3
    auto id_1113 = liteResBlock(network, weightMap, *id_1089->getOutput(0), 36, "stage3.0.branches.1.0");
    auto id_1120 = liteResBlock(network, weightMap, *id_1113->getOutput(0), 36, "stage3.0.branches.1.1");
    // transition2-3  stage_3
    auto id_1092 = convBnLeaky(network, weightMap, *id_1089->getOutput(0), 72, 3, 2, 1, "transition2.2.0.0", "transition2.2.0.1");
    auto id_1127 = liteResBlock(network, weightMap, *id_1092->getOutput(0), 72, "stage3.0.branches.2.0");
    auto id_1134 = liteResBlock(network, weightMap, *id_1127->getOutput(0), 72, "stage3.0.branches.2.1");

    /////// ֱģ ܼ
    //conv bn up
    IConvolutionLayer* id_1135 = network->addConvolutionNd(*id_1120->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage3.0.fuse_layers.0.1.0.weight"], emptywts);
    assert(id_1135);
    id_1135->setStrideNd(DimsHW{ 1, 1 });
    id_1135->setPaddingNd(DimsHW{ 0, 0 });
    IScaleLayer* id_1136 = addBatchNorm2d(network, weightMap, *id_1135->getOutput(0), "stage3.0.fuse_layers.0.1.1", 1e-5);
    ILayer* id_1165 = netAddUpsample(network, id_1136->getOutput(0), 18, 2);
    IElementWiseLayer* id_1166 = network->addElementWise(*id_1165->getOutput(0), *id_1106->getOutput(0), ElementWiseOperation::kSUM);

    IConvolutionLayer* id_1167 = network->addConvolutionNd(*id_1134->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage3.0.fuse_layers.0.2.0.weight"], emptywts);
    assert(id_1167);
    id_1167->setStrideNd(DimsHW{ 1, 1 });
    id_1167->setPaddingNd(DimsHW{ 0, 0 });
    IScaleLayer* id_1168 = addBatchNorm2d(network, weightMap, *id_1167->getOutput(0), "stage3.0.fuse_layers.0.2.1", 1e-5);
    ILayer* id_1197 = netAddUpsample(network, id_1168->getOutput(0), 18, 4);
    IElementWiseLayer* id_1198 = network->addElementWise(*id_1166->getOutput(0), *id_1197->getOutput(0), ElementWiseOperation::kSUM);
    IActivationLayer* id_1199 = network->addActivation(*id_1198->getOutput(0), ActivationType::kRELU);

    //2
    IConvolutionLayer* id_1200 = network->addConvolutionNd(*id_1106->getOutput(0), 36, DimsHW{ 3, 3 }, weightMap["stage3.0.fuse_layers.1.0.0.0.weight"], emptywts);
    assert(id_1200);
    id_1200->setStrideNd(DimsHW{ 2, 2 });
    id_1200->setPaddingNd(DimsHW{ 1, 1 });

    IScaleLayer* id_1201 = addBatchNorm2d(network, weightMap, *id_1200->getOutput(0), "stage3.0.fuse_layers.1.0.0.1", 1e-5);
    IElementWiseLayer* id_1202 = network->addElementWise(*id_1201->getOutput(0), *id_1120->getOutput(0), ElementWiseOperation::kSUM);

    IConvolutionLayer* id_1203 = network->addConvolutionNd(*id_1134->getOutput(0), 36, DimsHW{ 1, 1 }, weightMap["stage3.0.fuse_layers.1.2.0.weight"], emptywts);
    assert(id_1203);
    id_1203->setStrideNd(DimsHW{ 1, 1 });
    id_1203->setPaddingNd(DimsHW{ 0, 0 });
    IScaleLayer* id_1204 = addBatchNorm2d(network, weightMap, *id_1203->getOutput(0), "stage3.0.fuse_layers.1.2.1", 1e-5);
    ILayer* id_1233 = netAddUpsample(network, id_1204->getOutput(0), 36, 2);
    IElementWiseLayer* id_1234 = network->addElementWise(*id_1202->getOutput(0), *id_1233->getOutput(0), ElementWiseOperation::kSUM);
    IActivationLayer* id_1235 = network->addActivation(*id_1234->getOutput(0), ActivationType::kRELU);

    // 3
    IConvolutionLayer* id_1236 = network->addConvolutionNd(*id_1106->getOutput(0), 18, DimsHW{ 3, 3 }, weightMap["stage3.0.fuse_layers.2.0.0.0.weight"], emptywts);
    assert(id_1236);
    id_1236->setStrideNd(DimsHW{ 2, 2 });
    id_1236->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_1237 = addBatchNorm2d(network, weightMap, *id_1236->getOutput(0), "stage3.0.fuse_layers.2.0.0.1", 1e-5);
    IActivationLayer* id_1238 = network->addActivation(*id_1237->getOutput(0), ActivationType::kRELU);

    IConvolutionLayer* id_1239 = network->addConvolutionNd(*id_1238->getOutput(0), 72, DimsHW{ 3, 3 }, weightMap["stage3.0.fuse_layers.2.0.1.0.weight"], emptywts);
    assert(id_1239);
    id_1239->setStrideNd(DimsHW{ 2, 2 });
    id_1239->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_1240 = addBatchNorm2d(network, weightMap, *id_1239->getOutput(0), "stage3.0.fuse_layers.2.0.1.1", 1e-5);

    IConvolutionLayer* id_1241 = network->addConvolutionNd(*id_1120->getOutput(0), 72, DimsHW{ 3, 3 }, weightMap["stage3.0.fuse_layers.2.1.0.0.weight"], emptywts);
    assert(id_1241);
    id_1241->setStrideNd(DimsHW{ 2, 2 });
    id_1241->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_1242 = addBatchNorm2d(network, weightMap, *id_1241->getOutput(0), "stage3.0.fuse_layers.2.1.0.1", 1e-5);

    IElementWiseLayer* id_1243 = network->addElementWise(*id_1240->getOutput(0), *id_1242->getOutput(0), ElementWiseOperation::kSUM);
    IElementWiseLayer* id_1244 = network->addElementWise(*id_1243->getOutput(0), *id_1134->getOutput(0), ElementWiseOperation::kSUM);
    IActivationLayer* id_1245 = network->addActivation(*id_1244->getOutput(0), ActivationType::kRELU);

    auto id_1252 = liteResBlock(network, weightMap, *id_1199->getOutput(0), 18, "stage3.1.branches.0.0");
    auto id_1259 = liteResBlock(network, weightMap, *id_1252->getOutput(0), 18, "stage3.1.branches.0.1");
    auto id_1266 = liteResBlock(network, weightMap, *id_1235->getOutput(0), 36, "stage3.1.branches.1.0");
    auto id_1273 = liteResBlock(network, weightMap, *id_1266->getOutput(0), 36, "stage3.1.branches.1.1");
    auto id_1280 = liteResBlock(network, weightMap, *id_1245->getOutput(0), 72, "stage3.1.branches.2.0");
    auto id_1287 = liteResBlock(network, weightMap, *id_1280->getOutput(0), 72, "stage3.1.branches.2.1");

    /////// ֱģ ܼ 
    //1: 1259+up(1273)+up(1287)
    IConvolutionLayer* id_1288 = network->addConvolutionNd(*id_1273->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage3.1.fuse_layers.0.1.0.weight"], emptywts);
    assert(id_1288);
    id_1288->setStrideNd(DimsHW{ 1, 1 });
    id_1288->setPaddingNd(DimsHW{ 0, 0 });
    IScaleLayer* id_1289 = addBatchNorm2d(network, weightMap, *id_1288->getOutput(0), "stage3.1.fuse_layers.0.1.1", 1e-5);
    ILayer* id_1318 = netAddUpsample(network, id_1289->getOutput(0), 18, 2);
    IElementWiseLayer* id_1319 = network->addElementWise(*id_1259->getOutput(0), *id_1318->getOutput(0), ElementWiseOperation::kSUM);
    //1-2 up(1287)  conv bn up
    IConvolutionLayer* id_1320 = network->addConvolutionNd(*id_1134->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage3.1.fuse_layers.0.2.0.weight"], emptywts);
    assert(id_1320);
    id_1320->setStrideNd(DimsHW{ 1, 1 });
    id_1320->setPaddingNd(DimsHW{ 0, 0 });

    IScaleLayer* id_1321 = addBatchNorm2d(network, weightMap, *id_1320->getOutput(0), "stage3.1.fuse_layers.0.2.1", 1e-5);
    ILayer* id_1350 = netAddUpsample(network, id_1321->getOutput(0), 18, 4);
    IElementWiseLayer* id_1351 = network->addElementWise(*id_1319->getOutput(0), *id_1350->getOutput(0), ElementWiseOperation::kSUM);
    IActivationLayer* id_1352 = network->addActivation(*id_1351->getOutput(0), ActivationType::kRELU);


    //2: conv(1259)+1273 + up(1287)
    IConvolutionLayer* id_1353 = network->addConvolutionNd(*id_1259->getOutput(0), 36, DimsHW{ 3, 3 }, weightMap["stage3.1.fuse_layers.1.0.0.0.weight"], emptywts);
    assert(id_1353);
    id_1353->setStrideNd(DimsHW{ 2, 2 });
    id_1353->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_1354 = addBatchNorm2d(network, weightMap, *id_1353->getOutput(0), "stage3.1.fuse_layers.1.0.0.1", 1e-5);
    IElementWiseLayer* id_1355 = network->addElementWise(*id_1354->getOutput(0), *id_1273->getOutput(0), ElementWiseOperation::kSUM);


    IConvolutionLayer* id_1356 = network->addConvolutionNd(*id_1287->getOutput(0), 36, DimsHW{ 1, 1 }, weightMap["stage3.1.fuse_layers.1.2.0.weight"], emptywts);
    assert(id_1356);
    id_1356->setStrideNd(DimsHW{ 1, 1 });
    id_1356->setPaddingNd(DimsHW{ 0, 0 });
    IScaleLayer* id_1357 = addBatchNorm2d(network, weightMap, *id_1356->getOutput(0), "stage3.1.fuse_layers.1.2.1", 1e-5);
    ILayer* id_1386 = netAddUpsample(network, id_1357->getOutput(0), 36, 2);
    IElementWiseLayer* id_1387 = network->addElementWise(*id_1355->getOutput(0), *id_1386->getOutput(0), ElementWiseOperation::kSUM);
    IActivationLayer* id_1388 = network->addActivation(*id_1387->getOutput(0), ActivationType::kRELU);

    //3 conv(1259)+conv(1273)+1287
    IConvolutionLayer* id_1389 = network->addConvolutionNd(*id_1259->getOutput(0), 18, DimsHW{ 3, 3 }, weightMap["stage3.1.fuse_layers.2.0.0.0.weight"], emptywts);
    assert(id_1389);
    id_1389->setStrideNd(DimsHW{ 2, 2 });
    id_1389->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_1390 = addBatchNorm2d(network, weightMap, *id_1389->getOutput(0), "stage3.1.fuse_layers.2.0.0.1", 1e-5);
    IActivationLayer* id_1391 = network->addActivation(*id_1390->getOutput(0), ActivationType::kRELU);

    IConvolutionLayer* id_1392 = network->addConvolutionNd(*id_1391->getOutput(0), 72, DimsHW{ 3, 3 }, weightMap["stage3.1.fuse_layers.2.0.1.0.weight"], emptywts);
    assert(id_1392);
    id_1392->setStrideNd(DimsHW{ 2, 2 });
    id_1392->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_1393 = addBatchNorm2d(network, weightMap, *id_1392->getOutput(0), "stage3.1.fuse_layers.2.0.1.1", 1e-5);

    IConvolutionLayer* id_1394 = network->addConvolutionNd(*id_1273->getOutput(0), 72, DimsHW{ 3, 3 }, weightMap["stage3.1.fuse_layers.2.1.0.0.weight"], emptywts);
    assert(id_1394);
    id_1394->setStrideNd(DimsHW{ 2, 2 });
    id_1394->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_1395 = addBatchNorm2d(network, weightMap, *id_1394->getOutput(0), "stage3.1.fuse_layers.2.1.0.1", 1e-5);

    IElementWiseLayer* id_1396 = network->addElementWise(*id_1393->getOutput(0), *id_1395->getOutput(0), ElementWiseOperation::kSUM);
    IElementWiseLayer* id_1397 = network->addElementWise(*id_1396->getOutput(0), *id_1287->getOutput(0), ElementWiseOperation::kSUM);
    IActivationLayer* id_1398 = network->addActivation(*id_1397->getOutput(0), ActivationType::kRELU);

    auto id_1405 = liteResBlock(network, weightMap, *id_1352->getOutput(0), 18, "stage3.2.branches.0.0");
    auto id_1412 = liteResBlock(network, weightMap, *id_1405->getOutput(0), 18, "stage3.2.branches.0.1");
    auto id_1419 = liteResBlock(network, weightMap, *id_1388->getOutput(0), 36, "stage3.2.branches.1.0");
    auto id_1426 = liteResBlock(network, weightMap, *id_1419->getOutput(0), 36, "stage3.2.branches.1.1");
    auto id_1433 = liteResBlock(network, weightMap, *id_1398->getOutput(0), 72, "stage3.2.branches.2.0");
    auto id_1440 = liteResBlock(network, weightMap, *id_1433->getOutput(0), 72, "stage3.2.branches.2.1");


    // 1412 + up(1426)+up(1440) 
    IConvolutionLayer* id_1441 = network->addConvolutionNd(*id_1426->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage3.2.fuse_layers.0.1.0.weight"], emptywts);
    assert(id_1441);
    id_1441->setStrideNd(DimsHW{ 1, 1 });
    id_1441->setPaddingNd(DimsHW{ 0, 0 });
    IScaleLayer* id_1442 = addBatchNorm2d(network, weightMap, *id_1441->getOutput(0), "stage3.2.fuse_layers.0.1.1", 1e-5);
    ILayer* id_1471 = netAddUpsample(network, id_1442->getOutput(0), 18, 2);
    IElementWiseLayer* id_1472 = network->addElementWise(*id_1412->getOutput(0), *id_1471->getOutput(0), ElementWiseOperation::kSUM);

    IConvolutionLayer* id_1473 = network->addConvolutionNd(*id_1440->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage3.2.fuse_layers.0.2.0.weight"], emptywts);
    assert(id_1473);
    id_1473->setStrideNd(DimsHW{ 1, 1 });
    id_1473->setPaddingNd(DimsHW{ 0, 0 });
    IScaleLayer* id_1474 = addBatchNorm2d(network, weightMap, *id_1473->getOutput(0), "stage3.2.fuse_layers.0.2.1", 1e-5);
    ILayer* id_1503 = netAddUpsample(network, id_1474->getOutput(0), 18, 4);

    IElementWiseLayer* id_1504 = network->addElementWise(*id_1472->getOutput(0), *id_1503->getOutput(0), ElementWiseOperation::kSUM);
    IActivationLayer* id_1505 = network->addActivation(*id_1504->getOutput(0), ActivationType::kRELU);

    // conv(1412)+1426+up(1440)
    IConvolutionLayer* id_1506 = network->addConvolutionNd(*id_1412->getOutput(0), 36, DimsHW{ 3, 3 }, weightMap["stage3.2.fuse_layers.1.0.0.0.weight"], emptywts);
    assert(id_1506);
    id_1506->setStrideNd(DimsHW{ 2, 2 });
    id_1506->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_1507 = addBatchNorm2d(network, weightMap, *id_1506->getOutput(0), "stage3.2.fuse_layers.1.0.0.1", 1e-5);
    IElementWiseLayer* id_1508 = network->addElementWise(*id_1507->getOutput(0), *id_1426->getOutput(0), ElementWiseOperation::kSUM);

    IConvolutionLayer* id_1509 = network->addConvolutionNd(*id_1440->getOutput(0), 36, DimsHW{ 1, 1 }, weightMap["stage3.2.fuse_layers.1.2.0.weight"], emptywts);
    assert(id_1509);
    id_1509->setStrideNd(DimsHW{ 1, 1 });
    id_1509->setPaddingNd(DimsHW{ 0, 0 });
    IScaleLayer* id_1510 = addBatchNorm2d(network, weightMap, *id_1509->getOutput(0), "stage3.2.fuse_layers.1.2.1", 1e-5);
    ILayer* id_1539 = netAddUpsample(network, id_1510->getOutput(0), 36, 2);
    IElementWiseLayer* id_1540 = network->addElementWise(*id_1508->getOutput(0), *id_1539->getOutput(0), ElementWiseOperation::kSUM);
    IActivationLayer* id_1541 = network->addActivation(*id_1540->getOutput(0), ActivationType::kRELU);

    // conv(1412)+conv(1426)+1440
    IConvolutionLayer* id_1542 = network->addConvolutionNd(*id_1412->getOutput(0), 18, DimsHW{ 3, 3 }, weightMap["stage3.2.fuse_layers.2.0.0.0.weight"], emptywts);
    assert(id_1542);
    id_1542->setStrideNd(DimsHW{ 2, 2 });
    id_1542->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_1543 = addBatchNorm2d(network, weightMap, *id_1542->getOutput(0), "stage3.2.fuse_layers.2.0.0.1", 1e-5);
    IActivationLayer* id_1544 = network->addActivation(*id_1543->getOutput(0), ActivationType::kRELU);

    IConvolutionLayer* id_1545 = network->addConvolutionNd(*id_1544->getOutput(0), 72, DimsHW{ 3, 3 }, weightMap["stage3.2.fuse_layers.2.0.1.0.weight"], emptywts);
    assert(id_1545);
    id_1545->setStrideNd(DimsHW{ 2, 2 });
    id_1545->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_1546 = addBatchNorm2d(network, weightMap, *id_1545->getOutput(0), "stage3.2.fuse_layers.2.0.1.1", 1e-5);

    IConvolutionLayer* id_1547 = network->addConvolutionNd(*id_1426->getOutput(0), 72, DimsHW{ 3, 3 }, weightMap["stage3.2.fuse_layers.2.1.0.0.weight"], emptywts);
    assert(id_1547);
    id_1547->setStrideNd(DimsHW{ 2, 2 });
    id_1547->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_1548 = addBatchNorm2d(network, weightMap, *id_1547->getOutput(0), "stage3.2.fuse_layers.2.1.0.1", 1e-5);

    IElementWiseLayer* id_1549 = network->addElementWise(*id_1546->getOutput(0), *id_1548->getOutput(0), ElementWiseOperation::kSUM);
    IElementWiseLayer* id_1550 = network->addElementWise(*id_1549->getOutput(0), *id_1440->getOutput(0), ElementWiseOperation::kSUM);
    IActivationLayer* id_1551 = network->addActivation(*id_1550->getOutput(0), ActivationType::kRELU);

    auto id_1561 = liteResBlock(network, weightMap, *id_1505->getOutput(0), 18, "stage4.0.branches.0.0");
    auto id_1568 = liteResBlock(network, weightMap, *id_1561->getOutput(0), 18, "stage4.0.branches.0.1");
    auto id_1575 = liteResBlock(network, weightMap, *id_1541->getOutput(0), 36, "stage4.0.branches.1.0");
    auto id_1582 = liteResBlock(network, weightMap, *id_1575->getOutput(0), 36, "stage4.0.branches.1.1");
    auto id_1589 = liteResBlock(network, weightMap, *id_1551->getOutput(0), 72, "stage4.0.branches.2.0");
    auto id_1596 = liteResBlock(network, weightMap, *id_1589->getOutput(0), 72, "stage4.0.branches.2.1");

    // transition
    auto id_1554 = convBnLeaky(network, weightMap, *id_1551->getOutput(0), 144, 3, 2, 1, "transition3.3.0.0", "transition3.3.0.1");
    auto id_1603 = liteResBlock(network, weightMap, *id_1554->getOutput(0), 144, "stage4.0.branches.3.0");
    auto id_1610 = liteResBlock(network, weightMap, *id_1603->getOutput(0), 144, "stage4.0.branches.3.1");

    // 1568+up(1582)+up(1596)+up(1610)
    IConvolutionLayer* id_1611 = network->addConvolutionNd(*id_1582->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage4.0.fuse_layers.0.1.0.weight"], emptywts);
    assert(id_1611);
    id_1611->setStrideNd(DimsHW{ 1, 1 });
    id_1611->setPaddingNd(DimsHW{ 0, 0 });
    IScaleLayer* id_1612 = addBatchNorm2d(network, weightMap, *id_1611->getOutput(0), "stage4.0.fuse_layers.0.1.1", 1e-5);
    ILayer* id_1641 = netAddUpsample(network, id_1612->getOutput(0), 18, 2);
    IElementWiseLayer* id_1642 = network->addElementWise(*id_1641->getOutput(0), *id_1568->getOutput(0), ElementWiseOperation::kSUM);

    IConvolutionLayer* id_1643 = network->addConvolutionNd(*id_1596->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage4.0.fuse_layers.0.2.0.weight"], emptywts);
    assert(id_1643);
    id_1643->setStrideNd(DimsHW{ 1, 1 });
    id_1643->setPaddingNd(DimsHW{ 0, 0 });
    IScaleLayer* id_1644 = addBatchNorm2d(network, weightMap, *id_1643->getOutput(0), "stage4.0.fuse_layers.0.2.1", 1e-5);
    ILayer* id_1673 = netAddUpsample(network, id_1644->getOutput(0), 18, 4);
    IElementWiseLayer* id_1674 = network->addElementWise(*id_1642->getOutput(0), *id_1673->getOutput(0), ElementWiseOperation::kSUM);

    //3
    IConvolutionLayer* id_1675 = network->addConvolutionNd(*id_1610->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage4.0.fuse_layers.0.3.0.weight"], emptywts);
    assert(id_1675);
    id_1675->setStrideNd(DimsHW{ 1, 1 });
    id_1675->setPaddingNd(DimsHW{ 0, 0 });
    IScaleLayer* id_1676 = addBatchNorm2d(network, weightMap, *id_1675->getOutput(0), "stage4.0.fuse_layers.0.3.1", 1e-5);
    ILayer* id_1705 = netAddUpsample(network, id_1676->getOutput(0), 18, 8);
    IElementWiseLayer* id_1706 = network->addElementWise(*id_1705->getOutput(0), *id_1674->getOutput(0), ElementWiseOperation::kSUM);
    IActivationLayer* id_1707 = network->addActivation(*id_1706->getOutput(0), ActivationType::kRELU);

    // conv(1568)+1582+up(1596)+up(1610)
    IConvolutionLayer* id_1708 = network->addConvolutionNd(*id_1568->getOutput(0), 36, DimsHW{ 3, 3 }, weightMap["stage4.0.fuse_layers.1.0.0.0.weight"], emptywts);
    assert(id_1708);
    id_1708->setStrideNd(DimsHW{ 2, 2 });
    id_1708->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_1709 = addBatchNorm2d(network, weightMap, *id_1708->getOutput(0), "stage4.0.fuse_layers.1.0.0.1", 1e-5);
    IElementWiseLayer* id_1710 = network->addElementWise(*id_1709->getOutput(0), *id_1582->getOutput(0), ElementWiseOperation::kSUM);

    IConvolutionLayer* id_1711 = network->addConvolutionNd(*id_1596->getOutput(0), 36, DimsHW{ 1, 1 }, weightMap["stage4.0.fuse_layers.1.2.0.weight"], emptywts);
    assert(id_1711);
    id_1711->setStrideNd(DimsHW{ 1, 1 });
    id_1711->setPaddingNd(DimsHW{ 0, 0 });
    IScaleLayer* id_1712 = addBatchNorm2d(network, weightMap, *id_1711->getOutput(0), "stage4.0.fuse_layers.1.2.1", 1e-5);
    ILayer* id_1741 = netAddUpsample(network, id_1712->getOutput(0), 36, 2);
    IElementWiseLayer* id_1742 = network->addElementWise(*id_1741->getOutput(0), *id_1710->getOutput(0), ElementWiseOperation::kSUM);

    IConvolutionLayer* id_1743 = network->addConvolutionNd(*id_1610->getOutput(0), 36, DimsHW{ 1, 1 }, weightMap["stage4.0.fuse_layers.1.3.0.weight"], emptywts);
    assert(id_1743);
    id_1743->setStrideNd(DimsHW{ 1, 1 });
    id_1743->setPaddingNd(DimsHW{ 0, 0 });
    IScaleLayer* id_1744 = addBatchNorm2d(network, weightMap, *id_1743->getOutput(0), "stage4.0.fuse_layers.1.3.1", 1e-5);
    ILayer* id_1773 = netAddUpsample(network, id_1744->getOutput(0), 36, 4);
    IElementWiseLayer* id_1774 = network->addElementWise(*id_1773->getOutput(0), *id_1742->getOutput(0), ElementWiseOperation::kSUM);
    IActivationLayer* id_1775 = network->addActivation(*id_1774->getOutput(0), ActivationType::kRELU);

    // conv(1568)+conv(1582)+1596+up(1610)
    IConvolutionLayer* id_1776 = network->addConvolutionNd(*id_1568->getOutput(0), 18, DimsHW{ 3, 3 }, weightMap["stage4.0.fuse_layers.2.0.0.0.weight"], emptywts);
    assert(id_1776);
    id_1776->setStrideNd(DimsHW{ 2, 2 });
    id_1776->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_1777 = addBatchNorm2d(network, weightMap, *id_1776->getOutput(0), "stage4.0.fuse_layers.2.0.0.1", 1e-5);
    IActivationLayer* id_1778 = network->addActivation(*id_1777->getOutput(0), ActivationType::kRELU);

    IConvolutionLayer* id_1779 = network->addConvolutionNd(*id_1778->getOutput(0), 72, DimsHW{ 3, 3 }, weightMap["stage4.0.fuse_layers.2.0.1.0.weight"], emptywts);
    assert(id_1779);
    id_1779->setStrideNd(DimsHW{ 2, 2 });
    id_1779->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_1780 = addBatchNorm2d(network, weightMap, *id_1779->getOutput(0), "stage4.0.fuse_layers.2.0.1.1", 1e-5);

    IConvolutionLayer* id_1781 = network->addConvolutionNd(*id_1582->getOutput(0), 72, DimsHW{ 3, 3 }, weightMap["stage4.0.fuse_layers.2.1.0.0.weight"], emptywts);
    assert(id_1781);
    id_1781->setStrideNd(DimsHW{ 2, 2 });
    id_1781->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_1782 = addBatchNorm2d(network, weightMap, *id_1781->getOutput(0), "stage4.0.fuse_layers.2.1.0.1", 1e-5);

    IElementWiseLayer* id_1783 = network->addElementWise(*id_1780->getOutput(0), *id_1782->getOutput(0), ElementWiseOperation::kSUM);
    IElementWiseLayer* id_1784 = network->addElementWise(*id_1783->getOutput(0), *id_1596->getOutput(0), ElementWiseOperation::kSUM);

    IConvolutionLayer* id_1785 = network->addConvolutionNd(*id_1610->getOutput(0), 72, DimsHW{ 1, 1 }, weightMap["stage4.0.fuse_layers.2.3.0.weight"], emptywts);
    assert(id_1785);
    id_1785->setStrideNd(DimsHW{ 1, 1 });
    id_1785->setPaddingNd(DimsHW{ 0, 0 });
    IScaleLayer* id_1786 = addBatchNorm2d(network, weightMap, *id_1785->getOutput(0), "stage4.0.fuse_layers.2.3.1", 1e-5);
    ILayer* id_1815 = netAddUpsample(network, id_1786->getOutput(0), 72, 2);

    IElementWiseLayer* id_1816 = network->addElementWise(*id_1784->getOutput(0), *id_1815->getOutput(0), ElementWiseOperation::kSUM);
    IActivationLayer* id_1817 = network->addActivation(*id_1816->getOutput(0), ActivationType::kRELU);

    // conv(1568)+conv(1582)+conv(1596)+(1610)
    // 1568(cbr)1820(cbr)1823(cb)1825
    IConvolutionLayer* id_1818 = network->addConvolutionNd(*id_1568->getOutput(0), 18, DimsHW{ 3, 3 }, weightMap["stage4.0.fuse_layers.3.0.0.0.weight"], emptywts);
    assert(id_1818);
    id_1818->setStrideNd(DimsHW{ 2, 2 });
    id_1818->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_1819 = addBatchNorm2d(network, weightMap, *id_1818->getOutput(0), "stage4.0.fuse_layers.3.0.0.1", 1e-5);
    IActivationLayer* id_1820 = network->addActivation(*id_1819->getOutput(0), ActivationType::kRELU);
    IConvolutionLayer* id_1821 = network->addConvolutionNd(*id_1820->getOutput(0), 18, DimsHW{ 3, 3 }, weightMap["stage4.0.fuse_layers.3.0.1.0.weight"], emptywts);
    assert(id_1821);
    id_1821->setStrideNd(DimsHW{ 2, 2 });
    id_1821->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_1822 = addBatchNorm2d(network, weightMap, *id_1821->getOutput(0), "stage4.0.fuse_layers.3.0.1.1", 1e-5);
    IActivationLayer* id_1823 = network->addActivation(*id_1822->getOutput(0), ActivationType::kRELU);
    IConvolutionLayer* id_1824 = network->addConvolutionNd(*id_1823->getOutput(0), 144, DimsHW{ 3, 3 }, weightMap["stage4.0.fuse_layers.3.0.2.0.weight"], emptywts);
    assert(id_1824);
    id_1824->setStrideNd(DimsHW{ 2, 2 });
    id_1824->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_1825 = addBatchNorm2d(network, weightMap, *id_1824->getOutput(0), "stage4.0.fuse_layers.3.0.2.1", 1e-5);

    // 1582(cbr)1828(cb)1830
    IConvolutionLayer* id_1826 = network->addConvolutionNd(*id_1582->getOutput(0), 36, DimsHW{ 3, 3 }, weightMap["stage4.0.fuse_layers.3.1.0.0.weight"], emptywts);
    assert(id_1826);
    id_1826->setStrideNd(DimsHW{ 2, 2 });
    id_1826->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_1827 = addBatchNorm2d(network, weightMap, *id_1826->getOutput(0), "stage4.0.fuse_layers.3.1.0.1", 1e-5);
    IActivationLayer* id_1828 = network->addActivation(*id_1827->getOutput(0), ActivationType::kRELU);
    IConvolutionLayer* id_1829 = network->addConvolutionNd(*id_1828->getOutput(0), 144, DimsHW{ 3, 3 }, weightMap["stage4.0.fuse_layers.3.1.1.0.weight"], emptywts);
    assert(id_1829);
    id_1829->setStrideNd(DimsHW{ 2, 2 });
    id_1829->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_1830 = addBatchNorm2d(network, weightMap, *id_1829->getOutput(0), "stage4.0.fuse_layers.3.1.1.1", 1e-5);

    IElementWiseLayer* id_1831 = network->addElementWise(*id_1830->getOutput(0), *id_1825->getOutput(0), ElementWiseOperation::kSUM);

    // 1596(cb)1832
    IConvolutionLayer* id_1832 = network->addConvolutionNd(*id_1596->getOutput(0), 144, DimsHW{ 3, 3 }, weightMap["stage4.0.fuse_layers.3.2.0.0.weight"], emptywts);
    assert(id_1832);
    id_1832->setStrideNd(DimsHW{ 2, 2 });
    id_1832->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_1833 = addBatchNorm2d(network, weightMap, *id_1832->getOutput(0), "stage4.0.fuse_layers.3.2.0.1", 1e-5);

    IElementWiseLayer* id_1834 = network->addElementWise(*id_1833->getOutput(0), *id_1831->getOutput(0), ElementWiseOperation::kSUM);
    IElementWiseLayer* id_1835 = network->addElementWise(*id_1834->getOutput(0), *id_1610->getOutput(0), ElementWiseOperation::kSUM);
    IActivationLayer* id_1836 = network->addActivation(*id_1835->getOutput(0), ActivationType::kRELU);

    auto id_1843 = liteResBlock(network, weightMap, *id_1707->getOutput(0), 18, "stage4.1.branches.0.0");
    auto id_1850 = liteResBlock(network, weightMap, *id_1843->getOutput(0), 18, "stage4.1.branches.0.1");
    auto id_1857 = liteResBlock(network, weightMap, *id_1775->getOutput(0), 36, "stage4.1.branches.1.0");
    auto id_1864 = liteResBlock(network, weightMap, *id_1857->getOutput(0), 36, "stage4.1.branches.1.1");
    auto id_1871 = liteResBlock(network, weightMap, *id_1817->getOutput(0), 72, "stage4.1.branches.2.0");
    auto id_1878 = liteResBlock(network, weightMap, *id_1871->getOutput(0), 72, "stage4.1.branches.2.1");
    auto id_1885 = liteResBlock(network, weightMap, *id_1836->getOutput(0), 144, "stage4.1.branches.3.0");
    auto id_1892 = liteResBlock(network, weightMap, *id_1885->getOutput(0), 144, "stage4.1.branches.3.1");

    // 1850+up1864+up1878+up1892
    IConvolutionLayer* id_1893 = network->addConvolutionNd(*id_1864->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage4.1.fuse_layers.0.1.0.weight"], emptywts);
    assert(id_1893);
    id_1893->setStrideNd(DimsHW{ 1, 1 });
    id_1893->setPaddingNd(DimsHW{ 0, 0 });
    IScaleLayer* id_1894 = addBatchNorm2d(network, weightMap, *id_1893->getOutput(0), "stage4.1.fuse_layers.0.1.1", 1e-5);
    ILayer* id_1923 = netAddUpsample(network, id_1894->getOutput(0), 18, 2);
    IElementWiseLayer* id_1924 = network->addElementWise(*id_1850->getOutput(0), *id_1923->getOutput(0), ElementWiseOperation::kSUM);

    IConvolutionLayer* id_1925 = network->addConvolutionNd(*id_1878->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage4.1.fuse_layers.0.2.0.weight"], emptywts);
    assert(id_1925);
    id_1925->setStrideNd(DimsHW{ 1, 1 });
    id_1925->setPaddingNd(DimsHW{ 0, 0 });
    IScaleLayer* id_1926 = addBatchNorm2d(network, weightMap, *id_1925->getOutput(0), "stage4.1.fuse_layers.0.2.1", 1e-5);
    ILayer* id_1955 = netAddUpsample(network, id_1926->getOutput(0), 18, 4);
    IElementWiseLayer* id_1956 = network->addElementWise(*id_1924->getOutput(0), *id_1955->getOutput(0), ElementWiseOperation::kSUM);

    IConvolutionLayer* id_1957 = network->addConvolutionNd(*id_1892->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage4.1.fuse_layers.0.3.0.weight"], emptywts);
    assert(id_1957);
    id_1957->setStrideNd(DimsHW{ 1, 1 });
    id_1957->setPaddingNd(DimsHW{ 0, 0 });
    IScaleLayer* id_1958 = addBatchNorm2d(network, weightMap, *id_1957->getOutput(0), "stage4.1.fuse_layers.0.3.1", 1e-5);
    ILayer* id_1987 = netAddUpsample(network, id_1958->getOutput(0), 18, 8);
    IElementWiseLayer* id_1988 = network->addElementWise(*id_1956->getOutput(0), *id_1987->getOutput(0), ElementWiseOperation::kSUM);
    IActivationLayer* id_1989 = network->addActivation(*id_1988->getOutput(0), ActivationType::kRELU);

    // conv1850+1864+up1878+up1892
    IConvolutionLayer* id_1990 = network->addConvolutionNd(*id_1850->getOutput(0), 36, DimsHW{ 3, 3 }, weightMap["stage4.1.fuse_layers.1.0.0.0.weight"], emptywts);
    assert(id_1990);
    id_1990->setStrideNd(DimsHW{ 2, 2 });
    id_1990->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_1991 = addBatchNorm2d(network, weightMap, *id_1990->getOutput(0), "stage4.1.fuse_layers.1.0.0.1", 1e-5);
    IElementWiseLayer* id_1992 = network->addElementWise(*id_1991->getOutput(0), *id_1864->getOutput(0), ElementWiseOperation::kSUM);

    IConvolutionLayer* id_1993 = network->addConvolutionNd(*id_1878->getOutput(0), 36, DimsHW{ 1, 1 }, weightMap["stage4.1.fuse_layers.1.2.0.weight"], emptywts);
    assert(id_1993);
    id_1993->setStrideNd(DimsHW{ 1, 1 });
    id_1993->setPaddingNd(DimsHW{ 0, 0 });
    IScaleLayer* id_1994 = addBatchNorm2d(network, weightMap, *id_1993->getOutput(0), "stage4.1.fuse_layers.1.2.1", 1e-5);
    ILayer* id_2023 = netAddUpsample(network, id_1994->getOutput(0), 36, 2);
    IElementWiseLayer* id_2024 = network->addElementWise(*id_1992->getOutput(0), *id_2023->getOutput(0), ElementWiseOperation::kSUM);

    IConvolutionLayer* id_2025 = network->addConvolutionNd(*id_1892->getOutput(0), 36, DimsHW{ 1, 1 }, weightMap["stage4.1.fuse_layers.1.3.0.weight"], emptywts);
    assert(id_2025);
    id_2025->setStrideNd(DimsHW{ 1, 1 });
    id_2025->setPaddingNd(DimsHW{ 0, 0 });
    IScaleLayer* id_2026 = addBatchNorm2d(network, weightMap, *id_2025->getOutput(0), "stage4.1.fuse_layers.1.3.1", 1e-5);
    ILayer* id_2055 = netAddUpsample(network, id_2026->getOutput(0), 36, 4);
    IElementWiseLayer* id_2056 = network->addElementWise(*id_2024->getOutput(0), *id_2055->getOutput(0), ElementWiseOperation::kSUM);
    IActivationLayer* id_2057 = network->addActivation(*id_2056->getOutput(0), ActivationType::kRELU);

    //conv1850 + conv 1864 + 1878 + up1892
    IConvolutionLayer* id_2058 = network->addConvolutionNd(*id_1850->getOutput(0), 18, DimsHW{ 3, 3 }, weightMap["stage4.1.fuse_layers.2.0.0.0.weight"], emptywts);
    assert(id_2058);
    id_2058->setStrideNd(DimsHW{ 2, 2 });
    id_2058->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_2059 = addBatchNorm2d(network, weightMap, *id_2058->getOutput(0), "stage4.1.fuse_layers.2.0.0.1", 1e-5);
    IActivationLayer* id_2060 = network->addActivation(*id_2059->getOutput(0), ActivationType::kRELU);

    IConvolutionLayer* id_2061 = network->addConvolutionNd(*id_2060->getOutput(0), 72, DimsHW{ 3, 3 }, weightMap["stage4.1.fuse_layers.2.0.1.0.weight"], emptywts);
    assert(id_2061);
    id_2061->setStrideNd(DimsHW{ 2, 2 });
    id_2061->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_2062 = addBatchNorm2d(network, weightMap, *id_2061->getOutput(0), "stage4.1.fuse_layers.2.0.1.1", 1e-5);

    IConvolutionLayer* id_2063 = network->addConvolutionNd(*id_1864->getOutput(0), 72, DimsHW{ 3, 3 }, weightMap["stage4.1.fuse_layers.2.1.0.0.weight"], emptywts);
    assert(id_2063);
    id_2063->setStrideNd(DimsHW{ 2, 2 });
    id_2063->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_2064 = addBatchNorm2d(network, weightMap, *id_2063->getOutput(0), "stage4.1.fuse_layers.2.1.0.1", 1e-5);

    IElementWiseLayer* id_2065 = network->addElementWise(*id_2062->getOutput(0), *id_2064->getOutput(0), ElementWiseOperation::kSUM);
    IElementWiseLayer* id_2066 = network->addElementWise(*id_1878->getOutput(0), *id_2065->getOutput(0), ElementWiseOperation::kSUM);

    IConvolutionLayer* id_2067 = network->addConvolutionNd(*id_1892->getOutput(0), 72, DimsHW{ 1, 1 }, weightMap["stage4.1.fuse_layers.2.3.0.weight"], emptywts);
    assert(id_2067);
    id_2067->setStrideNd(DimsHW{ 1, 1 });
    id_2067->setPaddingNd(DimsHW{ 0, 0 });
    IScaleLayer* id_2068 = addBatchNorm2d(network, weightMap, *id_2067->getOutput(0), "stage4.1.fuse_layers.2.3.1", 1e-5);
    ILayer* id_2097 = netAddUpsample(network, id_2068->getOutput(0), 72, 2);

    IElementWiseLayer* id_2098 = network->addElementWise(*id_2097->getOutput(0), *id_2066->getOutput(0), ElementWiseOperation::kSUM);
    IActivationLayer* id_2099 = network->addActivation(*id_2098->getOutput(0), ActivationType::kRELU);

    // conv1850+conv1864+conv1878+1892
    IConvolutionLayer* id_2100 = network->addConvolutionNd(*id_1850->getOutput(0), 18, DimsHW{ 3, 3 }, weightMap["stage4.1.fuse_layers.3.0.0.0.weight"], emptywts);
    assert(id_2100);
    id_2100->setStrideNd(DimsHW{ 2, 2 });
    id_2100->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_2101 = addBatchNorm2d(network, weightMap, *id_2100->getOutput(0), "stage4.1.fuse_layers.3.0.0.1", 1e-5);
    IActivationLayer* id_2102 = network->addActivation(*id_2101->getOutput(0), ActivationType::kRELU);
    IConvolutionLayer* id_2103 = network->addConvolutionNd(*id_2102->getOutput(0), 18, DimsHW{ 3, 3 }, weightMap["stage4.1.fuse_layers.3.0.1.0.weight"], emptywts);
    assert(id_2103);
    id_2103->setStrideNd(DimsHW{ 2, 2 });
    id_2103->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_2104 = addBatchNorm2d(network, weightMap, *id_2103->getOutput(0), "stage4.1.fuse_layers.3.0.1.1", 1e-5);
    IActivationLayer* id_2105 = network->addActivation(*id_2104->getOutput(0), ActivationType::kRELU);
    IConvolutionLayer* id_2106 = network->addConvolutionNd(*id_2105->getOutput(0), 144, DimsHW{ 3, 3 }, weightMap["stage4.1.fuse_layers.3.0.2.0.weight"], emptywts);
    assert(id_2106);
    id_2106->setStrideNd(DimsHW{ 2, 2 });
    id_2106->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_2107 = addBatchNorm2d(network, weightMap, *id_2106->getOutput(0), "stage4.1.fuse_layers.3.0.2.1", 1e-5);

    // 
    IConvolutionLayer* id_2108 = network->addConvolutionNd(*id_1864->getOutput(0), 36, DimsHW{ 3, 3 }, weightMap["stage4.1.fuse_layers.3.1.0.0.weight"], emptywts);
    assert(id_2108);
    id_2108->setStrideNd(DimsHW{ 2, 2 });
    id_2108->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_2109 = addBatchNorm2d(network, weightMap, *id_2108->getOutput(0), "stage4.1.fuse_layers.3.1.0.1", 1e-5);
    IActivationLayer* id_2110 = network->addActivation(*id_2109->getOutput(0), ActivationType::kRELU);
    IConvolutionLayer* id_2111 = network->addConvolutionNd(*id_2110->getOutput(0), 144, DimsHW{ 3, 3 }, weightMap["stage4.1.fuse_layers.3.1.1.0.weight"], emptywts);
    assert(id_2111);
    id_2111->setStrideNd(DimsHW{ 2, 2 });
    id_2111->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_2112 = addBatchNorm2d(network, weightMap, *id_2111->getOutput(0), "stage4.1.fuse_layers.3.1.1.1", 1e-5);

    IElementWiseLayer* id_2113 = network->addElementWise(*id_2107->getOutput(0), *id_2112->getOutput(0), ElementWiseOperation::kSUM);

    IConvolutionLayer* id_2114 = network->addConvolutionNd(*id_1878->getOutput(0), 144, DimsHW{ 3, 3 }, weightMap["stage4.1.fuse_layers.3.2.0.0.weight"], emptywts);
    assert(id_2114);
    id_2114->setStrideNd(DimsHW{ 2, 2 });
    id_2114->setPaddingNd(DimsHW{ 1, 1 });
    IScaleLayer* id_2115 = addBatchNorm2d(network, weightMap, *id_2114->getOutput(0), "stage4.1.fuse_layers.3.2.0.1", 1e-5);

    IElementWiseLayer* id_2116 = network->addElementWise(*id_2113->getOutput(0), *id_2115->getOutput(0), ElementWiseOperation::kSUM);
    IElementWiseLayer* id_2117 = network->addElementWise(*id_2116->getOutput(0), *id_1892->getOutput(0), ElementWiseOperation::kSUM);
    IActivationLayer* id_2118 = network->addActivation(*id_2117->getOutput(0), ActivationType::kRELU);

    //res
    auto id_2174 = ResBlock2Conv(network, weightMap, *id_2118->getOutput(0), 256, 1024, 1, "incre_modules.3.0");
    auto id_2158 = ResBlock2Conv(network, weightMap, *id_2099->getOutput(0), 128, 512, 1, "incre_modules.2.0");
    auto id_2142 = ResBlock2Conv(network, weightMap, *id_2057->getOutput(0), 64, 256, 1, "incre_modules.1.0");
    auto id_2130 = ResBlock2Conv(network, weightMap, *id_1989->getOutput(0), 32, 128, 1, "incre_modules.0.0");

    auto id_2145 = convBnLeaky(network, weightMap, *id_2130->getOutput(0), 256, 3, 2, 1, "downsamp_modules.0.0", "downsamp_modules.0.1", true);
    IElementWiseLayer* id_2146 = network->addElementWise(*id_2145->getOutput(0), *id_2142->getOutput(0), ElementWiseOperation::kSUM);
    auto id_2161 = convBnLeaky(network, weightMap, *id_2146->getOutput(0), 512, 3, 2, 1, "downsamp_modules.1.0", "downsamp_modules.1.1", true);
    IElementWiseLayer* id_2162 = network->addElementWise(*id_2161->getOutput(0), *id_2158->getOutput(0), ElementWiseOperation::kSUM);
    auto id_2177 = convBnLeaky(network, weightMap, *id_2162->getOutput(0), 1024, 3, 2, 1, "downsamp_modules.2.0", "downsamp_modules.2.1", true);
    IElementWiseLayer* id_2178 = network->addElementWise(*id_2177->getOutput(0), *id_2174->getOutput(0), ElementWiseOperation::kSUM);

    auto id_2181 = convBnLeaky(network, weightMap, *id_2178->getOutput(0), 2048, 1, 1, 0, "final_layer.0", "final_layer.1", true);
    //   y = F.avg_pool2d(y, kernel_size=y.size()[2:]).view(y.size(0), -1)
    auto pool = network->addPoolingNd(*id_2181->getOutput(0), PoolingType::kAVERAGE, DimsHW{ 7, 7 });
    pool->setPaddingNd(DimsHW{ 0, 0 });
    pool->setStrideNd(DimsHW{ 1, 1 });
    // self.classifier = nn.Linear(2048, 1000)
    IFullyConnectedLayer* out = network->addFullyConnected(*pool->getOutput(0), 1000, weightMap["classifier.weight"], weightMap["classifier.bias"]);
    assert(out);
    out->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    std::cout << "set name out" << std::endl;
    network->markOutput(*out->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize((1 << 30));  // 1G
#ifdef USE_FP16
    config->setFlag(BuilderFlag::kFP16);
#endif
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "build out" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*)(mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

int main(int argc, char** argv) {

    cudaSetDevice(DEVICE);
    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{ nullptr };
    size_t size{ 0 };
    std::string engine_name = "hrnet.engine";
    if (argc == 2 && std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{ nullptr };
        APIToModel(BATCH_SIZE, &modelStream);
        assert(modelStream != nullptr);
        std::ofstream p(engine_name, std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 0;
    }
    else if (argc == 3 && std::string(argv[1]) == "-d") {
        std::ifstream file(engine_name, std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    }
    else {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./yolov5 -s  // serialize model to plan file" << std::endl;
        std::cerr << "./yolov5 -d ../samples  // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    std::vector<std::string> file_names;
    if (read_files_in_dir(argv[2], file_names) < 0) {
        std::cout << "read_files_in_dir failed." << std::endl;
        return -1;
    }
    // prepare input data ---------------------------
    static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
    //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
    //    data[i] = 1.0;
    static float prob[BATCH_SIZE * OUTPUT_SIZE];
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    /*
    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]
    inp_image = ((resized_img/255. - mean) / std).astype(np.float32)
    */
    int fcount = 0;
    for (int f = 0; f < (int)file_names.size(); f++) {
        fcount++;
        if (fcount < BATCH_SIZE && f + 1 != (int)file_names.size()) continue;
        for (int b = 0; b < fcount; b++) {
            cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]); // BGR
            if (img.empty()) continue;
            // cv::Mat pr_img = preprocess_img(img); // letterbox BGR to RGB
            cv::Mat pr_img;
            cv::resize(img, pr_img, cv::Size(INPUT_W, INPUT_H));
            int i = 0;
            for (int row = 0; row < INPUT_H; ++row) {
                uchar* uc_pixel = pr_img.data + row * pr_img.step;
                for (int col = 0; col < INPUT_W; ++col) {
                    data[b * 3 * INPUT_H * INPUT_W + i] = ((float)uc_pixel[2] / 255.0 - 0.485) / 0.229; // R-0.485
                    data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = ((float)uc_pixel[1] / 255.0 - 0.456) / 0.224;
                    data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = ((float)uc_pixel[0] / 255.0 - 0.406) / 0.225;
                    uc_pixel += 3;
                    ++i;
                }
            }
        }
        // Run inference  
        auto start = std::chrono::system_clock::now();
        doInference(*context, data, prob, BATCH_SIZE);
        auto end = std::chrono::system_clock::now();
        std::cout << "infer time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
        float maxp = 0;
        int index = 0;
        for (int b = 0; b < fcount; b++) {
            for (int j = 0; j < 1000; ++j)
            {
                float p = prob[b * OUTPUT_SIZE + j];
                if (p > maxp)
                {
                    maxp = p;
                    index = j;
                }
            }
        }
        std::cout << "out index: " << index << std::endl;
    }
}

================================================
FILE: hrnet/hrnet-image-classification/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) override
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: hrnet/hrnet-semantic-segmentation/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(hrnetseg)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

find_package(CUDA REQUIRED)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(/usr/include/x86_64-linux-gnu/)
link_directories(/usr/lib/x86_64-linux-gnu/)

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

add_executable(hrnet ${PROJECT_SOURCE_DIR}/hrnet.cpp)
target_link_libraries(hrnet nvinfer)
target_link_libraries(hrnet cudart)
target_link_libraries(hrnet ${OpenCV_LIBS})


add_executable(hrnet_ocr ${PROJECT_SOURCE_DIR}/hrnet_ocr.cpp)
target_link_libraries(hrnet_ocr nvinfer)
target_link_libraries(hrnet_ocr cudart)
target_link_libraries(hrnet_ocr ${OpenCV_LIBS})


add_definitions(-O2 -pthread)


================================================
FILE: hrnet/hrnet-semantic-segmentation/README.md
================================================
# HRNet-Semantic-Segmentation

This repo implemtents [HRNet-Semantic-Segmentation-v1.1](https://github.com/HRNet/HRNet-Semantic-Segmentation/tree/pytorch-v1.1) and [HRNet-Semantic-Segmentation-OCR](https://github.com/HRNet/HRNet-Semantic-Segmentation/tree/HRNet-OCR).


## How to Run
### For HRNet-Semantic-Segmentation-v1.1
1. generate .wts, use config `experiments/cityscapes/seg_hrnet_w48_train_512x1024_sgd_lr1e-2_wd5e-4_bs_12_epoch484.yaml` and pretrained weight `hrnet_w48_cityscapes_cls19_1024x2048_trainset.pth` as example. change `PRETRAINED` in `experiments/cityscapes/seg_hrnet_w48_train_512x1024_sgd_lr1e-2_wd5e-4_bs_12_epoch484.yaml` to `""`.
```
cp gen_wts.py $HRNET--Semantic-Segmentation-PROJECT-ROOT/tools
cd $HRNET--Semantic-Segmentation-PROJECT-ROOT
python tools/gen_wts.py --cfg experiments/cityscapes/seg_hrnet_w48_train_512x1024_sgd_lr1e-2_wd5e-4_bs_12_epoch484.yaml --ckpt_path hrnet_w48_cityscapes_cls19_1024x2048_trainset.pth --save_path hrnet_w48.wts
cp hrnet_w48.wts $HRNET-TENSORRT-ROOT
cd $HRNET-TENSORRT-ROOT
```
2. cmake and make

  ```
  mkdir build
  cd build
  cmake ..
  make
  ```
  first serialize model to plan file
  ```
  ./hrnet -s [.wts] [.engine] [small or 18 or 32 or 48] # small for W18-Small-v2, 18 for W18, etc.
  ```
  such as
  ```
  ./hrnet -s ../hrnet_w48.wts ./hrnet_w48.engine 48
  ```
  then deserialize plan file and run inference
  ```
  ./hrnet -d  [.engine] [image dir]
  ```
  such as 
  ```
  ./hrnet -d  ./hrnet_w48.engine ../samples
  ```
### For HRNet-Semantic-Segmentation-OCR

1. generate .wts, use config `experiments/cityscapes/seg_hrnet_ocr_w48_train_512x1024_sgd_lr1e-2_wd5e-4_bs_12_epoch484.yaml` and pretrained weight `hrnet_ocr_cs_8162_torch11.pth` as example. change `PRETRAINED` in `experiments/cityscapes/seg_hrnet_ocr_w48_train_512x1024_sgd_lr1e-2_wd5e-4_bs_12_epoch484.yaml` to `""`.
```
cp gen_wts.py $HRNET-OCR-TRAIN-PROJECT-ROOT/tools
cd $HRNET-OCR-PROJECT-ROOT
python tools/gen_wts.py --cfg experiments/cityscapes/seg_hrnet_ocr_w48_train_512x1024_sgd_lr1e-2_wd5e-4_bs_12_epoch484.yaml --ckpt_path hrnet_ocr_cs_8162_torch11.pth --save_path hrnet_ocr_w48.wts
cp hrnet_ocr_w48.wts $HRNET-OCR-TENSORRT-ROOT
cd $HRNET-OCR-TENSORRT-ROOT
```
2. cmake and make

  ```
  mkdir build
  cd build
  cmake ..
  make
  ```
  first serialize model to plan file
  ```
  ./hrnet_ocr -s [.wts] [.engine] [18 or 32 or 48]
  ```
  such as
  ```
  ./hrnet_ocr -s ../hrnet_ocr_w48.wts ./hrnet_ocr_w48.engine 48
  ```
  then deserialize plan file and run inference
  ```
  ./hrnet_ocr -d  [.engine] [image dir]
  ```
  such as 
  ```
  ./hrnet_ocr -d  ./hrnet_ocr_w48.engine ../samples
  ```
## Result

TRT Result:

![trtcity](https://user-images.githubusercontent.com/20653176/103136469-a68e2080-46fb-11eb-9f05-06bad81c74b9.png)

pytorch result:

![image-20201225171224159](https://user-images.githubusercontent.com/20653176/103131619-6cf9ed00-46dc-11eb-9369-4374abb65744.png)

## Note

* Some source codes are changed for simplicity.  But the original model can still be used.

  All "upsample" op  in source code are changed to `mode='bilinear', align_corners=True`

* Image preprocessing operation and postprocessing operation  are put into Trt Engine.

* Zero-copy technology (CPU/GPU memory copy) is used.


================================================
FILE: hrnet/hrnet-semantic-segmentation/common.hpp
================================================
#pragma once

#include <fstream>
#include <map>
#include <sstream>
#include <vector>
#include <opencv2/opencv.hpp>
#include <dirent.h>
#include "NvInfer.h"
#include "NvInferPlugin.h"
#include "cuda_runtime_api.h"

using namespace nvinfer1;

#define CHECK(status)                                          \
    do                                                         \
    {                                                          \
        auto ret = (status);                                   \
        if (ret != 0)                                          \
        {                                                      \
            std::cerr << "Cuda failure: " << ret << std::endl; \
            abort();                                           \
        }                                                      \
    } while (0)

int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
    DIR *p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 &&
            strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}


// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
void debug_print(ITensor *input_tensor, std::string head)
{
    std::cout << head << " : ";

    for (int i = 0; i < input_tensor->getDimensions().nbDims; i++)
    {
        std::cout << input_tensor->getDimensions().d[i] << " ";
    }
    std::cout << std::endl;
}
std::map<std::string, Weights> loadWeights(const std::string file)
{
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t *val = reinterpret_cast<uint32_t *>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

cv::Mat createLTU(int len)
{
    cv::Mat lookUpTable(1, 256, CV_8U);
    uchar *p = lookUpTable.data;
    for (int j = 0; j < 256; ++j)
    {
        p[j] = (j * (256 / len) > 255) ? uchar(255) : (uchar)(j * (256 / len));
    }
    return lookUpTable;
}
ITensor *MeanStd(INetworkDefinition *network, ITensor *input, float *mean, float *std, bool div255)
{
    if (div255)
    {
        Weights Div_225{DataType::kFLOAT, nullptr, 3};
        float *wgt = reinterpret_cast<float *>(malloc(sizeof(float) * 3));
        for (int i = 0; i < 3; ++i)
        {
            wgt[i] = 255.0f;
        }
        Div_225.values = wgt;
        IConstantLayer *d = network->addConstant(Dims3{3, 1, 1}, Div_225);
        input = network->addElementWise(*input, *d->getOutput(0), ElementWiseOperation::kDIV)->getOutput(0);
    }
    Weights Mean{DataType::kFLOAT, nullptr, 3};
    Mean.values = mean;
    IConstantLayer *m = network->addConstant(Dims3{3, 1, 1}, Mean);
    IElementWiseLayer *sub_mean = network->addElementWise(*input, *m->getOutput(0), ElementWiseOperation::kSUB);
    if (std != nullptr)
    {
        Weights Std{DataType::kFLOAT, nullptr, 3};
        Std.values = std;
        IConstantLayer *s = network->addConstant(Dims3{3, 1, 1}, Std);
        IElementWiseLayer *std_mean = network->addElementWise(*sub_mean->getOutput(0), *s->getOutput(0), ElementWiseOperation::kDIV);
        return std_mean->getOutput(0);
    }
    else
    {
        return sub_mean->getOutput(0);
    }
}

IScaleLayer *addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights> &weightMap, ITensor &input, std::string lname, float eps)
{
    float *gamma = (float *)weightMap[lname + ".weight"].values;
    float *beta = (float *)weightMap[lname + ".bias"].values;
    float *mean = (float *)weightMap[lname + ".running_mean"].values;
    float *var = (float *)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;
    //std::cout << "len " << len << std::endl;

    float *scval = reinterpret_cast<float *>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++)
    {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};

    float *shval = reinterpret_cast<float *>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++)
    {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float *pval = reinterpret_cast<float *>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++)
    {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer *scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

ILayer *convBnRelu(INetworkDefinition *network,
                   std::map<std::string, Weights> &weightMap,
                   ITensor &input, int outch, int ksize, int s, int p,
                   std::string convname, std::string bnname,
                   bool relu = true,
                   bool bias = false)
{
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    IConvolutionLayer *conv1;
    //Dims dim;
    if (!bias)
    {
        conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[convname + ".weight"], emptywts);
    }
    else
    {
        conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[convname + ".weight"], weightMap[convname + ".bias"]);
    }
    assert(conv1);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{p, p});
    debug_print(conv1->getOutput(0), convname);
    IScaleLayer *bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), bnname, 1e-5);
    debug_print(bn1->getOutput(0), bnname);
    if (relu)
    {
        auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
        return lr;
    }
    return bn1;
}

IActivationLayer *ResBlock2Conv(INetworkDefinition *network, std::map<std::string, Weights> &weightMap, ITensor &input, int inch, int outch, int stride, std::string lname)
{
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    IConvolutionLayer *conv1 = network->addConvolutionNd(input, inch, DimsHW{1, 1}, weightMap[lname + ".conv1.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{stride, stride});
    conv1->setPaddingNd(DimsHW{0, 0});
    debug_print(conv1->getOutput(0), lname + "_1");
    IScaleLayer *bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn1", 1e-5);
    IActivationLayer *relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);
    ///
    IConvolutionLayer *conv2 = network->addConvolutionNd(*relu1->getOutput(0), inch, DimsHW{3, 3}, weightMap[lname + ".conv2.weight"], emptywts);
    assert(conv2);
    conv2->setStrideNd(DimsHW{stride, stride});
    conv2->setPaddingNd(DimsHW{1, 1});
    debug_print(conv2->getOutput(0), lname + "_2");
    IScaleLayer *bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + ".bn2", 1e-5);

    IActivationLayer *relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU);
    assert(relu2);
    //////
    IConvolutionLayer *conv3 = network->addConvolutionNd(*relu2->getOutput(0), outch, DimsHW{1, 1}, weightMap[lname + ".conv3.weight"], emptywts);
    assert(conv3);
    conv3->setStrideNd(DimsHW{stride, stride});
    conv3->setPaddingNd(DimsHW{0, 0});
    debug_print(conv3->getOutput(0), lname + "_3");
    IScaleLayer *bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + ".bn3", 1e-5);

    IElementWiseLayer *ew1;
    if (inch != outch)
    {
        IConvolutionLayer *conv4 = network->addConvolutionNd(input, outch, DimsHW{1, 1}, weightMap[lname + ".downsample.0.weight"], emptywts);
        assert(conv4);
        conv4->setStrideNd(DimsHW{stride, stride});
        conv4->setPaddingNd(DimsHW{0, 0});
        debug_print(conv4->getOutput(0), lname + "_4");
        IScaleLayer *bn4 = addBatchNorm2d(network, weightMap, *conv4->getOutput(0), lname + ".downsample.1", 1e-5);
        ew1 = network->addElementWise(*bn4->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM);
    }
    else
    {
        ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM);
    }
    IActivationLayer *relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU);
    assert(relu3);
    return relu3;
}

IActivationLayer *ResBlock(INetworkDefinition *network, std::map<std::string, Weights> &weightMap, ITensor &input, int inch, int outch, int stride, std::string lname)
{
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    // in 256 out 64
    IConvolutionLayer *conv1 = network->addConvolutionNd(input, outch, DimsHW{1, 1}, weightMap[lname + ".conv1.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{stride, stride});
    conv1->setPaddingNd(DimsHW{0, 0});
    debug_print(conv1->getOutput(0), lname + "_1");
    IScaleLayer *bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn1", 1e-5);

    IActivationLayer *relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);
    ///
    IConvolutionLayer *conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{3, 3}, weightMap[lname + ".conv2.weight"], emptywts);
    assert(conv2);
    conv2->setStrideNd(DimsHW{stride, stride});
    conv2->setPaddingNd(DimsHW{1, 1});
    debug_print(conv2->getOutput(0), lname + "_2");
    IScaleLayer *bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + ".bn2", 1e-5);

    IActivationLayer *relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU);
    assert(relu2);
    //////
    IConvolutionLayer *conv3 = network->addConvolutionNd(*relu2->getOutput(0), inch, DimsHW{1, 1}, weightMap[lname + ".conv3.weight"], emptywts);
    assert(conv3);
    conv3->setStrideNd(DimsHW{stride, stride});
    conv3->setPaddingNd(DimsHW{0, 0});
    debug_print(conv3->getOutput(0), lname + "_3");
    IScaleLayer *bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + ".bn3", 1e-5);

    IElementWiseLayer *ew1;
    ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM);
    IActivationLayer *relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU);
    assert(relu3);
    return relu3;
}

IActivationLayer *liteResBlock(INetworkDefinition *network, std::map<std::string, Weights> &weightMap, ITensor &input, int outch, std::string lname)
{
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    // in 256 out 64
    IConvolutionLayer *conv1 = network->addConvolutionNd(input, outch, DimsHW{3, 3}, weightMap[lname + ".conv1.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{1, 1});
    conv1->setPaddingNd(DimsHW{1, 1});
    debug_print(conv1->getOutput(0), lname + "_1");
    IScaleLayer *bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn1", 1e-5);

    IActivationLayer *relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);
    ///
    IConvolutionLayer *conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{3, 3}, weightMap[lname + ".conv2.weight"], emptywts);
    assert(conv2);
    conv2->setStrideNd(DimsHW{1, 1});
    conv2->setPaddingNd(DimsHW{1, 1});
    debug_print(conv2->getOutput(0), lname + "_2");
    IScaleLayer *bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + ".bn2", 1e-5);

    IElementWiseLayer *ew1;
    ew1 = network->addElementWise(input, *bn2->getOutput(0), ElementWiseOperation::kSUM);
    debug_print(ew1->getOutput(0), lname + "_add");
    IActivationLayer *relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU);
    assert(relu3);
    return relu3;
}

ILayer *convBnAddRelu(INetworkDefinition *network, std::map<std::string, Weights> &weightMap, ITensor &input, ITensor &addinput, int outch, int ksize, int s, int p, std::string convname, std::string bnname, bool bias = false)
{
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    IConvolutionLayer *conv1;
    //Dims dim;
    if (!bias)
    {
        conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[convname + ".weight"], emptywts);
    }
    else
    {
        conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[convname + ".weight"], weightMap[convname + ".bias"]);
    }
    assert(conv1);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{p, p});
    debug_print(conv1->getOutput(0), convname);
    IScaleLayer *bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), bnname, 1e-5);
    auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    debug_print(lr->getOutput(0), convname + "_add");
    return lr;
}

ILayer *netAddUpsampleBi(INetworkDefinition *network, ITensor *input, Dims outdims)
{
    // Bi + True
    IResizeLayer *upSample = network->addResize(*input);
    upSample->setResizeMode(ResizeMode::kLINEAR);
    upSample->setOutputDimensions(outdims);
    upSample->setAlignCorners(true); // tips!
    return upSample;
}

IElementWiseLayer *convBnUpAdd(INetworkDefinition *network,
                               std::map<std::string, Weights> &weightMap,
                               ITensor &input, ITensor &addinput,
                               int outch, int ksize, int s, int p,
                               std::string convname,
                               std::string bnname, bool upsample, bool bias = false)
{
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    IConvolutionLayer *conv1;
    if (!bias)
    {
        conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[convname + ".weight"], emptywts);
    }
    else
    {
        conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[convname + ".weight"], weightMap[convname + ".bias"]);
    }
    assert(conv1);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{p, p});
    debug_print(conv1->getOutput(0), convname + "_1");
    IScaleLayer *bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), bnname, 1e-5);
    if (!upsample)
    {
        IElementWiseLayer *add = network->addElementWise(*bn1->getOutput(0), addinput, ElementWiseOperation::kSUM);
        debug_print(add->getOutput(0), convname + "_add");
        return add;
    }
    else
    {
        nvinfer1::Dims dim = addinput.getDimensions();
        ILayer *up = netAddUpsampleBi(network, bn1->getOutput(0), dim);
        IElementWiseLayer *add = network->addElementWise(*up->getOutput(0), addinput, ElementWiseOperation::kSUM);
        debug_print(conv1->getOutput(0), convname + "_1");
        //auto lr = network->addActivation(*add->getOutput(0), ActivationType::kRELU);
        return add;
    }
}


================================================
FILE: hrnet/hrnet-semantic-segmentation/gen_wts.py
================================================
import argparse
import struct

import _init_paths
import models
import torch
from config import config, update_config


def parse_args():
    parser = argparse.ArgumentParser(description="Train keypoints network")

    parser.add_argument("--cfg", help="experiment configure file name", type=str)
    parser.add_argument("--ckpt_path", help="checkpoint path", required=True, type=str)
    parser.add_argument("--save_path", help=".wts path", required=True, type=str)

    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()
    update_config(config, args)

    return args


def main():
    args = parse_args()

    model = eval("models." + config.MODEL.NAME + ".get_seg_model")(config)

    print("=> loading model from {}".format(args.ckpt_path))
    pretrained_dict = torch.load(args.ckpt_path, map_location="cpu")
    model_dict = model.state_dict()
    pretrained_dict = {
        k[6:]: v for k, v in pretrained_dict.items() if k[6:] in model_dict.keys()
    }
    for k, _ in pretrained_dict.items():
        print("=> loading {} from pretrained model".format(k))
    model_dict.update(pretrained_dict)
    model.load_state_dict(model_dict)

    print("=> saving {} ".format(args.save_path))
    f = open(args.save_path, "w")
    f.write("{}\n".format(len(model.state_dict().keys())))
    for k, v in model.state_dict().items():
        vr = v.reshape(-1).cpu().numpy()
        f.write("{} {} ".format(k, len(vr)))
        for vv in vr:
            f.write(" ")
            f.write(struct.pack(">f", float(vv)).hex())
        f.write("\n")
    f.close()


if __name__ == "__main__":
    main()


================================================
FILE: hrnet/hrnet-semantic-segmentation/hrnet.cpp
================================================
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include "common.hpp"
#include "logging.h"

static Logger gLogger;
#define USE_FP32
#define DEVICE 0 // GPU id
#define BATCH_SIZE 1

const char *INPUT_BLOB_NAME = "data";
const char *OUTPUT_BLOB_NAME = "output";
static const int INPUT_H = 512;
static const int INPUT_W = 1024;
static const int NUM_CLASSES = 19;
static const int OUTPUT_SIZE = INPUT_H * INPUT_W;

// Creat the engine using only the API and not any parser.
ICudaEngine *createEngine(unsigned int maxBatchSize, IBuilder *builder, IBuilderConfig *config, DataType dt, std::string wtsPath, int width)
{
    INetworkDefinition *network = builder->createNetworkV2(0U);
    // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
    ITensor *data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{INPUT_H, INPUT_W, 3});
    assert(data);

    // hwc to chw
    auto ps = network->addShuffle(*data);
    ps->setFirstTranspose(nvinfer1::Permutation{2, 0, 1});
    float mean[3] = {0.485, 0.456, 0.406};
    float std[3] = {0.229, 0.224, 0.225};
    ITensor *preinput = MeanStd(network, ps->getOutput(0), mean, std, true);

    std::map<std::string, Weights> weightMap = loadWeights(wtsPath);
    auto relu_2 = convBnRelu(network, weightMap, *preinput, 64, 3, 2, 1, "conv1", "bn1");
    auto relu_5 = convBnRelu(network, weightMap, *relu_2->getOutput(0), 64, 3, 2, 1, "conv2", "bn2");
    auto relu_17 = ResBlock2Conv(network, weightMap, *relu_5->getOutput(0), 64, 256, 1, "layer1.0");
    auto relu_27 = ResBlock(network, weightMap, *relu_17->getOutput(0), 256, 64, 1, "layer1.1");
    auto relu_37 = ResBlock(network, weightMap, *relu_27->getOutput(0), 256, 64, 1, "layer1.2");
    auto relu_47 = ResBlock(network, weightMap, *relu_37->getOutput(0), 256, 64, 1, "layer1.3");

    auto relu_50 = convBnRelu(network, weightMap, *relu_47->getOutput(0), width, 3, 1, 1, "transition1.0.0", "transition1.0.1");
    auto relu_60 = liteResBlock(network, weightMap, *relu_50->getOutput(0), width, "stage2.0.branches.0.0");
    auto relu_67 = liteResBlock(network, weightMap, *relu_60->getOutput(0), width, "stage2.0.branches.0.1");
    auto relu_74 = liteResBlock(network, weightMap, *relu_67->getOutput(0), width, "stage2.0.branches.0.2");
    auto relu_81 = liteResBlock(network, weightMap, *relu_74->getOutput(0), width, "stage2.0.branches.0.3");

    auto relu_53 = convBnRelu(network, weightMap, *relu_47->getOutput(0), width * 2, 3, 2, 1, "transition1.1.0.0", "transition1.1.0.1");
    auto relu_88 = liteResBlock(network, weightMap, *relu_53->getOutput(0), width * 2, "stage2.0.branches.1.0");
    auto relu_95 = liteResBlock(network, weightMap, *relu_88->getOutput(0), width * 2, "stage2.0.branches.1.1");
    auto relu_102 = liteResBlock(network, weightMap, *relu_95->getOutput(0), width * 2, "stage2.0.branches.1.2");
    auto relu_109 = liteResBlock(network, weightMap, *relu_102->getOutput(0), width * 2, "stage2.0.branches.1.3");

    auto add_131 = convBnUpAdd(network, weightMap, *relu_109->getOutput(0), *relu_81->getOutput(0), width, 1, 1, 0, "stage2.0.fuse_layers.0.1.0", "stage2.0.fuse_layers.0.1.1", true);
    auto relu_132 = network->addActivation(*add_131->getOutput(0), ActivationType::kRELU);

    auto add_135 = convBnUpAdd(network, weightMap, *relu_81->getOutput(0), *relu_109->getOutput(0), width * 2, 3, 2, 1, "stage2.0.fuse_layers.1.0.0.0", "stage2.0.fuse_layers.1.0.0.1", false);
    auto relu_136 = network->addActivation(*add_135->getOutput(0), ActivationType::kRELU);

    auto relu_146 = liteResBlock(network, weightMap, *relu_132->getOutput(0), width, "stage3.0.branches.0.0");
    auto relu_153 = liteResBlock(network, weightMap, *relu_146->getOutput(0), width, "stage3.0.branches.0.1");
    auto relu_160 = liteResBlock(network, weightMap, *relu_153->getOutput(0), width, "stage3.0.branches.0.2");
    auto relu_167 = liteResBlock(network, weightMap, *relu_160->getOutput(0), width, "stage3.0.branches.0.3");

    auto relu_174 = liteResBlock(network, weightMap, *relu_136->getOutput(0), width * 2, "stage3.0.branches.1.0");
    auto relu_181 = liteResBlock(network, weightMap, *relu_174->getOutput(0), width * 2, "stage3.0.branches.1.1");
    auto relu_188 = liteResBlock(network, weightMap, *relu_181->getOutput(0), width * 2, "stage3.0.branches.1.2");
    auto relu_195 = liteResBlock(network, weightMap, *relu_188->getOutput(0), width * 2, "stage3.0.branches.1.3");

    auto relu_139 = convBnRelu(network, weightMap, *relu_136->getOutput(0), width * 4, 3, 2, 1, "transition2.2.0.0", "transition2.2.0.1");
    auto relu_202 = liteResBlock(network, weightMap, *relu_139->getOutput(0), width * 4, "stage3.0.branches.2.0");
    auto relu_209 = liteResBlock(network, weightMap, *relu_202->getOutput(0), width * 4, "stage3.0.branches.2.1");
    auto relu_216 = liteResBlock(network, weightMap, *relu_209->getOutput(0), width * 4, "stage3.0.branches.2.2");
    auto relu_223 = liteResBlock(network, weightMap, *relu_216->getOutput(0), width * 4, "stage3.0.branches.2.3");

    auto add_245 = convBnUpAdd(network, weightMap, *relu_195->getOutput(0), *relu_167->getOutput(0), width, 1, 1, 0, "stage3.0.fuse_layers.0.1.0", "stage3.0.fuse_layers.0.1.1", true);
    auto add_267 = convBnUpAdd(network, weightMap, *relu_223->getOutput(0), *add_245->getOutput(0), width, 1, 1, 0, "stage3.0.fuse_layers.0.2.0", "stage3.0.fuse_layers.0.2.1", true);
    auto relu_268 = network->addActivation(*add_267->getOutput(0), ActivationType::kRELU);

    auto add_271 = convBnUpAdd(network, weightMap, *relu_167->getOutput(0), *relu_195->getOutput(0), width * 2, 3, 2, 1, "stage3.0.fuse_layers.1.0.0.0", "stage3.0.fuse_layers.1.0.0.1", false);
    auto add_293 = convBnUpAdd(network, weightMap, *relu_223->getOutput(0), *add_271->getOutput(0), width * 2, 1, 1, 0, "stage3.0.fuse_layers.1.2.0", "stage3.0.fuse_layers.1.2.1", true);
    auto relu_294 = network->addActivation(*add_293->getOutput(0), ActivationType::kRELU);

    auto relu_297 = convBnRelu(network, weightMap, *relu_167->getOutput(0), width, 3, 2, 1, "stage3.0.fuse_layers.2.0.0.0", "stage3.0.fuse_layers.2.0.0.1");
    auto bn_299 = convBnRelu(network, weightMap, *relu_297->getOutput(0), width * 4, 3, 2, 1, "stage3.0.fuse_layers.2.0.1.0", "stage3.0.fuse_layers.2.0.1.1", false);
    auto add_302 = convBnUpAdd(network, weightMap, *relu_195->getOutput(0), *bn_299->getOutput(0), width * 4, 3, 2, 1, "stage3.0.fuse_layers.2.1.0.0", "stage3.0.fuse_layers.2.1.0.1", false);
    auto add_303 = network->addElementWise(*add_302->getOutput(0), *relu_223->getOutput(0), ElementWiseOperation::kSUM);
    auto relu_304 = network->addActivation(*add_303->getOutput(0), ActivationType::kRELU);

    auto relu_311 = liteResBlock(network, weightMap, *relu_268->getOutput(0), width, "stage3.1.branches.0.0");
    auto relu_318 = liteResBlock(network, weightMap, *relu_311->getOutput(0), width, "stage3.1.branches.0.1");
    auto relu_325 = liteResBlock(network, weightMap, *relu_318->getOutput(0), width, "stage3.1.branches.0.2");
    auto relu_332 = liteResBlock(network, weightMap, *relu_325->getOutput(0), width, "stage3.1.branches.0.3");

    auto relu_339 = liteResBlock(network, weightMap, *relu_294->getOutput(0), width * 2, "stage3.1.branches.1.0");
    auto relu_346 = liteResBlock(network, weightMap, *relu_339->getOutput(0), width * 2, "stage3.1.branches.1.1");
    auto relu_353 = liteResBlock(network, weightMap, *relu_346->getOutput(0), width * 2, "stage3.1.branches.1.2");
    auto relu_360 = liteResBlock(network, weightMap, *relu_353->getOutput(0), width * 2, "stage3.1.branches.1.3");

    auto relu_367 = liteResBlock(network, weightMap, *relu_304->getOutput(0), width * 4, "stage3.1.branches.2.0");
    auto relu_374 = liteResBlock(network, weightMap, *relu_367->getOutput(0), width * 4, "stage3.1.branches.2.1");
    auto relu_381 = liteResBlock(network, weightMap, *relu_374->getOutput(0), width * 4, "stage3.1.branches.2.2");
    auto relu_388 = liteResBlock(network, weightMap, *relu_381->getOutput(0), width * 4, "stage3.1.branches.2.3");

    auto add_410 = convBnUpAdd(network, weightMap, *relu_360->getOutput(0), *relu_332->getOutput(0), width, 1, 1, 0, "stage3.1.fuse_layers.0.1.0", "stage3.1.fuse_layers.0.1.1", true);
    auto add_432 = convBnUpAdd(network, weightMap, *relu_388->getOutput(0), *add_410->getOutput(0), width, 1, 1, 0, "stage3.1.fuse_layers.0.2.0", "stage3.1.fuse_layers.0.2.1", true);
    auto relu_433 = network->addActivation(*add_432->getOutput(0), ActivationType::kRELU);

    auto add_436 = convBnUpAdd(network, weightMap, *relu_332->getOutput(0), *relu_360->getOutput(0), width * 2, 3, 2, 1, "stage3.1.fuse_layers.1.0.0.0", "stage3.1.fuse_layers.1.0.0.1", false);
    auto add_458 = convBnUpAdd(network, weightMap, *relu_388->getOutput(0), *add_436->getOutput(0), width * 2, 1, 1, 0, "stage3.1.fuse_layers.1.2.0", "stage3.1.fuse_layers.1.2.1", true);
    auto relu_459 = network->addActivation(*add_458->getOutput(0), ActivationType::kRELU);

    auto relu_462 = convBnRelu(network, weightMap, *relu_332->getOutput(0), width, 3, 2, 1, "stage3.1.fuse_layers.2.0.0.0", "stage3.1.fuse_layers.2.0.0.1");
    auto bn_464 = convBnRelu(network, weightMap, *relu_462->getOutput(0), width * 4, 3, 2, 1, "stage3.1.fuse_layers.2.0.1.0", "stage3.1.fuse_layers.2.0.1.1", false);
    auto add_467 = convBnUpAdd(network, weightMap, *relu_360->getOutput(0), *bn_464->getOutput(0), width * 4, 3, 2, 1, "stage3.1.fuse_layers.2.1.0.0", "stage3.1.fuse_layers.2.1.0.1", false);
    auto add_468 = network->addElementWise(*add_467->getOutput(0), *relu_388->getOutput(0), ElementWiseOperation::kSUM);
    auto relu_469 = network->addActivation(*add_468->getOutput(0), ActivationType::kRELU);

    auto relu_476 = liteResBlock(network, weightMap, *relu_433->getOutput(0), width, "stage3.2.branches.0.0");
    auto relu_483 = liteResBlock(network, weightMap, *relu_476->getOutput(0), width, "stage3.2.branches.0.1");
    auto relu_490 = liteResBlock(network, weightMap, *relu_483->getOutput(0), width, "stage3.2.branches.0.2");
    auto relu_497 = liteResBlock(network, weightMap, *relu_490->getOutput(0), width, "stage3.2.branches.0.3");

    auto relu_504 = liteResBlock(network, weightMap, *relu_459->getOutput(0), width * 2, "stage3.2.branches.1.0");
    auto relu_511 = liteResBlock(network, weightMap, *relu_504->getOutput(0), width * 2, "stage3.2.branches.1.1");
    auto relu_518 = liteResBlock(network, weightMap, *relu_511->getOutput(0), width * 2, "stage3.2.branches.1.2");
    auto relu_525 = liteResBlock(network, weightMap, *relu_518->getOutput(0), width * 2, "stage3.2.branches.1.3");

    auto relu_532 = liteResBlock(network, weightMap, *relu_469->getOutput(0), width * 4, "stage3.2.branches.2.0");
    auto relu_539 = liteResBlock(network, weightMap, *relu_532->getOutput(0), width * 4, "stage3.2.branches.2.1");
    auto relu_546 = liteResBlock(network, weightMap, *relu_539->getOutput(0), width * 4, "stage3.2.branches.2.2");
    auto relu_553 = liteResBlock(network, weightMap, *relu_546->getOutput(0), width * 4, "stage3.2.branches.2.3");

    auto add_575 = convBnUpAdd(network, weightMap, *relu_525->getOutput(0), *relu_497->getOutput(0), width, 1, 1, 0, "stage3.2.fuse_layers.0.1.0", "stage3.2.fuse_layers.0.1.1", true);
    auto add_597 = convBnUpAdd(network, weightMap, *relu_553->getOutput(0), *add_575->getOutput(0), width, 1, 1, 0, "stage3.2.fuse_layers.0.2.0", "stage3.2.fuse_layers.0.2.1", true);

    auto relu_598 = network->addActivation(*add_597->getOutput(0), ActivationType::kRELU);

    auto add_601 = convBnUpAdd(network, weightMap, *relu_497->getOutput(0), *relu_525->getOutput(0), width * 2, 3, 2, 1, "stage3.2.fuse_layers.1.0.0.0", "stage3.2.fuse_layers.1.0.0.1", false);
    auto add_623 = convBnUpAdd(network, weightMap, *relu_553->getOutput(0), *add_601->getOutput(0), width * 2, 1, 1, 0, "stage3.2.fuse_layers.1.2.0", "stage3.2.fuse_layers.1.2.1", true);
    auto relu_624 = network->addActivation(*add_623->getOutput(0), ActivationType::kRELU);

    auto relu_627 = convBnRelu(network, weightMap, *relu_497->getOutput(0), width, 3, 2, 1, "stage3.2.fuse_layers.2.0.0.0", "stage3.2.fuse_layers.2.0.0.1");
    auto bn_629 = convBnRelu(network, weightMap, *relu_627->getOutput(0), width * 4, 3, 2, 1, "stage3.2.fuse_layers.2.0.1.0", "stage3.2.fuse_layers.2.0.1.1", false);
    auto add_632 = convBnUpAdd(network, weightMap, *relu_525->getOutput(0), *bn_629->getOutput(0), width * 4, 3, 2, 1, "stage3.2.fuse_layers.2.1.0.0", "stage3.2.fuse_layers.2.1.0.1", false);
    auto add_633 = network->addElementWise(*relu_553->getOutput(0), *add_632->getOutput(0), ElementWiseOperation::kSUM);
    auto relu_634 = network->addActivation(*add_633->getOutput(0), ActivationType::kRELU);

    auto relu_641 = liteResBlock(network, weightMap, *relu_598->getOutput(0), width, "stage3.3.branches.0.0");
    auto relu_648 = liteResBlock(network, weightMap, *relu_641->getOutput(0), width, "stage3.3.branches.0.1");
    auto relu_655 = liteResBlock(network, weightMap, *relu_648->getOutput(0), width, "stage3.3.branches.0.2");
    auto relu_662 = liteResBlock(network, weightMap, *relu_655->getOutput(0), width, "stage3.3.branches.0.3");

    auto relu_669 = liteResBlock(network, weightMap, *relu_624->getOutput(0), width * 2, "stage3.3.branches.1.0");
    auto relu_676 = liteResBlock(network, weightMap, *relu_669->getOutput(0), width * 2, "stage3.3.branches.1.1");
    auto relu_683 = liteResBlock(network, weightMap, *relu_676->getOutput(0), width * 2, "stage3.3.branches.1.2");
    auto relu_690 = liteResBlock(network, weightMap, *relu_683->getOutput(0), width * 2, "stage3.3.branches.1.3");

    auto relu_697 = liteResBlock(network, weightMap, *relu_634->getOutput(0), width * 4, "stage3.3.branches.2.0");
    auto relu_704 = liteResBlock(network, weightMap, *relu_697->getOutput(0), width * 4, "stage3.3.branches.2.1");
    auto relu_711 = liteResBlock(network, weightMap, *relu_704->getOutput(0), width * 4, "stage3.3.branches.2.2");
    auto relu_718 = liteResBlock(network, weightMap, *relu_711->getOutput(0), width * 4, "stage3.3.branches.2.3");

    auto add_740 = convBnUpAdd(network, weightMap, *relu_690->getOutput(0), *relu_662->getOutput(0), width, 1, 1, 0, "stage3.3.fuse_layers.0.1.0", "stage3.3.fuse_layers.0.1.1", true);
    auto add_762 = convBnUpAdd(network, weightMap, *relu_718->getOutput(0), *add_740->getOutput(0), width, 1, 1, 0, "stage3.3.fuse_layers.0.2.0", "stage3.3.fuse_layers.0.2.1", true);
    auto relu_763 = network->addActivation(*add_762->getOutput(0), ActivationType::kRELU);

    auto add_766 = convBnUpAdd(network, weightMap, *relu_662->getOutput(0), *relu_690->getOutput(0), width * 2, 3, 2, 1, "stage3.3.fuse_layers.1.0.0.0", "stage3.3.fuse_layers.1.0.0.1", false);
    auto add_788 = convBnUpAdd(network, weightMap, *relu_718->getOutput(0), *add_766->getOutput(0), width * 2, 1, 1, 0, "stage3.3.fuse_layers.1.2.0", "stage3.3.fuse_layers.1.2.1", true);
    auto relu_789 = network->addActivation(*add_788->getOutput(0), ActivationType::kRELU);

    auto relu_792 = convBnRelu(network, weightMap, *relu_662->getOutput(0), width, 3, 2, 1, "stage3.3.fuse_layers.2.0.0.0", "stage3.3.fuse_layers.2.0.0.1");
    auto bn_794 = convBnRelu(network, weightMap, *relu_792->getOutput(0), width * 4, 3, 2, 1, "stage3.3.fuse_layers.2.0.1.0", "stage3.3.fuse_layers.2.0.1.1", false);
    auto add_797 = convBnUpAdd(network, weightMap, *relu_690->getOutput(0), *bn_794->getOutput(0), width * 4, 3, 2, 1, "stage3.3.fuse_layers.2.1.0.0", "stage3.3.fuse_layers.2.1.0.1", false);
    auto add_798 = network->addElementWise(*relu_718->getOutput(0), *add_797->getOutput(0), ElementWiseOperation::kSUM);
    auto relu_799 = network->addActivation(*add_798->getOutput(0), ActivationType::kRELU);

    auto relu_809 = liteResBlock(network, weightMap, *relu_763->getOutput(0), width, "stage4.0.branches.0.0");
    auto relu_816 = liteResBlock(network, weightMap, *relu_809->getOutput(0), width, "stage4.0.branches.0.1");
    auto relu_823 = liteResBlock(network, weightMap, *relu_816->getOutput(0), width, "stage4.0.branches.0.2");
    auto relu_830 = liteResBlock(network, weightMap, *relu_823->getOutput(0), width, "stage4.0.branches.0.3");

    auto relu_837 = liteResBlock(network, weightMap, *relu_789->getOutput(0), width * 2, "stage4.0.branches.1.0");
    auto relu_844 = liteResBlock(network, weightMap, *relu_837->getOutput(0), width * 2, "stage4.0.branches.1.1");
    auto relu_851 = liteResBlock(network, weightMap, *relu_844->getOutput(0), width * 2, "stage4.0.branches.1.2");
    auto relu_858 = liteResBlock(network, weightMap, *relu_851->getOutput(0), width * 2, "stage4.0.branches.1.3");

    auto relu_865 = liteResBlock(network, weightMap, *relu_799->getOutput(0), width * 4, "stage4.0.branches.2.0");
    auto relu_872 = liteResBlock(network, weightMap, *relu_865->getOutput(0), width * 4, "stage4.0.branches.2.1");
    auto relu_879 = liteResBlock(network, weightMap, *relu_872->getOutput(0), width * 4, "stage4.0.branches.2.2");
    auto relu_886 = liteResBlock(network, weightMap, *relu_879->getOutput(0), width * 4, "stage4.0.branches.2.3"); //========

    auto relu_802 = convBnRelu(network, weightMap, *relu_799->getOutput(0), width * 8, 3, 2, 1, "transition3.3.0.0", "transition3.3.0.1");
    auto relu_893 = liteResBlock(network, weightMap, *relu_802->getOutput(0), width * 8, "stage4.0.branches.3.0");
    auto relu_900 = liteResBlock(network, weightMap, *relu_893->getOutput(0), width * 8, "stage4.0.branches.3.1");
    auto relu_907 = liteResBlock(network, weightMap, *relu_900->getOutput(0), width * 8, "stage4.0.branches.3.2");
    auto relu_914 = liteResBlock(network, weightMap, *relu_907->getOutput(0), width * 8, "stage4.0.branches.3.3");

    auto add_936 = convBnUpAdd(network, weightMap, *relu_858->getOutput(0), *relu_830->getOutput(0), width, 1, 1, 0, "stage4.0.fuse_layers.0.1.0", "stage4.0.fuse_layers.0.1.1", true);
    auto add_958 = convBnUpAdd(network, weightMap, *relu_886->getOutput(0), *add_936->getOutput(0), width, 1, 1, 0, "stage4.0.fuse_layers.0.2.0", "stage4.0.fuse_layers.0.2.1", true);
    auto add_980 = convBnUpAdd(network, weightMap, *relu_914->getOutput(0), *add_958->getOutput(0), width, 1, 1, 0, "stage4.0.fuse_layers.0.3.0", "stage4.0.fuse_layers.0.3.1", true);
    auto relu_981 = network->addActivation(*add_980->getOutput(0), ActivationType::kRELU);

    auto add_984 = convBnUpAdd(network, weightMap, *relu_830->getOutput(0), *relu_858->getOutput(0), width * 2, 3, 2, 1, "stage4.0.fuse_layers.1.0.0.0", "stage4.0.fuse_layers.1.0.0.1", false);
    auto add_1006 = convBnUpAdd(network, weightMap, *relu_886->getOutput(0), *add_984->getOutput(0), width * 2, 1, 1, 0, "stage4.0.fuse_layers.1.2.0", "stage4.0.fuse_layers.1.2.1", true);
    auto add_1028 = convBnUpAdd(network, weightMap, *relu_914->getOutput(0), *add_1006->getOutput(0), width * 2, 1, 1, 0, "stage4.0.fuse_layers.1.3.0", "stage4.0.fuse_layers.1.3.1", true);
    auto relu_1029 = network->addActivation(*add_1028->getOutput(0), ActivationType::kRELU);

    auto relu_1032 = convBnRelu(network, weightMap, *relu_830->getOutput(0), width, 3, 2, 1, "stage4.0.fuse_layers.2.0.0.0", "stage4.0.fuse_layers.2.0.0.1");
    auto bn_1034 = convBnRelu(network, weightMap, *relu_1032->getOutput(0), width * 4, 3, 2, 1, "stage4.0.fuse_layers.2.0.1.0", "stage4.0.fuse_layers.2.0.1.1", false);

    auto add_1037 = convBnUpAdd(network, weightMap, *relu_858->getOutput(0), *bn_1034->getOutput(0), width * 4, 3, 2, 1,
                                "stage4.0.fuse_layers.2.1.0.0", "stage4.0.fuse_layers.2.1.0.1", false);
    auto add_1038 = network->addElementWise(*relu_886->getOutput(0), *add_1037->getOutput(0), ElementWiseOperation::kSUM);
    auto add_1060 = convBnUpAdd(network, weightMap, *relu_914->getOutput(0), *add_1038->getOutput(0), width * 4, 1, 1, 0,
                                "stage4.0.fuse_layers.2.3.0", "stage4.0.fuse_layers.2.3.1", true);
    auto relu_1061 = network->addActivation(*add_1060->getOutput(0), ActivationType::kRELU);

    auto relu_1064 = convBnRelu(network, weightMap, *relu_830->getOutput(0), width, 3, 2, 1, "stage4.0.fuse_layers.3.0.0.0", "stage4.0.fuse_layers.3.0.0.1");
    auto relu_1067 = convBnRelu(network, weightMap, *relu_1064->getOutput(0), width, 3, 2, 1, "stage4.0.fuse_layers.3.0.1.0", "stage4.0.fuse_layers.3.0.1.1");
    auto bn_1069 = convBnRelu(network, weightMap, *relu_1067->getOutput(0), width * 8, 3, 2, 1, "stage4.0.fuse_layers.3.0.2.0", "stage4.0.fuse_layers.3.0.2.1", false);
    auto relu_1072 = convBnRelu(network, weightMap, *relu_858->getOutput(0), width * 2, 3, 2, 1, "stage4.0.fuse_layers.3.1.0.0", "stage4.0.fuse_layers.3.1.0.1");
    auto add_1075 = convBnUpAdd(network, weightMap, *relu_1072->getOutput(0), *bn_1069->getOutput(0), width * 8, 3, 2, 1,
                                "stage4.0.fuse_layers.3.1.1.0", "stage4.0.fuse_layers.3.1.1.1", false);
    auto add_1078 = convBnUpAdd(network, weightMap, *relu_886->getOutput(0), *add_1075->getOutput(0), width * 8, 3, 2, 1,
                                "stage4.0.fuse_layers.3.2.0.0", "stage4.0.fuse_layers.3.2.0.1", false);
    auto add_1079 = network->addElementWise(*relu_914->getOutput(0), *add_1078->getOutput(0), ElementWiseOperation::kSUM);
    auto relu_1080 = network->addActivation(*add_1079->getOutput(0), ActivationType::kRELU);

    auto relu_1087 = liteResBlock(network, weightMap, *relu_981->getOutput(0), width, "stage4.1.branches.0.0");
    auto relu_1094 = liteResBlock(network, weightMap, *relu_1087->getOutput(0), width, "stage4.1.branches.0.1");
    auto relu_1101 = liteResBlock(network, weightMap, *relu_1094->getOutput(0), width, "stage4.1.branches.0.2");
    auto relu_1108 = liteResBlock(network, weightMap, *relu_1101->getOutput(0), width, "stage4.1.branches.0.3");

    auto relu_1115 = liteResBlock(network, weightMap, *relu_1029->getOutput(0), width * 2, "stage4.1.branches.1.0");
    auto relu_1122 = liteResBlock(network, weightMap, *relu_1115->getOutput(0), width * 2, "stage4.1.branches.1.1");
    auto relu_1129 = liteResBlock(network, weightMap, *relu_1122->getOutput(0), width * 2, "stage4.1.branches.1.2");
    auto relu_1136 = liteResBlock(network, weightMap, *relu_1129->getOutput(0), width * 2, "stage4.1.branches.1.3");

    auto relu_1143 = liteResBlock(network, weightMap, *relu_1061->getOutput(0), width * 4, "stage4.1.branches.2.0");
    auto relu_1150 = liteResBlock(network, weightMap, *relu_1143->getOutput(0), width * 4, "stage4.1.branches.2.1");
    auto relu_1157 = liteResBlock(network, weightMap, *relu_1150->getOutput(0), width * 4, "stage4.1.branches.2.2");
    auto relu_1164 = liteResBlock(network, weightMap, *relu_1157->getOutput(0), width * 4, "stage4.1.branches.2.3");

    auto relu_1171 = liteResBlock(network, weightMap, *relu_1080->getOutput(0), width * 8, "stage4.1.branches.3.0");
    auto relu_1178 = liteResBlock(network, weightMap, *relu_1171->getOutput(0), width * 8, "stage4.1.branches.3.1");
    auto relu_1185 = liteResBlock(network, weightMap, *relu_1178->getOutput(0), width * 8, "stage4.1.branches.3.2");
    auto relu_1192 = liteResBlock(network, weightMap, *relu_1185->getOutput(0), width * 8, "stage4.1.branches.3.3");

    auto add_1214 = convBnUpAdd(network, weightMap, *relu_1136->getOutput(0), *relu_1108->getOutput(0), width, 1, 1, 0,
                                "stage4.1.fuse_layers.0.1.0", "stage4.1.fuse_layers.0.1.1", true);
    auto add_1236 = convBnUpAdd(network, weightMap, *relu_1164->getOutput(0), *add_1214->getOutput(0), width, 1, 1, 0,
                                "stage4.1.fuse_layers.0.2.0", "stage4.1.fuse_layers.0.2.1", true);
    auto add_1258 = convBnUpAdd(network, weightMap, *relu_1192->getOutput(0), *add_1236->getOutput(0), width, 1, 1, 0,
                                "stage4.1.fuse_layers.0.3.0", "stage4.1.fuse_layers.0.3.1", true);
    auto relu_1259 = network->addActivation(*add_1258->getOutput(0), ActivationType::kRELU);

    auto add_1262 = convBnUpAdd(network, weightMap, *relu_1108->getOutput(0), *relu_1136->getOutput(0), width * 2, 3, 2, 1,
                                "stage4.1.fuse_layers.1.0.0.0", "stage4.1.fuse_layers.1.0.0.1", false);
    auto add_1284 = convBnUpAdd(network, weightMap, *relu_1164->getOutput(0), *add_1262->getOutput(0), width * 2, 1, 1, 0,
                                "stage4.1.fuse_layers.1.2.0", "stage4.1.fuse_layers.1.2.1", true);
    auto add_1306 = convBnUpAdd(network, weightMap, *relu_1192->getOutput(0), *add_1284->getOutput(0), width * 2, 1, 1, 0,
                                "stage4.1.fuse_layers.1.3.0", "stage4.1.fuse_layers.1.3.1", true);
    auto relu_1307 = network->addActivation(*add_1306->getOutput(0), ActivationType::kRELU);

    auto relu_1310 = convBnRelu(network, weightMap, *relu_1108->getOutput(0), width, 3, 2, 1, "stage4.1.fuse_layers.2.0.0.0", "stage4.1.fuse_layers.2.0.0.1");
    auto bn_1312 = convBnRelu(network, weightMap, *relu_1310->getOutput(0), width * 4, 3, 2, 1, "stage4.1.fuse_layers.2.0.1.0", "stage4.1.fuse_layers.2.0.1.1", false);
    auto add_1315 = convBnUpAdd(network, weightMap, *relu_1136->getOutput(0), *bn_1312->getOutput(0), width * 4, 3, 2, 1,
                                "stage4.1.fuse_layers.2.1.0.0", "stage4.1.fuse_layers.2.1.0.1", false);
    auto add_1316 = network->addElementWise(*relu_1164->getOutput(0), *add_1315->getOutput(0), ElementWiseOperation::kSUM);
    auto add_1338 = convBnUpAdd(network, weightMap, *relu_1192->getOutput(0), *add_1316->getOutput(0), width * 4, 1, 1, 0,
                                "stage4.1.fuse_layers.2.3.0", "stage4.1.fuse_layers.2.3.1", true);
    auto relu_1339 = network->addActivation(*add_1338->getOutput(0), ActivationType::kRELU);

    auto relu_1342 = convBnRelu(network, weightMap, *relu_1108->getOutput(0), width, 3, 2, 1, "stage4.1.fuse_layers.3.0.0.0", "stage4.1.fuse_layers.3.0.0.1");
    auto relu_1345 = convBnRelu(network, weightMap, *relu_1342->getOutput(0), width, 3, 2, 1, "stage4.1.fuse_layers.3.0.1.0", "stage4.1.fuse_layers.3.0.1.1");
    auto bn_1347 = convBnRelu(network, weightMap, *relu_1345->getOutput(0), width * 8, 3, 2, 1, "stage4.1.fuse_layers.3.0.2.0", "stage4.1.fuse_layers.3.0.2.1", false);
    auto relu_1350 = convBnRelu(network, weightMap, *relu_1136->getOutput(0), width * 2, 3, 2, 1, "stage4.1.fuse_layers.3.1.0.0", "stage4.1.fuse_layers.3.1.0.1");
    auto add_1353 = convBnUpAdd(network, weightMap, *relu_1350->getOutput(0), *bn_1347->getOutput(0), width * 8, 3, 2, 1,
                                "stage4.1.fuse_layers.3.1.1.0", "stage4.1.fuse_layers.3.1.1.1", false);
    auto add_1356 = convBnUpAdd(network, weightMap, *relu_1164->getOutput(0), *add_1353->getOutput(0), width * 8, 3, 2, 1,
                                "stage4.1.fuse_layers.3.2.0.0", "stage4.1.fuse_layers.3.2.0.1", false);
    auto add_1357 = network->addElementWise(*relu_1192->getOutput(0), *add_1356->getOutput(0), ElementWiseOperation::kSUM);
    auto relu_1358 = network->addActivation(*add_1357->getOutput(0), ActivationType::kRELU);

    auto relu_1365 = liteResBlock(network, weightMap, *relu_1259->getOutput(0), width, "stage4.2.branches.0.0");
    auto relu_1372 = liteResBlock(network, weightMap, *relu_1365->getOutput(0), width, "stage4.2.branches.0.1");
    auto relu_1379 = liteResBlock(network, weightMap, *relu_1372->getOutput(0), width, "stage4.2.branches.0.2");
    auto relu_1386 = liteResBlock(network, weightMap, *relu_1379->getOutput(0), width, "stage4.2.branches.0.3");

    auto relu_1393 = liteResBlock(network, weightMap, *relu_1307->getOutput(0), width * 2, "stage4.2.branches.1.0");
    auto relu_1400 = liteResBlock(network, weightMap, *relu_1393->getOutput(0), width * 2, "stage4.2.branches.1.1");
    auto relu_1407 = liteResBlock(network, weightMap, *relu_1400->getOutput(0), width * 2, "stage4.2.branches.1.2");
    auto relu_1414 = liteResBlock(network, weightMap, *relu_1407->getOutput(0), width * 2, "stage4.2.branches.1.3");

    auto relu_1421 = liteResBlock(network, weightMap, *relu_1339->getOutput(0), width * 4, "stage4.2.branches.2.0");
    auto relu_1428 = liteResBlock(network, weightMap, *relu_1421->getOutput(0), width * 4, "stage4.2.branches.2.1");
    auto relu_1435 = liteResBlock(network, weightMap, *relu_1428->getOutput(0), width * 4, "stage4.2.branches.2.2");
    auto relu_1442 = liteResBlock(network, weightMap, *relu_1435->getOutput(0), width * 4, "stage4.2.branches.2.3");

    auto relu_1449 = liteResBlock(network, weightMap, *relu_1358->getOutput(0), width * 8, "stage4.2.branches.3.0");
    auto relu_1456 = liteResBlock(network, weightMap, *relu_1449->getOutput(0), width * 8, "stage4.2.branches.3.1");
    auto relu_1463 = liteResBlock(network, weightMap, *relu_1456->getOutput(0), width * 8, "stage4.2.branches.3.2");
    auto relu_1470 = liteResBlock(network, weightMap, *relu_1463->getOutput(0), width * 8, "stage4.2.branches.3.3");

    auto add_1492 = convBnUpAdd(network, weightMap, *relu_1414->getOutput(0), *relu_1386->getOutput(0), width, 1, 1, 0,
                                "stage4.2.fuse_layers.0.1.0", "stage4.2.fuse_layers.0.1.1", true);
    auto add_1514 = convBnUpAdd(network, weightMap, *relu_1442->getOutput(0), *add_1492->getOutput(0), width, 1, 1, 0,
                                "stage4.2.fuse_layers.0.2.0", "stage4.2.fuse_layers.0.2.1", true);

    auto add_1536 = convBnUpAdd(network, weightMap, *relu_1470->getOutput(0), *add_1514->getOutput(0), width, 1, 1, 0,
                                "stage4.2.fuse_layers.0.3.0", "stage4.2.fuse_layers.0.3.1", true);
    auto relu_1537 = network->addActivation(*add_1536->getOutput(0), ActivationType::kRELU);

    auto add_1540 = convBnUpAdd(network, weightMap, *relu_1386->getOutput(0), *relu_1414->getOutput(0),
                                width * 2, 3, 2, 1, "stage4.2.fuse_layers.1.0.0.0", "stage4.2.fuse_layers.1.0.0.1", false);
    auto add_1562 = convBnUpAdd(network, weightMap, *relu_1442->getOutput(0), *add_1540->getOutput(0),
                                width * 2, 1, 1, 0, "stage4.2.fuse_layers.1.2.0", "stage4.2.fuse_layers.1.2.1", true);
    auto add_1584 = convBnUpAdd(network, weightMap, *relu_1470->getOutput(0), *add_1562->getOutput(0),
                                width * 2, 1, 1, 0, "stage4.2.fuse_layers.1.3.0", "stage4.2.fuse_layers.1.3.1", true);
    auto relu_1585 = network->addActivation(*add_1584->getOutput(0), ActivationType::kRELU);

    auto relu_1588 = convBnRelu(network, weightMap, *relu_1386->getOutput(0), width, 3, 2, 1, "stage4.2.fuse_layers.2.0.0.0", "stage4.2.fuse_layers.2.0.0.1");
    auto bn_1590 = convBnRelu(network, weightMap, *relu_1588->getOutput(0), width * 4, 3, 2, 1, "stage4.2.fuse_layers.2.0.1.0", "stage4.2.fuse_layers.2.0.1.1", false);
    auto add_1593 = convBnUpAdd(network, weightMap, *relu_1414->getOutput(0), *bn_1590->getOutput(0), width * 4, 3, 2, 1,
                                "stage4.2.fuse_layers.2.1.0.0", "stage4.2.fuse_layers.2.1.0.1", false);
    auto add_1594 = network->addElementWise(*relu_1442->getOutput(0), *add_1593->getOutput(0), ElementWiseOperation::kSUM);
    auto add_1616 = convBnUpAdd(network, weightMap, *relu_1470->getOutput(0), *add_1594->getOutput(0), width * 4, 1, 1, 0,
                                "stage4.2.fuse_layers.2.3.0", "stage4.2.fuse_layers.2.3.1", true);
    auto relu_1617 = network->addActivation(*add_1616->getOutput(0), ActivationType::kRELU);

    auto relu_1620 = convBnRelu(network, weightMap, *relu_1386->getOutput(0), width, 3, 2, 1, "stage4.2.fuse_layers.3.0.0.0", "stage4.2.fuse_layers.3.0.0.1");
    auto relu_1623 = convBnRelu(network, weightMap, *relu_1620->getOutput(0), width, 3, 2, 1, "stage4.2.fuse_layers.3.0.1.0", "stage4.2.fuse_layers.3.0.1.1");
    auto bn_1625 = convBnRelu(network, weightMap, *relu_1623->getOutput(0), width * 8, 3, 2, 1, "stage4.2.fuse_layers.3.0.2.0", "stage4.2.fuse_layers.3.0.2.1", false);
    auto relu_1628 = convBnRelu(network, weightMap, *relu_1414->getOutput(0), width * 2, 3, 2, 1, "stage4.2.fuse_layers.3.1.0.0", "stage4.2.fuse_layers.3.1.0.1");
    auto add_1631 = convBnUpAdd(network, weightMap, *relu_1628->getOutput(0), *bn_1625->getOutput(0), width * 8, 3, 2, 1,
                                "stage4.2.fuse_layers.3.1.1.0", "stage4.2.fuse_layers.3.1.1.1", false);
    auto add_1634 = convBnUpAdd(network, weightMap, *relu_1442->getOutput(0), *add_1631->getOutput(0), width * 8, 3, 2, 1,
                                "stage4.2.fuse_layers.3.2.0.0", "stage4.2.fuse_layers.3.2.0.1", false);
    auto add_1635 = network->addElementWise(*relu_1470->getOutput(0), *add_1634->getOutput(0), ElementWiseOperation::kSUM);
    auto relu_1636 = network->addActivation(*add_1635->getOutput(0), ActivationType::kRELU);

    nvinfer1::Dims dim = relu_1537->getOutput(0)->getDimensions();
    dim.d[0] = relu_1585->getOutput(0)->getDimensions().d[0];
    auto resize_1655 = netAddUpsampleBi(network, relu_1585->getOutput(0), dim);
    dim.d[0] = relu_1617->getOutput(0)->getDimensions().d[0];
    auto resize_1668 = netAddUpsampleBi(network, relu_1617->getOutput(0), dim);
    dim.d[0] = relu_1636->getOutput(0)->getDimensions().d[0];
    auto resize_1681 = netAddUpsampleBi(network, relu_1636->getOutput(0), dim);

    ITensor *concatTensors[] = {relu_1537->getOutput(0), resize_1655->getOutput(0), resize_1668->getOutput(0), resize_1681->getOutput(0)};
    auto concat_1682 = network->addConcatenation(concatTensors, 4);
    concat_1682->setAxis(0);
    auto relu_1685 = convBnRelu(network, weightMap, *concat_1682->getOutput(0), width * 15, 1, 1, 0, "last_layer.0", "last_layer.1", true, true);
    auto conv_1686 = network->addConvolutionNd(*relu_1685->getOutput(0), NUM_CLASSES, DimsHW{1, 1}, weightMap["last_layer.3.weight"], weightMap["last_layer.3.bias"]);
    conv_1686->setStrideNd(DimsHW{1, 1});
    conv_1686->setPaddingNd(DimsHW{0, 0});
    debug_print(conv_1686->getOutput(0), "conv_1686");
    dim.d[0] = NUM_CLASSES;
    dim.d[1] = INPUT_H;
    dim.d[2] = INPUT_W;
    auto feature_map = netAddUpsampleBi(network, conv_1686->getOutput(0), dim);
    debug_print(feature_map->getOutput(0), "feature_map");
    auto topk = network->addTopK(*feature_map->getOutput(0), TopKOperation::kMAX, 1, 0X01);
    debug_print(topk->getOutput(0), "topk");
    std::cout << "set name out" << std::endl;
    // topk->getOutput(1) 1 is index
    topk->getOutput(1)->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*topk->getOutput(1));

    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize((1 << 30)); // 1G
#ifdef USE_FP16
    std::cout << "use fp16" << std::endl;
    config->setFlag(BuilderFlag::kFP16);
#endif
    ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "build success!" << std::endl;
    network->destroy();
    for (auto &mem : weightMap)
    {
        free((void *)(mem.second.values));
    }
    return engine;
}
void APIToModel(unsigned int maxBatchSize, IHostMemory **modelStream, std::string wtsPath, int width)
{
    IBuilder *builder = createInferBuilder(gLogger);
    IBuilderConfig *config = builder->createBuilderConfig();
    ICudaEngine *engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT, wtsPath, width);
    assert(engine != nullptr);
    (*modelStream) = engine->serialize();
    engine->destroy();
    builder->destroy();
}

bool parse_args(int argc, char **argv, std::string &wts, std::string &engine, int &width, std::string &img_dir)
{
    if (std::string(argv[1]) == "-s" && argc == 5)
    {
        wts = std::string(argv[2]);
        engine = std::string(argv[3]);
        width = std::stoi(argv[4]);
    }
    else if (std::string(argv[1]) == "-d" && argc == 4)
    {
        engine = std::string(argv[2]);
        img_dir = std::string(argv[3]);
    }
    else
    {
        return false;
    }
    return true;
}
void doInference(IExecutionContext &context, cudaStream_t &stream, void **buffers, int batchSize)
{
    context.enqueue(batchSize, buffers, stream, nullptr);
    cudaStreamSynchronize(stream);
    cudaDeviceSynchronize();
}

int main(int argc, char **argv)
{
    cudaSetDevice(DEVICE);
    std::string wtsPath = "";
    std::string engine_name = "";
    int width;
    std::string img_dir;
    // parse args
    if (!parse_args(argc, argv, wtsPath, engine_name, width, img_dir))
    {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./hrnet -s [.wts] [.engine] [18 or 32 or 48]  // serialize model to plan file" << std::endl;
        std::cerr << "./hrnet -d [.engine] ../samples  // deserialize plan file and run inference" << std::endl;
        return -1;
    }
    // create a model using the API directly and serialize it to a stream
    if (!wtsPath.empty())
    {
        IHostMemory *modelStream{nullptr};
        APIToModel(BATCH_SIZE, &modelStream, wtsPath, width);
        assert(modelStream != nullptr);
        std::ofstream p(engine_name, std::ios::binary);
        if (!p)
        {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char *>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 0;
    }

    // deserialize the .engine and run inference
    char *trtModelStream{nullptr};
    size_t size{0};
    std::ifstream file(engine_name, std::ios::binary);
    if (file.good())
    {
        file.seekg(0, file.end);
        size = file.tellg();
        file.seekg(0, file.beg);
        trtModelStream = new char[size];
        assert(trtModelStream);
        file.read(trtModelStream, size);
        file.close();
    }
    else
    {
        std::cerr << "could not open plan file" << std::endl;
    }

    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0)
    {
        std::cout << "read_files_in_dir failed." << std::endl;
        return -1;
    }
    // prepare input data ---------------------------
    cudaSetDeviceFlags(cudaDeviceMapHost);
    float *data;
    int *prob; // using int. output is index
    CHECK(cudaHostAlloc((void **)&data, BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float), cudaHostAllocMapped));
    CHECK(cudaHostAlloc((void **)&prob, BATCH_SIZE * OUTPUT_SIZE * sizeof(int), cudaHostAllocMapped));

    IRuntime *runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine *engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr);
    IExecutionContext *context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;
    void *buffers[2];
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);
    assert(inputIndex == 0);
    assert(outputIndex == 1);
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    for (int f = 0; f < (int)file_names.size(); f++)
    {
        std::cout << file_names[f] << std::endl;
        cv::Mat pr_img;
        cv::Mat img_BGR = cv::imread(img_dir + "/" + file_names[f], 1); // BGR
        cv::Mat img;
        cv::cvtColor(img_BGR, img, cv::COLOR_BGR2RGB);
        if (img.empty())
            continue;
        cv::resize(img, pr_img, cv::Size(INPUT_W, INPUT_H));
        img = pr_img.clone(); // for img show
        pr_img.convertTo(pr_img, CV_32FC3);
        if (!pr_img.isContinuous())
        {
            pr_img = pr_img.clone();
        }
        std::memcpy(data, pr_img.data, BATCH_SIZE * 3 * INPUT_W * INPUT_H * sizeof(float));

        cudaHostGetDevicePointer((void **)&buffers[inputIndex], (void *)data, 0);  // buffers[inputIndex]-->data
        cudaHostGetDevicePointer((void **)&buffers[outputIndex], (void *)prob, 0); // buffers[outputIndex] --> prob

        // Run inference
        auto start = std::chrono::high_resolution_clock::now();
        doInference(*context, stream, buffers, BATCH_SIZE);
        auto end = std::chrono::high_resolution_clock::now();
        std::cout << "infer time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;

        cv::Mat outimg(INPUT_H, INPUT_W, CV_8UC1);
        for (int row = 0; row < INPUT_H; ++row)
        {
            uchar *uc_pixel = outimg.data + row * outimg.step;
            for (int col = 0; col < INPUT_W; ++col)
            {
                uc_pixel[col] = (uchar)prob[row * INPUT_W + col];
            }
        }
        cv::Mat im_color;
        cv::cvtColor(outimg, im_color, cv::COLOR_GRAY2RGB);
        cv::Mat lut = createLTU(NUM_CLASSES);
        cv::LUT(im_color, lut, im_color);
        // false color
        cv::cvtColor(im_color, im_color, cv::COLOR_RGB2GRAY);
        cv::applyColorMap(im_color, im_color, cv::COLORMAP_HOT);
        // cv::imshow("False Color Map", im_color);
        cv::imwrite(std::to_string(f) + "_false_color_map.png", im_color);
        //fusion
        cv::Mat fusionImg;
        cv::addWeighted(img, 1, im_color, 0.8, 1, fusionImg);
        // cv::imshow("Fusion Img", fusionImg);
        // cv::waitKey(0);
        cv::imwrite(std::to_string(f) + "_fusion_img.png", fusionImg);
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFreeHost(buffers[inputIndex]));
    CHECK(cudaFreeHost(buffers[outputIndex]));
    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();
    return 0;
}


================================================
FILE: hrnet/hrnet-semantic-segmentation/hrnet_ocr.cpp
================================================
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include "common.hpp"
#include "logging.h"

static Logger gLogger;
#define USE_FP32
#define DEVICE 0     // GPU id
#define BATCH_SIZE 1 //

const char *INPUT_BLOB_NAME = "data";
const char *OUTPUT_BLOB_NAME = "output";
static const int INPUT_H = 512;
static const int INPUT_W = 1024;
static const int NUM_CLASSES = 19;
static const int OUTPUT_SIZE = INPUT_H * INPUT_W;

// Creat the engine using only the API and not any parser.
ICudaEngine *createEngine(unsigned int maxBatchSize, IBuilder *builder, IBuilderConfig *config, DataType dt, std::string wtsPath, int width)
{
    INetworkDefinition *network = builder->createNetworkV2(0U);
    // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
    ITensor *data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{INPUT_H, INPUT_W, 3});
    assert(data);

    // hwc to chw
    auto ps = network->addShuffle(*data);
    ps->setFirstTranspose(nvinfer1::Permutation{2, 0, 1});
    float mean[3] = {0.485, 0.456, 0.406};
    float std[3] = {0.229, 0.224, 0.225};
    ITensor *preinput = MeanStd(network, ps->getOutput(0), mean, std, true);

    std::map<std::string, Weights> weightMap = loadWeights(wtsPath);
    auto relu_2 = convBnRelu(network, weightMap, *preinput, 64, 3, 2, 1, "conv1", "bn1");
    auto relu_5 = convBnRelu(network, weightMap, *relu_2->getOutput(0), 64, 3, 2, 1, "conv2", "bn2");
    auto relu_17 = ResBlock2Conv(network, weightMap, *relu_5->getOutput(0), 64, 256, 1, "layer1.0");
    auto relu_27 = ResBlock(network, weightMap, *relu_17->getOutput(0), 256, 64, 1, "layer1.1");
    auto relu_37 = ResBlock(network, weightMap, *relu_27->getOutput(0), 256, 64, 1, "layer1.2");
    auto relu_47 = ResBlock(network, weightMap, *relu_37->getOutput(0), 256, 64, 1, "layer1.3");

    auto relu_50 = convBnRelu(network, weightMap, *relu_47->getOutput(0), width, 3, 1, 1, "transition1.0.0", "transition1.0.1");
    auto relu_60 = liteResBlock(network, weightMap, *relu_50->getOutput(0), width, "stage2.0.branches.0.0");
    auto relu_67 = liteResBlock(network, weightMap, *relu_60->getOutput(0), width, "stage2.0.branches.0.1");
    auto relu_74 = liteResBlock(network, weightMap, *relu_67->getOutput(0), width, "stage2.0.branches.0.2");
    auto relu_81 = liteResBlock(network, weightMap, *relu_74->getOutput(0), width, "stage2.0.branches.0.3");

    auto relu_53 = convBnRelu(network, weightMap, *relu_47->getOutput(0), width * 2, 3, 2, 1, "transition1.1.0.0", "transition1.1.0.1");
    auto relu_88 = liteResBlock(network, weightMap, *relu_53->getOutput(0), width * 2, "stage2.0.branches.1.0");
    auto relu_95 = liteResBlock(network, weightMap, *relu_88->getOutput(0), width * 2, "stage2.0.branches.1.1");
    auto relu_102 = liteResBlock(network, weightMap, *relu_95->getOutput(0), width * 2, "stage2.0.branches.1.2");
    auto relu_109 = liteResBlock(network, weightMap, *relu_102->getOutput(0), width * 2, "stage2.0.branches.1.3");

    auto add_131 = convBnUpAdd(network, weightMap, *relu_109->getOutput(0), *relu_81->getOutput(0), width, 1, 1, 0, "stage2.0.fuse_layers.0.1.0", "stage2.0.fuse_layers.0.1.1", true);
    auto relu_132 = network->addActivation(*add_131->getOutput(0), ActivationType::kRELU);

    auto add_135 = convBnUpAdd(network, weightMap, *relu_81->getOutput(0), *relu_109->getOutput(0), width * 2, 3, 2, 1, "stage2.0.fuse_layers.1.0.0.0", "stage2.0.fuse_layers.1.0.0.1", false);
    auto relu_136 = network->addActivation(*add_135->getOutput(0), ActivationType::kRELU);

    auto relu_146 = liteResBlock(network, weightMap, *relu_132->getOutput(0), width, "stage3.0.branches.0.0");
    auto relu_153 = liteResBlock(network, weightMap, *relu_146->getOutput(0), width, "stage3.0.branches.0.1");
    auto relu_160 = liteResBlock(network, weightMap, *relu_153->getOutput(0), width, "stage3.0.branches.0.2");
    auto relu_167 = liteResBlock(network, weightMap, *relu_160->getOutput(0), width, "stage3.0.branches.0.3");

    auto relu_174 = liteResBlock(network, weightMap, *relu_136->getOutput(0), width * 2, "stage3.0.branches.1.0");
    auto relu_181 = liteResBlock(network, weightMap, *relu_174->getOutput(0), width * 2, "stage3.0.branches.1.1");
    auto relu_188 = liteResBlock(network, weightMap, *relu_181->getOutput(0), width * 2, "stage3.0.branches.1.2");
    auto relu_195 = liteResBlock(network, weightMap, *relu_188->getOutput(0), width * 2, "stage3.0.branches.1.3");

    auto relu_139 = convBnRelu(network, weightMap, *relu_136->getOutput(0), width * 4, 3, 2, 1, "transition2.2.0.0", "transition2.2.0.1");
    auto relu_202 = liteResBlock(network, weightMap, *relu_139->getOutput(0), width * 4, "stage3.0.branches.2.0");
    auto relu_209 = liteResBlock(network, weightMap, *relu_202->getOutput(0), width * 4, "stage3.0.branches.2.1");
    auto relu_216 = liteResBlock(network, weightMap, *relu_209->getOutput(0), width * 4, "stage3.0.branches.2.2");
    auto relu_223 = liteResBlock(network, weightMap, *relu_216->getOutput(0), width * 4, "stage3.0.branches.2.3");

    auto add_245 = convBnUpAdd(network, weightMap, *relu_195->getOutput(0), *relu_167->getOutput(0), width, 1, 1, 0, "stage3.0.fuse_layers.0.1.0", "stage3.0.fuse_layers.0.1.1", true);
    auto add_267 = convBnUpAdd(network, weightMap, *relu_223->getOutput(0), *add_245->getOutput(0), width, 1, 1, 0, "stage3.0.fuse_layers.0.2.0", "stage3.0.fuse_layers.0.2.1", true);
    auto relu_268 = network->addActivation(*add_267->getOutput(0), ActivationType::kRELU);

    auto add_271 = convBnUpAdd(network, weightMap, *relu_167->getOutput(0), *relu_195->getOutput(0), width * 2, 3, 2, 1, "stage3.0.fuse_layers.1.0.0.0", "stage3.0.fuse_layers.1.0.0.1", false);
    auto add_293 = convBnUpAdd(network, weightMap, *relu_223->getOutput(0), *add_271->getOutput(0), width * 2, 1, 1, 0, "stage3.0.fuse_layers.1.2.0", "stage3.0.fuse_layers.1.2.1", true);
    auto relu_294 = network->addActivation(*add_293->getOutput(0), ActivationType::kRELU);

    auto relu_297 = convBnRelu(network, weightMap, *relu_167->getOutput(0), width, 3, 2, 1, "stage3.0.fuse_layers.2.0.0.0", "stage3.0.fuse_layers.2.0.0.1");
    auto bn_299 = convBnRelu(network, weightMap, *relu_297->getOutput(0), width * 4, 3, 2, 1, "stage3.0.fuse_layers.2.0.1.0", "stage3.0.fuse_layers.2.0.1.1", false);
    auto add_302 = convBnUpAdd(network, weightMap, *relu_195->getOutput(0), *bn_299->getOutput(0), width * 4, 3, 2, 1, "stage3.0.fuse_layers.2.1.0.0", "stage3.0.fuse_layers.2.1.0.1", false);
    auto add_303 = network->addElementWise(*add_302->getOutput(0), *relu_223->getOutput(0), ElementWiseOperation::kSUM);
    auto relu_304 = network->addActivation(*add_303->getOutput(0), ActivationType::kRELU);

    auto relu_311 = liteResBlock(network, weightMap, *relu_268->getOutput(0), width, "stage3.1.branches.0.0");
    auto relu_318 = liteResBlock(network, weightMap, *relu_311->getOutput(0), width, "stage3.1.branches.0.1");
    auto relu_325 = liteResBlock(network, weightMap, *relu_318->getOutput(0), width, "stage3.1.branches.0.2");
    auto relu_332 = liteResBlock(network, weightMap, *relu_325->getOutput(0), width, "stage3.1.branches.0.3");

    auto relu_339 = liteResBlock(network, weightMap, *relu_294->getOutput(0), width * 2, "stage3.1.branches.1.0");
    auto relu_346 = liteResBlock(network, weightMap, *relu_339->getOutput(0), width * 2, "stage3.1.branches.1.1");
    auto relu_353 = liteResBlock(network, weightMap, *relu_346->getOutput(0), width * 2, "stage3.1.branches.1.2");
    auto relu_360 = liteResBlock(network, weightMap, *relu_353->getOutput(0), width * 2, "stage3.1.branches.1.3");

    auto relu_367 = liteResBlock(network, weightMap, *relu_304->getOutput(0), width * 4, "stage3.1.branches.2.0");
    auto relu_374 = liteResBlock(network, weightMap, *relu_367->getOutput(0), width * 4, "stage3.1.branches.2.1");
    auto relu_381 = liteResBlock(network, weightMap, *relu_374->getOutput(0), width * 4, "stage3.1.branches.2.2");
    auto relu_388 = liteResBlock(network, weightMap, *relu_381->getOutput(0), width * 4, "stage3.1.branches.2.3");

    auto add_410 = convBnUpAdd(network, weightMap, *relu_360->getOutput(0), *relu_332->getOutput(0), width, 1, 1, 0, "stage3.1.fuse_layers.0.1.0", "stage3.1.fuse_layers.0.1.1", true);
    auto add_432 = convBnUpAdd(network, weightMap, *relu_388->getOutput(0), *add_410->getOutput(0), width, 1, 1, 0, "stage3.1.fuse_layers.0.2.0", "stage3.1.fuse_layers.0.2.1", true);
    auto relu_433 = network->addActivation(*add_432->getOutput(0), ActivationType::kRELU);

    auto add_436 = convBnUpAdd(network, weightMap, *relu_332->getOutput(0), *relu_360->getOutput(0), width * 2, 3, 2, 1, "stage3.1.fuse_layers.1.0.0.0", "stage3.1.fuse_layers.1.0.0.1", false);
    auto add_458 = convBnUpAdd(network, weightMap, *relu_388->getOutput(0), *add_436->getOutput(0), width * 2, 1, 1, 0, "stage3.1.fuse_layers.1.2.0", "stage3.1.fuse_layers.1.2.1", true);
    auto relu_459 = network->addActivation(*add_458->getOutput(0), ActivationType::kRELU);

    auto relu_462 = convBnRelu(network, weightMap, *relu_332->getOutput(0), width, 3, 2, 1, "stage3.1.fuse_layers.2.0.0.0", "stage3.1.fuse_layers.2.0.0.1");
    auto bn_464 = convBnRelu(network, weightMap, *relu_462->getOutput(0), width * 4, 3, 2, 1, "stage3.1.fuse_layers.2.0.1.0", "stage3.1.fuse_layers.2.0.1.1", false);
    auto add_467 = convBnUpAdd(network, weightMap, *relu_360->getOutput(0), *bn_464->getOutput(0), width * 4, 3, 2, 1, "stage3.1.fuse_layers.2.1.0.0", "stage3.1.fuse_layers.2.1.0.1", false);
    auto add_468 = network->addElementWise(*add_467->getOutput(0), *relu_388->getOutput(0), ElementWiseOperation::kSUM);
    auto relu_469 = network->addActivation(*add_468->getOutput(0), ActivationType::kRELU);

    auto relu_476 = liteResBlock(network, weightMap, *relu_433->getOutput(0), width, "stage3.2.branches.0.0");
    auto relu_483 = liteResBlock(network, weightMap, *relu_476->getOutput(0), width, "stage3.2.branches.0.1");
    auto relu_490 = liteResBlock(network, weightMap, *relu_483->getOutput(0), width, "stage3.2.branches.0.2");
    auto relu_497 = liteResBlock(network, weightMap, *relu_490->getOutput(0), width, "stage3.2.branches.0.3");

    auto relu_504 = liteResBlock(network, weightMap, *relu_459->getOutput(0), width * 2, "stage3.2.branches.1.0");
    auto relu_511 = liteResBlock(network, weightMap, *relu_504->getOutput(0), width * 2, "stage3.2.branches.1.1");
    auto relu_518 = liteResBlock(network, weightMap, *relu_511->getOutput(0), width * 2, "stage3.2.branches.1.2");
    auto relu_525 = liteResBlock(network, weightMap, *relu_518->getOutput(0), width * 2, "stage3.2.branches.1.3");

    auto relu_532 = liteResBlock(network, weightMap, *relu_469->getOutput(0), width * 4, "stage3.2.branches.2.0");
    auto relu_539 = liteResBlock(network, weightMap, *relu_532->getOutput(0), width * 4, "stage3.2.branches.2.1");
    auto relu_546 = liteResBlock(network, weightMap, *relu_539->getOutput(0), width * 4, "stage3.2.branches.2.2");
    auto relu_553 = liteResBlock(network, weightMap, *relu_546->getOutput(0), width * 4, "stage3.2.branches.2.3");

    auto add_575 = convBnUpAdd(network, weightMap, *relu_525->getOutput(0), *relu_497->getOutput(0), width, 1, 1, 0, "stage3.2.fuse_layers.0.1.0", "stage3.2.fuse_layers.0.1.1", true);
    auto add_597 = convBnUpAdd(network, weightMap, *relu_553->getOutput(0), *add_575->getOutput(0), width, 1, 1, 0, "stage3.2.fuse_layers.0.2.0", "stage3.2.fuse_layers.0.2.1", true);

    auto relu_598 = network->addActivation(*add_597->getOutput(0), ActivationType::kRELU);

    auto add_601 = convBnUpAdd(network, weightMap, *relu_497->getOutput(0), *relu_525->getOutput(0), width * 2, 3, 2, 1, "stage3.2.fuse_layers.1.0.0.0", "stage3.2.fuse_layers.1.0.0.1", false);
    auto add_623 = convBnUpAdd(network, weightMap, *relu_553->getOutput(0), *add_601->getOutput(0), width * 2, 1, 1, 0, "stage3.2.fuse_layers.1.2.0", "stage3.2.fuse_layers.1.2.1", true);
    auto relu_624 = network->addActivation(*add_623->getOutput(0), ActivationType::kRELU);

    auto relu_627 = convBnRelu(network, weightMap, *relu_497->getOutput(0), width, 3, 2, 1, "stage3.2.fuse_layers.2.0.0.0", "stage3.2.fuse_layers.2.0.0.1");
    auto bn_629 = convBnRelu(network, weightMap, *relu_627->getOutput(0), width * 4, 3, 2, 1, "stage3.2.fuse_layers.2.0.1.0", "stage3.2.fuse_layers.2.0.1.1", false);
    auto add_632 = convBnUpAdd(network, weightMap, *relu_525->getOutput(0), *bn_629->getOutput(0), width * 4, 3, 2, 1, "stage3.2.fuse_layers.2.1.0.0", "stage3.2.fuse_layers.2.1.0.1", false);
    auto add_633 = network->addElementWise(*relu_553->getOutput(0), *add_632->getOutput(0), ElementWiseOperation::kSUM);
    auto relu_634 = network->addActivation(*add_633->getOutput(0), ActivationType::kRELU);

    auto relu_641 = liteResBlock(network, weightMap, *relu_598->getOutput(0), width, "stage3.3.branches.0.0");
    auto relu_648 = liteResBlock(network, weightMap, *relu_641->getOutput(0), width, "stage3.3.branches.0.1");
    auto relu_655 = liteResBlock(network, weightMap, *relu_648->getOutput(0), width, "stage3.3.branches.0.2");
    auto relu_662 = liteResBlock(network, weightMap, *relu_655->getOutput(0), width, "stage3.3.branches.0.3");

    auto relu_669 = liteResBlock(network, weightMap, *relu_624->getOutput(0), width * 2, "stage3.3.branches.1.0");
    auto relu_676 = liteResBlock(network, weightMap, *relu_669->getOutput(0), width * 2, "stage3.3.branches.1.1");
    auto relu_683 = liteResBlock(network, weightMap, *relu_676->getOutput(0), width * 2, "stage3.3.branches.1.2");
    auto relu_690 = liteResBlock(network, weightMap, *relu_683->getOutput(0), width * 2, "stage3.3.branches.1.3");

    auto relu_697 = liteResBlock(network, weightMap, *relu_634->getOutput(0), width * 4, "stage3.3.branches.2.0");
    auto relu_704 = liteResBlock(network, weightMap, *relu_697->getOutput(0), width * 4, "stage3.3.branches.2.1");
    auto relu_711 = liteResBlock(network, weightMap, *relu_704->getOutput(0), width * 4, "stage3.3.branches.2.2");
    auto relu_718 = liteResBlock(network, weightMap, *relu_711->getOutput(0), width * 4, "stage3.3.branches.2.3");

    auto add_740 = convBnUpAdd(network, weightMap, *relu_690->getOutput(0), *relu_662->getOutput(0), width, 1, 1, 0, "stage3.3.fuse_layers.0.1.0", "stage3.3.fuse_layers.0.1.1", true);
    auto add_762 = convBnUpAdd(network, weightMap, *relu_718->getOutput(0), *add_740->getOutput(0), width, 1, 1, 0, "stage3.3.fuse_layers.0.2.0", "stage3.3.fuse_layers.0.2.1", true);
    auto relu_763 = network->addActivation(*add_762->getOutput(0), ActivationType::kRELU);

    auto add_766 = convBnUpAdd(network, weightMap, *relu_662->getOutput(0), *relu_690->getOutput(0), width * 2, 3, 2, 1, "stage3.3.fuse_layers.1.0.0.0", "stage3.3.fuse_layers.1.0.0.1", false);
    auto add_788 = convBnUpAdd(network, weightMap, *relu_718->getOutput(0), *add_766->getOutput(0), width * 2, 1, 1, 0, "stage3.3.fuse_layers.1.2.0", "stage3.3.fuse_layers.1.2.1", true);
    auto relu_789 = network->addActivation(*add_788->getOutput(0), ActivationType::kRELU);

    auto relu_792 = convBnRelu(network, weightMap, *relu_662->getOutput(0), width, 3, 2, 1, "stage3.3.fuse_layers.2.0.0.0", "stage3.3.fuse_layers.2.0.0.1");
    auto bn_794 = convBnRelu(network, weightMap, *relu_792->getOutput(0), width * 4, 3, 2, 1, "stage3.3.fuse_layers.2.0.1.0", "stage3.3.fuse_layers.2.0.1.1", false);
    auto add_797 = convBnUpAdd(network, weightMap, *relu_690->getOutput(0), *bn_794->getOutput(0), width * 4, 3, 2, 1, "stage3.3.fuse_layers.2.1.0.0", "stage3.3.fuse_layers.2.1.0.1", false);
    auto add_798 = network->addElementWise(*relu_718->getOutput(0), *add_797->getOutput(0), ElementWiseOperation::kSUM);
    auto relu_799 = network->addActivation(*add_798->getOutput(0), ActivationType::kRELU);

    auto relu_809 = liteResBlock(network, weightMap, *relu_763->getOutput(0), width, "stage4.0.branches.0.0");
    auto relu_816 = liteResBlock(network, weightMap, *relu_809->getOutput(0), width, "stage4.0.branches.0.1");
    auto relu_823 = liteResBlock(network, weightMap, *relu_816->getOutput(0), width, "stage4.0.branches.0.2");
    auto relu_830 = liteResBlock(network, weightMap, *relu_823->getOutput(0), width, "stage4.0.branches.0.3");

    auto relu_837 = liteResBlock(network, weightMap, *relu_789->getOutput(0), width * 2, "stage4.0.branches.1.0");
    auto relu_844 = liteResBlock(network, weightMap, *relu_837->getOutput(0), width * 2, "stage4.0.branches.1.1");
    auto relu_851 = liteResBlock(network, weightMap, *relu_844->getOutput(0), width * 2, "stage4.0.branches.1.2");
    auto relu_858 = liteResBlock(network, weightMap, *relu_851->getOutput(0), width * 2, "stage4.0.branches.1.3");

    auto relu_865 = liteResBlock(network, weightMap, *relu_799->getOutput(0), width * 4, "stage4.0.branches.2.0");
    auto relu_872 = liteResBlock(network, weightMap, *relu_865->getOutput(0), width * 4, "stage4.0.branches.2.1");
    auto relu_879 = liteResBlock(network, weightMap, *relu_872->getOutput(0), width * 4, "stage4.0.branches.2.2");
    auto relu_886 = liteResBlock(network, weightMap, *relu_879->getOutput(0), width * 4, "stage4.0.branches.2.3"); //========

    auto relu_802 = convBnRelu(network, weightMap, *relu_799->getOutput(0), width * 8, 3, 2, 1, "transition3.3.0.0", "transition3.3.0.1");
    auto relu_893 = liteResBlock(network, weightMap, *relu_802->getOutput(0), width * 8, "stage4.0.branches.3.0");
    auto relu_900 = liteResBlock(network, weightMap, *relu_893->getOutput(0), width * 8, "stage4.0.branches.3.1");
    auto relu_907 = liteResBlock(network, weightMap, *relu_900->getOutput(0), width * 8, "stage4.0.branches.3.2");
    auto relu_914 = liteResBlock(network, weightMap, *relu_907->getOutput(0), width * 8, "stage4.0.branches.3.3");

    auto add_936 = convBnUpAdd(network, weightMap, *relu_858->getOutput(0), *relu_830->getOutput(0), width, 1, 1, 0, "stage4.0.fuse_layers.0.1.0", "stage4.0.fuse_layers.0.1.1", true);
    auto add_958 = convBnUpAdd(network, weightMap, *relu_886->getOutput(0), *add_936->getOutput(0), width, 1, 1, 0, "stage4.0.fuse_layers.0.2.0", "stage4.0.fuse_layers.0.2.1", true);
    auto add_980 = convBnUpAdd(network, weightMap, *relu_914->getOutput(0), *add_958->getOutput(0), width, 1, 1, 0, "stage4.0.fuse_layers.0.3.0", "stage4.0.fuse_layers.0.3.1", true);
    auto relu_981 = network->addActivation(*add_980->getOutput(0), ActivationType::kRELU);

    auto add_984 = convBnUpAdd(network, weightMap, *relu_830->getOutput(0), *relu_858->getOutput(0), width * 2, 3, 2, 1, "stage4.0.fuse_layers.1.0.0.0", "stage4.0.fuse_layers.1.0.0.1", false);
    auto add_1006 = convBnUpAdd(network, weightMap, *relu_886->getOutput(0), *add_984->getOutput(0), width * 2, 1, 1, 0, "stage4.0.fuse_layers.1.2.0", "stage4.0.fuse_layers.1.2.1", true);
    auto add_1028 = convBnUpAdd(network, weightMap, *relu_914->getOutput(0), *add_1006->getOutput(0), width * 2, 1, 1, 0, "stage4.0.fuse_layers.1.3.0", "stage4.0.fuse_layers.1.3.1", true);
    auto relu_1029 = network->addActivation(*add_1028->getOutput(0), ActivationType::kRELU);

    auto relu_1032 = convBnRelu(network, weightMap, *relu_830->getOutput(0), width, 3, 2, 1, "stage4.0.fuse_layers.2.0.0.0", "stage4.0.fuse_layers.2.0.0.1");
    auto bn_1034 = convBnRelu(network, weightMap, *relu_1032->getOutput(0), width * 4, 3, 2, 1, "stage4.0.fuse_layers.2.0.1.0", "stage4.0.fuse_layers.2.0.1.1", false);

    auto add_1037 = convBnUpAdd(network, weightMap, *relu_858->getOutput(0), *bn_1034->getOutput(0), width * 4, 3, 2, 1,
                                "stage4.0.fuse_layers.2.1.0.0", "stage4.0.fuse_layers.2.1.0.1", false);
    auto add_1038 = network->addElementWise(*relu_886->getOutput(0), *add_1037->getOutput(0), ElementWiseOperation::kSUM);
    auto add_1060 = convBnUpAdd(network, weightMap, *relu_914->getOutput(0), *add_1038->getOutput(0), width * 4, 1, 1, 0,
                                "stage4.0.fuse_layers.2.3.0", "stage4.0.fuse_layers.2.3.1", true);
    auto relu_1061 = network->addActivation(*add_1060->getOutput(0), ActivationType::kRELU);

    auto relu_1064 = convBnRelu(network, weightMap, *relu_830->getOutput(0), width, 3, 2, 1, "stage4.0.fuse_layers.3.0.0.0", "stage4.0.fuse_layers.3.0.0.1");
    auto relu_1067 = convBnRelu(network, weightMap, *relu_1064->getOutput(0), width, 3, 2, 1, "stage4.0.fuse_layers.3.0.1.0", "stage4.0.fuse_layers.3.0.1.1");
    auto bn_1069 = convBnRelu(network, weightMap, *relu_1067->getOutput(0), width * 8, 3, 2, 1, "stage4.0.fuse_layers.3.0.2.0", "stage4.0.fuse_layers.3.0.2.1", false);
    auto relu_1072 = convBnRelu(network, weightMap, *relu_858->getOutput(0), width * 2, 3, 2, 1, "stage4.0.fuse_layers.3.1.0.0", "stage4.0.fuse_layers.3.1.0.1");
    auto add_1075 = convBnUpAdd(network, weightMap, *relu_1072->getOutput(0), *bn_1069->getOutput(0), width * 8, 3, 2, 1,
                                "stage4.0.fuse_layers.3.1.1.0", "stage4.0.fuse_layers.3.1.1.1", false);
    auto add_1078 = convBnUpAdd(network, weightMap, *relu_886->getOutput(0), *add_1075->getOutput(0), width * 8, 3, 2, 1,
                                "stage4.0.fuse_layers.3.2.0.0", "stage4.0.fuse_layers.3.2.0.1", false);
    auto add_1079 = network->addElementWise(*relu_914->getOutput(0), *add_1078->getOutput(0), ElementWiseOperation::kSUM);
    auto relu_1080 = network->addActivation(*add_1079->getOutput(0), ActivationType::kRELU);

    auto relu_1087 = liteResBlock(network, weightMap, *relu_981->getOutput(0), width, "stage4.1.branches.0.0");
    auto relu_1094 = liteResBlock(network, weightMap, *relu_1087->getOutput(0), width, "stage4.1.branches.0.1");
    auto relu_1101 = liteResBlock(network, weightMap, *relu_1094->getOutput(0), width, "stage4.1.branches.0.2");
    auto relu_1108 = liteResBlock(network, weightMap, *relu_1101->getOutput(0), width, "stage4.1.branches.0.3");

    auto relu_1115 = liteResBlock(network, weightMap, *relu_1029->getOutput(0), width * 2, "stage4.1.branches.1.0");
    auto relu_1122 = liteResBlock(network, weightMap, *relu_1115->getOutput(0), width * 2, "stage4.1.branches.1.1");
    auto relu_1129 = liteResBlock(network, weightMap, *relu_1122->getOutput(0), width * 2, "stage4.1.branches.1.2");
    auto relu_1136 = liteResBlock(network, weightMap, *relu_1129->getOutput(0), width * 2, "stage4.1.branches.1.3");

    auto relu_1143 = liteResBlock(network, weightMap, *relu_1061->getOutput(0), width * 4, "stage4.1.branches.2.0");
    auto relu_1150 = liteResBlock(network, weightMap, *relu_1143->getOutput(0), width * 4, "stage4.1.branches.2.1");
    auto relu_1157 = liteResBlock(network, weightMap, *relu_1150->getOutput(0), width * 4, "stage4.1.branches.2.2");
    auto relu_1164 = liteResBlock(network, weightMap, *relu_1157->getOutput(0), width * 4, "stage4.1.branches.2.3");

    auto relu_1171 = liteResBlock(network, weightMap, *relu_1080->getOutput(0), width * 8, "stage4.1.branches.3.0");
    auto relu_1178 = liteResBlock(network, weightMap, *relu_1171->getOutput(0), width * 8, "stage4.1.branches.3.1");
    auto relu_1185 = liteResBlock(network, weightMap, *relu_1178->getOutput(0), width * 8, "stage4.1.branches.3.2");
    auto relu_1192 = liteResBlock(network, weightMap, *relu_1185->getOutput(0), width * 8, "stage4.1.branches.3.3");

    auto add_1214 = convBnUpAdd(network, weightMap, *relu_1136->getOutput(0), *relu_1108->getOutput(0), width, 1, 1, 0,
                                "stage4.1.fuse_layers.0.1.0", "stage4.1.fuse_layers.0.1.1", true);
    auto add_1236 = convBnUpAdd(network, weightMap, *relu_1164->getOutput(0), *add_1214->getOutput(0), width, 1, 1, 0,
                                "stage4.1.fuse_layers.0.2.0", "stage4.1.fuse_layers.0.2.1", true);
    auto add_1258 = convBnUpAdd(network, weightMap, *relu_1192->getOutput(0), *add_1236->getOutput(0), width, 1, 1, 0,
                                "stage4.1.fuse_layers.0.3.0", "stage4.1.fuse_layers.0.3.1", true);
    auto relu_1259 = network->addActivation(*add_1258->getOutput(0), ActivationType::kRELU);

    auto add_1262 = convBnUpAdd(network, weightMap, *relu_1108->getOutput(0), *relu_1136->getOutput(0), width * 2, 3, 2, 1,
                                "stage4.1.fuse_layers.1.0.0.0", "stage4.1.fuse_layers.1.0.0.1", false);
    auto add_1284 = convBnUpAdd(network, weightMap, *relu_1164->getOutput(0), *add_1262->getOutput(0), width * 2, 1, 1, 0,
                                "stage4.1.fuse_layers.1.2.0", "stage4.1.fuse_layers.1.2.1", true);
    auto add_1306 = convBnUpAdd(network, weightMap, *relu_1192->getOutput(0), *add_1284->getOutput(0), width * 2, 1, 1, 0,
                                "stage4.1.fuse_layers.1.3.0", "stage4.1.fuse_layers.1.3.1", true);
    auto relu_1307 = network->addActivation(*add_1306->getOutput(0), ActivationType::kRELU);

    auto relu_1310 = convBnRelu(network, weightMap, *relu_1108->getOutput(0), width, 3, 2, 1, "stage4.1.fuse_layers.2.0.0.0", "stage4.1.fuse_layers.2.0.0.1");
    auto bn_1312 = convBnRelu(network, weightMap, *relu_1310->getOutput(0), width * 4, 3, 2, 1, "stage4.1.fuse_layers.2.0.1.0", "stage4.1.fuse_layers.2.0.1.1", false);
    auto add_1315 = convBnUpAdd(network, weightMap, *relu_1136->getOutput(0), *bn_1312->getOutput(0), width * 4, 3, 2, 1,
                                "stage4.1.fuse_layers.2.1.0.0", "stage4.1.fuse_layers.2.1.0.1", false);
    auto add_1316 = network->addElementWise(*relu_1164->getOutput(0), *add_1315->getOutput(0), ElementWiseOperation::kSUM);
    auto add_1338 = convBnUpAdd(network, weightMap, *relu_1192->getOutput(0), *add_1316->getOutput(0), width * 4, 1, 1, 0,
                                "stage4.1.fuse_layers.2.3.0", "stage4.1.fuse_layers.2.3.1", true);
    auto relu_1339 = network->addActivation(*add_1338->getOutput(0), ActivationType::kRELU);

    auto relu_1342 = convBnRelu(network, weightMap, *relu_1108->getOutput(0), width, 3, 2, 1, "stage4.1.fuse_layers.3.0.0.0", "stage4.1.fuse_layers.3.0.0.1");
    auto relu_1345 = convBnRelu(network, weightMap, *relu_1342->getOutput(0), width, 3, 2, 1, "stage4.1.fuse_layers.3.0.1.0", "stage4.1.fuse_layers.3.0.1.1");
    auto bn_1347 = convBnRelu(network, weightMap, *relu_1345->getOutput(0), width * 8, 3, 2, 1, "stage4.1.fuse_layers.3.0.2.0", "stage4.1.fuse_layers.3.0.2.1", false);
    auto relu_1350 = convBnRelu(network, weightMap, *relu_1136->getOutput(0), width * 2, 3, 2, 1, "stage4.1.fuse_layers.3.1.0.0", "stage4.1.fuse_layers.3.1.0.1");
    auto add_1353 = convBnUpAdd(network, weightMap, *relu_1350->getOutput(0), *bn_1347->getOutput(0), width * 8, 3, 2, 1,
                                "stage4.1.fuse_layers.3.1.1.0", "stage4.1.fuse_layers.3.1.1.1", false);
    auto add_1356 = convBnUpAdd(network, weightMap, *relu_1164->getOutput(0), *add_1353->getOutput(0), width * 8, 3, 2, 1,
                                "stage4.1.fuse_layers.3.2.0.0", "stage4.1.fuse_layers.3.2.0.1", false);
    auto add_1357 = network->addElementWise(*relu_1192->getOutput(0), *add_1356->getOutput(0), ElementWiseOperation::kSUM);
    auto relu_1358 = network->addActivation(*add_1357->getOutput(0), ActivationType::kRELU);

    auto relu_1365 = liteResBlock(network, weightMap, *relu_1259->getOutput(0), width, "stage4.2.branches.0.0");
    auto relu_1372 = liteResBlock(network, weightMap, *relu_1365->getOutput(0), width, "stage4.2.branches.0.1");
    auto relu_1379 = liteResBlock(network, weightMap, *relu_1372->getOutput(0), width, "stage4.2.branches.0.2");
    auto relu_1386 = liteResBlock(network, weightMap, *relu_1379->getOutput(0), width, "stage4.2.branches.0.3");

    auto relu_1393 = liteResBlock(network, weightMap, *relu_1307->getOutput(0), width * 2, "stage4.2.branches.1.0");
    auto relu_1400 = liteResBlock(network, weightMap, *relu_1393->getOutput(0), width * 2, "stage4.2.branches.1.1");
    auto relu_1407 = liteResBlock(network, weightMap, *relu_1400->getOutput(0), width * 2, "stage4.2.branches.1.2");
    auto relu_1414 = liteResBlock(network, weightMap, *relu_1407->getOutput(0), width * 2, "stage4.2.branches.1.3");

    auto relu_1421 = liteResBlock(network, weightMap, *relu_1339->getOutput(0), width * 4, "stage4.2.branches.2.0");
    auto relu_1428 = liteResBlock(network, weightMap, *relu_1421->getOutput(0), width * 4, "stage4.2.branches.2.1");
    auto relu_1435 = liteResBlock(network, weightMap, *relu_1428->getOutput(0), width * 4, "stage4.2.branches.2.2");
    auto relu_1442 = liteResBlock(network, weightMap, *relu_1435->getOutput(0), width * 4, "stage4.2.branches.2.3");

    auto relu_1449 = liteResBlock(network, weightMap, *relu_1358->getOutput(0), width * 8, "stage4.2.branches.3.0");
    auto relu_1456 = liteResBlock(network, weightMap, *relu_1449->getOutput(0), width * 8, "stage4.2.branches.3.1");
    auto relu_1463 = liteResBlock(network, weightMap, *relu_1456->getOutput(0), width * 8, "stage4.2.branches.3.2");
    auto relu_1470 = liteResBlock(network, weightMap, *relu_1463->getOutput(0), width * 8, "stage4.2.branches.3.3");

    auto add_1492 = convBnUpAdd(network, weightMap, *relu_1414->getOutput(0), *relu_1386->getOutput(0), width, 1, 1, 0,
                                "stage4.2.fuse_layers.0.1.0", "stage4.2.fuse_layers.0.1.1", true);
    auto add_1514 = convBnUpAdd(network, weightMap, *relu_1442->getOutput(0), *add_1492->getOutput(0), width, 1, 1, 0,
                                "stage4.2.fuse_layers.0.2.0", "stage4.2.fuse_layers.0.2.1", true);

    auto add_1536 = convBnUpAdd(network, weightMap, *relu_1470->getOutput(0), *add_1514->getOutput(0), width, 1, 1, 0,
                                "stage4.2.fuse_layers.0.3.0", "stage4.2.fuse_layers.0.3.1", true);
    auto relu_1537 = network->addActivation(*add_1536->getOutput(0), ActivationType::kRELU);

    auto add_1540 = convBnUpAdd(network, weightMap, *relu_1386->getOutput(0), *relu_1414->getOutput(0),
                                width * 2, 3, 2, 1, "stage4.2.fuse_layers.1.0.0.0", "stage4.2.fuse_layers.1.0.0.1", false);
    auto add_1562 = convBnUpAdd(network, weightMap, *relu_1442->getOutput(0), *add_1540->getOutput(0),
                                width * 2, 1, 1, 0, "stage4.2.fuse_layers.1.2.0", "stage4.2.fuse_layers.1.2.1", true);
    auto add_1584 = convBnUpAdd(network, weightMap, *relu_1470->getOutput(0), *add_1562->getOutput(0),
                                width * 2, 1, 1, 0, "stage4.2.fuse_layers.1.3.0", "stage4.2.fuse_layers.1.3.1", true);
    auto relu_1585 = network->addActivation(*add_1584->getOutput(0), ActivationType::kRELU);

    auto relu_1588 = convBnRelu(network, weightMap, *relu_1386->getOutput(0), width, 3, 2, 1, "stage4.2.fuse_layers.2.0.0.0", "stage4.2.fuse_layers.2.0.0.1");
    auto bn_1590 = convBnRelu(network, weightMap, *relu_1588->getOutput(0), width * 4, 3, 2, 1, "stage4.2.fuse_layers.2.0.1.0", "stage4.2.fuse_layers.2.0.1.1", false);
    auto add_1593 = convBnUpAdd(network, weightMap, *relu_1414->getOutput(0), *bn_1590->getOutput(0), width * 4, 3, 2, 1,
                                "stage4.2.fuse_layers.2.1.0.0", "stage4.2.fuse_layers.2.1.0.1", false);
    auto add_1594 = network->addElementWise(*relu_1442->getOutput(0), *add_1593->getOutput(0), ElementWiseOperation::kSUM);
    auto add_1616 = convBnUpAdd(network, weightMap, *relu_1470->getOutput(0), *add_1594->getOutput(0), width * 4, 1, 1, 0,
                                "stage4.2.fuse_layers.2.3.0", "stage4.2.fuse_layers.2.3.1", true);
    auto relu_1617 = network->addActivation(*add_1616->getOutput(0), ActivationType::kRELU);

    auto relu_1620 = convBnRelu(network, weightMap, *relu_1386->getOutput(0), width, 3, 2, 1, "stage4.2.fuse_layers.3.0.0.0", "stage4.2.fuse_layers.3.0.0.1");
    auto relu_1623 = convBnRelu(network, weightMap, *relu_1620->getOutput(0), width, 3, 2, 1, "stage4.2.fuse_layers.3.0.1.0", "stage4.2.fuse_layers.3.0.1.1");
    auto bn_1625 = convBnRelu(network, weightMap, *relu_1623->getOutput(0), width * 8, 3, 2, 1, "stage4.2.fuse_layers.3.0.2.0", "stage4.2.fuse_layers.3.0.2.1", false);
    auto relu_1628 = convBnRelu(network, weightMap, *relu_1414->getOutput(0), width * 2, 3, 2, 1, "stage4.2.fuse_layers.3.1.0.0", "stage4.2.fuse_layers.3.1.0.1");
    auto add_1631 = convBnUpAdd(network, weightMap, *relu_1628->getOutput(0), *bn_1625->getOutput(0), width * 8, 3, 2, 1,
                                "stage4.2.fuse_layers.3.1.1.0", "stage4.2.fuse_layers.3.1.1.1", false);
    auto add_1634 = convBnUpAdd(network, weightMap, *relu_1442->getOutput(0), *add_1631->getOutput(0), width * 8, 3, 2, 1,
                                "stage4.2.fuse_layers.3.2.0.0", "stage4.2.fuse_layers.3.2.0.1", false);
    auto add_1635 = network->addElementWise(*relu_1470->getOutput(0), *add_1634->getOutput(0), ElementWiseOperation::kSUM);
    auto relu_1636 = network->addActivation(*add_1635->getOutput(0), ActivationType::kRELU);

    nvinfer1::Dims dim = relu_1537->getOutput(0)->getDimensions();
    dim.d[0] = relu_1585->getOutput(0)->getDimensions().d[0];
    auto resize_1655 = netAddUpsampleBi(network, relu_1585->getOutput(0), dim);
    dim.d[0] = relu_1617->getOutput(0)->getDimensions().d[0];
    auto resize_1668 = netAddUpsampleBi(network, relu_1617->getOutput(0), dim);
    dim.d[0] = relu_1636->getOutput(0)->getDimensions().d[0];
    auto resize_1681 = netAddUpsampleBi(network, relu_1636->getOutput(0), dim);

    ITensor *concatTensors[] = {relu_1537->getOutput(0), resize_1655->getOutput(0), resize_1668->getOutput(0), resize_1681->getOutput(0)};
    auto concat_1682 = network->addConcatenation(concatTensors, 4);
    concat_1682->setAxis(0);
    auto relu_1685 = convBnRelu(network, weightMap, *concat_1682->getOutput(0), width * 15, 1, 1, 0, "aux_head.0", "aux_head.1", true, true);
    auto conv_1686 = network->addConvolutionNd(*relu_1685->getOutput(0), NUM_CLASSES, DimsHW{1, 1}, weightMap["aux_head.3.weight"], weightMap["aux_head.3.bias"]);
    conv_1686->setStrideNd(DimsHW{1, 1});
    conv_1686->setPaddingNd(DimsHW{0, 0});
    auto reshape_1701 = network->addShuffle(*conv_1686->getOutput(0));
    nvinfer1::Dims reshape_dim;
    reshape_dim.nbDims = 2;
    reshape_dim.d[0] = NUM_CLASSES;
    reshape_dim.d[1] = -1;
    reshape_1701->setReshapeDimensions(reshape_dim);

    auto softmax_1714 = network->addSoftMax(*reshape_1701->getOutput(0));
    softmax_1714->setAxes(2);

    auto relu_1689 = convBnRelu(network, weightMap, *concat_1682->getOutput(0), 512, 3, 1, 1, "conv3x3_ocr.0", "conv3x3_ocr.1", true, true);

    auto reshape_1710 = network->addShuffle(*relu_1689->getOutput(0));
    nvinfer1::Dims reshape_dim1;
    reshape_dim1.nbDims = 2;
    reshape_dim1.d[0] = 512;
    reshape_dim1.d[1] = -1;
    reshape_1710->setReshapeDimensions(reshape_dim1);
    nvinfer1::Permutation permutation1;
    permutation1.order[0] = 1;
    permutation1.order[1] = 0;
    reshape_1710->setSecondTranspose(permutation1);

    auto matmul_1715 = network->addMatrixMultiply(*softmax_1714->getOutput(0), MatrixOperation::kNONE,
                                                  *reshape_1710->getOutput(0), MatrixOperation::kNONE);

    auto transpose_1716 = network->addShuffle(*matmul_1715->getOutput(0));
    nvinfer1::Permutation permutation2;
    permutation2.order[0] = 1;
    permutation2.order[1] = 0;
    transpose_1716->setFirstTranspose(permutation2);

    auto unsqueeze_1717 = network->addShuffle(*transpose_1716->getOutput(0));
    nvinfer1::Dims reshape_dim3;
    reshape_dim3.nbDims = 3;
    reshape_dim3.d[0] = 512;
    reshape_dim3.d[1] = NUM_CLASSES;
    reshape_dim3.d[2] = 1;
    unsqueeze_1717->setReshapeDimensions(reshape_dim3);

    auto relu_1737 = convBnRelu(network, weightMap, *unsqueeze_1717->getOutput(0), 256, 1, 1, 0, "ocr_distri_head.object_context_block.f_object.0", "ocr_distri_head.object_context_block.f_object.1.0", true, true);

    auto relu_1740 = convBnRelu(network, weightMap, *relu_1737->getOutput(0), 256, 1, 1, 0, "ocr_distri_head.object_context_block.f_object.2", "ocr_distri_head.object_context_block.f_object.3.0", true, true);

    auto reshape_1747 = network->addShuffle(*relu_1740->getOutput(0));
    nvinfer1::Dims reshape_dim4;
    reshape_dim4.nbDims = 2;
    reshape_dim4.d[0] = 256;
    reshape_dim4.d[1] = -1;
    reshape_1747->setReshapeDimensions(reshape_dim4);

    auto relu_1723 = convBnRelu(network, weightMap, *relu_1689->getOutput(0), 256, 1, 1, 0, "ocr_distri_head.object_context_block.f_pixel.0", "ocr_distri_head.object_context_block.f_pixel.1.0", true, true);
    auto relu_1726 = convBnRelu(network, weightMap, *relu_1723->getOutput(0), 256, 1, 1, 0, "ocr_distri_head.object_context_block.f_pixel.2", "ocr_distri_head.object_context_block.f_pixel.3.0", true, true);

    auto reshape_1733 = network->addShuffle(*relu_1726->getOutput(0));
    nvinfer1::Dims reshape_dim5;
    reshape_dim5.nbDims = 2;
    reshape_dim5.d[0] = 256;
    reshape_dim5.d[1] = -1;
    reshape_1733->setReshapeDimensions(reshape_dim5);
    nvinfer1::Permutation permutation3;
    permutation3.order[0] = 1;
    permutation3.order[1] = 0;
    reshape_1733->setSecondTranspose(permutation3);

    auto matmul_1759 = network->addMatrixMultiply(*reshape_1733->getOutput(0), MatrixOperation::kNONE, *reshape_1747->getOutput(0), MatrixOperation::kNONE);
    nvinfer1::Dims constant_dim;
    constant_dim.nbDims = 2;
    int allNum = INPUT_H * INPUT_W / 16;
    constant_dim.d[0] = INPUT_H * INPUT_W / 16;
    constant_dim.d[1] = 1;
    Weights wgt{DataType::kFLOAT, nullptr, allNum};
    float *w = new float[allNum];
    for (int i = 0; i < allNum; i++)
    {
        w[i] = 0.0625;
    }
    wgt.values = w;
    auto constant_1761 = network->addConstant(constant_dim, wgt);

    auto mul_1761 = network->addElementWise(*constant_1761->getOutput(0), *matmul_1759->getOutput(0), ElementWiseOperation::kPROD);

    auto softmax_1762 = network->addSoftMax(*mul_1761->getOutput(0));
    softmax_1762->setAxes(2);

    auto relu_1750 = convBnRelu(network, weightMap, *unsqueeze_1717->getOutput(0), 256, 1, 1, 0, "ocr_distri_head.object_context_block.f_down.0", "ocr_distri_head.object_context_block.f_down.1.0", true, true);

    auto reshape_1757 = network->addShuffle(*relu_1750->getOutput(0));
    nvinfer1::Dims reshape_dim6;
    reshape_dim6.nbDims = 2;
    reshape_dim6.d[0] = 256;
    reshape_dim6.d[1] = -1;
    reshape_1757->setReshapeDimensions(reshape_dim6);
    nvinfer1::Permutation permutation4;
    permutation4.order[0] = 1;
    permutation4.order[1] = 0;
    reshape_1757->setSecondTranspose(permutation4);

    auto matmul_1763 = network->addMatrixMultiply(*softmax_1762->getOutput(0), MatrixOperation::kNONE, *reshape_1757->getOutput(0), MatrixOperation::kNONE);

    auto reshape_1777 = network->addShuffle(*matmul_1763->getOutput(0));
    nvinfer1::Dims reshape_dim7;
    reshape_dim7.nbDims = 3;
    reshape_dim7.d[0] = 256;
    reshape_dim7.d[1] = INPUT_H / 4;
    reshape_dim7.d[2] = INPUT_W / 4;
    reshape_1777->setReshapeDimensions(reshape_dim7);
    nvinfer1::Permutation permutation5;
    permutation5.order[0] = 1;
    permutation5.order[1] = 0;
    reshape_1777->setFirstTranspose(permutation5);

    auto relu_1780 = convBnRelu(network, weightMap, *reshape_1777->getOutput(0), 512, 1, 1, 0, "ocr_distri_head.object_context_block.f_up.0", "ocr_distri_head.object_context_block.f_up.1.0", true, true);

    ITensor *concatTensors1[] = {relu_1780->getOutput(0), relu_1689->getOutput(0)};
    auto concat_1781 = network->addConcatenation(concatTensors1, 2);

    auto relu_1784 = convBnRelu(network, weightMap, *concat_1781->getOutput(0), 512, 1, 1, 0, "ocr_distri_head.conv_bn_dropout.0", "ocr_distri_head.conv_bn_dropout.1.0", true, true);

    auto conv_1785 = network->addConvolutionNd(*relu_1784->getOutput(0), NUM_CLASSES, DimsHW{1, 1}, weightMap["cls_head.weight"], weightMap["cls_head.bias"]);
    debug_print(conv_1785->getOutput(0), "cls_head");
    dim.nbDims = 3;
    dim.d[0] = NUM_CLASSES;
    dim.d[1] = INPUT_H;
    dim.d[2] = INPUT_W;
    auto feature_map = netAddUpsampleBi(network, conv_1785->getOutput(0), dim);
    debug_print(feature_map->getOutput(0), "upsample");
    auto topk = network->addTopK(*feature_map->getOutput(0), TopKOperation::kMAX, 1, 0X01);

    debug_print(topk->getOutput(0), "topk");

    std::cout << "set name out" << std::endl;
    topk->getOutput(1)->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*topk->getOutput(1));
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize((1 << 30)); // 1G
#ifdef USE_FP16
    std::cout << "use fp16" << std::endl;
    config->setFlag(BuilderFlag::kFP16);
#endif
    ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "build success!" << std::endl;
    network->destroy();
    for (auto &mem : weightMap)
    {
        free((void *)(mem.second.values));
    }
    return engine;
}
void APIToModel(unsigned int maxBatchSize, IHostMemory **modelStream, std::string wtsPath, int width)
{
    IBuilder *builder = createInferBuilder(gLogger);
    IBuilderConfig *config = builder->createBuilderConfig();
    ICudaEngine *engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT, wtsPath, width);
    assert(engine != nullptr);
    (*modelStream) = engine->serialize();
    engine->destroy();
    builder->destroy();
}

bool parse_args(int argc, char **argv, std::string &wts, std::string &engine, int &width, std::string &img_dir)
{
    if (std::string(argv[1]) == "-s" && argc == 5)
    {
        wts = std::string(argv[2]);
        engine = std::string(argv[3]);
        width = std::stoi(argv[4]);
    }
    else if (std::string(argv[1]) == "-d" && argc == 4)
    {
        engine = std::string(argv[2]);
        img_dir = std::string(argv[3]);
    }
    else
    {
        return false;
    }
    return true;
}
void doInference(IExecutionContext &context, cudaStream_t &stream, void **buffers, int batchSize)
{
    context.enqueue(batchSize, buffers, stream, nullptr);
    cudaStreamSynchronize(stream);
    cudaDeviceSynchronize();
}

int main(int argc, char **argv)
{
    cudaSetDevice(DEVICE);
    std::string wtsPath = "";
    std::string engine_name = "";
    int width;
    std::string img_dir;
    // parse args
    if (!parse_args(argc, argv, wtsPath, engine_name, width, img_dir))
    {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./hrnet_ocr -s [.wts] [.engine] [18 or 32 or 48]  // serialize model to plan file" << std::endl;
        std::cerr << "./hrnet_ocr -d [.engine] ../samples  // deserialize plan file and run inference" << std::endl;
        return -1;
    }
    // create a model using the API directly and serialize it to a stream
    if (!wtsPath.empty())
    {
        IHostMemory *modelStream{nullptr};
        APIToModel(BATCH_SIZE, &modelStream, wtsPath, width);
        assert(modelStream != nullptr);
        std::ofstream p(engine_name, std::ios::binary);
        if (!p)
        {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char *>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 0;
    }

    // deserialize the .engine and run inference
    char *trtModelStream{nullptr};
    size_t size{0};
    std::ifstream file(engine_name, std::ios::binary);
    if (file.good())
    {
        file.seekg(0, file.end);
        size = file.tellg();
        file.seekg(0, file.beg);
        trtModelStream = new char[size];
        assert(trtModelStream);
        file.read(trtModelStream, size);
        file.close();
    }
    else
    {
        std::cerr << "could not open plan file" << std::endl;
    }

    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0)
    {
        std::cout << "read_files_in_dir failed." << std::endl;
        return -1;
    }
    // prepare input data ---------------------------
    cudaSetDeviceFlags(cudaDeviceMapHost);
    float *data;
    int *prob; // using int. output is index
    CHECK(cudaHostAlloc((void **)&data, BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float), cudaHostAllocMapped));
    CHECK(cudaHostAlloc((void **)&prob, BATCH_SIZE * OUTPUT_SIZE * sizeof(int), cudaHostAllocMapped));

    IRuntime *runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine *engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr);
    IExecutionContext *context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;
    void *buffers[2];
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);
    assert(inputIndex == 0);
    assert(outputIndex == 1);
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    for (int f = 0; f < (int)file_names.size(); f++)
    {
        std::cout << file_names[f] << std::endl;
        cv::Mat pr_img;
        cv::Mat img_BGR = cv::imread(img_dir + "/" + file_names[f], 1); // BGR
        cv::Mat img;
        cv::cvtColor(img_BGR, img, cv::COLOR_BGR2RGB);
        if (img.empty())
            continue;
        cv::resize(img, pr_img, cv::Size(INPUT_W, INPUT_H));
        img = pr_img.clone(); // for img show
        pr_img.convertTo(pr_img, CV_32FC3);
        if (!pr_img.isContinuous())
        {
            pr_img = pr_img.clone();
        }
        std::memcpy(data, pr_img.data, BATCH_SIZE * 3 * INPUT_W * INPUT_H * sizeof(float));

        cudaHostGetDevicePointer((void **)&buffers[inputIndex], (void *)data, 0);  // buffers[inputIndex]-->data
        cudaHostGetDevicePointer((void **)&buffers[outputIndex], (void *)prob, 0); // buffers[outputIndex] --> prob

        // Run inference
        auto start = std::chrono::high_resolution_clock::now();
        doInference(*context, stream, buffers, BATCH_SIZE);
        auto end = std::chrono::high_resolution_clock::now();
        std::cout << "infer time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;

        cv::Mat outimg(INPUT_H, INPUT_W, CV_8UC1);
        for (int row = 0; row < INPUT_H; ++row)
        {
            uchar *uc_pixel = outimg.data + row * outimg.step;
            for (int col = 0; col < INPUT_W; ++col)
            {
                uc_pixel[col] = (uchar)prob[row * INPUT_W + col];
            }
        }
        cv::Mat im_color;
        cv::cvtColor(outimg, im_color, cv::COLOR_GRAY2RGB);
        cv::Mat lut = createLTU(NUM_CLASSES);
        cv::LUT(im_color, lut, im_color);
        // false color
        cv::cvtColor(im_color, im_color, cv::COLOR_RGB2GRAY);
        cv::applyColorMap(im_color, im_color, cv::COLORMAP_HOT);
        // cv::imshow("False Color Map", im_color);
        cv::imwrite(std::to_string(f) + "_false_color_map.png", im_color);
        //fusion
        cv::Mat fusionImg;
        cv::addWeighted(img, 1, im_color, 0.8, 1, fusionImg);
        // cv::imshow("Fusion Img", fusionImg);
        // cv::waitKey(0);
        cv::imwrite(std::to_string(f) + "_fusion_img.png", fusionImg);
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFreeHost(buffers[inputIndex]));
    CHECK(cudaFreeHost(buffers[outputIndex]));
    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();
    return 0;
}


================================================
FILE: hrnet/hrnet-semantic-segmentation/hrnet_trt.py
================================================
"""
An example that uses TensorRT's Python api to make inferences for hrnet.
"""
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
from imgaug import augmenters as iaa

def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret

class Hrnet_TRT(object):
    """
    description: A Hrnet class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.cfx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        runtime = trt.Runtime(trt.Logger(trt.Logger.INFO))
        assert runtime
        
        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-2]
                self.input_h = engine.get_binding_shape(binding)[-3]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.batch_size = engine.max_batch_size

    def infer(self, image_raw):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.cfx.push()
        # Restore
        stream = self.stream
        context = self.context
        engine = self.engine
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        print('ori_shape: ', image_raw.shape)
        # if image_raw is constant, image_raw.shape[1] != self.input_w
        w_ori, h_ori = image_raw.shape[1], image_raw.shape[0]
        # Do image preprocess
        input_image = self.preprocess_image(image_raw)
        # Copy input image to host buffer
        np.copyto(host_inputs[0], input_image.ravel())
        start = time.time()
        # Transfer input data to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async(bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.cfx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        # Do postprocess
        output = output.reshape(self.input_h, self.input_w).astype('uint8')
        print('output_shape: ', output.shape)
        output = cv2.resize(output, (w_ori, h_ori))
        return output, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.cfx.pop()

    def preprocess_image(self, image_raw):
        """
        description: Read an image from image path, convert it to RGB,
                    resize and pad it to target size.
        param:
            image_raw: numpy, raw image
        return:
            image:  the processed image
        """
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        resize = iaa.Resize({
            'width': self.input_w,
            'height': self.input_h
        })
        image = resize.augment_image(image)
        print('resized', image.shape, image.dtype)
        image = image.astype(np.float32)
        return image

    def get_raw_image(self, image_path_batch):
        """
        description: Read an image from image path
        """
        for img_path in image_path_batch:
            return cv2.imread(img_path)
    
    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            return np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)


class inferThread(threading.Thread):
    def __init__(self, hrnet_wrapper, image_path_batch):
        threading.Thread.__init__(self)
        self.hrnet_wrapper = hrnet_wrapper
        self.image_path_batch = image_path_batch

    def run(self):
        batch_image_raw, use_time = self.hrnet_wrapper.infer(self.hrnet_wrapper.get_raw_image(self.image_path_batch))
        for i, img_path in enumerate(self.image_path_batch):
            parent, filename = os.path.split(img_path)
            save_name = os.path.join('output', filename)
            # Save image
            cv2.imwrite(save_name, batch_image_raw*255)
        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))


class warmUpThread(threading.Thread):
    def __init__(self, hrnet_wrapper):
        threading.Thread.__init__(self)
        self.hrnet_wrapper = hrnet_wrapper

    def run(self):
        batch_image_raw, use_time = self.hrnet_wrapper.infer(self.hrnet_wrapper.get_raw_image_zeros())
        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))


if __name__ == "__main__":
    # load custom engine
    engine_file_path = "build/hrnet.engine"  # the generated engine file
    
    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')
    # a hrnet instance
    hrnet_wrapper = Hrnet_TRT(engine_file_path)
    try:
        print('batch size is', hrnet_wrapper.batch_size)  # batch size is set to 1!
        
        image_dir = "samples/"
        image_path_batches = get_img_path_batches(hrnet_wrapper.batch_size, image_dir)

        for i in range(10):
            # create a new thread to do warm_up
            thread1 = warmUpThread(hrnet_wrapper)
            thread1.start()
            thread1.join()
        for batch in image_path_batches:
            # create a new thread to do inference
            thread1 = inferThread(hrnet_wrapper, batch)
            thread1.start()
            thread1.join()
    finally:
        # destroy the instance
        hrnet_wrapper.destroy()


================================================
FILE: hrnet/hrnet-semantic-segmentation/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) override
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: ibnnet/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(IBNNet)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

find_package(CUDA REQUIRED)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(/usr/include/x86_64-linux-gnu/)
link_directories(/usr/lib/x86_64-linux-gnu/)

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

file(GLOB SOURCE_FILES "*.h" "*.cpp")

add_executable(ibnnet ${SOURCE_FILES})
target_link_libraries(ibnnet nvinfer)
target_link_libraries(ibnnet cudart)
target_link_libraries(ibnnet ${OpenCV_LIBS})

add_definitions(-O2 -pthread)


================================================
FILE: ibnnet/InferenceEngine.cpp
================================================
#include "InferenceEngine.h"

namespace trt {

   InferenceEngine::InferenceEngine(const EngineConfig &enginecfg): _engineCfg(enginecfg) { 

        assert(_engineCfg.max_batch_size > 0);

        CHECK(cudaSetDevice(_engineCfg.device_id));

        _runtime = make_holder(nvinfer1::createInferRuntime(gLogger));
        assert(_runtime);

        _engine = make_holder(_runtime->deserializeCudaEngine(_engineCfg.trtModelStream.get(), _engineCfg.stream_size)); 
        assert(_engine);

        _context = make_holder(_engine->createExecutionContext());
        assert(_context);

        _inputSize = _engineCfg.max_batch_size * 3 * _engineCfg.input_h * _engineCfg.input_w * _depth;
        _outputSize = _engineCfg.max_batch_size * _engineCfg.output_size * _depth; 

        CHECK(cudaMallocHost((void**)&_data, _inputSize));
        CHECK(cudaMallocHost((void**)&_prob, _outputSize));

        _streamptr = std::shared_ptr<cudaStream_t>( new cudaStream_t, 
            [](cudaStream_t* ptr){ 
                cudaStreamDestroy(*ptr);
                if(ptr != nullptr){ 
                    delete ptr;
                } 
            });

        CHECK(cudaStreamCreate(&*_streamptr.get()));

        // Pointers to input and output device buffers to pass to engine.
        // Engine requires exactly IEngine::getNbBindings() number of buffers.
        assert(_engine->getNbBindings() == 2);

        // In order to bind the buffers, we need to know the names of the input and output tensors.
        // Note that indices are guaranteed to be less than IEngine::getNbBindings()
        _inputIndex = _engine->getBindingIndex(_engineCfg.input_name);
        _outputIndex = _engine->getBindingIndex(_engineCfg.output_name);
        
        // Create GPU buffers on device
        CHECK(cudaMalloc(&_buffers[_inputIndex], _inputSize));
        CHECK(cudaMalloc(&_buffers[_outputIndex], _outputSize));

        _inputSize /= _engineCfg.max_batch_size;
        _outputSize /= _engineCfg.max_batch_size; 

    }

    bool InferenceEngine::doInference(const int inference_batch_size, std::function<void(float*)> preprocessing) {
        assert(inference_batch_size <= _engineCfg.max_batch_size);
        preprocessing(_data);
        CHECK(cudaSetDevice(_engineCfg.device_id));
        CHECK(cudaMemcpyAsync(_buffers[_inputIndex], _data, inference_batch_size * _inputSize, cudaMemcpyHostToDevice, *_streamptr));
        auto status = _context->enqueue(inference_batch_size, _buffers, *_streamptr, nullptr);
        CHECK(cudaMemcpyAsync(_prob, _buffers[_outputIndex], inference_batch_size * _outputSize, cudaMemcpyDeviceToHost, *_streamptr));
        CHECK(cudaStreamSynchronize(*_streamptr));
        return status;
    }

    InferenceEngine::InferenceEngine(InferenceEngine &&other) noexcept: 
        _engineCfg(other._engineCfg)
        , _data(other._data)
        , _prob(other._prob)
        , _inputIndex(other._inputIndex) 
        , _outputIndex(other._outputIndex)
        , _inputSize(other._inputSize) 
        , _outputSize(other._outputSize)
        , _runtime(std::move(other._runtime))
        , _engine(std::move(other._engine))
        , _context(std::move(other._context))
        , _streamptr(other._streamptr) { 

        _buffers[0] = other._buffers[0];
        _buffers[1] = other._buffers[1];
        other._streamptr.reset();
        other._data = nullptr;
        other._prob = nullptr;
        other._buffers[0] = nullptr; 
        other._buffers[1] = nullptr; 
    } 

    InferenceEngine::~InferenceEngine() {  
        CHECK(cudaFreeHost(_data));
        CHECK(cudaFreeHost(_prob));
        CHECK(cudaFree(_buffers[_inputIndex]));
        CHECK(cudaFree(_buffers[_outputIndex]));
    }
}

================================================
FILE: ibnnet/InferenceEngine.h
================================================
/**************************************************************************
 * Handle memory pre-alloc
 * both on host(pinned memory, allow CUDA DMA) & device
*************************************************************************/

#pragma once

#include <thread>
#include <chrono>
#include <memory>
#include <functional>
#include <opencv2/opencv.hpp>

#include "utils.h"
#include "holder.h"
#include "logging.h"
#include "NvInfer.h"
#include "cuda_runtime_api.h"
static Logger gLogger;

namespace trt {

    struct EngineConfig {
        const char* input_name;
        const char* output_name; 
        std::shared_ptr<char> trtModelStream;
        int max_batch_size; /* create engine */
        int input_h;  
        int input_w;
        int output_size;
        int stream_size;
        int device_id;
    };

    class InferenceEngine {

    public:
        InferenceEngine(const EngineConfig &enginecfg);
        InferenceEngine(InferenceEngine &&other) noexcept;
        ~InferenceEngine();

        InferenceEngine(const InferenceEngine &) = delete;
        InferenceEngine& operator=(const InferenceEngine &) = delete;
        InferenceEngine& operator=(InferenceEngine && other) = delete;

        bool doInference(const int inference_batch_size, std::function<void(float*)> preprocessing);
        float* getOutput() { return _prob; }
        std::thread::id getThreadID() { return std::this_thread::get_id(); }

    private:
        EngineConfig _engineCfg;
        float* _data{nullptr};
        float* _prob{nullptr};

        // Pointers to input and output device buffers to pass to engine.
        // Engine requires exactly IEngine::getNbBindings() number of buffers.
        void* _buffers[2];

        // In order to bind the buffers, we need to know the names of the input and output tensors.
        // Note that indices are guaranteed to be less than IEngine::getNbBindings()
        int _inputIndex;
        int _outputIndex;

        int _inputSize;
        int _outputSize;

        static constexpr std::size_t _depth{sizeof(float)};

        TensorRTHolder<nvinfer1::IRuntime> _runtime{nullptr};
        TensorRTHolder<nvinfer1::ICudaEngine> _engine{nullptr};
        TensorRTHolder<nvinfer1::IExecutionContext> _context{nullptr};
        std::shared_ptr<cudaStream_t> _streamptr;
    };

}


================================================
FILE: ibnnet/README.md
================================================
# IBN-Net

An implementation of IBN-Net, proposed in ["Two at Once: Enhancing Learning and Generalization Capacities via IBN-Net"](https://arxiv.org/abs/1807.09441), ECCV2018 by Xingang Pan, Ping Luo, Jianping Shi, Xiaoou Tang. 

For the Pytorch implementation, you can refer to [IBN-Net](https://github.com/XingangPan/IBN-Net)

## Features
- InstanceNorm2d
- bottleneck_ibn
- Resnet50-IBNA
- Resnet50-IBNB
- Multi-thread inference

## How to Run

* 1. generate .wts

  // for ibn-a
  ```
  python gen_wts.py a
  ```
  a file 'resnet50-ibna.wts' will be generated.

  // for ibn-b
  ```
  python gen_wts.py b
  ```
  a file 'resnet50-ibnb.wts' will be generated.
* 2. cmake and make

  ```
  mkdir build
  cd build
  cmake ..
  make
  ```
* 3. build engine and run classification

  // put resnet50-ibna.wts/resnet50-ibnb.wts into tensorrtx/ibnnet
  
  // go to tensorrtx/ibnnet
  ```
  ./ibnnet -s  // serialize model to plan file
  ./ibnnet -d  // deserialize plan file and run inference
  ```
  

================================================
FILE: ibnnet/gen_wts.py
================================================
import torch
import os
import sys
import struct


assert sys.argv[1] == "a" or sys.argv[1] == "b"
model_name = "resnet50_ibn_" + sys.argv[1]

net = torch.hub.load('XingangPan/IBN-Net', model_name, pretrained=True).to('cuda:0').eval()

#verify
#input = torch.ones(1, 3, 224, 224).to('cuda:0')
#pixel_mean = torch.tensor([0.485, 0.456, 0.406]).view(1, -1, 1, 1).to('cuda:0')
#pixel_std = torch.tensor([0.229, 0.224, 0.225]).view(1, -1, 1, 1).to('cuda:0')
#input.sub_(pixel_mean).div_(pixel_std)
#out = net(input)
#print(out)

f = open(model_name + ".wts", 'w')
f.write("{}\n".format(len(net.state_dict().keys())))
for k,v in net.state_dict().items():
    vr = v.reshape(-1).cpu().numpy()
    f.write("{} {}".format(k, len(vr)))
    for vv in vr:
        f.write(" ")
        f.write(struct.pack(">f", float(vv)).hex())
    f.write("\n")


================================================
FILE: ibnnet/holder.h
================================================
#pragma once

template <typename T>
class TensorRTHolder {
    T* holder;
public:
    explicit TensorRTHolder(T* holder_) : holder(holder_) {}
    ~TensorRTHolder() {
        if (holder)
            holder->destroy();
    }
    TensorRTHolder(const TensorRTHolder&) = delete;
    TensorRTHolder& operator=(const TensorRTHolder&) = delete;
    TensorRTHolder(TensorRTHolder && rhs) noexcept{
        holder = rhs.holder;
        rhs.holder = nullptr;
    }
    TensorRTHolder& operator=(TensorRTHolder&& rhs) noexcept {
        if (this == &rhs) {
            return *this;
        }
        if (holder) holder->destroy();
        holder = rhs.holder;
        rhs.holder = nullptr;
        return *this;
    }
    T* operator->() {
        return holder;
    }
    T* get() { return holder; }
    explicit operator bool() { return holder != nullptr; }
    T& operator*() noexcept { return *holder; }
};

template <typename T>
TensorRTHolder<T> make_holder(T* holder) {
    return TensorRTHolder<T>(holder);
}

template <typename T>
using TensorRTNonHolder = T*;

================================================
FILE: ibnnet/ibnnet.cpp
================================================
#include "ibnnet.h"

//#define USE_FP16

namespace trt {

    IBNNet::IBNNet(trt::EngineConfig &enginecfg, const IBN ibn) : _engineCfg(enginecfg) {
        switch(ibn) {
            case IBN::A:
                _ibn = "a"; 
                break;
            case IBN::B:
                _ibn = "b"; 
                break;
            case IBN::NONE:
            default:
                _ibn = "";
                break;
        }
    }

    // create the engine using only the API and not any parser.
    ICudaEngine *IBNNet::createEngine(IBuilder* builder, IBuilderConfig* config) {
        // resnet50-ibna, resnet50-ibnb, resnet50
        assert(_ibn == "a" or _ibn == "b" or _ibn == "");
        INetworkDefinition* network = builder->createNetworkV2(0U);

        // Create input tensor of shape { 3, INPUT_H, INPUT_W } with name INPUT_BLOB_NAME
        ITensor* data = network->addInput(_engineCfg.input_name, _dt, Dims3{3, _engineCfg.input_h, _engineCfg.input_w});
        assert(data);

        std::string path;
        if(_ibn == "") {
            path = "../resnet50.wts";
        } else {
            path = "../resnet50-ibn" + _ibn + ".wts";
        }

        std::map<std::string, Weights> weightMap = loadWeights(path);
        Weights emptywts{DataType::kFLOAT, nullptr, 0};

        std::map<std::string, std::vector<std::string>> ibn_layers{ 
            { "a", {"a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "", "", ""}},
            { "b", {"", "", "b", "", "", "","b", "", "", "", "", "", "", "", "", "",}},
            { "", {16, ""}}};

        const float mean[3] = {0.485, 0.456, 0.406}; // rgb
        const float std[3] = {0.229, 0.224, 0.225};
        ITensor* pre_input = MeanStd(network, weightMap, data, "", mean, std, false);

        IConvolutionLayer* conv1 = network->addConvolutionNd(*pre_input, 64, DimsHW{7, 7}, weightMap["conv1.weight"], emptywts);
        assert(conv1);
        conv1->setStrideNd(DimsHW{2, 2});
        conv1->setPaddingNd(DimsHW{3, 3});

        IActivationLayer* relu1{nullptr};
        if (_ibn == "b") {
            IScaleLayer* bn1 = addInstanceNorm2d(network, weightMap, *conv1->getOutput(0), "bn1", 1e-5);
            relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
        } else {
            IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "bn1", 1e-5);
            relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
        }
        assert(relu1);

        // Add max pooling layer with stride of 2x2 and kernel size of 2x2.
        IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
        assert(pool1);
        pool1->setStrideNd(DimsHW{2, 2});
        pool1->setPaddingNd(DimsHW{1, 1});

        IActivationLayer* x = bottleneck_ibn(network, weightMap, *pool1->getOutput(0), 64, 64, 1, "layer1.0.", ibn_layers[_ibn][0]);
        x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 256, 64, 1, "layer1.1.", ibn_layers[_ibn][1]);
        x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 256, 64, 1, "layer1.2.", ibn_layers[_ibn][2]);

        x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 256, 128, 2, "layer2.0.", ibn_layers[_ibn][3]);
        x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.1.", ibn_layers[_ibn][4]);
        x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.2.", ibn_layers[_ibn][5]);
        x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.3.", ibn_layers[_ibn][6]);

        x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 512, 256, 2, "layer3.0.", ibn_layers[_ibn][7]);
        x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.1.", ibn_layers[_ibn][8]);
        x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.2.", ibn_layers[_ibn][9]);
        x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.3.", ibn_layers[_ibn][10]);
        x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.4.", ibn_layers[_ibn][11]);
        x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.5.", ibn_layers[_ibn][12]);

        x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 1024, 512, 2, "layer4.0.", ibn_layers[_ibn][13]);
        x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 2048, 512, 1, "layer4.1.", ibn_layers[_ibn][14]);
        x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 2048, 512, 1, "layer4.2.", ibn_layers[_ibn][15]);

        IPoolingLayer* pool2 = network->addPoolingNd(*x->getOutput(0), PoolingType::kAVERAGE, DimsHW{7, 7});
        assert(pool2);
        pool2->setStrideNd(DimsHW{1, 1});
        
        IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool2->getOutput(0), 1000, weightMap["fc.weight"], weightMap["fc.bias"]);
        assert(fc1);

        fc1->getOutput(0)->setName(_engineCfg.output_name);
        std::cout << "set name out" << std::endl;
        network->markOutput(*fc1->getOutput(0));

        // Build engine
        builder->setMaxBatchSize(_engineCfg.max_batch_size);
        config->setMaxWorkspaceSize(1 << 20);

    #ifdef USE_FP16
        config->setFlag(BuilderFlag::kFP16);
    #endif
        ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
        std::cout << "build out" << std::endl;

        // Don't need the network any more
        network->destroy();

        // Release host memory
        for (auto& mem : weightMap) {
            free((void*) (mem.second.values));
        }

        return engine;
    }

    bool IBNNet::serializeEngine() {
        // Create builder
        auto builder = make_holder(createInferBuilder(gLogger));
        auto config = make_holder(builder->createBuilderConfig());
        // Create model to populate the network, then set the outputs and create an engine
        ICudaEngine *engine = createEngine(builder.get(), config.get());
        assert(engine);

        // Serialize the engine
        TensorRTHolder<IHostMemory> modelStream = make_holder(engine->serialize());
        assert(modelStream);

        std::ofstream p("./ibnnet.engine", std::ios::binary | std::ios::out);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return false;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());

        return true;
    }

    bool IBNNet::deserializeEngine() {
        std::ifstream file("./ibnnet.engine", std::ios::binary | std::ios::in);
        if (file.good()) {
            file.seekg(0, file.end);
            _engineCfg.stream_size = file.tellg();
            file.seekg(0, file.beg);
            _engineCfg.trtModelStream = std::shared_ptr<char>( new char[_engineCfg.stream_size], []( char* ptr ){ delete [] ptr; } );
            assert(_engineCfg.trtModelStream.get());
            file.read(_engineCfg.trtModelStream.get(), _engineCfg.stream_size);
            file.close();
    
            _inferEngine = make_unique<trt::InferenceEngine>(_engineCfg);
            return true;
        }
        return false;
    }

    void IBNNet::preprocessing(const cv::Mat& img, float* const data, const std::size_t stride) {
        for (std::size_t i = 0; i < stride; ++i) { 
            data[i] = img.at<cv::Vec3b>(i)[2] / 255.0; 
            data[i + stride] = img.at<cv::Vec3b>(i)[1] / 255.0;
            data[i + (stride<<1)] = img.at<cv::Vec3b>(i)[0] / 255.0;
        }
    }

    bool IBNNet::inference(std::vector<cv::Mat> &input) {
        if(_inferEngine != nullptr) {
            const std::size_t stride = _engineCfg.input_w * _engineCfg.input_h;
            return _inferEngine.get()->doInference(input.size(), 
                [&](float* data) {
                    for(const auto &img : input) {
                        preprocessing(img, data, stride);
                        data += 3 * stride;
                    }
                }
            );
        } else {
            return false;
        }
    }

    float* IBNNet::getOutput() { 
        if(_inferEngine != nullptr) 
            return _inferEngine.get()->getOutput(); 
        return nullptr;
    }

    int IBNNet::getDeviceID() { 
        return _engineCfg.device_id; 
    }

}

================================================
FILE: ibnnet/ibnnet.h
================================================
#pragma once

#include "utils.h"
#include "holder.h"
#include "layers.h"
#include "InferenceEngine.h"
#include <memory>
#include <vector>
#include <chrono>
#include <opencv2/opencv.hpp>
extern Logger gLogger;
using namespace trtxapi;

namespace trt {

    enum IBN {
        A, // resnet50-ibna,
        B, // resnet50-ibnb,
        NONE // resnet50
    };

    class IBNNet {
    public:
        IBNNet(trt::EngineConfig &enginecfg, const IBN ibn);
        ~IBNNet() {};

        bool serializeEngine(); /* create & serializeEngine */ 
        bool deserializeEngine();
        bool inference(std::vector<cv::Mat> &input); /* support batch inference */

        float* getOutput(); 
        int getDeviceID(); /* cuda deviceid */ 

    private:
        ICudaEngine *createEngine(IBuilder *builder, IBuilderConfig *config);
        void preprocessing(const cv::Mat& img, float* const data, const std::size_t stride);

    private:
        trt::EngineConfig _engineCfg;
        std::unique_ptr<trt::InferenceEngine> _inferEngine{nullptr};
        std::string _ibn;
        DataType _dt{DataType::kFLOAT};
    };

}

================================================
FILE: ibnnet/layers.cpp
================================================
#include "layers.h"

namespace trtxapi {

    ITensor* MeanStd(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor* input, const std::string lname, const float* mean, const float* std, const bool div255) {
        if(div255) {
            Weights Div_225{ DataType::kFLOAT, nullptr, 3 };
            float *wgt = reinterpret_cast<float*>(malloc(sizeof(float) * 3));
            std::fill_n(wgt, 3, 255.0f); 
            Div_225.values = wgt;
            weightMap[lname + ".div"] = Div_225;
            IConstantLayer* d = network->addConstant(Dims3{ 3, 1, 1 }, Div_225);
            input = network->addElementWise(*input, *d->getOutput(0), ElementWiseOperation::kDIV)->getOutput(0);
        }
        Weights Mean{ DataType::kFLOAT, nullptr, 3 };
        Mean.values = mean;
        IConstantLayer* m = network->addConstant(Dims3{ 3, 1, 1 }, Mean);
        IElementWiseLayer* sub_mean = network->addElementWise(*input, *m->getOutput(0), ElementWiseOperation::kSUB);
        if (std != nullptr) {
            Weights Std{ DataType::kFLOAT, nullptr, 3 };
            Std.values = std;
            IConstantLayer* s = network->addConstant(Dims3{ 3, 1, 1 }, Std);
            IElementWiseLayer* std_mean = network->addElementWise(*sub_mean->getOutput(0), *s->getOutput(0), ElementWiseOperation::kDIV);
            return std_mean->getOutput(0);
        } else {
            return sub_mean->getOutput(0);
        }
    }

    IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, const std::string lname, const float eps) {
        float *gamma = (float*)weightMap[lname + ".weight"].values;
        float *beta = (float*)weightMap[lname + ".bias"].values;
        float *mean = (float*)weightMap[lname + ".running_mean"].values;
        float *var = (float*)weightMap[lname + ".running_var"].values;
        int len = weightMap[lname + ".running_var"].count;

        float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
        for (int i = 0; i < len; i++) {
            scval[i] = gamma[i] / sqrt(var[i] + eps);
        }
        Weights wscale{DataType::kFLOAT, scval, len};

        float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
        for (int i = 0; i < len; i++) {
            shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
        }
        Weights wshift{DataType::kFLOAT, shval, len};

        float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
        for (int i = 0; i < len; i++) {
            pval[i] = 1.0;
        }
        Weights wpower{DataType::kFLOAT, pval, len};

        weightMap[lname + ".scale"] = wscale;
        weightMap[lname + ".shift"] = wshift;
        weightMap[lname + ".power"] = wpower;
        IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, wshift, wscale, wpower);
        assert(scale_1);
        return scale_1;
    }

    IScaleLayer* addInstanceNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, const std::string lname, const float eps) {

        int len = weightMap[lname + ".weight"].count;

        IReduceLayer* reduce1 = network->addReduce(input, 
            ReduceOperation::kAVG,
            6, 
            true);
        assert(reduce1);

        IElementWiseLayer* ew1 = network->addElementWise(input, 
            *reduce1->getOutput(0),
            ElementWiseOperation::kSUB);  
        assert(ew1);

        const static float pval1[3]{0.0, 1.0, 2.0};   
        Weights wshift1{DataType::kFLOAT, pval1, 1};
        Weights wscale1{DataType::kFLOAT, pval1+1, 1};
        Weights wpower1{DataType::kFLOAT, pval1+2, 1};

        IScaleLayer* scale1 = network->addScale(
            *ew1->getOutput(0), 
            ScaleMode::kUNIFORM,
            wshift1,  
            wscale1,  
            wpower1); 
        assert(scale1);

        IReduceLayer* reduce2 = network->addReduce(
            *scale1->getOutput(0), 
            ReduceOperation::kAVG,
            6, 
            true);
        assert(reduce2);

        const static float pval2[3]{eps, 1.0, 0.5}; 
        Weights wshift2{DataType::kFLOAT, pval2, 1};
        Weights wscale2{DataType::kFLOAT, pval2+1, 1};
        Weights wpower2{DataType::kFLOAT, pval2+2, 1};
        
        IScaleLayer* scale2 = network->addScale(
            *reduce2->getOutput(0), 
            ScaleMode::kUNIFORM,
            wshift2,  
            wscale2,  
            wpower2);
        assert(scale2);

        IElementWiseLayer* ew2 = network->addElementWise(*ew1->getOutput(0), 
            *scale2->getOutput(0),
            ElementWiseOperation::kDIV); 
        assert(ew2);

        float* pval3 = reinterpret_cast<float*>(malloc(sizeof(float) * len));
        std::fill_n(pval3, len, 1.0); 
        Weights wpower3{DataType::kFLOAT, pval3, len};
        weightMap[lname + ".power3"] = wpower3;

        IScaleLayer* scale3 = network->addScale(
            *ew2->getOutput(0), 
            ScaleMode::kCHANNEL,
            weightMap[lname + ".bias"], 
            weightMap[lname + ".weight"],  
            wpower3); 
        assert(scale3);
        return scale3;
    }

    IConcatenationLayer* addIBN(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, const std::string lname) {
        Dims spliteDims = input.getDimensions();
        ISliceLayer *split1 = network->addSlice(input, 
            Dims3{0, 0, 0}, 
            Dims3{spliteDims.d[0]/2, spliteDims.d[1], spliteDims.d[2]}, 
            Dims3{1, 1, 1});
        assert(split1);

        ISliceLayer *split2 = network->addSlice(input, 
            Dims3{spliteDims.d[0]/2, 0, 0}, 
            Dims3{spliteDims.d[0]/2, spliteDims.d[1], spliteDims.d[2]}, 
            Dims3{1, 1, 1});
        assert(split2);

        auto in1 = addInstanceNorm2d(network, weightMap, *split1->getOutput(0), lname + "IN", 1e-5);
        auto bn1 = addBatchNorm2d(network, weightMap, *split2->getOutput(0), lname + "BN", 1e-5);

        ITensor* tensor1[] = {in1->getOutput(0), bn1->getOutput(0)};
        auto cat1 = network->addConcatenation(tensor1, 2);
        assert(cat1);
        return cat1;
    }

    IActivationLayer* bottleneck_ibn(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, const int inch, const int outch, const int stride, const std::string lname, const std::string ibn) {
        Weights emptywts{DataType::kFLOAT, nullptr, 0};

        IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{1, 1}, weightMap[lname + "conv1.weight"], emptywts);
        assert(conv1);

        IActivationLayer* relu1{nullptr};
        if (ibn == "a") {
            IConcatenationLayer* bn1 = addIBN(network, weightMap, *conv1->getOutput(0), lname + "bn1.");
            relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
            assert(relu1);
        } else {
            IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5);
            relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
            assert(relu1);
        }

        IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{3, 3}, weightMap[lname + "conv2.weight"], emptywts);
        assert(conv2);
        conv2->setStrideNd(DimsHW{stride, stride});
        conv2->setPaddingNd(DimsHW{1, 1});

        IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5);

        IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU);
        assert(relu2);

        IConvolutionLayer* conv3 = network->addConvolutionNd(*relu2->getOutput(0), outch * 4, DimsHW{1, 1}, weightMap[lname + "conv3.weight"], emptywts);
        assert(conv3);

        IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "bn3", 1e-5);

        IElementWiseLayer* ew1;
        if (stride != 1 || inch != outch * 4) {
            IConvolutionLayer* conv4 = network->addConvolutionNd(input, outch * 4, DimsHW{1, 1}, weightMap[lname + "downsample.0.weight"], emptywts);
            assert(conv4);
            conv4->setStrideNd(DimsHW{stride, stride});

            IScaleLayer* bn4 = addBatchNorm2d(network, weightMap, *conv4->getOutput(0), lname + "downsample.1", 1e-5);
            ew1 = network->addElementWise(*bn4->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM);
        } else {
            ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM);
        }
    
        IActivationLayer* relu3{nullptr};
        if (ibn == "b") {
            IScaleLayer* in1 = addInstanceNorm2d(network, weightMap, *ew1->getOutput(0), lname + "IN", 1e-5);
            relu3 = network->addActivation(*in1->getOutput(0), ActivationType::kRELU);
        } else {
            relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU);
        }

        assert(relu3);
        return relu3;
    }

}

================================================
FILE: ibnnet/layers.h
================================================
#pragma once

#include <map>
#include <math.h>
#include <assert.h>
#include "NvInfer.h"
#include "cuda_runtime_api.h"
using namespace nvinfer1;

namespace trtxapi {

    ITensor* MeanStd(INetworkDefinition *network, 
        std::map<std::string, Weights>& weightMap, 
        ITensor* input, 
        const std::string lname,
        const float* mean, 
        const float* std, 
        const bool div255);

    IScaleLayer* addBatchNorm2d(INetworkDefinition *network, 
        std::map<std::string, Weights>& weightMap, 
        ITensor& input, 
        const std::string lname, 
        const float eps);

    IScaleLayer* addInstanceNorm2d(INetworkDefinition *network, 
        std::map<std::string, Weights>& weightMap, 
        ITensor& input, 
        const std::string lname, 
        const float eps);

    IConcatenationLayer* addIBN(INetworkDefinition *network, 
        std::map<std::string, Weights>& weightMap, 
        ITensor& input, 
        const std::string lname);

    IActivationLayer* bottleneck_ibn(INetworkDefinition *network, 
        std::map<std::string, Weights>& weightMap, 
        ITensor& input, 
        const int inch, 
        const int outch,
        const int stride, 
        const std::string lname, 
        const std::string ibn);

}

================================================
FILE: ibnnet/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) override
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: ibnnet/main.cpp
================================================
#include <thread>
#include <vector>
#include <memory>
#include "ibnnet.h"
#include "InferenceEngine.h"

// stuff we know about the network and the input/output blobs
static const int MAX_BATCH_SIZE = 4;
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int OUTPUT_SIZE = 1000;
static const int DEVICE_ID = 0;
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
extern Logger gLogger;

void run_infer(std::shared_ptr<trt::IBNNet> model) {

    CHECK(cudaSetDevice(model->getDeviceID()));

    if(!model->deserializeEngine()) {
        std::cout << "DeserializeEngine Failed." << std::endl;
        return;
    }

    /* support batch input data */
    std::vector<cv::Mat> input;
    input.emplace_back( cv::Mat(INPUT_H, INPUT_W, CV_8UC3, cv::Scalar(255,255,255)) ) ;

    /* run inference */
    model->inference(input); 

    /* get output data from cudaMalloc */
    float* prob = model->getOutput();

    /* print output */
    std::cout << "\nOutput from thread_id: " << std::this_thread::get_id() << std::endl;
    if( prob != nullptr ) { 
        for (size_t batch_idx = 0; batch_idx < input.size(); ++batch_idx) {
            for (int p = 0; p < OUTPUT_SIZE; ++p) {
                std::cout<< prob[batch_idx+p] << " ";
                if ((p+1) % 10 == 0) {
                    std::cout << std::endl;
                }
            }
        }
    }
}

int main(int argc, char** argv) {

    trt::EngineConfig engineCfg { 
        INPUT_BLOB_NAME,
        OUTPUT_BLOB_NAME,
        nullptr,
        MAX_BATCH_SIZE,
        INPUT_H,
        INPUT_W,
        OUTPUT_SIZE,
        0,
        DEVICE_ID};

    if (argc == 2 && std::string(argv[1]) == "-s") {
        std::cout << "Serializling Engine" << std::endl;
        trt::IBNNet ibnnet{engineCfg, trt::IBN::A}; 
        ibnnet.serializeEngine();
        return 0;
    } else if (argc == 2 && std::string(argv[1]) == "-d") {

        /* 
         * Support multi thread inference (mthreads>1)
         * Each thread holds their own CudaEngine
         * They can run on different cuda device through trt::EngineConfig setting
        */
        int mthreads = 1; 
        std::vector<std::thread> workers;
        std::vector<std::shared_ptr<trt::IBNNet>> models;

        for(int i = 0; i < mthreads; ++i) {
            models.emplace_back( std::make_shared<trt::IBNNet>(engineCfg, trt::IBN::A) ); // For IBNB: trt::IBN::B
        }

        for(int i = 0; i < mthreads; ++i) {
            workers.emplace_back( std::thread(run_infer, models[i]) );
        }

        for(auto & worker : workers) {
            worker.join();
        } 

        return 0;
    } else {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./ibnnet -s  // serialize model to plan file" << std::endl;
        std::cerr << "./ibnnet -d  // deserialize plan file and run inference" << std::endl;
        return -1;
    }
}


================================================
FILE: ibnnet/utils.cpp
================================================
#include "utils.h"

// Load weights from files shared with TensorRT samples.
// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--) {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x) {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}


================================================
FILE: ibnnet/utils.h
================================================
#pragma once

#include <map>
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "assert.h"
#include <fstream>
#include <iostream>
#include <memory>

using namespace nvinfer1;

#define CHECK(status)                             \
    do                                            \
    {                                             \
        auto ret = (status);                      \
        if (ret != 0)                             \
        {                                         \
            std::cout << "Cuda failure: " << ret; \
            abort();                              \
        }                                         \
    } while (0)

template<typename T, typename... Args>
std::unique_ptr<T> make_unique(Args&&... args) {
    return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
}

std::map<std::string, Weights> loadWeights(const std::string file);


================================================
FILE: inception/inceptionv3/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(inception)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(/usr/include/x86_64-linux-gnu/)
link_directories(/usr/lib/x86_64-linux-gnu/)

add_executable(inception ${PROJECT_SOURCE_DIR}/inception_v3.cpp)
target_link_libraries(inception nvinfer)
target_link_libraries(inception cudart)

add_definitions(-O2 -pthread)


================================================
FILE: inception/inceptionv3/README.md
================================================
# Inception v3

Inception v3 model architecture from "Rethinking the Inception Architecture for Computer Vision" <http://arxiv.org/abs/1512.00567>.

For the details, you can refer to [pytorchx/inception](https://github.com/wang-xinyu/pytorchx/tree/master/inception)

Following tricks are used in this inception:

- For pooling layer with padding, we need pay attention to see if padding is included or excluded while calculating average number. Pytorch includes padding while doing avgPool by default, but Tensorrt doesn't. So for pooling layer with padding, we need `setAverageCountExcludesPadding(false)` in tensorrt.
- Batchnorm layer, implemented by scale layer.

```
// 1. generate inception.wts from [pytorchx/inception](https://github.com/wang-xinyu/pytorchx/tree/master/inception)

// 2. put inception.wts into tensorrtx/inception

// 3. build and run

cd tensorrtx/inception

mkdir build

cd build

cmake ..

make

sudo ./inception -s   // serialize model to plan file i.e. 'inception.engine'

sudo ./inception -d   // deserialize plan file and run inference

// 4. see if the output is same as pytorchx/inception
```


================================================
FILE: inception/inceptionv3/inception_v3.cpp
================================================
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include <cmath>

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

// stuff we know about the network and the input/output blobs
static const int INPUT_H = 299;
static const int INPUT_W = 299;
static const int OUTPUT_SIZE = 1000;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";

using namespace nvinfer1;

static Logger gLogger;

// Load weights from files shared with TensorRT samples.
// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file)
{
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        
        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + ".weight"].values;
    float *beta = (float*)weightMap[lname + ".bias"].values;
    float *mean = (float*)weightMap[lname + ".running_mean"].values;
    float *var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;
    std::cout << "len " << len << std::endl;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};
    
    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

IActivationLayer* basicConv2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input,  int outch, DimsHW ksize, int s, DimsHW p, std::string lname) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, ksize, weightMap[lname + "conv.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(p);

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn", 1e-3);

    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);
    return relu1;
}

IConcatenationLayer* inceptionA(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname,
    int pool_proj) {
    IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 64, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch1x1.");

    IActivationLayer* relu2 = basicConv2d(network, weightMap, input, 48, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch5x5_1.");
    relu2 = basicConv2d(network, weightMap, *relu2->getOutput(0), 64, DimsHW{5, 5}, 1, DimsHW{2, 2}, lname + "branch5x5_2.");

    IActivationLayer* relu3 = basicConv2d(network, weightMap, input, 64, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch3x3dbl_1.");
    relu3 = basicConv2d(network, weightMap, *relu3->getOutput(0), 96, DimsHW{3, 3}, 1, DimsHW{1, 1}, lname + "branch3x3dbl_2.");
    relu3 = basicConv2d(network, weightMap, *relu3->getOutput(0), 96, DimsHW{3, 3}, 1, DimsHW{1, 1}, lname + "branch3x3dbl_3.");

    IPoolingLayer* pool1 = network->addPoolingNd(input, PoolingType::kAVERAGE, DimsHW{3, 3});
    assert(pool1);
    pool1->setStrideNd(DimsHW{1, 1});
    pool1->setPaddingNd(DimsHW{1, 1});
    pool1->setAverageCountExcludesPadding(false);
    IActivationLayer* relu4 = basicConv2d(network, weightMap, *pool1->getOutput(0), pool_proj, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch_pool.");

    ITensor* inputTensors[] = {relu1->getOutput(0), relu2->getOutput(0), relu3->getOutput(0), relu4->getOutput(0)};
    IConcatenationLayer* cat1 = network->addConcatenation(inputTensors, 4);
    assert(cat1);
    return cat1;
}

IConcatenationLayer* inceptionB(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname) {
    IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 384, DimsHW{3, 3}, 2, DimsHW{0, 0}, lname + "branch3x3.");

    IActivationLayer* relu2 = basicConv2d(network, weightMap, input, 64, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch3x3dbl_1.");
    relu2 = basicConv2d(network, weightMap, *relu2->getOutput(0), 96, DimsHW{3, 3}, 1, DimsHW{1, 1}, lname + "branch3x3dbl_2.");
    relu2 = basicConv2d(network, weightMap, *relu2->getOutput(0), 96, DimsHW{3, 3}, 2, DimsHW{0, 0}, lname + "branch3x3dbl_3.");

    IPoolingLayer* pool1 = network->addPoolingNd(input, PoolingType::kMAX, DimsHW{3, 3});
    assert(pool1);
    pool1->setStrideNd(DimsHW{2, 2});

    ITensor* inputTensors[] = {relu1->getOutput(0), relu2->getOutput(0), pool1->getOutput(0)};
    IConcatenationLayer* cat1 = network->addConcatenation(inputTensors, 3);
    assert(cat1);
    return cat1;
}

IConcatenationLayer* inceptionC(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname,
    int c7) {
    IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 192, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch1x1.");

    IActivationLayer* relu2 = basicConv2d(network, weightMap, input, c7, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch7x7_1.");
    relu2 = basicConv2d(network, weightMap, *relu2->getOutput(0), c7, DimsHW{1, 7}, 1, DimsHW{0, 3}, lname + "branch7x7_2.");
    relu2 = basicConv2d(network, weightMap, *relu2->getOutput(0), 192, DimsHW{7, 1}, 1, DimsHW{3, 0}, lname + "branch7x7_3.");

    IActivationLayer* relu3 = basicConv2d(network, weightMap, input, c7, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch7x7dbl_1.");
    relu3 = basicConv2d(network, weightMap, *relu3->getOutput(0), c7, DimsHW{7, 1}, 1, DimsHW{3, 0}, lname + "branch7x7dbl_2.");
    relu3 = basicConv2d(network, weightMap, *relu3->getOutput(0), c7, DimsHW{1, 7}, 1, DimsHW{0, 3}, lname + "branch7x7dbl_3.");
    relu3 = basicConv2d(network, weightMap, *relu3->getOutput(0), c7, DimsHW{7, 1}, 1, DimsHW{3, 0}, lname + "branch7x7dbl_4.");
    relu3 = basicConv2d(network, weightMap, *relu3->getOutput(0), 192, DimsHW{1, 7}, 1, DimsHW{0, 3}, lname + "branch7x7dbl_5.");

    IPoolingLayer* pool1 = network->addPoolingNd(input, PoolingType::kAVERAGE, DimsHW{3, 3});
    assert(pool1);
    pool1->setStrideNd(DimsHW{1, 1});
    pool1->setPaddingNd(DimsHW{1, 1});
    pool1->setAverageCountExcludesPadding(false);
    IActivationLayer* relu4 = basicConv2d(network, weightMap, *pool1->getOutput(0), 192, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch_pool.");

    ITensor* inputTensors[] = {relu1->getOutput(0), relu2->getOutput(0), relu3->getOutput(0), relu4->getOutput(0)};
    IConcatenationLayer* cat1 = network->addConcatenation(inputTensors, 4);
    assert(cat1);
    return cat1;
}

IConcatenationLayer* inceptionD(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname) {
    IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 192, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch3x3_1.");
    relu1 = basicConv2d(network, weightMap, *relu1->getOutput(0), 320, DimsHW{3, 3}, 2, DimsHW{0, 0}, lname + "branch3x3_2.");

    IActivationLayer* relu2 = basicConv2d(network, weightMap, input, 192, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch7x7x3_1.");
    relu2 = basicConv2d(network, weightMap, *relu2->getOutput(0), 192, DimsHW{1, 7}, 1, DimsHW{0, 3}, lname + "branch7x7x3_2.");
    relu2 = basicConv2d(network, weightMap, *relu2->getOutput(0), 192, DimsHW{7, 1}, 1, DimsHW{3, 0}, lname + "branch7x7x3_3.");
    relu2 = basicConv2d(network, weightMap, *relu2->getOutput(0), 192, DimsHW{3, 3}, 2, DimsHW{0, 0}, lname + "branch7x7x3_4.");

    IPoolingLayer* pool1 = network->addPoolingNd(input, PoolingType::kMAX, DimsHW{3, 3});
    assert(pool1);
    pool1->setStrideNd(DimsHW{2, 2});

    ITensor* inputTensors[] = {relu1->getOutput(0), relu2->getOutput(0), pool1->getOutput(0)};
    IConcatenationLayer* cat1 = network->addConcatenation(inputTensors, 3);
    assert(cat1);
    return cat1;
}

IConcatenationLayer* inceptionE(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname) {
    IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 320, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch1x1.");

    IActivationLayer* relu2 = basicConv2d(network, weightMap, input, 384, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch3x3_1.");
    IActivationLayer* relu2a = basicConv2d(network, weightMap, *relu2->getOutput(0), 384, DimsHW{1, 3}, 1, DimsHW{0, 1}, lname + "branch3x3_2a.");
    IActivationLayer* relu2b = basicConv2d(network, weightMap, *relu2->getOutput(0), 384, DimsHW{3, 1}, 1, DimsHW{1, 0}, lname + "branch3x3_2b.");
    ITensor* inputTensors[] = {relu2a->getOutput(0), relu2b->getOutput(0)};
    IConcatenationLayer* cat1 = network->addConcatenation(inputTensors, 2);
    assert(cat1);

    IActivationLayer* relu3 = basicConv2d(network, weightMap, input, 448, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch3x3dbl_1.");
    relu3 = basicConv2d(network, weightMap, *relu3->getOutput(0), 384, DimsHW{3, 3}, 1, DimsHW{1, 1}, lname + "branch3x3dbl_2.");
    IActivationLayer* relu3a = basicConv2d(network, weightMap, *relu3->getOutput(0), 384, DimsHW{1, 3}, 1, DimsHW{0, 1}, lname + "branch3x3dbl_3a.");
    IActivationLayer* relu3b = basicConv2d(network, weightMap, *relu3->getOutput(0), 384, DimsHW{3, 1}, 1, DimsHW{1, 0}, lname + "branch3x3dbl_3b.");
    ITensor* inputTensors1[] = {relu3a->getOutput(0), relu3b->getOutput(0)};
    IConcatenationLayer* cat2 = network->addConcatenation(inputTensors1, 2);
    assert(cat2);

    IPoolingLayer* pool1 = network->addPoolingNd(input, PoolingType::kAVERAGE, DimsHW{3, 3});
    assert(pool1);
    pool1->setStrideNd(DimsHW{1, 1});
    pool1->setPaddingNd(DimsHW{1, 1});
    pool1->setAverageCountExcludesPadding(false);
    IActivationLayer* relu4 = basicConv2d(network, weightMap, *pool1->getOutput(0), 192, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch_pool.");

    ITensor* inputTensors2[] = {relu1->getOutput(0), cat1->getOutput(0), cat2->getOutput(0), relu4->getOutput(0)};
    IConcatenationLayer* cat3 = network->addConcatenation(inputTensors2, 4);
    assert(cat3);
    return cat3;
}

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt)
{
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // Create input tensor of shape { 1, 1, 32, 32 } with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../inception.wts");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    float shval[3] = {(0.485 - 0.5) / 0.5, (0.456 - 0.5) / 0.5, (0.406 - 0.5) / 0.5};
    float scval[3] = {0.229 / 0.5, 0.224 / 0.5, 0.225 / 0.5};
    float pval[3] = {1.0, 1.0, 1.0};
    Weights shift{DataType::kFLOAT, shval, 3};
    Weights scale{DataType::kFLOAT, scval, 3};
    Weights power{DataType::kFLOAT, pval, 3};
    IScaleLayer* scale1 = network->addScale(*data, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale1);

    IActivationLayer* relu1 = basicConv2d(network, weightMap, *scale1->getOutput(0), 32, DimsHW{3, 3}, 2, DimsHW{0, 0}, "Conv2d_1a_3x3.");
    relu1 = basicConv2d(network, weightMap, *relu1->getOutput(0), 32, DimsHW{3, 3}, 1, DimsHW{0, 0}, "Conv2d_2a_3x3.");
    relu1 = basicConv2d(network, weightMap, *relu1->getOutput(0), 64, DimsHW{3, 3}, 1, DimsHW{1, 1}, "Conv2d_2b_3x3.");
    IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
    assert(pool1);
    pool1->setStrideNd(DimsHW{2, 2});
    relu1 = basicConv2d(network, weightMap, *pool1->getOutput(0), 80, DimsHW{1, 1}, 1, DimsHW{0, 0}, "Conv2d_3b_1x1.");
    relu1 = basicConv2d(network, weightMap, *relu1->getOutput(0), 192, DimsHW{3, 3}, 1, DimsHW{0, 0}, "Conv2d_4a_3x3.");
    pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
    pool1->setStrideNd(DimsHW{2, 2});

    auto cat1 = inceptionA(network, weightMap, *pool1->getOutput(0), "Mixed_5b.", 32);
    cat1 = inceptionA(network, weightMap, *cat1->getOutput(0), "Mixed_5c.", 64);
    cat1 = inceptionA(network, weightMap, *cat1->getOutput(0), "Mixed_5d.", 64);
    cat1 = inceptionB(network, weightMap, *cat1->getOutput(0), "Mixed_6a.");
    cat1 = inceptionC(network, weightMap, *cat1->getOutput(0), "Mixed_6b.", 128);
    cat1 = inceptionC(network, weightMap, *cat1->getOutput(0), "Mixed_6c.", 160);
    cat1 = inceptionC(network, weightMap, *cat1->getOutput(0), "Mixed_6d.", 160);
    cat1 = inceptionC(network, weightMap, *cat1->getOutput(0), "Mixed_6e.", 192);
    cat1 = inceptionD(network, weightMap, *cat1->getOutput(0), "Mixed_7a.");
    cat1 = inceptionE(network, weightMap, *cat1->getOutput(0), "Mixed_7b.");
    cat1 = inceptionE(network, weightMap, *cat1->getOutput(0), "Mixed_7c.");

    IPoolingLayer* pool2 = network->addPoolingNd(*cat1->getOutput(0), PoolingType::kAVERAGE, DimsHW{8, 8});
    assert(pool2);

    IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool2->getOutput(0), 1000, weightMap["fc.weight"], weightMap["fc.bias"]);
    assert(fc1);

    fc1->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    std::cout << "set name out" << std::endl;
    network->markOutput(*fc1->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(1 << 20);
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "build out" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*) (mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream)
{
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
    config->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

int main(int argc, char** argv)
{
    if (argc != 2) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./inception -s   // serialize model to plan file" << std::endl;
        std::cerr << "./inception -d   // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{nullptr};
    size_t size{0};

    if (std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(1, &modelStream);
        assert(modelStream != nullptr);

        std::ofstream p("inception.engine", std::ios::binary);
        if (!p)
        {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 1;
    } else if (std::string(argv[1]) == "-d") {
        std::ifstream file("inception.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        return -1;
    }


    // Subtract mean from image
    static float data[3 * INPUT_H * INPUT_W];
    for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
        data[i] = 1.0;

    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    // Run inference
    static float prob[OUTPUT_SIZE];
    for (int i = 0; i < 100; i++) {
        auto start = std::chrono::system_clock::now();
        doInference(*context, data, prob, 1);
        auto end = std::chrono::system_clock::now();
        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    // Print histogram of the output distribution
    std::cout << "\nOutput:\n\n";
    for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
    {
        std::cout << prob[i] << ", ";
        if (i % 10 == 0) std::cout << i / 10 << std::endl;
    }
    std::cout << std::endl;

    return 0;
}


================================================
FILE: inception/inceptionv3/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) override
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: inception/inceptionv4/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(InceptionV4)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

find_package(CUDA REQUIRED)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(/usr/include/x86_64-linux-gnu/)
link_directories(/usr/lib/x86_64-linux-gnu/)

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

file(GLOB SOURCE_FILES "*.h" "*.cpp")

add_executable(inceptionv4 ${SOURCE_FILES})
target_link_libraries(inceptionv4 nvinfer)
target_link_libraries(inceptionv4 cudart)
target_link_libraries(inceptionv4 ${OpenCV_LIBS})

add_definitions(-O2 -pthread)


================================================
FILE: inception/inceptionv4/README.md
================================================
# Inception v4

Inception v4 model architecture from "Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning" <https://arxiv.org/abs/1602.07261v2>.

For the details, you can refer to [rwightman/pytorch-image-models](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/inception_v4.py)

Following tricks are used in this inception:

- For pooling layer with padding, we need pay attention to see if padding is included or excluded while calculating average number. Pytorch includes padding while doing avgPool by default, but Tensorrt doesn't. So for pooling layer with padding, we need `setAverageCountExcludesPadding(false)` in tensorrt.
- Batchnorm layer, implemented by scale layer.

```
// 1. generate inception.wts from [BlueMirrors/torchtrtz](https://github.com/BlueMirrors/torchtrtz/blob/main/generate_weights.py)

// 2. put inception.wts into tensorrtx/inceptionV4

// 3. build and run

cd tensorrtx/inception/inceptionV4

mkdir build

cd build

cmake ..

make

sudo ./inceptionV4 -s   // serialize model to plan file i.e. 'inceptionV4.engine'

sudo ./inceptionV4 -d   // deserialize plan file and run inference

// 4. see if the output is same as rwightman/pytorch-image-models/inceptionv4
```


================================================
FILE: inception/inceptionv4/inception_v4.cpp
================================================
# include "inception_v4.h"


namespace trtx {
    InceptionV4::InceptionV4(const InceptionV4Params &params)
    : mParams(params)
    , mContext(nullptr)
    , mEngine(nullptr)
    {
    }

    /**
     * Builds the tensorrt engine and serializes it.
    **/
    bool InceptionV4::serializeEngine()
    {
        // load weights
        weightMap = loadWeights(mParams.weightsFile);

        // create builder
        IBuilder* builder = createInferBuilder(gLogger);
        assert(builder);

        // create builder config
        IBuilderConfig* config = builder -> createBuilderConfig();
        assert(config);

        // create engine
        bool created = buildEngine(builder, config);
        if(!created)
        {
            std::cerr << "Engine creation failed. Check logs." << std::endl;
            return false;
        }

        // serilaize engine
        assert(mEngine != nullptr);
        IHostMemory* modelStream{nullptr};
        modelStream = mEngine -> serialize();
        assert(modelStream != nullptr);

        // destroy
        config -> destroy();
        builder -> destroy();

        // write serialized engine to file
        std::ofstream trtFile(mParams.trtEngineFile, std::ios::binary);
        if(!trtFile){
            std::cerr << "Unable to open engine file." << std::endl;
            return false;
        }

        trtFile.write(reinterpret_cast<const char*>(modelStream -> data()), modelStream -> size());
        std::cout << "Engine serialized and saved." << std::endl;

        // clean
        modelStream -> destroy();

        return true;
    }

    bool InceptionV4::buildEngine(IBuilder *builder, IBuilderConfig *config) {
        INetworkDefinition* network = builder->createNetworkV2(0U);

        // Create input tensor of shape { 1, 1, 32, 32 } with name INPUT_BLOB_NAME
        ITensor* data = network->addInput(mParams.inputTensorName, dt, Dims3{3, mParams.inputH, mParams.inputW});
        assert(data);

        Weights emptywts{DataType::kFLOAT, nullptr, 0};

        float shval[3] = {(0.485 - 0.5) / 0.5, (0.456 - 0.5) / 0.5, (0.406 - 0.5) / 0.5};
        float scval[3] = {0.229 / 0.5, 0.224 / 0.5, 0.225 / 0.5};
        float pval[3] = {1.0, 1.0, 1.0};
        Weights shift{DataType::kFLOAT, shval, 3};
        Weights scale{DataType::kFLOAT, scval, 3};
        Weights power{DataType::kFLOAT, pval, 3};
        IScaleLayer* scale1 = network->addScale(*data, ScaleMode::kCHANNEL, shift, scale, power);
        assert(scale1);

        IActivationLayer* relu0 = basicConv2d(network, weightMap, *scale1 -> getOutput(0), 32, DimsHW{ 3, 3 }, 2, DimsHW{ 0, 0 }, "features.0");
        relu0 = basicConv2d(network, weightMap, *relu0 -> getOutput(0), 32, DimsHW{ 3, 3 }, 1, DimsHW{ 0, 0 }, "features.1");
        relu0 = basicConv2d(network, weightMap, *relu0 -> getOutput(0), 64, DimsHW{ 3, 3 }, 1, DimsHW{ 1, 1 }, "features.2");

        auto cat0 = mixed_3a(network, weightMap, *relu0 -> getOutput(0), "features.3");
        cat0 = mixed_4a(network, weightMap, *cat0 -> getOutput(0), "features.4");
        cat0 = mixed_5a(network, weightMap, *cat0 -> getOutput(0), "features.5");
        cat0 = inceptionA(network, weightMap, *cat0 -> getOutput(0), "features.6");
        cat0 = inceptionA(network, weightMap, *cat0 -> getOutput(0), "features.7");
        cat0 = inceptionA(network, weightMap, *cat0 -> getOutput(0), "features.8");
        cat0 = inceptionA(network, weightMap, *cat0 -> getOutput(0), "features.9");
        cat0 = reductionA(network, weightMap, *cat0 -> getOutput(0), "features.10");

        cat0 = inceptionB(network, weightMap, *cat0 -> getOutput(0), "features.11");
        cat0 = inceptionB(network, weightMap, *cat0 -> getOutput(0), "features.12");
        cat0 = inceptionB(network, weightMap, *cat0 -> getOutput(0), "features.13");
        cat0 = inceptionB(network, weightMap, *cat0 -> getOutput(0), "features.14");
        cat0 = inceptionB(network, weightMap, *cat0 -> getOutput(0), "features.15");
        cat0 = inceptionB(network, weightMap, *cat0 -> getOutput(0), "features.16");
        cat0 = inceptionB(network, weightMap, *cat0 -> getOutput(0), "features.17");
        cat0 = reductionB(network, weightMap, *cat0 -> getOutput(0), "features.18");
        
        cat0 = inceptionC(network, weightMap, *cat0 -> getOutput(0), "features.19");
        cat0 = inceptionC(network, weightMap, *cat0 -> getOutput(0), "features.20");
        cat0 = inceptionC(network, weightMap, *cat0 -> getOutput(0), "features.21");

        IPoolingLayer* pool2 = network->addPoolingNd(*cat0->getOutput(0), PoolingType::kAVERAGE, DimsHW{8, 8});
        assert(pool2);

        IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool2->getOutput(0), 1000, weightMap["last_linear.weight"], weightMap["last_linear.bias"]);
        assert(fc1);

        fc1->getOutput(0)->setName(mParams.outputTensorName);
        std::cout << "set name out" << std::endl;
        network->markOutput(*fc1->getOutput(0));

        // Build engine
        builder->setMaxBatchSize(mParams.batchSize);
        config->setMaxWorkspaceSize(1 << 28);
        if (mParams.fp16)
            config->setFlag(BuilderFlag::kFP16);
        mEngine = builder->buildEngineWithConfig(*network, *config);
        std::cout << "build out" << std::endl;

        // Don't need the network any more
        network->destroy();

        // Release host memory
        for (auto& mem : weightMap)
        {
            free((void*) (mem.second.values));
        }

        if (mEngine == nullptr) return false;
        return true;
    }

    bool InceptionV4::deserializeCudaEngine() {
        if (mContext != nullptr && mEngine != nullptr)
        {
            return true;
        }

        if (mEngine == nullptr)
        {
            char* trtModelStream{nullptr};
            size_t size{0};

            // open file
            std::ifstream f(mParams.trtEngineFile, std::ios::binary);

            if (f.good())
            {
                // get size
                f.seekg(0, f.end);
                size = f.tellg();
                f.seekg(0, f.beg);

                trtModelStream = new char[size];

                // read data as a block
                f.read(trtModelStream, size);
                f.close();
            }

            if (trtModelStream == nullptr)
            {
                return false;
            }

            // deserialize
            IRuntime* runtime = createInferRuntime(gLogger);
            assert(runtime);

            mEngine = runtime -> deserializeCudaEngine(trtModelStream, size, 0);
            assert(mEngine != nullptr);

            // clean up
            runtime -> destroy();
            delete[] trtModelStream;
        }

        std::cout << "deserialized engine successfully." << std::endl;

        // create execution context
        mContext = mEngine -> createExecutionContext();
        assert(mContext != nullptr);

        return true;
    }

    void InceptionV4::doInference(float* input, float* output, int batchSize) {
        // Pointers to input and output device buffers to pass to engine.
        // Engine requires exactly IEngine::getNbBindings() number of buffers.
        assert(mEngine -> getNbBindings() == 2);
        void* buffers[2];

        // In order to bind the buffers, we need to know the names of the input and output tensors.
        // Note that indices are guaranteed to be less than IEngine::getNbBindings()
        const int inputIndex = mEngine->getBindingIndex(mParams.inputTensorName);
        const int outputIndex = mEngine->getBindingIndex(mParams.outputTensorName);

        // Create GPU buffers on device
        CUDA_CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * mParams.inputH * mParams.inputW * sizeof(float)));
        CUDA_CHECK(cudaMalloc(&buffers[outputIndex], batchSize * 1000 * sizeof(float)));

        // Create stream
        cudaStream_t stream;
        CUDA_CHECK(cudaStreamCreate(&stream));

        // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
        CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * mParams.inputH * mParams.inputW * sizeof(float), cudaMemcpyHostToDevice, stream));
        mContext->enqueue(batchSize, buffers, stream, nullptr);
        CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * 1000 * sizeof(float), cudaMemcpyDeviceToHost, stream));
        cudaStreamSynchronize(stream);

        // Release stream and buffers
        cudaStreamDestroy(stream);
        CUDA_CHECK(cudaFree(buffers[inputIndex]));
        CUDA_CHECK(cudaFree(buffers[outputIndex]));
    }

    /**
     * Cleans up any state created in the InceptionV4Trt class
    **/
    bool InceptionV4::cleanUp()
    {
        if (mContext != nullptr)
            mContext -> destroy();

        if (mEngine != nullptr)
            mEngine -> destroy();

        return true;
    }
}


================================================
FILE: inception/inceptionv4/inception_v4.h
================================================
#ifndef TRTX_INCEPTION_NETWORK_H
#define TRTX_INCEPTION_NETWORK_H


#include <memory>
#include <vector>
#include <chrono>
#include <opencv2/opencv.hpp>

#include "logging.h"
#include "utils.h"
#include "layers_api.h"


static Logger gLogger;
using namespace trtxlayers;

namespace trtx {
    struct InceptionV4Params
    {
        /* data */
        int32_t batchSize{1};              // Number of inputs in a batch
        bool int8{false};                  // Allow runnning the network in Int8 mode.
        bool fp16{false};                  // Allow running the network in FP16 mode.
        const char* inputTensorName = "data";
        const char* outputTensorName = "prob";

        int inputW;                // The input width of the network.
        int inputH;                // The input height of the the network.
        int outputSize;           // THe output size of the network.
        std::string weightsFile;   // Weights file filename.
        std::string trtEngineFile; // trt engine file name
    };
    
    class InceptionV4 {
    public:
        InceptionV4(const InceptionV4Params &enginecfg);
        ~InceptionV4() {};

        bool serializeEngine();                  // create & serialize netowrk Engine 
        bool deserializeCudaEngine();

        void doInference(float* input, float* output, int batchSize);
        bool cleanUp();
    private:
        bool buildEngine(IBuilder *builder, IBuilderConfig *config);
        // Runs the Tensorrt network inference engine on a sample.
    private:
        InceptionV4Params mParams;
        ICudaEngine* mEngine;  // The tensorrt engine used to run the network.
        std::map<std::string, Weights> weightMap; // The weight value map.
        IExecutionContext* mContext; // The TensorRT execution context to run inference.
        std::string inception;
        DataType dt{DataType::kFLOAT};
    };
}

#endif

================================================
FILE: inception/inceptionv4/layers_api.cpp
================================================
#include "layers_api.h"

namespace trtxlayers {
    IScaleLayer* addBatchNorm2d(
        INetworkDefinition *network, 
        std::map<std::string, Weights>& weightMap, 
        ITensor& input, 
        std::string lname, 
        float eps
    )
    {
        float *gamma = (float*)weightMap[lname + ".weight"].values;
        float *beta = (float*)weightMap[lname + ".bias"].values;
        float *mean = (float*)weightMap[lname + ".running_mean"].values;
        float *var = (float*)weightMap[lname + ".running_var"].values;
        int len = weightMap[lname + ".running_var"].count;
        std::cout << "len " << len << std::endl;

        float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
        for (int i = 0; i < len; i++) {
            scval[i] = gamma[i] / sqrt(var[i] + eps);
        }
        Weights scale{DataType::kFLOAT, scval, len};

        float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
        for (int i = 0; i < len; i++) {
            shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
        }
        Weights shift{DataType::kFLOAT, shval, len};

        float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
        for (int i = 0; i < len; i++) {
            pval[i] = 1.0;
        }
        Weights power{DataType::kFLOAT, pval, len};

        weightMap[lname + ".scale"] = scale;
        weightMap[lname + ".shift"] = shift;
        weightMap[lname + ".power"] = power;
        IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
        assert(scale_1);
        return scale_1;
    }

    IActivationLayer* basicConv2d(
        INetworkDefinition *network,
        std::map<std::string, Weights>& weightMap, 
        ITensor& input,  
        int outch, 
        DimsHW ksize, 
        int s, 
        DimsHW p, 
        std::string lname
    )
    {
        // empty wts for bias
        Weights emptywts{DataType::kFLOAT, nullptr, 0};

        // add conv -> bn -> relu
        IConvolutionLayer* conv = network -> addConvolutionNd(input, outch, ksize, weightMap[lname + ".conv.weight"], emptywts);
        assert(conv);
        conv -> setStrideNd(DimsHW{s, s});
        conv -> setPaddingNd(p);

        IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv -> getOutput(0), lname + ".bn", 1e-3);
        
        IActivationLayer* relu = network -> addActivation(*bn -> getOutput(0), ActivationType::kRELU);
        assert(relu); 
        return relu;
    }

    IConcatenationLayer* mixed_3a(
        INetworkDefinition *network,
        std::map<std::string, Weights>& weightMap, 
        ITensor& input,  
        std::string lname
    )
    {
        // branch 0
        IPoolingLayer* pool = network -> addPoolingNd(input, PoolingType::kMAX, DimsHW{3, 3});
        assert(pool);
        pool -> setStrideNd(DimsHW{2, 2});

        // branch 1
        IActivationLayer* relu = basicConv2d(network, weightMap, input, 96, DimsHW{ 3, 3 }, 2, DimsHW{ 0, 0 }, lname + ".conv");
        
        // concatenate two branches
        ITensor* inputTensors[] = { pool -> getOutput(0), relu -> getOutput(0) };
        IConcatenationLayer* cat = network -> addConcatenation(inputTensors, 2);
        assert(cat);
        return cat;
    }

    IConcatenationLayer* mixed_4a(
        INetworkDefinition *network,
        std::map<std::string, Weights>& weightMap, 
        ITensor& input,  
        std::string lname
    )
    {
        // branch 0
        IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 64, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch0.0");
        relu1 = basicConv2d(network, weightMap, *relu1 -> getOutput(0), 96, DimsHW{ 3, 3 }, 1, DimsHW{ 0, 0 }, lname + ".branch0.1");

        // branch 1
        IActivationLayer* relu2 = basicConv2d(network, weightMap, input, 64, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch1.0");
        relu2 = basicConv2d(network, weightMap, *relu2 -> getOutput(0), 64, DimsHW{ 1, 7 }, 1, DimsHW{ 0, 3 }, lname + ".branch1.1");
        relu2 = basicConv2d(network, weightMap, *relu2 -> getOutput(0), 64, DimsHW{ 7, 1 }, 1, DimsHW{ 3, 0 }, lname + ".branch1.2");
        relu2 = basicConv2d(network, weightMap, *relu2 -> getOutput(0), 96, DimsHW{ 3, 3 }, 1, DimsHW{ 0, 0 }, lname + ".branch1.3");

        // concatenate two branches
        ITensor* inputTensors[] = { relu1 -> getOutput(0), relu2 -> getOutput(0) };
        IConcatenationLayer* cat = network -> addConcatenation(inputTensors, 2);
        assert(cat);
        return cat;
    }

    IConcatenationLayer* mixed_5a(
        INetworkDefinition *network,
        std::map<std::string, Weights>& weightMap, 
        ITensor& input,  
        std::string lname
    )
    {
        std::cout<<"mixed_5a"<<std::endl;
        //branch 0
        IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 192, DimsHW{ 3, 3 }, 2, DimsHW{ 0, 0 }, lname + ".conv");

        //branch 1
        IPoolingLayer* pool1 = network -> addPoolingNd(input, PoolingType::kMAX, DimsHW{ 3, 3 });
        assert(pool1);
        pool1 -> setStrideNd(DimsHW{ 2, 2 });

        // concatenate branches
        ITensor* inputTensors[] = { relu1 -> getOutput(0), pool1 -> getOutput(0)};
        IConcatenationLayer* cat = network -> addConcatenation(inputTensors, 2);
        assert(cat);
        std::cout<<"mixed_5a done"<<std::endl;
        return cat;
    }

    IConcatenationLayer* inceptionA(
        INetworkDefinition *network,
        std::map<std::string, Weights>& weightMap, 
        ITensor& input,  
        std::string lname
    )
    {
        // branch 0
        IActivationLayer* relu0 = basicConv2d(network, weightMap, input, 96, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch0");

        // branch 1
        IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 64, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname +".branch1.0");
        relu1 = basicConv2d(network, weightMap, *relu1 -> getOutput(0), 96, DimsHW{ 3, 3 }, 1, DimsHW{ 1, 1 }, lname+".branch1.1");
        
        // branch 2
        IActivationLayer* relu2 = basicConv2d(network, weightMap, input, 64, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname+".branch2.0");
        relu2 = basicConv2d(network, weightMap, *relu2 -> getOutput(0), 96, DimsHW{ 3, 3 }, 1, DimsHW{ 1, 1 }, lname+".branch2.1");
        relu2 = basicConv2d(network, weightMap, *relu2 -> getOutput(0), 96, DimsHW{ 3, 3 }, 1, DimsHW{ 1, 1 }, lname+".branch2.2");

        // branch 3
        IPoolingLayer* pool1 = network->addPoolingNd(input, PoolingType::kAVERAGE, DimsHW{3, 3});
        assert(pool1);
        pool1->setStrideNd(DimsHW{1, 1});
        pool1->setPaddingNd(DimsHW{1, 1});
        pool1->setAverageCountExcludesPadding(false);
        IActivationLayer* relu3 = basicConv2d(network, weightMap, *pool1 -> getOutput(0), 96, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname+".branch3.1");

        // concatenate all branches outputs
        ITensor* inputTensors[] = { relu0 -> getOutput(0), relu1 -> getOutput(0), relu2 -> getOutput(0), relu3 -> getOutput(0)};
        IConcatenationLayer* cat = network -> addConcatenation(inputTensors, 4);
        assert(cat);
        return cat;

    }

    IConcatenationLayer* reductionA(
        INetworkDefinition *network,
        std::map<std::string, Weights>& weightMap, 
        ITensor& input,  
        std::string lname
    )
    {
        // features 10 branch 0
        IActivationLayer* relu0 = basicConv2d(network, weightMap, input, 384, DimsHW{ 3, 3 }, 2, DimsHW{ 0, 0 }, lname + ".branch0");

        // branch 1
        IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 192, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch1.0");
        relu1 = basicConv2d(network, weightMap, *relu1 -> getOutput(0), 224, DimsHW{ 3, 3 }, 1, DimsHW{ 1, 1 }, lname + ".branch1.1");
        relu1 = basicConv2d(network, weightMap, *relu1 -> getOutput(0), 256, DimsHW{ 3, 3 }, 2, DimsHW{ 0, 0 }, lname + ".branch1.2");

        // branch 2
        IPoolingLayer* pool1 = network -> addPoolingNd(input, PoolingType::kMAX, DimsHW{ 3, 3 });
        assert(pool1);
        pool1 -> setStrideNd(DimsHW{ 2, 2 });

        // concatenate
        ITensor* inputTensors[] = { relu0 -> getOutput(0), relu1 -> getOutput(0), pool1 -> getOutput(0) };
        IConcatenationLayer* cat = network -> addConcatenation(inputTensors, 3);
        assert(cat);
        return cat;
    }

    IConcatenationLayer* inceptionB(
        INetworkDefinition *network,
        std::map<std::string, Weights>& weightMap, 
        ITensor& input,  
        std::string lname
    )
    {
        // features 11 branch 0
        IActivationLayer* relu0 = basicConv2d(network, weightMap, input, 384, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch0");

        // branch 1
        IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 192, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch1.0");
        relu1 = basicConv2d(network, weightMap, *relu1 -> getOutput(0), 224, DimsHW{ 1, 7 }, 1, DimsHW{ 0, 3 }, lname + ".branch1.1");
        relu1 = basicConv2d(network, weightMap, *relu1 -> getOutput(0), 256, DimsHW{ 7, 1 }, 1, DimsHW{ 3, 0 }, lname + ".branch1.2");
        
        // branch 2
        IActivationLayer* relu2 = basicConv2d(network, weightMap, input, 192, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch2.0");
        relu2 = basicConv2d(network, weightMap, *relu2 -> getOutput(0), 192, DimsHW{ 7, 1 }, 1, DimsHW{ 3, 0 }, lname + ".branch2.1");
        relu2 = basicConv2d(network, weightMap, *relu2 -> getOutput(0), 224, DimsHW{ 1, 7 }, 1, DimsHW{ 0, 3 }, lname + ".branch2.2");
        relu2 = basicConv2d(network, weightMap, *relu2 -> getOutput(0), 224, DimsHW{ 7, 1 }, 1, DimsHW{ 3, 0 }, lname + ".branch2.3");
        relu2 = basicConv2d(network, weightMap, *relu2 -> getOutput(0), 256, DimsHW{ 1, 7 }, 1, DimsHW{ 0, 3 }, lname + ".branch2.4");

        // branch 3
        IPoolingLayer* pool0 = network -> addPoolingNd(input, PoolingType::kAVERAGE, DimsHW{ 3, 3 });
        assert(pool0);
        pool0 -> setStrideNd(DimsHW{ 1, 1 });
        pool0 -> setPaddingNd(DimsHW{ 1, 1 });
        pool0 -> setAverageCountExcludesPadding(false);
        IActivationLayer* relu3 = basicConv2d(network, weightMap, *pool0 -> getOutput(0), 128, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch3.1");

        // concatenate branches
        ITensor* inputTensors[] = { relu0 -> getOutput(0), relu1 -> getOutput(0), relu2 -> getOutput(0), relu3 -> getOutput(0) };
        IConcatenationLayer* cat = network -> addConcatenation(inputTensors, 4);
        assert(cat);

        return cat;
    }

    IConcatenationLayer* reductionB(
        INetworkDefinition *network,
        std::map<std::string, Weights>& weightMap, 
        ITensor& input,  
        std::string lname
    )
    {
        // features 18 branch 0
        IActivationLayer* relu0 = basicConv2d(network, weightMap, input, 192, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch0.0");
        relu0 = basicConv2d(network, weightMap, *relu0 -> getOutput(0), 192, DimsHW{ 3, 3 }, 2, DimsHW{ 0, 0 }, lname + ".branch0.1");

        // branch 1
        IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 256, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch1.0");
        relu1 = basicConv2d(network, weightMap, *relu1 -> getOutput(0), 256, DimsHW{ 1, 7 }, 1, DimsHW{ 0, 3 }, lname + ".branch1.1");
        relu1 = basicConv2d(network, weightMap, *relu1 -> getOutput(0), 320, DimsHW{ 7, 1 }, 1, DimsHW{ 3, 0 }, lname + ".branch1.2");
        relu1 = basicConv2d(network, weightMap, *relu1 -> getOutput(0), 320, DimsHW{ 3, 3 }, 2, DimsHW{ 0, 0 }, lname + ".branch1.3");

        // branch 2
        IPoolingLayer* pool1 = network -> addPoolingNd(input, PoolingType::kMAX, DimsHW{ 3, 3 });
        assert(pool1);
        pool1 -> setStrideNd(DimsHW{ 2, 2 });

        // concatenate
        ITensor* inputTensors[] = { relu0 -> getOutput(0), relu1 -> getOutput(0), pool1 -> getOutput(0) };
        IConcatenationLayer* cat = network -> addConcatenation(inputTensors, 3);
        assert(cat);

        return cat;
    }

    IConcatenationLayer* inceptionC(
        INetworkDefinition *network,
        std::map<std::string, Weights>& weightMap, 
        ITensor& input,  
        std::string lname
    )
    {

        // features 19 branch 0
        IActivationLayer* relu0 = basicConv2d(network, weightMap, input, 256, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch0");

        // branch 1
        IActivationLayer* relu1_0 = basicConv2d(network, weightMap, input, 384, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch1_0");
        IActivationLayer* relu1_1a = basicConv2d(network, weightMap, *relu1_0 -> getOutput(0), 256, DimsHW{ 1, 3 }, 1, DimsHW{ 0, 1 }, lname + ".branch1_1a");
        IActivationLayer* relu1_1b = basicConv2d(network, weightMap, *relu1_0 -> getOutput(0), 256, DimsHW{ 3, 1 }, 1, DimsHW{ 1, 0 }, lname + ".branch1_1b");
        ITensor* inputTensors1[] = { relu1_1a -> getOutput(0), relu1_1b -> getOutput(0) };
        IConcatenationLayer* cat1 = network -> addConcatenation(inputTensors1, 2);
        assert(cat1);

        // branch 2
        IActivationLayer* relu2_0 = basicConv2d(network, weightMap, input, 384, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch2_0");
        IActivationLayer* relu2_1 = basicConv2d(network, weightMap, *relu2_0 -> getOutput(0), 448, DimsHW{ 3, 1 }, 1, DimsHW{ 1, 0 }, lname + ".branch2_1");
        IActivationLayer* relu2_2 = basicConv2d(network, weightMap, *relu2_1 -> getOutput(0), 512, DimsHW{ 1, 3 }, 1, DimsHW{ 0, 1 }, lname + ".branch2_2");
        IActivationLayer* relu2_3a = basicConv2d(network, weightMap, *relu2_2 -> getOutput(0), 256, DimsHW{ 1, 3 }, 1, DimsHW{ 0, 1 }, lname + ".branch2_3a");
        IActivationLayer* relu2_3b = basicConv2d(network, weightMap, *relu2_2 -> getOutput(0), 256, DimsHW{ 3, 1 }, 1, DimsHW{ 1, 0 }, lname + ".branch2_3b");
        ITensor* inputTensors2[] = { relu2_3a -> getOutput(0), relu2_3b -> getOutput(0) };
        IConcatenationLayer* cat2 = network -> addConcatenation(inputTensors2, 2);
        assert(cat2);

        // branch 3
        IPoolingLayer* pool3 = network -> addPoolingNd(input, PoolingType::kAVERAGE, DimsHW{ 3, 3 });
        assert(pool3);
        pool3 -> setStrideNd(DimsHW{ 1, 1 });
        pool3 -> setPaddingNd(DimsHW{ 1, 1 });
        pool3 -> setAverageCountExcludesPadding(false);
        IActivationLayer* relu3 = basicConv2d(network, weightMap, *pool3 -> getOutput(0), 256, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch3.1");

        // concatenate
        ITensor* inputTensors[] = { relu0 -> getOutput(0), cat1 -> getOutput(0), cat2 -> getOutput(0), relu3 -> getOutput(0) };
        IConcatenationLayer* cat = network -> addConcatenation(inputTensors, 4);
        assert(cat);
        return cat;
    }
}

================================================
FILE: inception/inceptionv4/layers_api.h
================================================
#ifndef TRTX_LAYERS_API_H
#define TRTX_LAYERS_API_H

#include <map>
#include <math.h>
#include <assert.h>
#include <iostream>

#include "NvInfer.h"
#include "cuda_runtime_api.h"

using namespace nvinfer1;

namespace trtxlayers {

    // Declare your layers here
    IScaleLayer* addBatchNorm2d(
        INetworkDefinition *network, 
        std::map<std::string, Weights>& weightMap, 
        ITensor& input, 
        std::string lname, 
        float eps
    );

    IActivationLayer* basicConv2d(
        INetworkDefinition *network,
        std::map<std::string, Weights>& weightMap, 
        ITensor& input,  
        int outch, 
        DimsHW ksize, 
        int s, 
        DimsHW p, 
        std::string lname
    );

    IConcatenationLayer* mixed_3a(
        INetworkDefinition *network,
        std::map<std::string, Weights>& weightMap, 
        ITensor& input,  
        std::string lname
    );

    IConcatenationLayer* mixed_4a(
        INetworkDefinition *network,
        std::map<std::string, Weights>& weightMap, 
        ITensor& input,  
        std::string lname
    );

    IConcatenationLayer* mixed_5a(
        INetworkDefinition *network,
        std::map<std::string, Weights>& weightMap, 
        ITensor& input,  
        std::string lname
    );

    IConcatenationLayer* inceptionA(
        INetworkDefinition *network,
        std::map<std::string, Weights>& weightMap, 
        ITensor& input,  
        std::string lname
    );

    IConcatenationLayer* reductionA(
        INetworkDefinition *network,
        std::map<std::string, Weights>& weightMap, 
        ITensor& input,  
        std::string lname
    );
    
    IConcatenationLayer* inceptionB(
        INetworkDefinition *network,
        std::map<std::string, Weights>& weightMap, 
        ITensor& input,  
        std::string lname
    );

    IConcatenationLayer* reductionB(
        INetworkDefinition *network,
        std::map<std::string, Weights>& weightMap, 
        ITensor& input,  
        std::string lname
    );

    IConcatenationLayer* inceptionC(
        INetworkDefinition *network,
        std::map<std::string, Weights>& weightMap, 
        ITensor& input,  
        std::string lname
    );
}

#endif  // TRTX_LAYERS_API_H

================================================
FILE: inception/inceptionv4/logging.h
================================================
/*
 * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
        , mPrefix(other.mPrefix)
        , mShouldLog(other.mShouldLog)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) override
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
            {
                ss << " ";
            }
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//!         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H

================================================
FILE: inception/inceptionv4/main.cpp
================================================
#include "inception_v4.h"


/**
 * Initializes Inception class params in the 
 * InceptionV4Params structure.
**/
trtx::InceptionV4Params initializeParams()
{
    trtx::InceptionV4Params params;

    params.batchSize = 1;
    params.fp16 = false;

    params.inputH = 299;
    params.inputW = 299;
    params.outputSize = 1000;

    // change weights file name here
    params.weightsFile = "../inceptionV4.wts";

    // change engine file name here
    params.trtEngineFile = "inceptionV4.engine";
    return params;
}


int main(int argc, char** argv){
    if (argc != 2) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./inception -s   // serialize model to plan file" << std::endl;
        std::cerr << "./inception -d   // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    trtx::InceptionV4Params params = initializeParams();
    trtx::InceptionV4 inceptionV4(params);

    if (std::string(argv[1]) == "-s") {
        // check if engine exists already
        std::ifstream f(params.trtEngineFile, std::ios::binary);

        // if engine does not exists build, serialize and save
        if(!f.good())
        {
            std::cout << "Building network ..." << std::endl;
            f.close();
            inceptionV4.serializeEngine();
        }

        return 1;
    } 
    else if(std::string(argv[1]) == "-d")
    {
        // deserialize
        inceptionV4.deserializeCudaEngine();
    }

    // create data
    float data[3 * params.inputH * params.inputW];
    for(int i=0; i<3*params.inputH*params.inputW; i++)
    {
        data[i] = 1.0;
    }
    
    // run inference
    float prob[params.outputSize];
    for(int i=0; i<100; i++)
    {
        auto start = std::chrono::system_clock::now();
        inceptionV4.doInference(data, prob, 1);
        auto end = std::chrono::system_clock::now();
        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    // cleanup
    bool cleaned = inceptionV4.cleanUp();
    
    std::cout << "\nOutput:\n\n";
    for (unsigned int i = 0; i < params.outputSize; i++)
    {
        std::cout << prob[i] << ", ";
        if (i % 10 == 0) std::cout << i / 10 << std::endl;
    }
    std::cout << std::endl;

    return 0;
}

================================================
FILE: inception/inceptionv4/utils.cpp
================================================
# include "utils.h"


// Load weights from files.
// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        
        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
} 

================================================
FILE: inception/inceptionv4/utils.h
================================================
# ifndef TRTX_UTILS_H
# define TRTX_UTILS_H

#include <map>
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "assert.h"
#include <fstream>
#include <iostream>
#include <memory>

#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)\
    {\
        cudaError_t error_code = callstr;\
        if (error_code != cudaSuccess) {\
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
            assert(0);\
        }\
    }
#endif  // CUDA_CHECK

using namespace nvinfer1;

std::map<std::string, Weights> loadWeights(const std::string input);

#endif // TRTX_UTILS_H

================================================
FILE: lenet/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.17.0)

project(
  lenet
  VERSION 0.1
  LANGUAGES C CXX CUDA)

if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
  set(CMAKE_CUDA_ARCHITECTURES
      75
      80
      86
      89
      90
      100
      120)
endif()

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_INCLUDE_CURRENT_DIR TRUE)

option(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cudaruntime library" OFF)

find_package(Threads REQUIRED)
find_package(CUDAToolkit REQUIRED)
find_package(OpenCV REQUIRED)

if(NOT TARGET TensorRT::TensorRT)
  include(FindTensorRT.cmake)
else()
  message("TensorRT has been found, skipping for ${PROJECT_NAME}")
endif()

add_executable(${PROJECT_NAME} lenet.cpp)

target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR}
                                                   ${OpenCV_INCLUDE_DIRS})

target_link_libraries(${PROJECT_NAME} PUBLIC Threads::Threads CUDA::cudart
                                             TensorRT::TensorRT ${OpenCV_LIBS})

if(WIN32)
  set_target_properties(
    ${PROJECT_NAME} PROPERTIES MSVC_RUNTIME_LIBRARY
                               "MultiThreaded$<$<CONFIG:Debug>:Debug>")
endif()


================================================
FILE: lenet/FindTensorRT.cmake
================================================
cmake_minimum_required(VERSION 3.17.0)

function(_guess_path var_name required_files)
  set(_result "")

  foreach(path_entry IN LISTS ARGN)
    if(NOT EXISTS "${path_entry}")
      message(DEBUG "skip non-existing path '${path_entry}'")
      continue()
    endif()

    set(_ok TRUE)
    foreach(required_file IN LISTS required_files)
      if(NOT EXISTS "${path_entry}/${required_file}")
        set(_ok FALSE)
        message(DEBUG "'${path_entry}' missing '${required_file}'")
        break()
      endif()
    endforeach()

    if(_ok)
      list(APPEND _result "${path_entry}")
      message(DEBUG "accept '${path_entry}'")
    else()
      message(DEBUG "reject '${path_entry}'")
    endif()
  endforeach()

  if(_result STREQUAL "")
    message(
      FATAL_ERROR
        "_guess_path(${var_name}) failed: no valid path found. required_files='${required_files}' candidates='${ARGN}'"
    )
  endif()

  set(${var_name}
      "${_result}"
      PARENT_SCOPE)
endfunction()

# add library
add_library(TensorRT IMPORTED INTERFACE)
add_library(TensorRT::TensorRT ALIAS TensorRT)

set(TRT_VERSION
    CACHE
      STRING
      "TensorRT version, e.g. \"8.6.1.6\" or \"8.6.1.6+cuda12.0.1.011\", \"8.6.1.6.Windows10.x86_64.cuda-12.0\" etc"
)

if(NOT TRT_VERSION STREQUAL "" AND NOT $ENV{TRT_VERSION} STREQUAL "")
  message(
    WARNING
      "TRT_VERSION defined by cmake and environment variable both, using the later one"
  )
endif()

if(NOT $ENV{TRT_VERSION} STREQUAL "")
  set(TRT_VERSION $ENV{TRT_VERSION})
endif()

string(REGEX MATCH "([0-9]+)" _match ${TRT_VERSION})
set(TRT_MAJOR_VERSION "${_match}")
unset(_match)

if(WIN32)
  set(TensorRT_DIR "C:/Program Files/TensorRT-${TRT_VERSION}")
  if(NOT EXISTS "${TensorRT_DIR}")
    message(
      FATAL_ERROR
        "TensorRT_DIR=${TensorRT_DIR} does not exist!"
    )
  endif()

  if(${TRT_MAJOR_VERSION} GREATER_EQUAL 10)
    set(_modules nvinfer_10 nvinfer_plugin_10 nvinfer_vc_plugin_10
                 nvinfer_dispatch_10 nvinfer_lean_10)
    message(DEBUG "Using ${_modules}")
  else()
    set(_modules nvinfer nvinfer_plugin nvinfer_vc_plugin nvinfer_dispatch
                 nvinfer_lean)
  endif()

  set(TensorRT_LIBRARY_DIR "${TensorRT_DIR}/lib")
  set(TensorRT_INCLUDE_DIR "${TensorRT_DIR}/include")
elseif(UNIX)
  string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" _trt_arch)
  set(_trt_include_candidates)
  if(_trt_arch MATCHES "^(aarch64|arm64|arch64)$")
    set(_trt_include_candidates "/usr/include/aarch64-linux-gnu" "/usr/include"
                                "/usr/local/cuda/targets/aarch64-linux/include")
    set(_trt_library_candidates
        "/usr/local/tensorrt/targets/aarch64-linux-gnu/lib"
        "/usr/lib/aarch64-linux-gnu" "/usr/lib/aarch64-linux-gnu/tegra"
        "/usr/lib")
  elseif(_trt_arch MATCHES "^(x86_64|amd64)$")
    set(_trt_include_candidates
        "/usr/local/tensorrt/targets/x86_64-linux-gnu/include"
        "/usr/include/x86_64-linux-gnu" "/usr/include")
    set(_trt_library_candidates
        "/usr/local/tensorrt/targets/x86_64-linux-gnu/lib"
        "/usr/lib/x86_64-linux-gnu" "/usr/lib")
  else()
    message(FATAL_ERROR "Unknown architecture")
  endif()

  set(_modules nvinfer nvinfer_plugin)
  if(${TRT_MAJOR_VERSION} GREATER_EQUAL 8)
    list(APPEND _modules nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean)
  endif()

  _guess_path(TensorRT_LIBRARY_DIR "libnvinfer.so;libnvinfer_plugin.so"
              ${_trt_library_candidates})
  message(STATUS "TensorRT libraries: ${TensorRT_LIBRARY_DIR}")
  _guess_path(TensorRT_INCLUDE_DIR "NvInfer.h" ${_trt_include_candidates})
  message(STATUS "TensorRT includes: ${TensorRT_INCLUDE_DIR}")
endif()

foreach(lib IN LISTS _modules)
  find_library(
    TensorRT_${lib}_LIBRARY
    NAMES ${lib}
    HINTS ${TensorRT_LIBRARY_DIR})
  list(APPEND TensorRT_LIBRARIES ${TensorRT_${lib}_LIBRARY})
endforeach()

target_link_libraries(TensorRT INTERFACE ${TensorRT_LIBRARIES})

message(STATUS "Found TensorRT libs: ${TensorRT_LIBRARIES}")

set_target_properties(
  TensorRT
  PROPERTIES C_STANDARD 17
             CXX_STANDARD 17
             POSITION_INDEPENDENT_CODE ON
             SKIP_BUILD_RPATH TRUE
             BUILD_WITH_INSTALL_RPATH TRUE
             INSTALL_RPATH "$ORIGIN"
             INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIR}")

unset(TRT_MAJOR_VERSION)
unset(_modules)
unset(_trt_include_candidates)
unset(_trt_library_candidates)
unset(_trt_arch)


================================================
FILE: lenet/README.md
================================================
# lenet5

lenet5 is one of the simplest net in this repo. You can learn the basic procedures of building CNN from TensorRT API. This demo includes 2 major steps:

1. Build engine
   - define network
   - set input/output
   - serialize model to `.engine` file
2. Do inference
   - load and deserialize model from `.engine` file
   - run inference

## Usage

1. download pt model from `https://github.com/SunnyHaze/LeNet5-MNIST-Pytorch/blob/main/model.pt`

2. run `gen_wts.py` to generate `.wts` file

```bash
python3 gen_wts.py
```

output looks like:

```bash
lenet out shape: torch.Size([1, 10])
lenet out: [tensor([0.0725, 0.0730, 0.1056, 0.1201, 0.1059, 0.0741, 0.1328, 0.0953, 0.1230,
        0.0975])]
inference result: 6
```

3. build C++ code

```bash
cd tensorrtx/lenet
cmake -S . -B build
cmake --build build
```

4. serialize wts model to engine file

```bash
./build/lenet -s
```

5. run inference

```bash
./build/lenet -d
```

output looks like:

```bash
...
Execution time: 32us
0.09727, 0.09732, 0.1005, 0.102, 0.1006, 0.09743, 0.1033, 0.09951, 0.1023, 0.09973,
====
Execution time: 33us
0.09727, 0.09732, 0.1005, 0.102, 0.1006, 0.09743, 0.1033, 0.09951, 0.1023, 0.09973,
====
prediction result:
Top: 0 idx: 6, logits: 0.1033, label: 6
Top: 1 idx: 8, logits: 0.1023, label: 8
Top: 2 idx: 3, logits: 0.102, label: 3
```

## Tripy (New TensorRT Python Programming Model)

1. Generate `lenet5.wts`

2. Copy `lenet5.wts` into [tensorrtx/lenet](./)

3. Install Tripy:

   ```bash
   python3 -m pip install nvtripy -f https://nvidia.github.io/TensorRT-Incubator/packages.html
   ```

4. Change directories:

   ```bash
   cd tensorrtx/lenet
   ```

5. Compile and save the model:

   ```bash
   python3 lenet_tripy.py -s
   ```

6. Load and run the model:

   ```bash
   python3 lenet_tripy.py -d
   ```


================================================
FILE: lenet/gen_wts.py
================================================
import struct
from collections import OrderedDict

import cv2
import numpy as np
import torch
import torch.nn as nn


class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=0)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)

        self.conv2 = nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)

        self.fc1 = nn.Linear(400, 120)
        self.relu3 = nn.ReLU()
        self.fc2 = nn.Linear(120, 84)
        self.relu4 = nn.ReLU()
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        y = self.conv1(x)
        y = self.relu1(y)
        y = self.pool1(y)

        y = self.conv2(y)
        y = self.relu2(y)
        y = self.pool2(y)

        y = y.view(y.shape[0], -1)

        y = self.fc1(y)
        y = self.relu3(y)

        y = self.fc2(y)
        y = self.relu4(y)

        y = self.fc3(y)
        return y


def reformat_state_dict(state: OrderedDict) -> OrderedDict:
    mapping: dict[str, str] = {
        "layer1.0.weight": "conv1.weight",
        "layer1.0.bias": "conv1.bias",
        "layer1.3.weight": "conv2.weight",
        "layer1.3.bias": "conv2.bias",
        "layer2.0.weight": "fc1.weight",
        "layer2.0.bias": "fc1.bias",
        "layer2.2.weight": "fc2.weight",
        "layer2.2.bias": "fc2.bias",
        "layer2.4.weight": "fc3.weight",
        "layer2.4.bias": "fc3.bias",
    }
    for i, j in mapping.items():
        state.setdefault(j, state.pop(i))
    return state


def main():
    model = LeNet()
    model.eval()
    with torch.inference_mode():
        img = cv2.imread("../assets/6.pgm", cv2.IMREAD_GRAYSCALE)
        img = cv2.resize(img, (32, 32), interpolation=cv2.INTER_LINEAR)
        img = (((img / 255.0) - 0.1307) / 0.3081).astype(np.float32)
        state = torch.load("../models/model.pt", weights_only=False)
        state = reformat_state_dict(state["state_dict"])
        model.load_state_dict(state)
        input = torch.from_numpy(img)[None, None, ...]
        out = model(input)
        print(f"lenet output shape: {out.shape}")
        print(f"lenet output: {out}")
        print(f"inference result for MNIST data: {int(torch.argmax(out, 1))}")

    # save to wts
    print("Writing into lenet.wts")
    with open("../models/lenet.wts", "w") as f:
        f.write("{}\n".format(len(model.state_dict().keys())))
        for k, v in model.state_dict().items():
            vr = v.reshape(-1).cpu().numpy()
            f.write("{} {} ".format(k, len(vr)))
            for vv in vr:
                f.write(" ")
                f.write(struct.pack(">f", float(vv)).hex())
            f.write("\n")


if __name__ == "__main__":
    main()


================================================
FILE: lenet/lenet.cpp
================================================
#include <NvInfer.h>
#include <cassert>
#include <chrono>
#include <cmath>
#include <exception>
#include <filesystem>
#include <map>
#include <opencv2/opencv.hpp>
#include <vector>
#include "logging.h"
#include "utils.h"

using M = nvinfer1::MatrixOperation;
using E = nvinfer1::ElementWiseOperation;

// parameters we know about the lenet-5
constexpr static const int64_t INPUT_H = 32;
constexpr static const int64_t INPUT_W = 32;
constexpr static const std::array<const char*, 2> NAMES = {"data", "prob"};
constexpr static const std::array<const int64_t, 2> SIZES = {1ll * INPUT_H * INPUT_W, 10};
constexpr static const char* WTS_PATH = "../models/lenet.wts";
constexpr static const char* ENGINE_PATH = "../models/lenet.engine";

static Logger gLogger;

/**
 * @brief Creat the engine using only the API and not any parser.
 *
 * @param N max batch size
 * @param runtime runtime
 * @param builder builder
 * @param config config
 * @param dt data type
 * @return ICudaEngine*
 */
ICudaEngine* createLenetEngine(int32_t N, IRuntime* runtime, IBuilder* builder, IBuilderConfig* config, DataType dt) {
#if TRT_VERSION >= 11200
    auto flag = 1U << static_cast<int>(NetworkDefinitionCreationFlag::kSTRONGLY_TYPED);
#elif TRT_VERSION >= 10000
    auto flag = 0U;
#else
    auto flag = 1U << static_cast<int>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
#endif
    auto* network = builder->createNetworkV2(flag);

    // Create input tensor of shape { 1, 1, 32, 32 } with name INPUT_NAME
    ITensor* data = network->addInput(NAMES[0], dt, Dims4{N, 1, INPUT_H, INPUT_W});
    assert(data);

    // Add convolution layer with 6 outputs and a 5x5 filter.
    std::filesystem::path wts_path{WTS_PATH};
    wts_path = std::filesystem::absolute(wts_path);
    std::map<std::string, Weights> weightMap = loadWeights(wts_path.string());
    auto* conv1 = network->addConvolutionNd(*data, 6, DimsHW{5, 5}, weightMap["conv1.weight"], weightMap["conv1.bias"]);
    assert(conv1);
    conv1->setStrideNd(DimsHW{1, 1});
    conv1->setName("conv1");

    // Add activation layer using the ReLU algorithm.
    IActivationLayer* relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);
    assert(relu1);
    relu1->setName("relu1");

    // Add max pooling layer with stride of 2x2 and kernel size of 2x2.
    IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    assert(pool1);
    pool1->setStrideNd(DimsHW{2, 2});
    pool1->setName("pool1");

    // Add second convolution layer with 16 outputs and a 5x5 filter.
    auto* conv2 = network->addConvolutionNd(*pool1->getOutput(0), 16, DimsHW{5, 5}, weightMap["conv2.weight"],
                                            weightMap["conv2.bias"]);
    assert(conv2);
    conv2->setStrideNd(DimsHW{1, 1});
    conv2->setName("conv2");

    // Add activation layer using the ReLU algorithm.
    IActivationLayer* relu2 = network->addActivation(*conv2->getOutput(0), ActivationType::kRELU);
    assert(relu2);

    // Add second max pooling layer with stride of 2x2 and kernel size of 2x2>
    IPoolingLayer* pool2 = network->addPoolingNd(*relu2->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    assert(pool2);
    pool2->setStrideNd(DimsHW{2, 2});
    pool2->setName("pool2");

    // Add fully connected layer
    auto* flatten = network->addShuffle(*pool2->getOutput(0));
    flatten->setReshapeDimensions(Dims2{-1, 400});
    auto* tensor_fc1w = network->addConstant(Dims2{120, 400}, weightMap["fc1.weight"])->getOutput(0);
    auto* fc1w = network->addMatrixMultiply(*tensor_fc1w, M::kNONE, *flatten->getOutput(0), M::kTRANSPOSE);
    assert(tensor_fc1w && fc1w);
    auto tensor_fc1b = network->addConstant(Dims2{120, 1}, weightMap["fc1.bias"])->getOutput(0);
    auto* fc1b = network->addElementWise(*fc1w->getOutput(0), *tensor_fc1b, E::kSUM);
    fc1b->setName("fc1b");
    assert(tensor_fc1b && fc1b);

    // Add activation layer using the ReLU algorithm.
    IActivationLayer* relu3 = network->addActivation(*fc1b->getOutput(0), ActivationType::kRELU);
    assert(relu3);
    auto* flatten_relu3 = network->addShuffle(*relu3->getOutput(0));
    flatten_relu3->setReshapeDimensions(Dims2{-1, 120});

    auto* fc2w = network->addConstant(Dims2{84, 120}, weightMap["fc2.weight"])->getOutput(0);
    auto* fc2b = network->addConstant(Dims2{84, 1}, weightMap["fc2.bias"])->getOutput(0);
    auto* fc3w = network->addConstant(Dims2{10, 84}, weightMap["fc3.weight"])->getOutput(0);
    auto* fc3b = network->addConstant(Dims2{10, 1}, weightMap["fc3.bias"])->getOutput(0);
    assert(fc2w && fc2b && fc3w && fc3b);

    // fully connected layer with relu
    auto* fc2_0 = network->addMatrixMultiply(*fc2w, M::kNONE, *flatten_relu3->getOutput(0), M::kTRANSPOSE);
    assert(fc2_0);
    fc2_0->setName("fc2");
    auto* fc2_1 = network->addElementWise(*fc2_0->getOutput(0), *fc2b, E::kSUM);
    assert(fc2_1);
    IActivationLayer* relu4 = network->addActivation(*fc2_1->getOutput(0), ActivationType::kRELU);
    assert(relu4);
    auto* shuffle = network->addShuffle(*relu4->getOutput(0));
    shuffle->setReshapeDimensions(Dims2{-1, 84});
    auto* fc3_0 = network->addMatrixMultiply(*fc3w, M::kNONE, *shuffle->getOutput(0), M::kTRANSPOSE);
    assert(fc3_0);
    auto* fc3_1 = network->addElementWise(*fc3_0->getOutput(0), *fc3b, E::kSUM);
    assert(fc3_1);
    // clang-format on

    // Add softmax layer to determine the probability.
    ISoftMaxLayer* prob = network->addSoftMax(*fc3_1->getOutput(0));
    assert(prob);
    prob->getOutput(0)->setName(NAMES[1]);
    network->markOutput(*prob->getOutput(0));

#if TRT_VERSION >= 8400
    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, WORKSPACE_SIZE);
#else
    config->setMaxWorkspaceSize(WORKSPACE_SIZE);
    builder->setMaxBatchSize(N);
#endif

    // Build engine
#if TRT_VERSION >= 8000
    IHostMemory* serialized_mem = builder->buildSerializedNetwork(*network, *config);
    ICudaEngine* engine = runtime->deserializeCudaEngine(serialized_mem->data(), serialized_mem->size());
    delete network;
#else
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    network->destroy();
#endif

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    return engine;
}

/**
 * @brief create a model using the API directly and serialize it to a stream
 *
 * @param N max batch size
 * @param runtime runtime
 * @param modelStream
 */
void APIToModel(int32_t N, IRuntime* runtime, IHostMemory** modelStream) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createLenetEngine(N, runtime, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

#if TRT_VERSION >= 8000
    delete engine;
    delete config;
    delete builder;
#else
    engine->destroy();
    config->destroy();
    builder->destroy();
#endif
}

std::vector<std::vector<float>> doInference(IExecutionContext& context, void* input, int64_t batchSize) {
    const auto& engine = context.getEngine();
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));
    std::vector<void*> buffers;

#if TRT_VERSION >= 8000
    const int32_t nIO = engine.getNbIOTensors();
#else
    const int32_t nIO = engine.getNbBindings();
#endif

    buffers.resize(nIO);
    for (auto i = 0; i < nIO; ++i) {
        std::size_t size = 0;
#if TRT_VERSION >= 8000
        auto* tensor_name = engine.getIOTensorName(i);
        auto s = getSize(engine.getTensorDataType(tensor_name));
        size = s * batchSize * SIZES[i];
        CHECK(cudaMalloc(&buffers[i], size));
        if (i == 0) {
            CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream));
        }
        context.setTensorAddress(tensor_name, buffers[i]);
#else
        const int32_t idx = engine.getBindingIndex(NAMES[i]);
        auto s = getSize(engine.getBindingDataType(idx));
        assert(idx == i);
        size = s * batchSize * SIZES[i];
        CHECK(cudaMalloc(&buffers[i], size));
        if (i == 0) {
            CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream));
        }
#endif
    }

#if TRT_VERSION >= 8000
    assert(context.enqueueV3(stream));
#else
    assert(context.enqueueV2(buffers.data(), stream, nullptr));
#endif

    std::vector<std::vector<float>> prob;
    for (int i = 1; i < nIO; ++i) {
        std::vector<float> tmp(batchSize * SIZES[i], std::nanf(""));
        std::size_t size = batchSize * SIZES[i] * sizeof(float);
        CHECK(cudaMemcpyAsync(tmp.data(), buffers[i], size, cudaMemcpyDeviceToHost, stream));
        prob.emplace_back(tmp);
    }
    CHECK(cudaStreamSynchronize(stream));

    for (auto& buffer : buffers) {
        CHECK(cudaFree(buffer));
    }
    CHECK(cudaStreamDestroy(stream));
    return prob;
}

int main(int argc, char** argv) {
    try {
        if (argc != 2) {
            std::cerr << "arguments not right!\n";
            std::cerr << "./lenet -s   // serialize model to plan file\n";
            std::cerr << "./lenet -d   // deserialize plan file and run inference\n";
            return -1;
        }

        IRuntime* runtime = createInferRuntime(gLogger);
        assert(runtime != nullptr);

        char* trtModelStream{nullptr};
        std::streamsize size{0};

        if (std::string(argv[1]) == "-s") {
            IHostMemory* modelStream{nullptr};
            APIToModel(1, runtime, &modelStream);
            assert(modelStream != nullptr);

            std::ofstream p(ENGINE_PATH, std::ios::binary | std::ios::trunc);
            if (!p) {
                std::cerr << "could not open plan output file\n";
                return -1;
            }
            if (modelStream->size() > static_cast<std::size_t>(std::numeric_limits<std::streamsize>::max())) {
                std::cerr << "this model is too large to serialize\n";
                return -1;
            }
            const auto* data_ptr = reinterpret_cast<const char*>(modelStream->data());
            auto data_size = static_cast<std::streamsize>(modelStream->size());
            p.write(data_ptr, data_size);

#if TRT_VERSION >= 8000
            delete modelStream;
#else
            modelStream->destroy();
#endif
            std::cout << "serialized weights to lenet5.engine\n";
            return 0;
        } else if (std::string(argv[1]) == "-d") {
            std::ifstream file(ENGINE_PATH, std::ios::binary);
            if (file.good()) {
                file.seekg(0, file.end);
                size = file.tellg();
                file.seekg(0, file.beg);
                trtModelStream = new char[size];
                assert(trtModelStream);
                file.read(trtModelStream, size);
                file.close();
            }
        } else {
            return -1;
        }

        // prepare input/output data
        auto img = cv::imread("../assets/6.pgm", cv::IMREAD_GRAYSCALE);
        cv::resize(img, img, cv::Size(32, 32), 0, 0, cv::INTER_LINEAR);
        assert(img.channels() == 1);
        img.convertTo(img, CV_32FC1, 0.00392156f, -0.1307f);
        img = img / cv::Scalar(0.3081);
        assert(img.total() * img.elemSize() == SIZES[0] * sizeof(float));

#if TRT_VERSION >= 8000
        ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
#else
        ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
#endif
        assert(engine != nullptr);
        IExecutionContext* context = engine->createExecutionContext();
        assert(context != nullptr);

        // Run inference
        for (int32_t i = 0; i < 100; ++i) {
            auto _start = std::chrono::system_clock::now();
            auto prob = doInference(*context, img.data, 1);
            auto _end = std::chrono::system_clock::now();
            auto _time = std::chrono::duration_cast<std::chrono::microseconds>(_end - _start).count();
            std::cout << "Execution time: " << _time << "us\n";

            for (const auto& vector : prob) {
                int idx = 0;
                for (auto v : vector) {
                    std::cout << std::setprecision(4) << v << ", " << std::flush;
                    if (++idx > 9) {
                        std::cout << "\n====\n";
                        break;
                    }
                }
            }

            if (i == 99) {
                std::cout << "prediction result:\n";
                int _top = 0;
                for (auto& [idx, logits] : topk(prob[0], 3)) {
                    std::cout << "Top: " << _top++ << " idx: " << idx << ", logits: " << logits << ", label: " << idx
                              << "\n";
                }
            }
        }

#if TRT_VERSION >= 8000
        delete context;
        delete engine;
        delete runtime;
#else
        context->destroy();
        engine->destroy();
        runtime->destroy();
#endif

        return 0;
    } catch (const std::exception& err) {
        std::cerr << "fatal error: " << err.what() << '\n';
        return -1;
    } catch (...) {
        std::cerr << "fatal error: unknown exception\n";
        return -1;
    }
}


================================================
FILE: lenet/lenet.py
================================================
import argparse
import os
import struct
import sys

import numpy as np
import pycuda.autoinit  # noqa: F401
import pycuda.driver as cuda
import tensorrt as trt

INPUT_H = 32
INPUT_W = 32
OUTPUT_SIZE = 10
INPUT_BLOB_NAME = "data"
OUTPUT_BLOB_NAME = "prob"

weight_path = "./lenet5.wts"
engine_path = "./lenet5.engine"

gLogger = trt.Logger(trt.Logger.INFO)


def load_weights(file):
    print(f"Loading weights: {file}")

    assert os.path.exists(file), 'Unable to load weight file.'

    weight_map = {}
    with open(file, "r") as f:
        lines = [line.strip() for line in f]
    count = int(lines[0])
    assert count == len(lines) - 1
    for i in range(1, count + 1):
        splits = lines[i].split(" ")
        name = splits[0]
        cur_count = int(splits[1])
        assert cur_count + 2 == len(splits)
        values = []
        for j in range(2, len(splits)):
            # hex string to bytes to float
            values.append(struct.unpack(">f", bytes.fromhex(splits[j])))
        weight_map[name] = np.array(values, dtype=np.float32)

    return weight_map


def createLenetEngine(maxBatchSize, builder, config, dt):
    weight_map = load_weights(weight_path)
    network = builder.create_network()

    data = network.add_input(INPUT_BLOB_NAME, dt, (1, INPUT_H, INPUT_W))
    assert data

    conv1 = network.add_convolution(input=data,
                                    num_output_maps=6,
                                    kernel_shape=(5, 5),
                                    kernel=weight_map["conv1.weight"],
                                    bias=weight_map["conv1.bias"])
    assert conv1
    conv1.stride = (1, 1)

    relu1 = network.add_activation(conv1.get_output(0),
                                   type=trt.ActivationType.RELU)
    assert relu1

    pool1 = network.add_pooling(input=relu1.get_output(0),
                                window_size=trt.DimsHW(2, 2),
                                type=trt.PoolingType.AVERAGE)
    assert pool1
    pool1.stride = (2, 2)

    conv2 = network.add_convolution(pool1.get_output(0), 16, trt.DimsHW(5, 5),
                                    weight_map["conv2.weight"],
                                    weight_map["conv2.bias"])
    assert conv2
    conv2.stride = (1, 1)

    relu2 = network.add_activation(conv2.get_output(0),
                                   type=trt.ActivationType.RELU)
    assert relu2

    pool2 = network.add_pooling(input=relu2.get_output(0),
                                window_size=trt.DimsHW(2, 2),
                                type=trt.PoolingType.AVERAGE)
    assert pool2
    pool2.stride = (2, 2)

    fc1 = network.add_fully_connected(input=pool2.get_output(0),
                                      num_outputs=120,
                                      kernel=weight_map['fc1.weight'],
                                      bias=weight_map['fc1.bias'])
    assert fc1

    relu3 = network.add_activation(fc1.get_output(0),
                                   type=trt.ActivationType.RELU)
    assert relu3

    fc2 = network.add_fully_connected(input=relu3.get_output(0),
                                      num_outputs=84,
                                      kernel=weight_map['fc2.weight'],
                                      bias=weight_map['fc2.bias'])
    assert fc2

    relu4 = network.add_activation(fc2.get_output(0),
                                   type=trt.ActivationType.RELU)
    assert relu4

    fc3 = network.add_fully_connected(input=relu4.get_output(0),
                                      num_outputs=OUTPUT_SIZE,
                                      kernel=weight_map['fc3.weight'],
                                      bias=weight_map['fc3.bias'])
    assert fc3

    prob = network.add_softmax(fc3.get_output(0))
    assert prob

    prob.get_output(0).name = OUTPUT_BLOB_NAME
    network.mark_output(prob.get_output(0))

    # Build engine
    builder.max_batch_size = maxBatchSize
    config.max_workspace_size = 1 << 20
    engine = builder.build_engine(network, config)

    del network
    del weight_map

    return engine


def APIToModel(maxBatchSize):
    builder = trt.Builder(gLogger)
    config = builder.create_builder_config()
    engine = createLenetEngine(maxBatchSize, builder, config, trt.float32)
    assert engine
    with open(engine_path, "wb") as f:
        f.write(engine.serialize())

    del engine
    del builder


def doInference(context, host_in, host_out, batchSize):
    engine = context.engine
    assert engine.num_bindings == 2

    devide_in = cuda.mem_alloc(host_in.nbytes)
    devide_out = cuda.mem_alloc(host_out.nbytes)
    bindings = [int(devide_in), int(devide_out)]
    stream = cuda.Stream()

    cuda.memcpy_htod_async(devide_in, host_in, stream)
    context.execute_async(bindings=bindings, stream_handle=stream.handle)
    cuda.memcpy_dtoh_async(host_out, devide_out, stream)
    stream.synchronize()


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("-s", action='store_true')
    parser.add_argument("-d", action='store_true')
    args = parser.parse_args()

    if not (args.s ^ args.d):
        print("arguments not right!")
        print("python lenet.py -s   # serialize model to plan file")
        print("python lenet.py -d   # deserialize plan file and run inference")
        sys.exit()

    if args.s:
        APIToModel(1)
    else:
        runtime = trt.Runtime(gLogger)
        assert runtime

        with open(engine_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        assert engine

        context = engine.create_execution_context()
        assert context

        data = np.ones((INPUT_H * INPUT_W), dtype=np.float32)
        host_in = cuda.pagelocked_empty(INPUT_H * INPUT_W, dtype=np.float32)
        np.copyto(host_in, data.ravel())
        host_out = cuda.pagelocked_empty(OUTPUT_SIZE, dtype=np.float32)

        doInference(context, host_in, host_out, 1)

        print(f'Output: {host_out}')


================================================
FILE: lenet/lenet_tripy.py
================================================
import argparse
import os
import struct

import nvtripy as tp

INPUT_SHAPE = (1, 1, 32, 32)
WEIGHT_PATH = "lenet5.wts"
COMPILED_MODEL_PATH = "lenet5.tpymodel"


def load_weights(file):
    if not os.path.exists(file):
        raise FileNotFoundError(f"Weight file: {file} does not exist.")

    with open(file, "r") as f:
        lines = [line.strip() for line in f]

    count = int(lines[0])
    assert count == len(lines) - 1, "Mismatch in weight count."

    return {
        splits[0]: tp.Tensor([struct.unpack(">f", bytes.fromhex(hex_val))[0] for hex_val in splits[2:]])
        for splits in (line.split(" ") for line in lines[1:])
    }


class Lenet5(tp.Module):
    def __init__(self):
        super().__init__()

        self.conv1 = tp.Conv(1, 6, kernel_dims=(5, 5))
        self.conv2 = tp.Conv(6, 16, kernel_dims=(5, 5))
        self.fc1 = tp.Linear(16 * 5 * 5, 120)
        self.fc2 = tp.Linear(120, 84)
        self.fc3 = tp.Linear(84, 10)

    def forward(self, x):
        x = tp.relu(self.conv1(x))
        x = tp.avgpool(x, kernel_dims=(2, 2), stride=(2, 2))
        x = tp.relu(self.conv2(x))
        x = tp.avgpool(x, kernel_dims=(2, 2), stride=(2, 2))

        x = tp.flatten(x, 1)

        x = tp.relu(self.fc1(x))
        x = tp.relu(self.fc2(x))
        x = tp.softmax(self.fc3(x), dim=1)
        return x


def main():
    parser = argparse.ArgumentParser()
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("-s", action="store_true", help="Save the model")
    group.add_argument("-d", action="store_true", help="Load a saved model")
    args = parser.parse_args()

    if args.s:
        model = Lenet5()

        weights = load_weights(WEIGHT_PATH)
        # The weights in the weights file are flattened, so we need to reshape
        # them to the right shape before we can load them:
        for name, tensor in model.state_dict().items():
            weights[name] = tp.reshape(weights[name], tensor.shape)

        model.load_state_dict(weights)

        compiled_model = tp.compile(model, args=[tp.InputInfo(INPUT_SHAPE, dtype=tp.float32)])

        compiled_model.save(COMPILED_MODEL_PATH)
    else:
        compiled_model = tp.Executable.load(COMPILED_MODEL_PATH)

        data = tp.ones(INPUT_SHAPE, dtype=tp.float32).eval()

        output = compiled_model(data)

        print(f"Output: {output}")


if __name__ == "__main__":
    main()


================================================
FILE: lenet/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include <cassert>
#include <cstdint>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include <utility>
#include "NvInferRuntime.h"
#include "macros.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf {
   public:
    LogStreamConsumerBuffer(std::ostream& stream, std::string prefix, bool shouldLog)
        : mOutput(stream), mPrefix(std::move(prefix)), mShouldLog(shouldLog) {}

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) noexcept
        : mOutput(other.mOutput), mPrefix(std::move(other.mPrefix)), mShouldLog(other.mShouldLog) {}

    ~LogStreamConsumerBuffer() override {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr()) {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    int sync() override {
        putOutput();
        return 0;
    }

    void putOutput() {
        if (mShouldLog) {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }

   private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase {
   public:
    LogStreamConsumerBase(std::ostream& stream, std::string prefix, bool shouldLog)
        : mBuffer(stream, std::move(prefix), shouldLog) {}

   protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
   public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(severity <= reportableSeverity),
          mSeverity(severity) {}

    LogStreamConsumer(LogStreamConsumer&& other) noexcept
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(other.mShouldLog),
          mSeverity(other.mSeverity) {}

    void setReportableSeverity(Severity reportableSeverity) {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

   private:
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger {
   private:
    struct TestInfo;

   public:
    Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {}

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult : std::uint8_t {
        kRUNNING,  //!< The test is running
        kPASSED,   //!< The test passed
        kFAILED,   //!< The test failed
        kWAIVED    //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger() { return *this; }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << '\n';
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity) { mReportableSeverity = severity; }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom {
       public:
        TestAtom(TestAtom&&) = default;

       private:
        friend class Logger;

        TestAtom(bool started, TestInfo info)
            : mStarted(started), mName(std::move(info.name)), mCmdline(std::move(info.cmdline)) {}

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline) {
        return TestAtom{false, TestInfo{name, cmdline}};
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom) {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result) {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass) {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    [[nodiscard]] Severity getReportableSeverity() const { return mReportableSeverity; }

   private:
    struct TestInfo {
        std::string name;
        std::string cmdline;
    };
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result) {
        switch (result) {
            case TestResult::kRUNNING:
                return "RUNNING";
            case TestResult::kPASSED:
                return "PASSED";
            case TestResult::kFAILED:
                return "FAILED";
            case TestResult::kWAIVED:
                return "WAIVED";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result) {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << '\n';
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv) {
        std::stringstream ss;
        for (int i = 0; i < argc; i++) {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace {

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kVERBOSE};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINFO};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kWARNING};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kERROR};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINTERNAL_ERROR};
}

}  // anonymous namespace

#endif  // TENSORRT_LOGGING_H


================================================
FILE: lenet/macros.h
================================================
#pragma once
#include <NvInfer.h>

#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else

#if defined(_MSC_VER)
#define API __declspec(dllimport)
#else
#define API
#endif
#endif  // API_EXPORTS

#define TRT_VERSION \
    ((NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + (NV_TENSORRT_PATCH * 10) + NV_TENSORRT_BUILD)

#if TRT_VERSION < 7220
#error "TensorRT >= 7.2.2 is required for this demo."
#endif

#if TRT_VERSION >= 8000
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif


================================================
FILE: lenet/utils.h
================================================
#pragma once
#include <cuda_runtime_api.h>
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <fstream>
#include <iostream>
#include <map>
#include <numeric>
#include <opencv2/opencv.hpp>
#include <string>
#include <vector>
#include "macros.h"

using namespace nvinfer1;

enum : std::uint32_t { WORKSPACE_SIZE = 16 << 20 };

#define CHECK(status)                                     \
    do {                                                  \
        auto ret = (status);                              \
        if (ret != cudaSuccess) {                         \
            std::cerr << "Cuda failure: " << ret << "\n"; \
            std::abort();                                 \
        }                                                 \
    } while (0)

static void checkTrtEnv(int device = 0) {
#if TRT_VERSION < 8000
    CHECK(cudaGetDevice(&device));
    cudaDeviceProp prop{};
    CHECK(cudaGetDeviceProperties(&prop, device));
    const int sm = prop.major * 10 + prop.minor;
    if (sm > 86) {
        std::cerr << "TensorRT < 8 does not support SM > 86 on this GPU.";
        std::abort();
    }
#endif
}

/**
 * @brief TensorRT weight files have a simple space delimited format:
 * [type] [size] <data x size in hex>
 * 
 * @param file input weight file path
 * @return std::map<std::string, nvinfer1::Weights> 
 */
static auto loadWeights(const std::string& file) {
    std::cout << "Loading weights: " << file << "\n";
    std::map<std::string, nvinfer1::Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--) {
        nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0};

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> wt.count;

        // Load blob
        auto* val = new uint32_t[wt.count];
        input >> std::hex;
        for (auto x = 0ll; x < wt.count; ++x) {
            input >> val[x];
        }
        wt.values = val;
        weightMap[name] = wt;
    }

    return weightMap;
}

static std::vector<std::pair<int, float>> topk(const std::vector<float>& v, int64_t k) {
    if (k <= 0)
        return {};
    auto s = std::min<std::ptrdiff_t>(k, static_cast<std::ptrdiff_t>(v.size()));

    std::vector<int> idx(v.size());
    std::iota(idx.begin(), idx.end(), 0);

    std::partial_sort(idx.begin(), std::next(idx.begin(), s), idx.end(), [&](int a, int b) { return v[a] > v[b]; });

    std::vector<std::pair<int, float>> out;
    out.reserve(k);
    for (int i = 0; i < k; ++i)
        out.emplace_back(idx[i], v[idx[i]]);
    return out;
}

static size_t getSize(DataType dt) {
    switch (dt) {
#if TRT_VERSION >= 8510
        case DataType::kUINT8:
#endif
        case DataType::kINT8:
            return sizeof(int8_t);
        case DataType::kFLOAT:
            return sizeof(float);
        case DataType::kHALF:
            return sizeof(int16_t);
        case DataType::kINT32:
            return sizeof(int32_t);
        default: {
            std::cerr << "Unsupported data type\n";
            std::abort();
        }
    }
}


================================================
FILE: lprnet/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.17.0)

project(
  lprnet
  VERSION 0.1
  LANGUAGES C CXX CUDA)

if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
  set(CMAKE_CUDA_ARCHITECTURES
      60
      70
      72
      75
      80
      86
      89)
endif()

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_INCLUDE_CURRENT_DIR TRUE)

option(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cudaruntime library" OFF)

find_package(Threads REQUIRED)
find_package(CUDAToolkit REQUIRED)
find_package(OpenCV)

if(NOT TARGET TensorRT::TensorRT)
  include(FindTensorRT.cmake)
else()
  message("TensorRT has been found, skipping for ${PROJECT_NAME}")
endif()

add_executable(${PROJECT_NAME} ${PROJECT_NAME}.cpp)
target_include_directories(${PROJECT_NAME} PRIVATE ${OpenCV_INCLUDE_DIRS})
target_link_libraries(${PROJECT_NAME} PUBLIC Threads::Threads CUDA::cudart
                                             TensorRT::TensorRT ${OpenCV_LIBS})

if(WIN32)
  set_target_properties(
    ${PROJECT_NAME} PROPERTIES MSVC_RUNTIME_LIBRARY
                               "MultiThreaded$<$<CONFIG:Debug>:Debug>")
endif()

target_compile_options(${PROJECT_NAME} PRIVATE $<$<CXX_COMPILER_ID:MSVC>:/utf-8>)


================================================
FILE: lprnet/FindTensorRT.cmake
================================================
cmake_minimum_required(VERSION 3.17.0)

function(_guess_path var_name required_files)
  set(_result "")

  foreach(path_entry IN LISTS ARGN)
    if(NOT EXISTS "${path_entry}")
      message(DEBUG "skip non-existing path '${path_entry}'")
      continue()
    endif()

    set(_ok TRUE)
    foreach(required_file IN LISTS required_files)
      if(NOT EXISTS "${path_entry}/${required_file}")
        set(_ok FALSE)
        message(DEBUG "'${path_entry}' missing '${required_file}'")
        break()
      endif()
    endforeach()

    if(_ok)
      list(APPEND _result "${path_entry}")
      message(DEBUG "accept '${path_entry}'")
    else()
      message(DEBUG "reject '${path_entry}'")
    endif()
  endforeach()

  if(_result STREQUAL "")
    message(
      FATAL_ERROR
        "_guess_path(${var_name}) failed: no valid path found. required_files='${required_files}' candidates='${ARGN}'"
    )
  endif()

  set(${var_name}
      "${_result}"
      PARENT_SCOPE)
endfunction()

# add library
add_library(TensorRT IMPORTED INTERFACE)
add_library(TensorRT::TensorRT ALIAS TensorRT)

set(TRT_VERSION
    CACHE
      STRING
      "TensorRT version, e.g. \"8.6.1.6\" or \"8.6.1.6+cuda12.0.1.011\", \"8.6.1.6.Windows10.x86_64.cuda-12.0\" etc"
)

if(NOT TRT_VERSION STREQUAL "" AND NOT $ENV{TRT_VERSION} STREQUAL "")
  message(
    WARNING
      "TRT_VERSION defined by cmake and environment variable both, using the later one"
  )
endif()

if(NOT $ENV{TRT_VERSION} STREQUAL "")
  set(TRT_VERSION $ENV{TRT_VERSION})
endif()

string(REGEX MATCH "([0-9]+)" _match ${TRT_VERSION})
set(TRT_MAJOR_VERSION "${_match}")
unset(_match)

if(WIN32)
  set(TensorRT_DIR "C:/Program Files/TensorRT-${TRT_VERSION}")
  if(NOT EXISTS "${TensorRT_DIR}")
    message(
      FATAL_ERROR
        "TensorRT_DIR=${TensorRT_DIR} does not exist!"
    )
  endif()

  if(${TRT_MAJOR_VERSION} GREATER_EQUAL 10)
    set(_modules nvinfer_10 nvinfer_plugin_10 nvinfer_vc_plugin_10
                 nvinfer_dispatch_10 nvinfer_lean_10)
    message(DEBUG "Using ${_modules}")
  else()
    set(_modules nvinfer nvinfer_plugin nvinfer_vc_plugin nvinfer_dispatch
                 nvinfer_lean)
  endif()

  set(TensorRT_LIBRARY_DIR "${TensorRT_DIR}/lib")
  set(TensorRT_INCLUDE_DIR "${TensorRT_DIR}/include")
elseif(UNIX)
  string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" _trt_arch)
  set(_trt_include_candidates)
  if(_trt_arch MATCHES "^(aarch64|arm64|arch64)$")
    set(_trt_include_candidates "/usr/include/aarch64-linux-gnu" "/usr/include"
                                "/usr/local/cuda/targets/aarch64-linux/include")
    set(_trt_library_candidates
        "/usr/local/tensorrt/targets/aarch64-linux-gnu/lib"
        "/usr/lib/aarch64-linux-gnu" "/usr/lib/aarch64-linux-gnu/tegra"
        "/usr/lib")
  elseif(_trt_arch MATCHES "^(x86_64|amd64)$")
    set(_trt_include_candidates
        "/usr/local/tensorrt/targets/x86_64-linux-gnu/include"
        "/usr/include/x86_64-linux-gnu" "/usr/include")
    set(_trt_library_candidates
        "/usr/local/tensorrt/targets/x86_64-linux-gnu/lib"
        "/usr/lib/x86_64-linux-gnu" "/usr/lib")
  else()
    message(FATAL_ERROR "Unknown architecture")
  endif()

  set(_modules nvinfer nvinfer_plugin)
  if(${TRT_MAJOR_VERSION} GREATER_EQUAL 8)
    list(APPEND _modules nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean)
  endif()

  _guess_path(TensorRT_LIBRARY_DIR "libnvinfer.so;libnvinfer_plugin.so"
              ${_trt_library_candidates})
  message(STATUS "TensorRT libraries: ${TensorRT_LIBRARY_DIR}")
  _guess_path(TensorRT_INCLUDE_DIR "NvInfer.h" ${_trt_include_candidates})
  message(STATUS "TensorRT includes: ${TensorRT_INCLUDE_DIR}")
endif()

foreach(lib IN LISTS _modules)
  find_library(
    TensorRT_${lib}_LIBRARY
    NAMES ${lib}
    HINTS ${TensorRT_LIBRARY_DIR})
  list(APPEND TensorRT_LIBRARIES ${TensorRT_${lib}_LIBRARY})
endforeach()

target_link_libraries(TensorRT INTERFACE ${TensorRT_LIBRARIES})

message(STATUS "Found TensorRT libs: ${TensorRT_LIBRARIES}")

set_target_properties(
  TensorRT
  PROPERTIES C_STANDARD 17
             CXX_STANDARD 17
             POSITION_INDEPENDENT_CODE ON
             SKIP_BUILD_RPATH TRUE
             BUILD_WITH_INSTALL_RPATH TRUE
             INSTALL_RPATH "$ORIGIN"
             INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIR}")

unset(TRT_MAJOR_VERSION)
unset(_modules)
unset(_trt_include_candidates)
unset(_trt_library_candidates)
unset(_trt_arch)


================================================
FILE: lprnet/README.md
================================================
# LPRNet

The Pytorch implementation is [xuexingyu24/License_Plate_Detection_Pytorch](https://github.com/xuexingyu24/License_Plate_Detection_Pytorch).

## Usage

1. download model from [HERE](https://github.com/xuexingyu24/License_Plate_Detection_Pytorch/blob/master/LPRNet/weights/Final_LPRNet_model.pth) and put it into `models` folder

2. use `genwts.py` to generate wts file

```bash
python3 genwts.py
```

3. build C++ code

```bash
pushd tensorrtx/lprnet
cmake -S . -B build -G Ninja --fresh
cmake --build build
```

4. serialize wts model to engine file

```bash
./build/LPRnet -s
```

now you may see `LPRNet.engine` under `models`

5. run inference

sample code use the image under assets by default:

![sample](../assets/car_plate.jpg)

```bash
./build/LPRnet -d
```

output looks like:

```bash
...
Execution time: 205us
-65.58, -28.74, -52.1, -70.79, -53.36, -57.58, -70.97, -60.66, -48.18, -57.38, -54.07, -58.56, -49.04, -52.39, -51.94, -53.4, -49.04, -45.89, -49.42, -7.863, -42.12,
====
Execution time: 202us
-65.58, -28.74, -52.1, -70.79, -53.36, -57.58, -70.97, -60.66, -48.18, -57.38, -54.07, -58.56, -49.04, -52.39, -51.94, -53.4, -49.04, -45.89, -49.42, -7.863, -42.12,
====
result: 沪BKB770
```

## Note

if you are running this demo on windows, you may need to check the code page, e.g., for Windows PowerShell, run:

```ps1
chcp
```

if the output is not **65001**, then use

```ps1
chcp 65001
```

to set the code page to utf-8, so you can get the correct literal result.


================================================
FILE: lprnet/gen_wts.py
================================================
"""
model codes are borrowed from:
`https://github.com/xuexingyu24/License_Plate_Detection_Pytorch/blob/master/LPRNet/model/LPRNET.py`

check `.pth` model here:
`https://github.com/xuexingyu24/License_Plate_Detection_Pytorch/blob/master/LPRNet/weights/Final_LPRNet_model.pth`

"""

import struct

import cv2
import numpy as np
import torch
import torch.nn as nn

CHARS = "京沪津渝冀晋蒙辽吉黑苏浙皖闽赣鲁豫鄂湘粤桂琼川贵云藏陕甘青宁新0123456789ABCDEFGHJKLMNPQRSTUVWXYZIO-"


def preprocess(path):
    image = cv2.imread(path, cv2.IMREAD_COLOR)
    image = cv2.resize(image, (94, 24), interpolation=cv2.INTER_CUBIC)
    image = image.astype(np.float32)
    image = image / 255.0 - 0.5  # still HxWx3, BGR
    image = image.transpose(2, 0, 1)[None, ...]
    image = torch.from_numpy(image)
    return image


class small_basic_block(nn.Module):
    def __init__(self, ch_in, ch_out):
        super(small_basic_block, self).__init__()
        self.block = nn.Sequential(
            nn.Conv2d(ch_in, ch_out // 4, kernel_size=1),
            nn.ReLU(),
            nn.Conv2d(ch_out // 4, ch_out // 4, kernel_size=(3, 1), padding=(1, 0)),
            nn.ReLU(),
            nn.Conv2d(ch_out // 4, ch_out // 4, kernel_size=(1, 3), padding=(0, 1)),
            nn.ReLU(),
            nn.Conv2d(ch_out // 4, ch_out, kernel_size=1),
        )

    def forward(self, x):
        return self.block(x)


class LPRNet(nn.Module):
    def __init__(self, class_num, dropout_rate):
        super(LPRNet, self).__init__()
        self.class_num = class_num
        self.backbone = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1),  # 0
            nn.BatchNorm2d(num_features=64),
            nn.ReLU(),  # 2
            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 1, 1)),
            small_basic_block(ch_in=64, ch_out=128),  # 4
            nn.BatchNorm2d(num_features=128),
            nn.ReLU(),  # 6
            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(2, 1, 2)),
            small_basic_block(ch_in=64, ch_out=256),  # 8
            nn.BatchNorm2d(num_features=256),
            nn.ReLU(),  # 10
            small_basic_block(ch_in=256, ch_out=256),  # 11
            nn.BatchNorm2d(num_features=256),  # 12
            nn.ReLU(),  # 13
            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(4, 1, 2)),  # 14
            nn.Dropout(dropout_rate),
            nn.Conv2d(in_channels=64, out_channels=256, kernel_size=(1, 4), stride=1),  # 16
            nn.BatchNorm2d(num_features=256),
            nn.ReLU(),  # 18
            nn.Dropout(dropout_rate),
            nn.Conv2d(in_channels=256, out_channels=class_num, kernel_size=(13, 1), stride=1),  # 20
            nn.BatchNorm2d(num_features=class_num),
            nn.ReLU(),  # 22
        )
        self.container = nn.Sequential(
            nn.Conv2d(
                in_channels=256 + class_num + 128 + 64, out_channels=self.class_num, kernel_size=(1, 1), stride=(1, 1)
            )
        )

    def forward(self, x):
        keep_features = list()
        for i, layer in enumerate(self.backbone.children()):
            x = layer(x)
            if i in [2, 6, 13, 22]:  # [2, 4, 8, 11, 22]
                print(self.backbone[i])
                keep_features.append(x)

        global_context = list()
        for i, f in enumerate(keep_features):
            if i in [0, 1]:
                f = nn.AvgPool2d(kernel_size=5, stride=5)(f)
            if i in [2]:
                f = nn.AvgPool2d(kernel_size=(4, 10), stride=(4, 2))(f)
            f_pow = torch.pow(f, 2)
            f_mean = torch.mean(f_pow)
            f = torch.div(f, f_mean)
            global_context.append(f)

        x = torch.cat(global_context, 1)
        x = self.container(x)
        logits = torch.mean(x, dim=2)

        return logits


if __name__ == "__main__":
    model_path = "../models/Final_LPRNet_model.pth"

    model = LPRNet(class_num=len(CHARS), dropout_rate=0)
    print("loading pretrained model from %s" % model_path)
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.load_state_dict(torch.load(model_path, map_location=device))

    img = preprocess("../assets/car_plate.jpg")
    model.eval()
    print(model)
    with torch.inference_mode():
        preds = model(img)
        res = "".join(CHARS[i] for i in torch.argmax(preds[0], dim=0).tolist())
        res = res.replace("-", "")

    with open("../models/LPRNet.wts", "w") as f:
        f.write("{}\n".format(len(model.state_dict().keys())))
        for k, v in model.state_dict().items():
            print("key: ", k)
            print("value: ", v.shape)
            vr = v.reshape(-1).cpu().numpy()
            f.write("{} {}".format(k, len(vr)))
            for vv in vr:
                f.write(" ")
                f.write(struct.pack(">f", float(vv)).hex())
            f.write("\n")

    print(f"inference result: {res}")


================================================
FILE: lprnet/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include <cassert>
#include <cstdint>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include <utility>
#include "NvInferRuntime.h"
#include "macros.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf {
   public:
    LogStreamConsumerBuffer(std::ostream& stream, std::string prefix, bool shouldLog)
        : mOutput(stream), mPrefix(std::move(prefix)), mShouldLog(shouldLog) {}

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) noexcept
        : mOutput(other.mOutput), mPrefix(std::move(other.mPrefix)), mShouldLog(other.mShouldLog) {}

    ~LogStreamConsumerBuffer() override {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr()) {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    int sync() override {
        putOutput();
        return 0;
    }

    void putOutput() {
        if (mShouldLog) {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }

   private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase {
   public:
    LogStreamConsumerBase(std::ostream& stream, std::string prefix, bool shouldLog)
        : mBuffer(stream, std::move(prefix), shouldLog) {}

   protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
   public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(severity <= reportableSeverity),
          mSeverity(severity) {}

    LogStreamConsumer(LogStreamConsumer&& other) noexcept
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(other.mShouldLog),
          mSeverity(other.mSeverity) {}

    void setReportableSeverity(Severity reportableSeverity) {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

   private:
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger {
   private:
    struct TestInfo;

   public:
    Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {}

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult : std::uint8_t {
        kRUNNING,  //!< The test is running
        kPASSED,   //!< The test passed
        kFAILED,   //!< The test failed
        kWAIVED    //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger() { return *this; }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << '\n';
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity) { mReportableSeverity = severity; }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom {
       public:
        TestAtom(TestAtom&&) = default;

       private:
        friend class Logger;

        TestAtom(bool started, TestInfo info)
            : mStarted(started), mName(std::move(info.name)), mCmdline(std::move(info.cmdline)) {}

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline) {
        return TestAtom{false, TestInfo{name, cmdline}};
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom) {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result) {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass) {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    [[nodiscard]] Severity getReportableSeverity() const { return mReportableSeverity; }

   private:
    struct TestInfo {
        std::string name;
        std::string cmdline;
    };
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result) {
        switch (result) {
            case TestResult::kRUNNING:
                return "RUNNING";
            case TestResult::kPASSED:
                return "PASSED";
            case TestResult::kFAILED:
                return "FAILED";
            case TestResult::kWAIVED:
                return "WAIVED";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result) {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << '\n';
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv) {
        std::stringstream ss;
        for (int i = 0; i < argc; i++) {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace {

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kVERBOSE};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINFO};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kWARNING};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kERROR};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINTERNAL_ERROR};
}

}  // anonymous namespace

#endif  // TENSORRT_LOGGING_H


================================================
FILE: lprnet/lprnet.cpp
================================================
#include <NvInfer.h>
#include <algorithm>
#include <array>
#include <chrono>
#include <cstdint>
#include <fstream>
#include <iostream>
#include <map>
#include <opencv2/opencv.hpp>
#include <string>
#include <vector>
#include "logging.h"
#include "utils.h"
#ifdef _WIN32
#define NOMINMAX
#include <Windows.h>
#endif

using namespace nvinfer1;

using WeightMap = std::map<std::string, Weights>;
using NDCF = nvinfer1::NetworkDefinitionCreationFlag;

static Logger gLogger;

static constexpr const std::size_t WORKSPACE_SIZE = 16 << 20;
static constexpr const int32_t DEVICE = 0;
static constexpr const int32_t BATCH_SIZE = 1;
static constexpr const char* WTS_PATH = "../models/LPRNet.wts";
static constexpr const char* ENGINE_PATH = "../models/LPRNet.engine";
// stuff we know about the network and the input/output blobs
static constexpr const int32_t INPUT_H = 24;
static constexpr const int32_t INPUT_W = 94;
static constexpr const std::array<const char*, 2> NAMES = {"data", "prob"};
static constexpr const std::array<int32_t, 2> SIZES = {3 * INPUT_H * INPUT_W, 18 * 68};
static constexpr const bool TRT_PREPROCESS = TRT_VERSION >= 8510 ? true : false;
static constexpr const std::array<const float, 3> mean = {0.5f, 0.5f, 0.5f};
static constexpr const std::array<const float, 3> stdv = {1.f, 1.f, 1.f};

const std::array<const std::string, 68> alphabet = {
        "京", "沪", "津", "渝", "冀", "晋", "蒙", "辽", "吉", "黑", "苏", "浙", "皖", "闽", "赣", "鲁", "豫",
        "鄂", "湘", "粤", "桂", "琼", "川", "贵", "云", "藏", "陕", "甘", "青", "宁", "新", "0",  "1",  "2",
        "3",  "4",  "5",  "6",  "7",  "8",  "9",  "A",  "B",  "C",  "D",  "E",  "F",  "G",  "H",  "J",  "K",
        "L",  "M",  "N",  "P",  "Q",  "R",  "S",  "T",  "U",  "V",  "W",  "X",  "Y",  "Z",  "I",  "O",  "-"};

IScaleLayer* addBatchNorm2d(INetworkDefinition* network, WeightMap& weightMap, ITensor& input, const std::string& lname,
                            float eps = 1e-5) {
    const float* gamma = reinterpret_cast<const float*>(weightMap[lname + ".weight"].values);
    const float* beta = reinterpret_cast<const float*>(weightMap[lname + ".bias"].values);
    const float* mean = reinterpret_cast<const float*>(weightMap[lname + ".running_mean"].values);
    const float* var = reinterpret_cast<const float*>(weightMap[lname + ".running_var"].values);
    int64_t len = weightMap[lname + ".running_var"].count;

    auto* scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};

    auto* shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    auto* pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0f;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    scale_1->setName(lname.c_str());
    return scale_1;
}

IConvolutionLayer* smallBasicBlock(INetworkDefinition* network, WeightMap& w, ITensor& input, int ch_out,
                                   const std::string& lname) {
    int o = ch_out / 4, i = 0;
    ITensor* cur_input = &input;
    IConvolutionLayer* ret{nullptr};
    struct ConvParams {
        DimsHW k_dim, p_dim;
        int ch_out;
        std::string w_name, b_name;
    };
    const std::array<ConvParams, 4> conv_params = {{
            {DimsHW{1, 1}, DimsHW{0, 0}, o, lname + ".block.0.weight", lname + ".block.0.bias"},
            {DimsHW{3, 1}, DimsHW{1, 0}, o, lname + ".block.2.weight", lname + ".block.2.bias"},
            {DimsHW{1, 3}, DimsHW{0, 1}, o, lname + ".block.4.weight", lname + ".block.4.bias"},
            {DimsHW{1, 1}, DimsHW{0, 0}, ch_out, lname + ".block.6.weight", lname + ".block.6.bias"},
    }};
    for (const auto& param : conv_params) {
        ret = network->addConvolutionNd(*cur_input, param.ch_out, param.k_dim, w[param.w_name], w[param.b_name]);
        assert(ret);
        ret->setPaddingNd(param.p_dim);
        ret->setName((lname + ".block." + std::to_string(i++)).c_str());
        if (i != 4) {
            auto* relu = network->addActivation(*ret->getOutput(0), ActivationType::kRELU);
            assert(relu);
            cur_input = relu->getOutput(0);
        } else {
            cur_input = ret->getOutput(0);
        }
    }
    return ret;
}

ICudaEngine* createEngine(int32_t N, IRuntime* runtime, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    const int nc = 68;
    WeightMap w = loadWeights(WTS_PATH);

#if TRT_VERSION >= 11200
    auto flag = 1U << static_cast<int>(NDCF::kSTRONGLY_TYPED);
#elif TRT_VERSION >= 10000
    auto flag = 0U;
#else
    auto flag = 1U << static_cast<int>(NDCF::kEXPLICIT_BATCH);
#endif
    auto* network = builder->createNetworkV2(flag);

    ITensor* data{nullptr};
    if constexpr (TRT_PREPROCESS) {
        // for simplicity, resize image on cpu side
        dt = DataType::kUINT8;
        auto* input = network->addInput(NAMES[0], dt, Dims4{N, INPUT_H, INPUT_W, 3});
        auto* trans = addTransformLayer(network, *input, false, mean, stdv);
        data = trans->getOutput(0);
    } else {
        data = network->addInput(NAMES[0], dt, Dims4{N, 3, INPUT_H, INPUT_W});
    }
    assert(data);

    // CBR (Conv-BatchNorm-ReLU)
    auto* c0 = network->addConvolutionNd(*data, 64, DimsHW{3, 3}, w["backbone.0.weight"], w["backbone.0.bias"]);
    auto* bn0 = addBatchNorm2d(network, w, *c0->getOutput(0), "backbone.1");
    auto* relu0 = network->addActivation(*bn0->getOutput(0), ActivationType::kRELU);

    auto* f0 = network->addPoolingNd(*relu0->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
    f0->setStrideNd(DimsHW{1, 1});
    assert(c0 && bn0 && relu0);

    auto* sm0 = smallBasicBlock(network, w, *f0->getOutput(0), 128, "backbone.4");
    auto* bn1 = addBatchNorm2d(network, w, *sm0->getOutput(0), "backbone.5");
    auto* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(sm0 && bn1 && relu1);

    // need to unsqueeze to 5D tensor for 3D pooling
    auto* to5d0 = network->addShuffle(*relu1->getOutput(0));
    to5d0->setReshapeDimensions({5, {BATCH_SIZE, 1, 128, 20, 90}});
    auto* f1 = network->addPoolingNd(*to5d0->getOutput(0), PoolingType::kMAX, Dims3{1, 3, 3});
    f1->setStrideNd(Dims3{2, 1, 2});
    f1->setName("MaxPool3d_1");
    auto* to5d1 = network->addShuffle(*f1->getOutput(0));
    to5d1->setReshapeDimensions(Dims4{BATCH_SIZE, 64, 18, 44});

    auto* sm1 = smallBasicBlock(network, w, *to5d1->getOutput(0), 256, "backbone.8");
    auto* bn2 = addBatchNorm2d(network, w, *sm1->getOutput(0), "backbone.9");
    auto* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU);
    auto* sm2 = smallBasicBlock(network, w, *relu2->getOutput(0), 256, "backbone.11");
    auto* bn3 = addBatchNorm2d(network, w, *sm2->getOutput(0), "backbone.12");
    auto* relu3 = network->addActivation(*bn3->getOutput(0), ActivationType::kRELU);

    // need to unsqueeze to 5D tensor for 3D pooling
    auto* to5d2 = network->addShuffle(*relu3->getOutput(0));
    to5d2->setReshapeDimensions({5, {BATCH_SIZE, 1, 256, 18, 44}});
    auto* f2 = network->addPoolingNd(*to5d2->getOutput(0), PoolingType::kMAX, Dims3{1, 3, 3});
    f2->setStrideNd(Dims3{4, 1, 2});
    f2->setName("MaxPool3d_2");
    auto* to5d3 = network->addShuffle(*f2->getOutput(0));
    to5d3->setReshapeDimensions(Dims4{BATCH_SIZE, 64, 16, 21});

    // CBR (Conv-BatchNorm-ReLU)
    c0 = network->addConvolutionNd(*to5d3->getOutput(0), 256, DimsHW{1, 4}, w["backbone.16.weight"],
                                   w["backbone.16.bias"]);
    auto* bn4 = addBatchNorm2d(network, w, *c0->getOutput(0), "backbone.17");
    auto* relu5 = network->addActivation(*bn4->getOutput(0), ActivationType::kRELU);

    // CBR (Conv-BatchNorm-ReLU)
    c0 = network->addConvolutionNd(*relu5->getOutput(0), nc, DimsHW{13, 1}, w["backbone.20.weight"],
                                   w["backbone.20.bias"]);
    auto* bn5 = addBatchNorm2d(network, w, *c0->getOutput(0), "backbone.21");
    auto* backbone = network->addActivation(*bn5->getOutput(0), ActivationType::kRELU);

    auto makeGlobalContext = [&](ITensor* feat, bool pool5, bool pool4x10) -> ITensor* {
        static int j = 0;
        ITensor* t = feat;
        if (pool5) {
            auto* pool = network->addPoolingNd(*t, PoolingType::kAVERAGE, DimsHW{5, 5});
            assert(pool);
            pool->setStrideNd(DimsHW{5, 5});
            auto _name = "global5." + std::to_string(j);
            pool->setName(_name.c_str());
            t = pool->getOutput(0);
        }
        if (pool4x10) {
            auto* pool = network->addPoolingNd(*t, PoolingType::kAVERAGE, DimsHW{4, 10});
            assert(pool);
            pool->setStrideNd(DimsHW{4, 2});
            auto _name = "global4x10." + std::to_string(j);
            pool->setName(_name.c_str());
            t = pool->getOutput(0);
        }

        // pow
        Dims dims = t->getDimensions();
        int64_t size = dims.d[0] * dims.d[1] * dims.d[2] * dims.d[3];
        void* data = malloc(sizeof(float) * size);
        for (int i = 0; i < size; ++i) {
            reinterpret_cast<float*>(data)[i] = 2.0f;
        }
        auto name = "pow." + std::to_string(j);
        w[name] = {DataType::kFLOAT, data, size};
        auto* pow_const = network->addConstant(dims, w[name]);
        auto* pow = network->addElementWise(*t, *pow_const->getOutput(0), ElementWiseOperation::kPOW);
        assert(pow);
        pow->setName(name.c_str());

        // mean
        int32_t mask = (1 << dims.nbDims) - 1;
        auto* mean = network->addReduce(*pow->getOutput(0), ReduceOperation::kAVG, mask, true);
        auto _mean_name = "mean." + std::to_string(j);
        mean->setName(_mean_name.c_str());

        // div
        auto* div = network->addElementWise(*t, *mean->getOutput(0), ElementWiseOperation::kDIV);
        auto _div_name = "div." + std::to_string(j);
        div->setName(_div_name.c_str());
        ++j;
        return div->getOutput(0);
    };

    auto* gc0 = makeGlobalContext(relu0->getOutput(0), true, false);
    auto* gc1 = makeGlobalContext(relu1->getOutput(0), true, false);
    auto* gc2 = makeGlobalContext(relu3->getOutput(0), false, true);
    auto* gc3 = makeGlobalContext(backbone->getOutput(0), false, false);
    const std::array<ITensor*, 4> gcs = {gc0, gc1, gc2, gc3};
    auto* cat = network->addConcatenation(gcs.data(), 4);
    assert(cat);
    cat->setAxis(1);

    auto* c = network->addConvolutionNd(*cat->getOutput(0), nc, DimsHW{1, 1}, w["container.0.weight"],
                                        w["container.0.bias"]);
    auto* logits = network->addReduce(*c->getOutput(0), ReduceOperation::kAVG, 0x04, false);
    logits->getOutput(0)->setName(NAMES[1]);

    network->markOutput(*logits->getOutput(0));

#if TRT_VERSION >= 8000
    config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, WORKSPACE_SIZE);
    IHostMemory* mem = builder->buildSerializedNetwork(*network, *config);
    ICudaEngine* engine = runtime->deserializeCudaEngine(mem->data(), mem->size());
    delete network;
#else
    builder->setMaxBatchSize(N);
    config->setMaxWorkspaceSize(WORKSPACE_SIZE);
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    network->destroy();
#endif

    std::cout << "build finished\n";
    // Release host memory
    for (auto& mem : w) {
        free((void*)mem.second.values);
    }

    return engine;
}

void APIToModel(int32_t N, IRuntime* runtime, IHostMemory** modelStream) {
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    ICudaEngine* engine = createEngine(N, runtime, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    (*modelStream) = engine->serialize();

#if TRT_VERSION >= 8000
    delete engine;
    delete config;
    delete builder;
#else
    engine->destroy();
    config->destroy();
    builder->destroy();
#endif
}

auto doInference(IExecutionContext& context, void* input, int64_t batchSize) -> std::vector<std::vector<float>> {
    const auto& engine = context.getEngine();
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));
    std::vector<void*> buffers;

#if TRT_VERSION >= 8000
    const int32_t nIO = engine.getNbIOTensors();
#else
    const int32_t nIO = engine.getNbBindings();
#endif

    buffers.resize(nIO);
    for (auto i = 0; i < nIO; ++i) {
        std::size_t size = 0;
#if TRT_VERSION >= 8000
        auto* tensor_name = engine.getIOTensorName(i);
        auto s = getSize(engine.getTensorDataType(tensor_name));
        size = s * batchSize * SIZES[i];
        CHECK(cudaMalloc(&buffers[i], size));
        if (i == 0) {
            CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream));
        }
        context.setTensorAddress(tensor_name, buffers[i]);
#else
        const int32_t idx = engine.getBindingIndex(NAMES[i]);
        auto s = getSize(engine.getBindingDataType(idx));
        assert(idx == i);
        size = s * batchSize * SIZES[i];
        CHECK(cudaMalloc(&buffers[i], size));
        if (i == 0) {
            CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream));
        }
#endif
    }

#if TRT_VERSION >= 8000
    assert(context.enqueueV3(stream));
#else
    assert(context.enqueueV2(buffers.data(), stream, nullptr));
#endif

    std::vector<std::vector<float>> prob;
    for (int i = 1; i < nIO; ++i) {
        std::vector<float> tmp(batchSize * SIZES[i], std::nanf(""));
        std::size_t size = batchSize * SIZES[i] * sizeof(float);
        CHECK(cudaMemcpyAsync(tmp.data(), buffers[i], size, cudaMemcpyDeviceToHost, stream));
        prob.emplace_back(tmp);
    }
    CHECK(cudaStreamSynchronize(stream));

    for (auto& buffer : buffers) {
        CHECK(cudaFree(buffer));
    }
    CHECK(cudaStreamDestroy(stream));
    return prob;
}

int main(int argc, char** argv) {
#if _WIN32
    SetConsoleOutputCP(CP_UTF8);
#endif
    cudaSetDevice(DEVICE);
    checkTrtEnv(DEVICE);
    if (argc != 2) {
        std::cerr << "arguments not right!\n";
        std::cerr << "./LPRnet -s  // serialize model to plan file\n";
        std::cerr << "./LPRnet -d  // deserialize plan file and run inference\n";
        return -1;
    }

    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);

    char* trtModelStream{nullptr};
    std::streamsize size{0};

    if (std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(BATCH_SIZE, runtime, &modelStream);
        assert(modelStream != nullptr);

        std::ofstream p(ENGINE_PATH, std::ios::binary | std::ios::trunc);
        if (!p) {
            std::cerr << "could not open plan output file\n";
            return -1;
        }
        if (modelStream->size() > static_cast<std::size_t>(std::numeric_limits<std::streamsize>::max())) {
            std::cerr << "this model is too large to serialize\n";
            return -1;
        }
        const auto* data_ptr = reinterpret_cast<const char*>(modelStream->data());
        auto data_size = static_cast<std::streamsize>(modelStream->size());
        p.write(data_ptr, data_size);
#if TRT_VERSION >= 8000
        delete modelStream;
#else
        modelStream->destroy();
#endif
        return 0;
    } else if (std::string(argv[1]) == "-d") {
        std::ifstream file(ENGINE_PATH, std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        return 1;
    }

    void* input = nullptr;
    std::vector<float> data;
    cv::Mat img = cv::imread("../assets/car_plate.jpg");
    if constexpr (TRT_PREPROCESS) {
        // for simplicity, resize image on cpu side
        cv::resize(img, img, cv::Size(INPUT_W, INPUT_H), 0, 0, cv::INTER_CUBIC);
        input = static_cast<void*>(img.data);
    } else {
        data = preprocess_img(img, false, mean, stdv, BATCH_SIZE, INPUT_H, INPUT_W);
        input = data.data();
    }

#if TRT_VERSION >= 8000
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
#else
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
#endif
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);

    for (int32_t i = 0; i < 100; ++i) {
        auto _start = std::chrono::system_clock::now();
        auto prob = doInference(*context, input, 1);
        auto _end = std::chrono::system_clock::now();
        auto _time = std::chrono::duration_cast<std::chrono::microseconds>(_end - _start).count();
        std::cout << "Execution time: " << _time << "us\n";

        for (const auto& vector : prob) {
            int idx = 0;
            for (auto v : vector) {
                std::cout << std::setprecision(4) << v << ", " << std::flush;
                if (++idx > 20) {
                    std::cout << "\n====\n";
                    break;
                }
            }
        }

        if (i == 99) {
            int prev = 67;
            std::string str;
            for (int t = 0; t < 18; ++t) {
                std::array<float, 68> scores{};
                for (int c = 0; c < 68; ++c) {
                    scores[c] = prob[0][t + 18 * c];
                }
                int best =
                        static_cast<int>(std::distance(scores.begin(), std::max_element(scores.begin(), scores.end())));
                if (best != prev && best != 67)
                    str += alphabet[best];
                prev = best;
            }
            std::cout << "result: " << str << "\n";
        }
    }

    delete[] trtModelStream;
#if TRT_VERSION >= 8000
    delete context;
    delete engine;
    delete runtime;
#else
    context->destroy();
    engine->destroy();
    runtime->destroy();
#endif

    return 0;
}


================================================
FILE: lprnet/macros.h
================================================
#pragma once
#include <NvInfer.h>

#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else

#if defined(_MSC_VER)
#define API __declspec(dllimport)
#else
#define API
#endif
#endif  // API_EXPORTS

#define TRT_VERSION \
    ((NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + (NV_TENSORRT_PATCH * 10) + NV_TENSORRT_BUILD)

#if TRT_VERSION < 7220
#error "TensorRT >= 7.2.2 is required for this demo."
#endif

#if TRT_VERSION >= 8000
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif


================================================
FILE: lprnet/utils.h
================================================
#pragma once
#include <cuda_runtime_api.h>
#include <algorithm>
#include <cassert>
#include <fstream>
#include <iostream>
#include <map>
#include <numeric>
#include <opencv2/opencv.hpp>
#include <string>
#include <vector>
#include "macros.h"

using namespace nvinfer1;

#define CHECK(status)                                     \
    do {                                                  \
        auto ret = (status);                              \
        if (ret != cudaSuccess) {                         \
            std::cerr << "Cuda failure: " << ret << "\n"; \
            std::abort();                                 \
        }                                                 \
    } while (0)

static inline void checkTrtEnv(int device = 0) {
#if TRT_VERSION < 8000
    CHECK(cudaGetDevice(&device));
    cudaDeviceProp prop{};
    CHECK(cudaGetDeviceProperties(&prop, device));
    const int sm = prop.major * 10 + prop.minor;
    if (sm > 86) {
        std::cerr << "TensorRT < 8 does not support SM > 86 on this GPU.";
        std::abort();
    }
#endif
}

/**
 * @brief TensorRT weight files have a simple space delimited format:
 * [type] [size] <data x size in hex>
 * 
 * @param file input weight file path
 * @return std::map<std::string, nvinfer1::Weights> 
 */
static inline auto loadWeights(const std::string& file) {
    std::cout << "Loading weights: " << file << "\n";
    std::map<std::string, nvinfer1::Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--) {
        nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0};

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> wt.count;

        // Load blob
        auto* val = new uint32_t[wt.count];
        input >> std::hex;
        for (auto x = 0ll; x < wt.count; ++x) {
            input >> val[x];
        }
        wt.values = val;
        weightMap[name] = wt;
    }

    return weightMap;
}

/**
 * @brief a preprocess function aligning with ImageNet preprocess in torchvision, only support 3-channel image
 * 
 * @param img opencv image with BGR layout
 * @param bgr2rgb whether to convert BGR to RGB
 * @param mean subtract mean
 * @param std divide std
 * @param n batch size
 * @param h resize height
 * @param w resize width
 * @return std::vector<float> contiguous flatten image data in float32 type
 */
static inline std::vector<float> preprocess_img(cv::Mat& img, bool bgr2rgb, const std::array<const float, 3>& mean,
                                                const std::array<const float, 3>& std, int n, int h, int w) {
    const auto c = img.channels();
    const auto size = c * h * w;
    if (c != 3) {
        std::cerr << "this demo only supports 3 channel input image.\n";
        std::abort();
    }
    if (bgr2rgb) {
        cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
    }
    cv::resize(img, img, cv::Size(w, h), 0, 0, cv::INTER_LINEAR);
    img.convertTo(img, CV_32FC3, 1.f / 255);
    img = (img - cv::Scalar(mean[0], mean[1], mean[2])) / cv::Scalar(std[0], std[1], std[2]);
    std::vector<float> chw(static_cast<std::size_t>(n) * c * h * w, 0.f);

    // fill all batch with the same input image
    for (int i = 0; i < n; ++i) {
        for (int y = 0; y < h; ++y) {
            for (int x = 0; x < w; ++x) {
                const cv::Vec3f v = img.at<cv::Vec3f>(y, x);
                chw[i * size + 0 * h * w + y * w + x] = v[0];
                chw[i * size + 1 * h * w + y * w + x] = v[1];
                chw[i * size + 2 * h * w + y * w + x] = v[2];
            }
        }
    }
    return chw;
}

static inline std::vector<std::pair<int, float>> topk(const std::vector<float>& v, int64_t k) {
    if (k <= 0)
        return {};
    auto s = std::min<std::ptrdiff_t>(k, static_cast<std::ptrdiff_t>(v.size()));

    std::vector<int> idx(v.size());
    std::iota(idx.begin(), idx.end(), 0);

    std::partial_sort(idx.begin(), std::next(idx.begin(), s), idx.end(), [&](int a, int b) { return v[a] > v[b]; });

    std::vector<std::pair<int, float>> out;
    out.reserve(k);
    for (int i = 0; i < k; ++i)
        out.emplace_back(idx[i], v[idx[i]]);
    return out;
}

static inline std::map<int, std::string> loadImagenetLabelMap(const std::string& path) {
    std::map<int, std::string> labels;
    std::ifstream in(path);
    if (!in.is_open()) {
        return labels;
    }
    std::string line;
    while (std::getline(in, line)) {
        auto colon = line.find(':');
        if (colon == std::string::npos) {
            continue;
        }
        auto first_quote = line.find('\'', colon);
        if (first_quote == std::string::npos) {
            continue;
        }
        auto second_quote = line.find('\'', first_quote + 1);
        if (second_quote == std::string::npos) {
            continue;
        }
        int idx = std::stoi(line.substr(0, colon));
        labels[idx] = line.substr(first_quote + 1, second_quote - first_quote - 1);
    }
    return labels;
}

static inline ILayer* addTransformLayer(INetworkDefinition* network, ITensor& input, bool bgr2rgb,
                                        const std::array<const float, 3>& mean, const std::array<const float, 3>& std) {
    struct ScaleParams {
        std::array<float, 3> shift;
        std::array<float, 3> scale;
    };
    static std::vector<std::unique_ptr<ScaleParams>> gScaleParams;
    auto params = std::make_unique<ScaleParams>();
    params->shift = {-mean[0] / std[0], -mean[1] / std[1], -mean[2] / std[2]};
    params->scale = {1.f / (std[0] * 255.f), 1.f / (std[1] * 255.f), 1.f / (std[2] * 255.f)};

    static const Weights empty{DataType::kFLOAT, nullptr, 0ll};
    const Weights shift{DataType::kFLOAT, params->shift.data(), 3ll};
    const Weights scale{DataType::kFLOAT, params->scale.data(), 3ll};

    gScaleParams.emplace_back(std::move(params));

    ITensor* in = &input;
    if (input.getType() != DataType::kFLOAT) {
#if TRT_VERSION >= 8000
        auto* cast = network->addCast(input, DataType::kFLOAT);
        assert(cast);
        cast->setName("Cast to FP32");
        in = cast->getOutput(0);
#else
        auto* identity = network->addIdentity(input);
        assert(identity);
        identity->setName("Convert to FP32");
        identity->setOutputType(0, DataType::kFLOAT);
        in = identity->getOutput(0);
#endif
    }
    // Convert from NHWC to NCHW
    auto* perm = network->addShuffle(*in);
    assert(perm);
    perm->setName("NHWC -> NCHW");
    perm->setFirstTranspose(Permutation{0, 3, 1, 2});

    // Convert from BGR to RGB (optional)
    ITensor* data{nullptr};
    if (bgr2rgb) {
        auto add_slice = [&](int c, const char* name) -> ITensor* {
            auto dims = perm->getOutput(0)->getDimensions();
            Dims4 start = {0, c, 0, 0}, stride = {1, 1, 1, 1};
            Dims4 size = {dims.d[0], 1, dims.d[2], dims.d[3]};
            auto* _slice = network->addSlice(*perm->getOutput(0), start, size, stride);
            _slice->setName(name);
            assert(_slice && _slice->getNbOutputs() == 1);
            return _slice->getOutput(0);
        };
        std::array<ITensor*, 3> channels = {add_slice(2, "R"), add_slice(1, "G"), add_slice(0, "B")};
        auto* cat = network->addConcatenation(channels.data(), 3);
        assert(cat);
        cat->setName("RGB");
        cat->setAxis(1);
        data = cat->getOutput(0);
    } else {
        data = perm->getOutput(0);
    }

    // Normalize
    auto* trans = network->addScale(*data, ScaleMode::kCHANNEL, shift, scale, empty);
    assert(trans);
    trans->setName("mean & std");
#if TRT_VERSION >= 8000
    trans->setChannelAxis(1);
#endif
    return trans;
}

static inline size_t getSize(DataType dt) {
    switch (dt) {
#if TRT_VERSION >= 8510
        case DataType::kUINT8:
#endif
        case DataType::kINT8:
            return sizeof(int8_t);
        case DataType::kFLOAT:
            return sizeof(float);
        case DataType::kHALF:
            return sizeof(int16_t);
        case DataType::kINT32:
            return sizeof(int32_t);
        default: {
            std::cerr << "Unsupported data type\n";
            std::abort();
        }
    }
}


================================================
FILE: mlp/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.17.0)

project(
  mlp
  VERSION 0.1
  LANGUAGES C CXX CUDA)

if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
  set(CMAKE_CUDA_ARCHITECTURES
      60
      70
      72
      75
      80
      86
      89)
endif()

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_INCLUDE_CURRENT_DIR TRUE)

option(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cudaruntime library" OFF)

find_package(Threads REQUIRED)
find_package(CUDAToolkit REQUIRED)

if(NOT TARGET TensorRT::TensorRT)
  include(FindTensorRT.cmake)
else()
  message("TensorRT has been found, skipping for ${PROJECT_NAME}")
endif()

add_executable(${PROJECT_NAME} mlp.cpp)

target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_LIST_DIR})

target_link_libraries(${PROJECT_NAME} PUBLIC Threads::Threads CUDA::cudart
                                             TensorRT::TensorRT)


================================================
FILE: mlp/FindTensorRT.cmake
================================================
cmake_minimum_required(VERSION 3.17.0)

function(_guess_path var_name required_files)
  set(_result "")

  foreach(path_entry IN LISTS ARGN)
    if(NOT EXISTS "${path_entry}")
      message(DEBUG "skip non-existing path '${path_entry}'")
      continue()
    endif()

    set(_ok TRUE)
    foreach(required_file IN LISTS required_files)
      if(NOT EXISTS "${path_entry}/${required_file}")
        set(_ok FALSE)
        message(DEBUG "'${path_entry}' missing '${required_file}'")
        break()
      endif()
    endforeach()

    if(_ok)
      list(APPEND _result "${path_entry}")
      message(DEBUG "accept '${path_entry}'")
    else()
      message(DEBUG "reject '${path_entry}'")
    endif()
  endforeach()

  if(_result STREQUAL "")
    message(
      FATAL_ERROR
        "_guess_path(${var_name}) failed: no valid path found. required_files='${required_files}' candidates='${ARGN}'"
    )
  endif()

  set(${var_name}
      "${_result}"
      PARENT_SCOPE)
endfunction()

# add library
add_library(TensorRT IMPORTED INTERFACE)
add_library(TensorRT::TensorRT ALIAS TensorRT)

set(TRT_VERSION
    CACHE
      STRING
      "TensorRT version, e.g. \"8.6.1.6\" or \"8.6.1.6+cuda12.0.1.011\", \"8.6.1.6.Windows10.x86_64.cuda-12.0\" etc"
)

if(NOT TRT_VERSION STREQUAL "" AND NOT $ENV{TRT_VERSION} STREQUAL "")
  message(
    WARNING
      "TRT_VERSION defined by cmake and environment variable both, using the later one"
  )
endif()

if(NOT $ENV{TRT_VERSION} STREQUAL "")
  set(TRT_VERSION $ENV{TRT_VERSION})
endif()

string(REGEX MATCH "([0-9]+)" _match ${TRT_VERSION})
set(TRT_MAJOR_VERSION "${_match}")
unset(_match)

if(WIN32)
  set(TensorRT_DIR "C:/Program Files/TensorRT-${TRT_VERSION}")
  if(NOT EXISTS "${TensorRT_DIR}")
    message(
      FATAL_ERROR
        "TensorRT_DIR=${TensorRT_DIR} does not exist!"
    )
  endif()

  if(${TRT_MAJOR_VERSION} GREATER_EQUAL 10)
    set(_modules nvinfer_10 nvinfer_plugin_10 nvinfer_vc_plugin_10
                 nvinfer_dispatch_10 nvinfer_lean_10)
    message(DEBUG "Using ${_modules}")
  else()
    set(_modules nvinfer nvinfer_plugin nvinfer_vc_plugin nvinfer_dispatch
                 nvinfer_lean)
  endif()

  set(TensorRT_LIBRARY_DIR "${TensorRT_DIR}/lib")
  set(TensorRT_INCLUDE_DIR "${TensorRT_DIR}/include")
elseif(UNIX)
  string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" _trt_arch)
  set(_trt_include_candidates)
  if(_trt_arch MATCHES "^(aarch64|arm64|arch64)$")
    set(_trt_include_candidates "/usr/include/aarch64-linux-gnu" "/usr/include"
                                "/usr/local/cuda/targets/aarch64-linux/include")
    set(_trt_library_candidates
        "/usr/local/tensorrt/targets/aarch64-linux-gnu/lib"
        "/usr/lib/aarch64-linux-gnu" "/usr/lib/aarch64-linux-gnu/tegra"
        "/usr/lib")
  elseif(_trt_arch MATCHES "^(x86_64|amd64)$")
    set(_trt_include_candidates
        "/usr/local/tensorrt/targets/x86_64-linux-gnu/include"
        "/usr/include/x86_64-linux-gnu" "/usr/include")
    set(_trt_library_candidates
        "/usr/local/tensorrt/targets/x86_64-linux-gnu/lib"
        "/usr/lib/x86_64-linux-gnu" "/usr/lib")
  else()
    message(FATAL_ERROR "Unknown architecture")
  endif()

  set(_modules nvinfer nvinfer_plugin)
  if(${TRT_MAJOR_VERSION} GREATER_EQUAL 8)
    list(APPEND _modules nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean)
  endif()

  _guess_path(TensorRT_LIBRARY_DIR "libnvinfer.so;libnvinfer_plugin.so"
              ${_trt_library_candidates})
  message(STATUS "TensorRT libraries: ${TensorRT_LIBRARY_DIR}")
  _guess_path(TensorRT_INCLUDE_DIR "NvInfer.h" ${_trt_include_candidates})
  message(STATUS "TensorRT includes: ${TensorRT_INCLUDE_DIR}")
endif()

foreach(lib IN LISTS _modules)
  find_library(
    TensorRT_${lib}_LIBRARY
    NAMES ${lib}
    HINTS ${TensorRT_LIBRARY_DIR})
  list(APPEND TensorRT_LIBRARIES ${TensorRT_${lib}_LIBRARY})
endforeach()

target_link_libraries(TensorRT INTERFACE ${TensorRT_LIBRARIES})

message(STATUS "Found TensorRT libs: ${TensorRT_LIBRARIES}")

set_target_properties(
  TensorRT
  PROPERTIES C_STANDARD 17
             CXX_STANDARD 17
             POSITION_INDEPENDENT_CODE ON
             SKIP_BUILD_RPATH TRUE
             BUILD_WITH_INSTALL_RPATH TRUE
             INSTALL_RPATH "$ORIGIN"
             INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIR}")

unset(TRT_MAJOR_VERSION)
unset(_modules)
unset(_trt_include_candidates)
unset(_trt_library_candidates)
unset(_trt_arch)


================================================
FILE: mlp/README.md
================================================
# mlp

MLP is the most basic net in this tensorrtx project for starters. You can learn the basic procedures of building TensorRT app from the provided APIs. The process of building a TensorRT engine explained in the chart below.

![TensorRT Image](https://user-images.githubusercontent.com/33795294/148565279-795b12da-5243-4e7e-881b-263eb7658683.jpg)

This demo creates a single-layer MLP with `TensorRT >= 7.2.x` version support.

## Helper Files

`logging.h` : A logger file for using NVIDIA TensorRT API (mostly same for all models)

`mlp.wts` : Converted weight file, can be generated from [pytorchx/mlp](https://github.com/wang-xinyu/pytorchx/tree/master/mlp), for mlp, it looks like:

```bash
2
linear.weight 1 3fff7e32
linear.bias 1 3c138a5a
```

(you can create `mlp.wts` and copy this content into it directly)

## TensorRT C++ API

see [HERE](../README.md#how-to-run)

## TensorRT Python API

1. Generate mlp.wts (from `pytorchx` or create on your own)

2. Put mlp.wts into tensorrtx/mlp (if using the generated weights)

3. Run
   ```bash
   cd tensorrtx/mlp
   python mlp.py -s   # serialize model to plan file, i.e. 'mlp.engine'
   python mlp.py -d   # deserialize plan file and run inference
   ```


================================================
FILE: mlp/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include <cassert>
#include <cstdint>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include <utility>
#include "NvInferRuntime.h"
#include "macros.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf {
   public:
    LogStreamConsumerBuffer(std::ostream& stream, std::string prefix, bool shouldLog)
        : mOutput(stream), mPrefix(std::move(prefix)), mShouldLog(shouldLog) {}

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) noexcept
        : mOutput(other.mOutput), mPrefix(std::move(other.mPrefix)), mShouldLog(other.mShouldLog) {}

    ~LogStreamConsumerBuffer() override {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr()) {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    int sync() override {
        putOutput();
        return 0;
    }

    void putOutput() {
        if (mShouldLog) {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }

   private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase {
   public:
    LogStreamConsumerBase(std::ostream& stream, std::string prefix, bool shouldLog)
        : mBuffer(stream, std::move(prefix), shouldLog) {}

   protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
   public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(severity <= reportableSeverity),
          mSeverity(severity) {}

    LogStreamConsumer(LogStreamConsumer&& other) noexcept
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(other.mShouldLog),
          mSeverity(other.mSeverity) {}

    void setReportableSeverity(Severity reportableSeverity) {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

   private:
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger {
   private:
    struct TestInfo;

   public:
    Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {}

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult : std::uint8_t {
        kRUNNING,  //!< The test is running
        kPASSED,   //!< The test passed
        kFAILED,   //!< The test failed
        kWAIVED    //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger() { return *this; }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << '\n';
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity) { mReportableSeverity = severity; }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom {
       public:
        TestAtom(TestAtom&&) = default;

       private:
        friend class Logger;

        TestAtom(bool started, TestInfo info)
            : mStarted(started), mName(std::move(info.name)), mCmdline(std::move(info.cmdline)) {}

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline) {
        return TestAtom{false, TestInfo{name, cmdline}};
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom) {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result) {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass) {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    [[nodiscard]] Severity getReportableSeverity() const { return mReportableSeverity; }

   private:
    struct TestInfo {
        std::string name;
        std::string cmdline;
    };
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result) {
        switch (result) {
            case TestResult::kRUNNING:
                return "RUNNING";
            case TestResult::kPASSED:
                return "PASSED";
            case TestResult::kFAILED:
                return "FAILED";
            case TestResult::kWAIVED:
                return "WAIVED";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result) {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << '\n';
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv) {
        std::stringstream ss;
        for (int i = 0; i < argc; i++) {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace {

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kVERBOSE};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINFO};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kWARNING};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kERROR};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINTERNAL_ERROR};
}

}  // anonymous namespace

#endif  // TENSORRT_LOGGING_H


================================================
FILE: mlp/macros.h
================================================
#pragma once
#include <NvInfer.h>

#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else

#if defined(_MSC_VER)
#define API __declspec(dllimport)
#else
#define API
#endif
#endif  // API_EXPORTS

#define TRT_VERSION \
    ((NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + (NV_TENSORRT_PATCH * 10) + NV_TENSORRT_BUILD)

#if TRT_VERSION < 7220
#error "TensorRT >= 7.2.2 is required for this demo."
#endif

#if TRT_VERSION >= 8000
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif


================================================
FILE: mlp/mlp.cpp
================================================
#include <array>
#include <chrono>
#include <iostream>
#include <numeric>
#include <vector>
#include "logging.h"
#include "utils.h"

using namespace nvinfer1;

constexpr static const int64_t INPUT_SIZE = 1;
constexpr static const int64_t OUTPUT_SIZE = 1;
constexpr static const char* INPUT_NAME = "data";
constexpr static const char* OUTPUT_NAME = "out";
constexpr static const char* WTS_PATH = "../models/mlp.wts";
constexpr static const char* ENGINE_PATH = "../models/mlp.engine";

// Logger from TRT API
static Logger gLogger;

/**
 * Create a single-layer "MLP" using the TRT Builder and Configurations
 *
 * @param N: max batch size for built TRT model
 * @param builder: to build engine and networks
 * @param config: configuration related to Hardware
 * @param dt: datatype for model layers
 * @return engine: TRT model
 */
ICudaEngine* createMLPEngine(int32_t N, IRuntime* runtime, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    std::cout << "[INFO]: Creating MLP using TensorRT...\n";

    // Load Weights from relevant file
    std::map<std::string, Weights> weightMap = loadWeights(WTS_PATH);

    // Create an empty network
#if TRT_VERSION >= 10000
    auto* network = builder->createNetworkV2(0);
#else
    auto* network = builder->createNetworkV2(1u << static_cast<int>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));
#endif

    // Create an input with proper name
    ITensor* data = network->addInput(INPUT_NAME, dt, Dims4{N, 1, 1, 1});
    assert(data);

    // all tensors
    auto* fc1w = network->addConstant(Dims4{1, 1, 1, 1}, weightMap["linear.weight"])->getOutput(0);
    auto* fc1b = network->addConstant(Dims4{1, 1, 1, 1}, weightMap["linear.bias"])->getOutput(0);
    assert(fc1w && fc1b);
    // fc layer
    auto* fc1_0 = network->addMatrixMultiply(*data, MatrixOperation::kNONE, *fc1w, MatrixOperation::kTRANSPOSE);
    auto* fc1_1 = network->addElementWise(*fc1_0->getOutput(0), *fc1b, ElementWiseOperation::kSUM);
    assert(fc1_0 && fc1_1);
    fc1_0->setName("fc1_0");

    // set output with name
    auto* output = fc1_1->getOutput(0);
    output->setName(OUTPUT_NAME);

    // mark the output
    network->markOutput(*output);

#if TRT_VERSION >= 8000
    IHostMemory* serialized_mem = builder->buildSerializedNetwork(*network, *config);
    ICudaEngine* engine = runtime->deserializeCudaEngine(serialized_mem->data(), serialized_mem->size());
    delete network;
#else
    builder->setMaxBatchSize(N);
    config->setMaxWorkspaceSize(WORKSPACE_SIZE);
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    network->destroy();
#endif
    assert(engine != nullptr);

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    return engine;
}

void APIToModel(int32_t maxBatchSize, IRuntime* runtime, IHostMemory** modelStream) {
    /**
     * Create engine using TensorRT APIs
     *
     * @param maxBatchSize: for the deployed model configs
     * @param modelStream: shared memory to store serialized model
     */

    // Create builder with the logger
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Build an engine
    ICudaEngine* engine = createMLPEngine(maxBatchSize, runtime, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // serialize the engine into binary stream
    (*modelStream) = engine->serialize();

#if TRT_VERSION >= 8000
    delete engine;
    delete config;
    delete builder;
#else
    engine->destroy();
    config->destroy();
    builder->destroy();
#endif
}

void doInference(IExecutionContext& ctx, void* input, float* output, int64_t batchSize = 1) {
    /**
     * Perform inference using the CUDA ctx
     *
     * @param ctx: context created by engine
     * @param input: input from the host
     * @param output: output to save on host
     * @param batchSize: batch size for TRT model
     */
    // Get engine from the ctx
    const ICudaEngine& engine = ctx.getEngine();

#if TRT_VERSION >= 8000
    int32_t nIO = engine.getNbIOTensors();
    const int inputIndex = 0;
    const int outputIndex = engine.getNbIOTensors() - 1;
#else
    int32_t nIO = engine.getNbBindings();
    const int inputIndex = engine.getBindingIndex(INPUT_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_NAME);
#endif
    assert(nIO == 2);  // mlp contains 1 input and 1 output

    // create cuda stream for aync cuda operations
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // create GPU buffers on cuda device and copy input data from host
    std::vector<void*> buffers(nIO, nullptr);
    size_t inputSize = 0;
    size_t outputSize = batchSize * OUTPUT_SIZE * sizeof(float);
#if TRT_VERSION >= 8000
    auto* input_name = engine.getIOTensorName(inputIndex);
    inputSize = batchSize * INPUT_SIZE * getSize(engine.getTensorDataType(input_name));
#else
    inputSize = batchSize * INPUT_SIZE * getSize(engine.getBindingDataType(inputIndex));
#endif
    CHECK(cudaMalloc(&buffers[inputIndex], inputSize));
    CHECK(cudaMalloc(&buffers[outputIndex], outputSize));
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, inputSize, cudaMemcpyHostToDevice, stream));

    // execute inference using ctx provided by engine
#if TRT_VERSION >= 8000
    for (int32_t i = 0; i < engine.getNbIOTensors(); i++) {
        auto const name = engine.getIOTensorName(i);
        auto dims = ctx.getTensorShape(name);
        auto total = std::accumulate(dims.d, dims.d + dims.nbDims, 1ll, std::multiplies<>());
        std::cout << name << "\t" << total << "\n";
        ctx.setTensorAddress(name, buffers[i]);
    }
    assert(ctx.enqueueV3(stream));
#else
    assert(ctx.enqueueV2(buffers.data(), stream, nullptr));
#endif

    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], outputSize, cudaMemcpyDeviceToHost, stream));
    CHECK(cudaStreamSynchronize(stream));
    for (auto& buffer : buffers) {
        CHECK(cudaFree(buffer));
    }
    CHECK(cudaStreamDestroy(stream));
}

int main(int argc, char** argv) {
    checkTrtEnv();
    if (argc != 2) {
        std::cerr << "[ERROR]: Arguments not right!\n";
        std::cerr << "./mlp -s   // serialize model to plan file\n";
        std::cerr << "./mlp -d   // deserialize plan file and run inference\n";
        return 1;
    }

    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    char* trtModelStream{nullptr};
    std::streamsize size{0};

    if (std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(1, runtime, &modelStream);
        assert(modelStream != nullptr);

        std::ofstream p(ENGINE_PATH, std::ios::binary | std::ios::trunc);
        if (!p.good()) {
            std::cerr << "could not open plan output file\n";
            return 1;
        }
        if (modelStream->size() > static_cast<std::size_t>(std::numeric_limits<std::streamsize>::max())) {
            std::cerr << "this model is too large to serialize\n";
            return -1;
        }
        const auto* data_ptr = reinterpret_cast<const char*>(modelStream->data());
        auto data_size = static_cast<std::streamsize>(modelStream->size());
        p.write(data_ptr, data_size);

#if TRT_VERSION >= 8000
        delete modelStream;
#else
        modelStream->destroy();
#endif
        std::cout << "[INFO]: Successfully created TensorRT engine.\n";
        return 0;
    } else if (std::string(argv[1]) == "-d") {
        std::ifstream file(ENGINE_PATH, std::ios::binary);

        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    }

#if TRT_VERSION >= 8000
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
#else
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
#endif
    assert(engine != nullptr);
    delete[] trtModelStream;

    IExecutionContext* ctx = engine->createExecutionContext();
    assert(ctx != nullptr);

    std::array<float, 1> output = {-1.f};
    std::array<float, 1> input = {12.0f};

    for (int i = 0; i < 100; i++) {
        auto start = std::chrono::high_resolution_clock::now();
        doInference(*ctx, input.data(), output.data());
        auto end = std::chrono::high_resolution_clock::now();
        auto time = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
        std::cout << "Execution time: " << time << "us\n"
                  << "output: " << output[0] << "\n";
    }

#if TRT_VERSION >= 8000
    delete ctx;
    delete engine;
    delete runtime;
#else
    ctx->destroy();
    engine->destroy();
    runtime->destroy();
#endif

    return 0;
}


================================================
FILE: mlp/mlp.py
================================================
import argparse
import os
import numpy as np
import struct

# required for the model creation
import tensorrt as trt

# required for the inference using TRT engine
import pycuda.driver as cuda

# Sizes of input and output for TensorRT model
INPUT_SIZE = 1
OUTPUT_SIZE = 1

# path of .wts (weight file) and .engine (model file)
WEIGHT_PATH = "./mlp.wts"
ENGINE_PATH = "./mlp.engine"

# input and output names are must for the TRT model
INPUT_BLOB_NAME = 'data'
OUTPUT_BLOB_NAME = 'out'

# A logger provided by NVIDIA-TRT
gLogger = trt.Logger(trt.Logger.INFO)


################################
# DEPLOYMENT RELATED ###########
################################
def load_weights(file_path):
    """
    Parse the .wts file and store weights in dict format
    :param file_path:
    :return weight_map: dictionary containing weights and their values
    """
    print(f"[INFO]: Loading weights: {file_path}")
    assert os.path.exists(file_path), '[ERROR]: Unable to load weight file.'

    weight_map = {}
    with open(file_path, "r") as f:
        lines = [line.strip() for line in f]

    # count for total # of weights
    count = int(lines[0])
    assert count == len(lines) - 1

    # Loop through counts and get the exact num of values against weights
    for i in range(1, count + 1):
        splits = lines[i].split(" ")
        name = splits[0]
        cur_count = int(splits[1])

        # len of splits must be greater than current weight counts
        assert cur_count + 2 == len(splits)

        # loop through all weights and unpack from the hexadecimal values
        values = []
        for j in range(2, len(splits)):
            # hex string to bytes to float
            values.append(struct.unpack(">f", bytes.fromhex(splits[j])))

        # store in format of { 'weight.name': [weights_val0, weight_val1, ..] }
        weight_map[name] = np.array(values, dtype=np.float32)

    return weight_map


def create_mlp_engine(max_batch_size, builder, config, dt):
    """
    Create Multi-Layer Perceptron using the TRT Builder and Configurations
    :param max_batch_size: batch size for built TRT model
    :param builder: to build engine and networks
    :param config: configuration related to Hardware
    :param dt: datatype for model layers
    :return engine: TRT model
    """
    print("[INFO]: Creating MLP using TensorRT...")
    # load weight maps from the file
    weight_map = load_weights(WEIGHT_PATH)

    # build an empty network using builder
    network = builder.create_network()

    # add an input to network using the *input-name
    data = network.add_input(INPUT_BLOB_NAME, dt, (1, 1, INPUT_SIZE))
    assert data

    # add the layer with output-size (number of outputs)
    linear = network.add_fully_connected(input=data,
                                         num_outputs=OUTPUT_SIZE,
                                         kernel=weight_map['linear.weight'],
                                         bias=weight_map['linear.bias'])
    assert linear

    # set the name for output layer
    linear.get_output(0).name = OUTPUT_BLOB_NAME

    # mark this layer as final output layer
    network.mark_output(linear.get_output(0))

    # set the batch size of current builder
    builder.max_batch_size = max_batch_size

    # create the engine with model and hardware configs
    engine = builder.build_engine(network, config)

    # free captured memory
    del network
    del weight_map

    # return engine
    return engine


def api_to_model(max_batch_size):
    """
    Create engine using TensorRT APIs
    :param max_batch_size: for the deployed model configs
    :return:
    """
    # Create Builder with logger provided by TRT
    builder = trt.Builder(gLogger)

    # Create configurations from Engine Builder
    config = builder.create_builder_config()

    # Create MLP Engine
    engine = create_mlp_engine(max_batch_size, builder, config, trt.float32)
    assert engine

    # Write the engine into binary file
    print("[INFO]: Writing engine into binary...")
    with open(ENGINE_PATH, "wb") as f:
        # write serialized model in file
        f.write(engine.serialize())

    # free the memory
    del engine
    del builder


################################
# INFERENCE RELATED ############
################################
def perform_inference(input_val):
    """
    Get inference using the pre-trained model
    :param input_val: a number as an input
    :return:
    """

    def do_inference(inf_context, inf_host_in, inf_host_out):
        """
        Perform inference using the CUDA context
        :param inf_context: context created by engine
        :param inf_host_in: input from the host
        :param inf_host_out: output to save on host
        :return:
        """

        inference_engine = inf_context.engine
        # Input and output bindings are required for inference
        assert inference_engine.num_bindings == 2

        # allocate memory in GPU using CUDA bindings
        device_in = cuda.mem_alloc(inf_host_in.nbytes)
        device_out = cuda.mem_alloc(inf_host_out.nbytes)

        # create bindings for input and output
        bindings = [int(device_in), int(device_out)]

        # create CUDA stream for simultaneous CUDA operations
        stream = cuda.Stream()

        # copy input from host (CPU) to device (GPU)  in stream
        cuda.memcpy_htod_async(device_in, inf_host_in, stream)

        # execute inference using context provided by engine
        inf_context.execute_async(bindings=bindings, stream_handle=stream.handle)

        # copy output back from device (GPU) to host (CPU)
        cuda.memcpy_dtoh_async(inf_host_out, device_out, stream)

        # synchronize the stream to prevent issues
        #       (block CUDA and wait for CUDA operations to be completed)
        stream.synchronize()

    # create a runtime (required for deserialization of model) with NVIDIA's logger
    runtime = trt.Runtime(gLogger)
    assert runtime

    # read and deserialize engine for inference
    with open(ENGINE_PATH, "rb") as f:
        engine = runtime.deserialize_cuda_engine(f.read())
    assert engine

    # create execution context -- required for inference executions
    context = engine.create_execution_context()
    assert context

    # create input as array
    data = np.array([input_val], dtype=np.float32)

    # capture free memory for input in GPU
    host_in = cuda.pagelocked_empty((INPUT_SIZE), dtype=np.float32)

    # copy input-array from CPU to Flatten array in GPU
    np.copyto(host_in, data.ravel())

    # capture free memory for output in GPU
    host_out = cuda.pagelocked_empty(OUTPUT_SIZE, dtype=np.float32)

    # do inference using required parameters
    do_inference(context, host_in, host_out)

    print(f'\n[INFO]: Predictions using pre-trained model..\n\tInput:\t{input_val}\n\tOutput:\t{host_out[0]:.4f}')


def get_args():
    """
    Parse command line arguments
    :return arguments: parsed arguments
    """
    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument('-s', action='store_true')
    arg_parser.add_argument('-d', action='store_true')
    arguments = vars(arg_parser.parse_args())
    # check for the arguments
    if not (arguments['s'] ^ arguments['d']):
        print("[ERROR]: Arguments not right!\n")
        print("\tpython mlp.py -s   # serialize model to engine file")
        print("\tpython mlp.py -d   # deserialize engine file and run inference")
        exit()

    return arguments


if __name__ == "__main__":
    args = get_args()
    if args['s']:
        api_to_model(max_batch_size=1)
        print("[INFO]: Successfully created TensorRT engine...")
        print("\n\tRun inference using `python mlp.py -d`\n")
    else:
        perform_inference(input_val=4.0)


================================================
FILE: mlp/utils.h
================================================
#pragma once
#include <cuda_runtime_api.h>
#include <cassert>
#include <fstream>
#include <iostream>
#include <map>
#include <stdexcept>
#include <string>
#include "macros.h"

using namespace nvinfer1;

constexpr const std::size_t WORKSPACE_SIZE = 16 << 20;

#define CHECK(status)                                     \
    do {                                                  \
        auto ret = (status);                              \
        if (ret != cudaSuccess) {                         \
            std::cerr << "Cuda failure: " << ret << "\n"; \
            std::abort();                                 \
        }                                                 \
    } while (0)

static void checkTrtEnv(int device = 0) {
#if TRT_VERSION < 8000
    CHECK(cudaGetDevice(&device));
    cudaDeviceProp prop{};
    CHECK(cudaGetDeviceProperties(&prop, device));
    const int sm = prop.major * 10 + prop.minor;
    if (sm > 86) {
        std::cerr << "TensorRT < 8 does not support SM > 86 on this GPU.";
        std::abort();
    }
#endif
}

/**
 * @brief TensorRT weight files have a simple space delimited format:
 * [type] [size] <data x size in hex>
 * 
 * @param file input weight file path
 * @return std::map<std::string, nvinfer1::Weights> 
 */
static auto loadWeights(const std::string& file) {
    std::cout << "Loading weights: " << file << "\n";
    std::map<std::string, nvinfer1::Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--) {
        nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0};

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> wt.count;

        // Load blob
        auto* val = new uint32_t[wt.count];
        input >> std::hex;
        for (auto x = 0ll; x < wt.count; ++x) {
            input >> val[x];
        }
        wt.values = val;
        weightMap[name] = wt;
    }

    return weightMap;
}

static size_t getSize(DataType dt) {
    switch (dt) {
#if TRT_VERSION >= 8510
        case DataType::kUINT8:
#endif
        case DataType::kINT8:
            return sizeof(int8_t);
        case DataType::kFLOAT:
            return sizeof(float);
        case DataType::kHALF:
            return sizeof(int16_t);
        case DataType::kINT32:
            return sizeof(int32_t);
        default: {
            std::cerr << "Unsupported data type\n";
            std::abort();
        }
    }
}


================================================
FILE: mnasnet/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.14)

project(
  mnasnet
  VERSION 0.1
  LANGUAGES C CXX CUDA)

if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
  set(CMAKE_CUDA_ARCHITECTURES
      60
      70
      72
      75
      80
      86
      89)
endif()

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_INCLUDE_CURRENT_DIR TRUE)

option(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cudaruntime library" OFF)

find_package(Threads REQUIRED)
find_package(CUDAToolkit REQUIRED)
find_package(OpenCV REQUIRED)

if(NOT TARGET TensorRT::TensorRT)
  include(FindTensorRT.cmake)
else()
  message("TensorRT has been found, skipping for ${PROJECT_NAME}")
endif()

add_executable(${PROJECT_NAME} mnasnet.cpp)

target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_LIST_DIR}
                                                  ${OpenCV_INCLUDE_DIRS})

target_link_libraries(${PROJECT_NAME} PUBLIC Threads::Threads CUDA::cudart
                                             TensorRT::TensorRT ${OpenCV_LIBS})


================================================
FILE: mnasnet/FindTensorRT.cmake
================================================
cmake_minimum_required(VERSION 3.17.0)

function(_guess_path var_name required_files)
  set(_result "")

  foreach(path_entry IN LISTS ARGN)
    if(NOT EXISTS "${path_entry}")
      message(DEBUG "skip non-existing path '${path_entry}'")
      continue()
    endif()

    set(_ok TRUE)
    foreach(required_file IN LISTS required_files)
      if(NOT EXISTS "${path_entry}/${required_file}")
        set(_ok FALSE)
        message(DEBUG "'${path_entry}' missing '${required_file}'")
        break()
      endif()
    endforeach()

    if(_ok)
      list(APPEND _result "${path_entry}")
      message(DEBUG "accept '${path_entry}'")
    else()
      message(DEBUG "reject '${path_entry}'")
    endif()
  endforeach()

  if(_result STREQUAL "")
    message(
      FATAL_ERROR
        "_guess_path(${var_name}) failed: no valid path found. required_files='${required_files}' candidates='${ARGN}'"
    )
  endif()

  set(${var_name}
      "${_result}"
      PARENT_SCOPE)
endfunction()

# add library
add_library(TensorRT IMPORTED INTERFACE)
add_library(TensorRT::TensorRT ALIAS TensorRT)

set(TRT_VERSION
    CACHE
      STRING
      "TensorRT version, e.g. \"8.6.1.6\" or \"8.6.1.6+cuda12.0.1.011\", \"8.6.1.6.Windows10.x86_64.cuda-12.0\" etc"
)

if(NOT TRT_VERSION STREQUAL "" AND NOT $ENV{TRT_VERSION} STREQUAL "")
  message(
    WARNING
      "TRT_VERSION defined by cmake and environment variable both, using the later one"
  )
endif()

if(NOT $ENV{TRT_VERSION} STREQUAL "")
  set(TRT_VERSION $ENV{TRT_VERSION})
endif()

string(REGEX MATCH "([0-9]+)" _match ${TRT_VERSION})
set(TRT_MAJOR_VERSION "${_match}")
unset(_match)

if(WIN32)
  set(TensorRT_DIR "C:/Program Files/TensorRT-${TRT_VERSION}")
  if(NOT EXISTS "${TensorRT_DIR}")
    message(
      FATAL_ERROR
        "TensorRT_DIR=${TensorRT_DIR} does not exist!"
    )
  endif()

  if(${TRT_MAJOR_VERSION} GREATER_EQUAL 10)
    set(_modules nvinfer_10 nvinfer_plugin_10 nvinfer_vc_plugin_10
                 nvinfer_dispatch_10 nvinfer_lean_10)
    message(DEBUG "Using ${_modules}")
  else()
    set(_modules nvinfer nvinfer_plugin nvinfer_vc_plugin nvinfer_dispatch
                 nvinfer_lean)
  endif()

  set(TensorRT_LIBRARY_DIR "${TensorRT_DIR}/lib")
  set(TensorRT_INCLUDE_DIR "${TensorRT_DIR}/include")
elseif(UNIX)
  string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" _trt_arch)
  set(_trt_include_candidates)
  if(_trt_arch MATCHES "^(aarch64|arm64|arch64)$")
    set(_trt_include_candidates "/usr/include/aarch64-linux-gnu" "/usr/include"
                                "/usr/local/cuda/targets/aarch64-linux/include")
    set(_trt_library_candidates
        "/usr/local/tensorrt/targets/aarch64-linux-gnu/lib"
        "/usr/lib/aarch64-linux-gnu" "/usr/lib/aarch64-linux-gnu/tegra"
        "/usr/lib")
  elseif(_trt_arch MATCHES "^(x86_64|amd64)$")
    set(_trt_include_candidates
        "/usr/local/tensorrt/targets/x86_64-linux-gnu/include"
        "/usr/include/x86_64-linux-gnu" "/usr/include")
    set(_trt_library_candidates
        "/usr/local/tensorrt/targets/x86_64-linux-gnu/lib"
        "/usr/lib/x86_64-linux-gnu" "/usr/lib")
  else()
    message(FATAL_ERROR "Unknown architecture")
  endif()

  set(_modules nvinfer nvinfer_plugin)
  if(${TRT_MAJOR_VERSION} GREATER_EQUAL 8)
    list(APPEND _modules nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean)
  endif()

  _guess_path(TensorRT_LIBRARY_DIR "libnvinfer.so;libnvinfer_plugin.so"
              ${_trt_library_candidates})
  message(STATUS "TensorRT libraries: ${TensorRT_LIBRARY_DIR}")
  _guess_path(TensorRT_INCLUDE_DIR "NvInfer.h" ${_trt_include_candidates})
  message(STATUS "TensorRT includes: ${TensorRT_INCLUDE_DIR}")
endif()

foreach(lib IN LISTS _modules)
  find_library(
    TensorRT_${lib}_LIBRARY
    NAMES ${lib}
    HINTS ${TensorRT_LIBRARY_DIR})
  list(APPEND TensorRT_LIBRARIES ${TensorRT_${lib}_LIBRARY})
endforeach()

target_link_libraries(TensorRT INTERFACE ${TensorRT_LIBRARIES})

message(STATUS "Found TensorRT libs: ${TensorRT_LIBRARIES}")

set_target_properties(
  TensorRT
  PROPERTIES C_STANDARD 17
             CXX_STANDARD 17
             POSITION_INDEPENDENT_CODE ON
             SKIP_BUILD_RPATH TRUE
             BUILD_WITH_INSTALL_RPATH TRUE
             INSTALL_RPATH "$ORIGIN"
             INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIR}")

unset(TRT_MAJOR_VERSION)
unset(_modules)
unset(_trt_include_candidates)
unset(_trt_library_candidates)
unset(_trt_arch)


================================================
FILE: mnasnet/README.md
================================================
# mnasnet

MNASNet with depth multiplier of 0.5 from
"MnasNet: Platform-Aware Neural Architecture Search for Mobile" <https://arxiv.org/pdf/1807.11626.pdf>

For the Pytorch implementation, you can refer to [pytorchx/mnasnet](https://github.com/wang-xinyu/pytorchx/tree/master/mnasnet)

Following tricks are used in this mnasnet, nothing special, group conv and batchnorm are used.

- Batchnorm layer, implemented by scale layer.

## Usage

1. use `gen_wts.py` to generate wts file

```bash
python gen_wts.py
```

2. build C++ code

```bash
pushd tensorrtx/mnasnet
cmake -S . -B build -G Ninja --fresh
cmake --build build
```

3. serialize wts model to engine file

```bash
./build/mnasnet -s
```

4. run inference

```bash
./build/mnasnet -d
```

The output looks like:

```bash
...
====
Execution time: 0ms
-2.024, -1.266, -1.602, -1.465, -0.7756, -0.2096, 0.05945, 1.342, -0.2382, 1.279, 1.251, 0.2579, 1.836, -0.5296, 0.3196, 0.9055, -0.4915, 0.1604, -0.6305, -0.1019, -0.8816,
====
prediction result:
Top: 0 idx: 285, logits: 4.869, label: Egyptian cat
Top: 1 idx: 281, logits: 4.837, label: tabby, tabby cat
Top: 2 idx: 282, logits: 4.019, label: tiger cat
```


================================================
FILE: mnasnet/gen_wts.py
================================================
import struct

import cv2
import numpy as np
import torch
from torchvision.models import mnasnet0_5


MODELS = [("mnasnet0_5", mnasnet0_5(pretrained=True))]


def read_imagenet_labels() -> dict[int, str]:
    """
    read ImageNet 1000 labels

    Returns:
        dict[int, str]: labels dict
    """
    clsid2label = {}
    with open("../assets/imagenet1000_clsidx_to_labels.txt", "r") as f:
        for i in f.readlines():
            k, v = i.split(": ")
            clsid2label.setdefault(int(k), v[1:-3])
    return clsid2label


def preprocess(img: np.array) -> torch.Tensor:
    """
    a preprocess method align with ImageNet dataset

    Args:
        img (np.array): input image

    Returns:
        torch.Tensor: preprocessed image in `NCHW` layout
    """
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
    img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_LINEAR)
    mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
    std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
    img = (img - mean) / std
    img = img.transpose(2, 0, 1)[None, ...]
    return torch.from_numpy(img)


def main():
    labels = read_imagenet_labels()

    img = preprocess(cv2.imread("../assets/cats.jpg", cv2.IMREAD_COLOR))
    for name, model in MODELS:
        model.eval()
        with torch.inference_mode():
            output = model(img)
        for i, batch in enumerate(torch.topk(output, k=3).indices):
            for j, idx in enumerate(batch):
                print(f"\tBatch: {i}, Top: {j}, logits: {output[i][idx]:.4f}, label: {labels[int(idx)]}")
        print(f"{'=' * 32}")

        with open(f"../models/{name}.wts", "w") as f:
            f.write("{}\n".format(len(model.state_dict().keys())))
            for k, v in model.state_dict().items():
                print("key: ", k)
                print("value: ", v.shape)
                vr = v.reshape(-1).cpu().numpy()
                f.write("{} {}".format(k, len(vr)))
                for vv in vr:
                    f.write(" ")
                    f.write(struct.pack(">f", float(vv)).hex())
                f.write("\n")


if __name__ == "__main__":
    main()


================================================
FILE: mnasnet/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include <cassert>
#include <cstdint>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include <utility>
#include "NvInferRuntime.h"
#include "macros.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf {
   public:
    LogStreamConsumerBuffer(std::ostream& stream, std::string prefix, bool shouldLog)
        : mOutput(stream), mPrefix(std::move(prefix)), mShouldLog(shouldLog) {}

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) noexcept
        : mOutput(other.mOutput), mPrefix(std::move(other.mPrefix)), mShouldLog(other.mShouldLog) {}

    ~LogStreamConsumerBuffer() override {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr()) {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    int sync() override {
        putOutput();
        return 0;
    }

    void putOutput() {
        if (mShouldLog) {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }

   private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase {
   public:
    LogStreamConsumerBase(std::ostream& stream, std::string prefix, bool shouldLog)
        : mBuffer(stream, std::move(prefix), shouldLog) {}

   protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
   public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(severity <= reportableSeverity),
          mSeverity(severity) {}

    LogStreamConsumer(LogStreamConsumer&& other) noexcept
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(other.mShouldLog),
          mSeverity(other.mSeverity) {}

    void setReportableSeverity(Severity reportableSeverity) {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

   private:
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger {
   private:
    struct TestInfo;

   public:
    Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {}

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult : std::uint8_t {
        kRUNNING,  //!< The test is running
        kPASSED,   //!< The test passed
        kFAILED,   //!< The test failed
        kWAIVED    //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger() { return *this; }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << '\n';
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity) { mReportableSeverity = severity; }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom {
       public:
        TestAtom(TestAtom&&) = default;

       private:
        friend class Logger;

        TestAtom(bool started, TestInfo info)
            : mStarted(started), mName(std::move(info.name)), mCmdline(std::move(info.cmdline)) {}

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline) {
        return TestAtom{false, TestInfo{name, cmdline}};
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom) {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result) {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass) {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    [[nodiscard]] Severity getReportableSeverity() const { return mReportableSeverity; }

   private:
    struct TestInfo {
        std::string name;
        std::string cmdline;
    };
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result) {
        switch (result) {
            case TestResult::kRUNNING:
                return "RUNNING";
            case TestResult::kPASSED:
                return "PASSED";
            case TestResult::kFAILED:
                return "FAILED";
            case TestResult::kWAIVED:
                return "WAIVED";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result) {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << '\n';
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv) {
        std::stringstream ss;
        for (int i = 0; i < argc; i++) {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace {

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kVERBOSE};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINFO};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kWARNING};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kERROR};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINTERNAL_ERROR};
}

}  // anonymous namespace

#endif  // TENSORRT_LOGGING_H


================================================
FILE: mnasnet/macros.h
================================================
#pragma once
#include <NvInfer.h>

#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else

#if defined(_MSC_VER)
#define API __declspec(dllimport)
#else
#define API
#endif
#endif  // API_EXPORTS

#define TRT_VERSION \
    ((NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + (NV_TENSORRT_PATCH * 10) + NV_TENSORRT_BUILD)

#if TRT_VERSION < 7220
#error "TensorRT >= 7.2.2 is required for this demo."
#endif

#if TRT_VERSION >= 8000
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif


================================================
FILE: mnasnet/mnasnet.cpp
================================================
#include <NvInfer.h>
#include <chrono>
#include <cmath>
#include <fstream>
#include <iostream>
#include <map>
#include <opencv2/opencv.hpp>
#include <string>
#include <vector>
#include "logging.h"

#include "utils.h"

// stuff we know about mnasnet and the input/output blobs
static constexpr const int INPUT_H = 224;
static constexpr const int INPUT_W = 224;
static constexpr const int OUTPUT_SIZE = 1000;
static constexpr int N = 1;
static constexpr const std::array<const char*, 2> NAMES = {"data", "prob"};
static constexpr const std::array<const int, 2> SIZES = {3 * INPUT_H * INPUT_W, OUTPUT_SIZE};
static const std::string WTS_PATH = "../models/mnasnet0_5.wts";
static const std::string ENGINE_PATH = "../models/mnasnet0_5.engine";
static constexpr const char* LABELS_PATH = "../assets/imagenet1000_clsidx_to_labels.txt";
static constexpr const bool TRT_PREPROCESS = TRT_VERSION >= 8510 ? true : false;
static constexpr const std::array<const float, 3> mean = {0.485f, 0.456f, 0.406f};
static constexpr const std::array<const float, 3> stdv = {0.229f, 0.224f, 0.225f};

using namespace nvinfer1;
using WeightMap = std::map<std::string, Weights>;
using M = nvinfer1::MatrixOperation;
using E = nvinfer1::ElementWiseOperation;
using NDCF = nvinfer1::NetworkDefinitionCreationFlag;

static Logger gLogger;

struct ConvParams {
    int o;
    int k;
    int s;
    int p;
    int d;
    int g;
    float eps = 1e-5f;
};

struct InvertedResParams {
    int inch;
    int o;
    int k;
    int s;
    int exp;
};

ILayer* addBatchNorm2d(INetworkDefinition* network, WeightMap& weightMap, ITensor& input, const std::string& lname,
                       float eps) {
    float* gamma = (float*)weightMap[lname + ".weight"].values;
    float* beta = (float*)weightMap[lname + ".bias"].values;
    float* mean = (float*)weightMap[lname + ".running_mean"].values;
    float* var = (float*)weightMap[lname + ".running_var"].values;
    auto len = weightMap[lname + ".running_var"].count;
    std::cout << lname << " running_var's len: " << len << "\n";

    auto* scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};

    auto* shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    auto* pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

ILayer* CBR(INetworkDefinition* net, WeightMap& map, const std::string& name, ITensor& input, const ConvParams& cp,
            int start_index = 0, bool has_relu = true) {
    Weights bias{DataType::kFLOAT, nullptr, 0};

    // conv -> bn -> relu
    auto conv_name = name + "." + std::to_string(start_index++) + ".weight";
    if (map.find(conv_name) == map.end()) {
        std::cerr << "KeyError: " << name << "is not in weight map";
        std::abort();
    }
    auto* conv = net->addConvolutionNd(input, cp.o, DimsHW{cp.k, cp.k}, map[conv_name], bias);
    if (conv == nullptr) {
        std::cerr << "build conv layer failed in " << name;
        std::abort();
    }
    conv->setStrideNd(DimsHW{cp.s, cp.s});
    conv->setPaddingNd(DimsHW{cp.p, cp.p});
    conv->setDilationNd(DimsHW{cp.d, cp.d});
    conv->setNbGroups(cp.g);
    conv->setName(conv_name.c_str());

    std::string bn_name = name + "." + std::to_string(start_index);
    auto* bn = addBatchNorm2d(net, map, *conv->getOutput(0), bn_name, cp.eps);
    if (has_relu) {
        auto* relu = net->addActivation(*bn->getOutput(0), ActivationType::kRELU);
        if (relu == nullptr) {
            std::cerr << "build relu layer failed in " << name;
            std::abort();
        }
        return relu;
    } else {
        return bn;
    }
}

ILayer* invertedRes(INetworkDefinition* network, WeightMap& w, ITensor& input, const std::string& lname,
                    const InvertedResParams& irp) {
    std::cout << "Building layer: " << lname << "\n";
    static const Weights emptywts{DataType::kFLOAT, nullptr, 0};
    int midch = irp.inch * irp.exp;
    auto* conv1 = network->addConvolutionNd(input, midch, DimsHW{1, 1}, w[lname + "layers.0.weight"], emptywts);
    assert(conv1);
    auto* bn1 = addBatchNorm2d(network, w, *conv1->getOutput(0), lname + "layers.1", 1e-5f);
    auto* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);
    auto* conv2 = network->addConvolutionNd(*relu1->getOutput(0), midch, DimsHW{irp.k, irp.k},
                                            w[lname + "layers.3.weight"], emptywts);
    assert(conv2);
    conv2->setStrideNd(DimsHW{irp.s, irp.s});
    conv2->setPaddingNd(DimsHW{irp.k / 2, irp.k / 2});
    conv2->setNbGroups(midch);
    auto* bn2 = addBatchNorm2d(network, w, *conv2->getOutput(0), lname + "layers.4", 1e-5f);
    auto* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU);
    assert(relu2);
    auto* conv3 = network->addConvolutionNd(*relu2->getOutput(0), irp.o, DimsHW{1, 1}, w[lname + "layers.6.weight"],
                                            emptywts);
    assert(conv3);
    auto* bn3 = addBatchNorm2d(network, w, *conv3->getOutput(0), lname + "layers.7", 1e-5f);

    if (irp.inch == irp.o && irp.s == 1) {
        auto* ew1 = network->addElementWise(*bn3->getOutput(0), input, ElementWiseOperation::kSUM);
        assert(ew1);
        return ew1;
    }
    return bn3;
}

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IRuntime* runtime, IBuilder* builder, IBuilderConfig* config,
                          DataType dt) {
    auto weightMap = loadWeights(WTS_PATH);

#if TRT_VERSION >= 11200
    auto flag = 1U << static_cast<int>(NDCF::kSTRONGLY_TYPED);
#elif TRT_VERSION >= 10000
    auto flag = 0U;
#else
    auto flag = 1U << static_cast<int>(NDCF::kEXPLICIT_BATCH);
#endif
    auto* network = builder->createNetworkV2(flag);

    ITensor* data{nullptr};
    if constexpr (TRT_PREPROCESS) {
        dt = DataType::kUINT8;
        data = network->addInput(NAMES[0], dt, Dims4{N, INPUT_H, INPUT_W, 3});
        auto* trans = addTransformLayer(network, *data, true, mean, stdv);
        data = trans->getOutput(0);
    } else {
        data = network->addInput(NAMES[0], dt, Dims4{N, 3, INPUT_H, INPUT_W});
    }
    assert(data);

    int start_idx = 0;
    auto* cbr_0 = CBR(network, weightMap, "layers", *data, {16, 3, 2, 1, 1, 1}, start_idx, true);
    start_idx += 3;
    auto* cbr_1 = CBR(network, weightMap, "layers", *cbr_0->getOutput(0), {16, 3, 1, 1, 1, 16}, start_idx, true);
    start_idx += 3;
    auto* cbr_2 = CBR(network, weightMap, "layers", *cbr_1->getOutput(0), {8, 1, 1, 1, 1, 1}, start_idx, false);

    ILayer* ir1 = invertedRes(network, weightMap, *cbr_2->getOutput(0), "layers.8.0.", {8, 16, 3, 2, 3});
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.8.1.", {16, 16, 3, 1, 3});
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.8.2.", {16, 16, 3, 1, 3});
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.9.0.", {16, 24, 5, 2, 3});
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.9.1.", {24, 24, 5, 1, 3});
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.9.2.", {24, 24, 5, 1, 3});
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.10.0.", {24, 40, 5, 2, 6});
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.10.1.", {40, 40, 5, 1, 6});
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.10.2.", {40, 40, 5, 1, 6});
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.11.0.", {40, 48, 3, 1, 6});
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.11.1.", {48, 48, 3, 1, 6});
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.12.0.", {48, 96, 5, 2, 6});
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.12.1.", {96, 96, 5, 1, 6});
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.12.2.", {96, 96, 5, 1, 6});
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.12.3.", {96, 96, 5, 1, 6});
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.13.0.", {96, 160, 3, 1, 6});

    auto* cbr_3 = CBR(network, weightMap, "layers", *ir1->getOutput(0), {1280, 1, 1, 0, 1, 1}, 14, true);

    auto* avg = network->addReduce(*cbr_3->getOutput(0), ReduceOperation::kAVG, 0xc, false);
    auto* _fcw = network->addConstant(DimsHW{1000, 1280}, weightMap["classifier.1.weight"]);
    auto* _fcb = network->addConstant(DimsHW{1, 1000}, weightMap["classifier.1.bias"]);
    auto* _fc1 = network->addMatrixMultiply(*avg->getOutput(0), M::kNONE, *_fcw->getOutput(0), M::kTRANSPOSE);
    auto* fc1 = network->addElementWise(*_fc1->getOutput(0), *_fcb->getOutput(0), E::kSUM);
    assert(fc1);

    fc1->getOutput(0)->setName(NAMES[1]);
    network->markOutput(*fc1->getOutput(0));

    // Build engine
#if TRT_VERSION >= 8000
    config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, WORKSPACE_SIZE);
    auto* _serialized = builder->buildSerializedNetwork(*network, *config);
    auto* engine = runtime->deserializeCudaEngine(_serialized->data(), _serialized->size());
    delete _serialized;
    delete network;
#else
    builder->setMaxBatchSize(N);
    config->setMaxWorkspaceSize(WORKSPACE_SIZE);
    auto* engine = builder->buildEngineWithConfig(*network, *config);
    network->destroy();
#endif
    std::cout << "build out\n";

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IRuntime* runtime, IHostMemory** modelStream) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, runtime, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
#if TRT_VERSION >= 8000
    delete engine;
    delete config;
    delete builder;
#else
    engine->destroy();
    config->destroy();
    builder->destroy();
#endif
}

std::vector<std::vector<float>> do_inference(IExecutionContext& context, void* input, std::size_t batch_size) {
    const ICudaEngine& engine = context.getEngine();
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));
    std::vector<void*> buffers;

#if TRT_VERSION >= 8000
    const int32_t nIO = engine.getNbIOTensors();
#else
    const int32_t nIO = engine.getNbBindings();
#endif

    buffers.resize(nIO);
    for (auto i = 0; i < nIO; ++i) {
        std::size_t size = 0;
#if TRT_VERSION >= 8000
        auto* tensor_name = engine.getIOTensorName(i);
        auto s = getSize(engine.getTensorDataType(tensor_name));
        size = s * batch_size * SIZES[i];
        CHECK(cudaMalloc(&buffers[i], size));
        if (i == 0) {
            CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream));
        }
        context.setTensorAddress(tensor_name, buffers[i]);
#else
        const int32_t idx = engine.getBindingIndex(NAMES[i]);
        auto s = getSize(engine.getBindingDataType(idx));
        assert(idx == i);
        size = s * batch_size * SIZES[i];
        CHECK(cudaMalloc(&buffers[i], size));
        if (i == 0) {
            CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream));
        }
#endif
    }

#if TRT_VERSION >= 8000
    assert(context.enqueueV3(stream));
#else
    assert(context.enqueueV2(buffers.data(), stream, nullptr));
#endif

    std::vector<std::vector<float>> prob;
    for (int i = 1; i < nIO; ++i) {
        std::vector<float> tmp(batch_size * SIZES[i], std::nanf(""));
        std::size_t size = batch_size * SIZES[i] * sizeof(float);
        CHECK(cudaMemcpyAsync(tmp.data(), buffers[i], size, cudaMemcpyDeviceToHost, stream));
        prob.emplace_back(tmp);
    }
    CHECK(cudaStreamSynchronize(stream));

    cudaStreamDestroy(stream);
    for (auto i = 0; i < nIO; ++i) {
        CHECK(cudaFree(buffers[i]));
    }
    return prob;
}

int main(int argc, char** argv) {
    checkTrtEnv();
    if (argc != 2) {
        std::cerr << "arguments not right!\n";
        std::cerr << "./mnasnet -s   // serialize model to plan file\n";
        std::cerr << "./mnasnet -d   // deserialize plan file and run inference\n";
        return -1;
    }

    auto* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);

    // create a model using the API directly and serialize it to a stream
    char* trt_model_stream{nullptr};
    std::streamsize size{0};

    if (std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(N, runtime, &modelStream);
        assert(modelStream != nullptr);

        std::ofstream p(ENGINE_PATH, std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file\n";
            return -1;
        }
        if (modelStream->size() > static_cast<std::size_t>(std::numeric_limits<std::streamsize>::max())) {
            std::cerr << "this model is too large to serialize\n";
            return -1;
        }
        const auto* data_ptr = reinterpret_cast<const char*>(modelStream->data());
        auto data_size = static_cast<std::streamsize>(modelStream->size());
        p.write(data_ptr, data_size);
#if TRT_VERSION >= 8000
        delete modelStream;
#else
        modelStream->destroy();
#endif
        return 0;
    } else if (std::string(argv[1]) == "-d") {
        std::ifstream file(ENGINE_PATH, std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trt_model_stream = new char[size];
            assert(trt_model_stream);
            file.read(trt_model_stream, size);
            file.close();
        }
    } else {
        return -1;
    }

#if TRT_VERSION >= 8000
    auto* engine = runtime->deserializeCudaEngine(trt_model_stream, size);
#else
    auto* engine = runtime->deserializeCudaEngine(trt_model_stream, size, nullptr);
#endif
    assert(engine != nullptr);
    auto* context = engine->createExecutionContext();
    assert(context != nullptr);

    void* input = nullptr;
    std::vector<float> flat_img;
    cv::Mat img;
    if constexpr (TRT_PREPROCESS) {
        // for simplicity, resize image on cpu side
        img = cv::imread("../assets/cats.jpg", cv::IMREAD_COLOR);
        cv::resize(img, img, cv::Size(INPUT_W, INPUT_H), 0, 0, cv::INTER_LINEAR);
        input = static_cast<void*>(img.data);
    } else {
        img = cv::imread("../assets/cats.jpg", cv::IMREAD_COLOR);
        flat_img = preprocess_img(img, true, mean, stdv, N, INPUT_H, INPUT_W);
        input = flat_img.data();
    }

    for (int32_t i = 0; i < 100; ++i) {
        auto _start = std::chrono::system_clock::now();
        auto prob = do_inference(*context, input, 1);
        auto _end = std::chrono::system_clock::now();
        auto _time = std::chrono::duration_cast<std::chrono::milliseconds>(_end - _start).count();
        std::cout << "Execution time: " << _time << "ms\n";

        for (const auto& vector : prob) {
            int idx = 0;
            for (auto v : vector) {
                std::cout << std::setprecision(4) << v << ", " << std::flush;
                if (++idx > 20) {
                    std::cout << "\n====\n";
                    break;
                }
            }
        }

        if (i == 99) {
            std::cout << "prediction result:\n";
            auto labels = loadImagenetLabelMap(LABELS_PATH);
            int _top = 0;
            for (auto& [idx, logits] : topk(prob[0], 3)) {
                std::cout << "Top: " << _top++ << " idx: " << idx << ", logits: " << logits
                          << ", label: " << labels[idx] << "\n";
            }
        }
    }

    delete[] trt_model_stream;
    return 0;
}


================================================
FILE: mnasnet/utils.h
================================================
#pragma once
#include <cuda_runtime_api.h>
#include <algorithm>
#include <cassert>
#include <fstream>
#include <iostream>
#include <map>
#include <numeric>
#include <opencv2/opencv.hpp>
#include <string>
#include <vector>
#include "macros.h"

using namespace nvinfer1;

constexpr const std::size_t WORKSPACE_SIZE = 16 << 20;

#define CHECK(status)                                     \
    do {                                                  \
        auto ret = (status);                              \
        if (ret != cudaSuccess) {                         \
            std::cerr << "Cuda failure: " << ret << "\n"; \
            std::abort();                                 \
        }                                                 \
    } while (0)

static void checkTrtEnv(int device = 0) {
#if TRT_VERSION < 8000
    CHECK(cudaGetDevice(&device));
    cudaDeviceProp prop{};
    CHECK(cudaGetDeviceProperties(&prop, device));
    const int sm = prop.major * 10 + prop.minor;
    if (sm > 86) {
        std::cerr << "TensorRT < 8 does not support SM > 86 on this GPU.";
        std::abort();
    }
#endif
}

/**
 * @brief TensorRT weight files have a simple space delimited format:
 * [type] [size] <data x size in hex>
 * 
 * @param file input weight file path
 * @return std::map<std::string, nvinfer1::Weights> 
 */
static std::map<std::string, nvinfer1::Weights> loadWeights(const std::string& file) {
    std::cout << "Loading weights: " << file << "\n";
    std::map<std::string, nvinfer1::Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--) {
        nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0};

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> wt.count;

        // Load blob
        auto* val = new uint32_t[wt.count];
        input >> std::hex;
        for (auto x = 0ll; x < wt.count; ++x) {
            input >> val[x];
        }
        wt.values = val;
        weightMap[name] = wt;
    }

    return weightMap;
}

/**
 * @brief a preprocess function aligning with ImageNet preprocess in torchvision, only support 3-channel image
 * 
 * @param img opencv image with BGR layout
 * @param bgr2rgb whether to convert BGR to RGB
 * @param mean subtract mean
 * @param std divide std
 * @param n batch size
 * @param h resize height
 * @param w resize width
 * @return std::vector<float> contiguous flatten image data in float32 type
 */
static std::vector<float> preprocess_img(cv::Mat& img, bool bgr2rgb, const std::array<const float, 3>& mean,
                                         const std::array<const float, 3>& std, int n, int h, int w) {
    const auto c = img.channels();
    const auto size = c * h * w;
    if (c != 3) {
        std::cerr << "this demo only supports 3 channel input image.\n";
        std::abort();
    }
    if (bgr2rgb) {
        cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
    }
    cv::resize(img, img, cv::Size(w, h), 0, 0, cv::INTER_LINEAR);
    img.convertTo(img, CV_32FC3, 1.f / 255);
    img = (img - cv::Scalar(mean[0], mean[1], mean[2])) / cv::Scalar(std[0], std[1], std[2]);
    std::vector<float> chw(static_cast<std::size_t>(n) * c * h * w, 0.f);

    // fill all batch with the same input image
    for (int i = 0; i < n; ++i) {
        for (int y = 0; y < h; ++y) {
            for (int x = 0; x < w; ++x) {
                const cv::Vec3f v = img.at<cv::Vec3f>(y, x);
                chw[i * size + 0 * h * w + y * w + x] = v[0];
                chw[i * size + 1 * h * w + y * w + x] = v[1];
                chw[i * size + 2 * h * w + y * w + x] = v[2];
            }
        }
    }
    return chw;
}

static auto topk(const std::vector<float>& v, int k) -> std::vector<std::pair<int, float>> {
    if (k <= 0)
        return {};
    auto stride = std::min<std::ptrdiff_t>(k, static_cast<int64_t>(v.size()));

    std::vector<int> idx(v.size());
    std::iota(idx.begin(), idx.end(), 0);

    std::partial_sort(idx.begin(), idx.begin() + k, idx.end(), [&](int a, int b) { return v[a] > v[b]; });

    std::vector<std::pair<int, float>> out;
    out.reserve(stride);
    for (auto i = 0; i < stride; ++i)
        out.emplace_back(idx[i], v[idx[i]]);
    return out;
}

static std::map<int, std::string> loadImagenetLabelMap(const std::string& path) {
    std::map<int, std::string> labels;
    std::ifstream in(path);
    if (!in.is_open()) {
        return labels;
    }
    std::string line;
    while (std::getline(in, line)) {
        auto colon = line.find(':');
        if (colon == std::string::npos) {
            continue;
        }
        auto first_quote = line.find('\'', colon);
        if (first_quote == std::string::npos) {
            continue;
        }
        auto second_quote = line.find('\'', first_quote + 1);
        if (second_quote == std::string::npos) {
            continue;
        }
        int idx = std::stoi(line.substr(0, colon));
        labels[idx] = line.substr(first_quote + 1, second_quote - first_quote - 1);
    }
    return labels;
}

static ILayer* addTransformLayer(INetworkDefinition* network, ITensor& input, bool bgr2rgb,
                                 const std::array<const float, 3>& mean, const std::array<const float, 3>& std) {
    struct ScaleParams {
        std::array<float, 3> shift;
        std::array<float, 3> scale;
    };
    static std::vector<std::unique_ptr<ScaleParams>> gScaleParams;
    auto params = std::make_unique<ScaleParams>();
    params->shift = {-mean[0] / std[0], -mean[1] / std[1], -mean[2] / std[2]};
    params->scale = {1.f / (std[0] * 255.f), 1.f / (std[1] * 255.f), 1.f / (std[2] * 255.f)};

    static const Weights empty{DataType::kFLOAT, nullptr, 0ll};
    const Weights shift{DataType::kFLOAT, params->shift.data(), 3ll};
    const Weights scale{DataType::kFLOAT, params->scale.data(), 3ll};

    gScaleParams.emplace_back(std::move(params));

    ITensor* in = &input;
    if (input.getType() != DataType::kFLOAT) {
#if TRT_VERSION >= 8000
        auto* cast = network->addCast(input, DataType::kFLOAT);
        assert(cast);
        cast->setName("Cast to FP32");
        in = cast->getOutput(0);
#else
        auto* identity = network->addIdentity(input);
        assert(identity);
        identity->setName("Convert to FP32");
        identity->setOutputType(0, DataType::kFLOAT);
        in = identity->getOutput(0);
#endif
    }
    // Convert from NHWC to NCHW
    auto* perm = network->addShuffle(*in);
    assert(perm);
    perm->setName("NHWC -> NCHW");
    perm->setFirstTranspose(Permutation{0, 3, 1, 2});

    // Convert from BGR to RGB (optional)
    ITensor* data{nullptr};
    if (bgr2rgb) {
        auto add_slice = [&](int c, const char* name) -> ITensor* {
            auto dims = perm->getOutput(0)->getDimensions();
            Dims4 start = {0, c, 0, 0}, stride = {1, 1, 1, 1};
            Dims4 size = {dims.d[0], 1, dims.d[2], dims.d[3]};
            auto* _slice = network->addSlice(*perm->getOutput(0), start, size, stride);
            _slice->setName(name);
            assert(_slice && _slice->getNbOutputs() == 1);
            return _slice->getOutput(0);
        };
        std::array<ITensor*, 3> channels = {add_slice(2, "R"), add_slice(1, "G"), add_slice(0, "B")};
        auto* cat = network->addConcatenation(channels.data(), 3);
        assert(cat);
        cat->setName("RGB");
        cat->setAxis(1);
        data = cat->getOutput(0);
    } else {
        data = perm->getOutput(0);
    }

    // Normalize
    auto* trans = network->addScale(*data, ScaleMode::kCHANNEL, shift, scale, empty);
    assert(trans);
    trans->setName("mean & std");
#if TRT_VERSION >= 8000
    trans->setChannelAxis(1);
#endif
    return trans;
}

static size_t getSize(DataType dt) {
    switch (dt) {
#if TRT_VERSION >= 8510
        case DataType::kUINT8:
#endif
        case DataType::kINT8:
            return sizeof(int8_t);
        case DataType::kFLOAT:
            return sizeof(float);
        case DataType::kHALF:
            return sizeof(int16_t);
        case DataType::kINT32:
            return sizeof(int32_t);
        default: {
            std::cerr << "Unsupported data type\n";
            std::abort();
        }
    }
}


================================================
FILE: mobilenet/mobilenetv2/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(mobilenet)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(/usr/include/x86_64-linux-gnu/)
link_directories(/usr/lib/x86_64-linux-gnu/)

add_executable(mobilenet ${PROJECT_SOURCE_DIR}/mobilenet_v2.cpp)
target_link_libraries(mobilenet nvinfer)
target_link_libraries(mobilenet cudart)

add_definitions(-O2 -pthread)


================================================
FILE: mobilenet/mobilenetv2/README.md
================================================
# mobilenet v2

MobileNetV2 architecture from
     "MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>.

For the Pytorch implementation, you can refer to [pytorchx/mobilenet](https://github.com/wang-xinyu/pytorchx/tree/master/mobilenet)

Following tricks are used in this mobilenet,

- Relu6 is used in mobilenet v2. We use `Relu6(x) = Relu(x) - Relu(x-6)` in tensorrt.
- Batchnorm layer, implemented by scale layer.

```
// 1. generate mobilenet.wts from [pytorchx/mobilenet](https://github.com/wang-xinyu/pytorchx/tree/master/mobilenet)

// 2. put mobilenet.wts into tensorrtx/mobilenet

// 3. build and run

cd tensorrtx/mobilenet/mobilenetv2

mkdir build

cd build

cmake ..

make

sudo ./mobilenet -s   // serialize model to plan file i.e. 'mobilenet.engine'

sudo ./mobilenet -d   // deserialize plan file and run inference

// 4. see if the output is same as pytorchx/mobilenet
```

### TensorRT Python API

```
# 1. generate mobilenetv2.wts from [pytorchx/mobilenet](https://github.com/wang-xinyu/pytorchx/tree/master/mobilenet)

# 2. put mobilenetv2.wts into tensorrtx/mobilenet/mobilenetv2

# 3. install Python dependencies (tensorrt/pycuda/numpy)

cd tensorrtx/mobilenet/mobilenetv2

python mobilenet_v2.py -s   // serialize model to plan file i.e. 'mobilenetv2.engine'
python mobilenet_v2.py -d   // deserialize plan file and run inference

# 4. see if the output is same as pytorchx/mobilenet
```


================================================
FILE: mobilenet/mobilenetv2/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "NvInferRuntimeCommon.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf {
   public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {}

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {}

    ~LogStreamConsumerBuffer() {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr()) {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync() {
        putOutput();
        return 0;
    }

    void putOutput() {
        if (mShouldLog) {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }

   private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase {
   public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog) {}

   protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
   public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(severity <= reportableSeverity),
          mSeverity(severity) {}

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(other.mShouldLog),
          mSeverity(other.mSeverity) {}

    void setReportableSeverity(Severity reportableSeverity) {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

   private:
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger {
   public:
    Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {}

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult {
        kRUNNING,  //!< The test is running
        kPASSED,   //!< The test passed
        kFAILED,   //!< The test failed
        kWAIVED    //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger() { return *this; }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) override {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity) { mReportableSeverity = severity; }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom {
       public:
        TestAtom(TestAtom&&) = default;

       private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started), mName(name), mCmdline(cmdline) {}

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline) {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom) {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result) {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass) {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const { return mReportableSeverity; }

   private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result) {
        switch (result) {
            case TestResult::kRUNNING:
                return "RUNNING";
            case TestResult::kPASSED:
                return "PASSED";
            case TestResult::kFAILED:
                return "FAILED";
            case TestResult::kWAIVED:
                return "WAIVED";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result) {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv) {
        std::stringstream ss;
        for (int i = 0; i < argc; i++) {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace {

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

}  // anonymous namespace

#endif  // TENSORRT_LOGGING_H


================================================
FILE: mobilenet/mobilenetv2/mobilenet_v2.cpp
================================================
#include <chrono>
#include <cmath>
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"

#define CHECK(status)                                          \
    do {                                                       \
        auto ret = (status);                                   \
        if (ret != 0) {                                        \
            std::cerr << "Cuda failure: " << ret << std::endl; \
            abort();                                           \
        }                                                      \
    } while (0)

// stuff we know about the network and the input/output blobs
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int OUTPUT_SIZE = 1000;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";

using namespace nvinfer1;

static Logger gLogger;

// Load weights from files shared with TensorRT samples.
// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--) {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x) {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input,
                            std::string lname, float eps) {
    float* gamma = (float*)weightMap[lname + ".weight"].values;
    float* beta = (float*)weightMap[lname + ".bias"].values;
    float* mean = (float*)weightMap[lname + ".running_mean"].values;
    float* var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;
    std::cout << "len " << len << std::endl;

    float* scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};

    float* shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float* pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

IElementWiseLayer* convBnRelu(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input,
                              int outch, int ksize, int s, int g, std::string lname) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    int p = (ksize - 1) / 2;
    IConvolutionLayer* conv1 =
            network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[lname + "0.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{p, p});
    conv1->setNbGroups(g);

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "1", 1e-5);

    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    float* shval = reinterpret_cast<float*>(malloc(sizeof(float) * 1));
    float* scval = reinterpret_cast<float*>(malloc(sizeof(float) * 1));
    float* pval = reinterpret_cast<float*>(malloc(sizeof(float) * 1));
    shval[0] = -6.0;
    scval[0] = 1.0;
    pval[0] = 1.0;
    Weights shift{DataType::kFLOAT, shval, 1};
    Weights scale{DataType::kFLOAT, scval, 1};
    Weights power{DataType::kFLOAT, pval, 1};
    weightMap[lname + "cbr.scale"] = scale;
    weightMap[lname + "cbr.shift"] = shift;
    weightMap[lname + "cbr.power"] = power;
    IScaleLayer* scale1 = network->addScale(*bn1->getOutput(0), ScaleMode::kUNIFORM, shift, scale, power);
    assert(scale1);

    IActivationLayer* relu2 = network->addActivation(*scale1->getOutput(0), ActivationType::kRELU);
    assert(relu2);

    IElementWiseLayer* ew1 =
            network->addElementWise(*relu1->getOutput(0), *relu2->getOutput(0), ElementWiseOperation::kSUB);
    assert(ew1);
    return ew1;
}

ILayer* invertedRes(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input,
                    std::string lname, int inch, int outch, int s, int exp) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    int hidden = inch * exp;
    bool use_res_connect = (s == 1 && inch == outch);

    IScaleLayer* bn1 = nullptr;
    if (exp != 1) {
        IElementWiseLayer* ew1 = convBnRelu(network, weightMap, input, hidden, 1, 1, 1, lname + "conv.0.");
        IElementWiseLayer* ew2 =
                convBnRelu(network, weightMap, *ew1->getOutput(0), hidden, 3, s, hidden, lname + "conv.1.");
        IConvolutionLayer* conv1 = network->addConvolutionNd(*ew2->getOutput(0), outch, DimsHW{1, 1},
                                                             weightMap[lname + "conv.2.weight"], emptywts);
        assert(conv1);
        bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "conv.3", 1e-5);
    } else {
        IElementWiseLayer* ew1 = convBnRelu(network, weightMap, input, hidden, 3, s, hidden, lname + "conv.0.");
        IConvolutionLayer* conv1 = network->addConvolutionNd(*ew1->getOutput(0), outch, DimsHW{1, 1},
                                                             weightMap[lname + "conv.1.weight"], emptywts);
        assert(conv1);
        bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "conv.2", 1e-5);
    }
    if (!use_res_connect)
        return bn1;
    IElementWiseLayer* ew3 = network->addElementWise(input, *bn1->getOutput(0), ElementWiseOperation::kSUM);
    assert(ew3);
    return ew3;
}

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // Create input tensor of shape { 3, INPUT_H, INPUT_W } with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../mobilenet.wts");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    auto ew1 = convBnRelu(network, weightMap, *data, 32, 3, 2, 1, "features.0.");
    ILayer* ir1 = invertedRes(network, weightMap, *ew1->getOutput(0), "features.1.", 32, 16, 1, 1);
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.2.", 16, 24, 2, 6);
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.3.", 24, 24, 1, 6);
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.4.", 24, 32, 2, 6);
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.5.", 32, 32, 1, 6);
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.6.", 32, 32, 1, 6);
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.7.", 32, 64, 2, 6);
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.8.", 64, 64, 1, 6);
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.9.", 64, 64, 1, 6);
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.10.", 64, 64, 1, 6);
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.11.", 64, 96, 1, 6);
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.12.", 96, 96, 1, 6);
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.13.", 96, 96, 1, 6);
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.14.", 96, 160, 2, 6);
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.15.", 160, 160, 1, 6);
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.16.", 160, 160, 1, 6);
    ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.17.", 160, 320, 1, 6);
    IElementWiseLayer* ew2 = convBnRelu(network, weightMap, *ir1->getOutput(0), 1280, 1, 1, 1, "features.18.");

    IPoolingLayer* pool1 = network->addPoolingNd(*ew2->getOutput(0), PoolingType::kAVERAGE, DimsHW{7, 7});
    assert(pool1);

    IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool1->getOutput(0), 1000, weightMap["classifier.1.weight"],
                                                           weightMap["classifier.1.bias"]);
    assert(fc1);

    fc1->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    std::cout << "set name out" << std::endl;
    network->markOutput(*fc1->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(1 << 20);
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "build out" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    config->destroy();
    builder->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float),
                          cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost,
                          stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

int main(int argc, char** argv) {
    if (argc != 2) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./mobilenet -s   // serialize model to plan file" << std::endl;
        std::cerr << "./mobilenet -d   // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // create a model using the API directly and serialize it to a stream
    char* trtModelStream{nullptr};
    size_t size{0};

    if (std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(1, &modelStream);
        assert(modelStream != nullptr);

        std::ofstream p("mobilenet.engine", std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 1;
    } else if (std::string(argv[1]) == "-d") {
        std::ifstream file("mobilenet.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        return -1;
    }

    // Subtract mean from image
    static float data[3 * INPUT_H * INPUT_W];
    for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
        data[i] = 1.0;

    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    // Run inference
    static float prob[OUTPUT_SIZE];
    for (int i = 0; i < 100; i++) {
        auto start = std::chrono::system_clock::now();
        doInference(*context, data, prob, 1);
        auto end = std::chrono::system_clock::now();
        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    // Print histogram of the output distribution
    std::cout << "\nOutput:\n\n";
    for (unsigned int i = 0; i < OUTPUT_SIZE; i++) {
        std::cout << prob[i] << ", ";
        if (i % 10 == 0)
            std::cout << i / 10 << std::endl;
    }
    std::cout << std::endl;

    return 0;
}


================================================
FILE: mobilenet/mobilenetv2/mobilenet_v2.py
================================================
import os
import sys
import struct
import argparse

import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit  # noqa: F401
import tensorrt as trt

BATCH_SIZE = 1
INPUT_H = 224
INPUT_W = 224
OUTPUT_SIZE = 1000
INPUT_BLOB_NAME = "data"
OUTPUT_BLOB_NAME = "prob"
EPS = 1e-5

WEIGHT_PATH = "./mobilenetv2.wts"
ENGINE_PATH = "./mobilenetv2.engine"

TRT_LOGGER = trt.Logger(trt.Logger.INFO)


def load_weights(file):
    print(f"Loading weights: {file}")

    assert os.path.exists(file), 'Unable to load weight file.'

    weight_map = {}
    with open(file, "r") as f:
        lines = [line.strip() for line in f]
    count = int(lines[0])
    assert count == len(lines) - 1
    for i in range(1, count + 1):
        splits = lines[i].split(" ")
        name = splits[0]
        cur_count = int(splits[1])
        assert cur_count + 2 == len(splits)
        values = []
        for j in range(2, len(splits)):
            # hex string to bytes to float
            values.append(struct.unpack(">f", bytes.fromhex(splits[j])))
        weight_map[name] = np.array(values, dtype=np.float32)

    return weight_map


def add_batch_norm_2d(network, weight_map, input, layer_name, eps):
    gamma = weight_map[layer_name + ".weight"]
    beta = weight_map[layer_name + ".bias"]
    mean = weight_map[layer_name + ".running_mean"]
    var = weight_map[layer_name + ".running_var"]
    var = np.sqrt(var + eps)

    scale = gamma / var
    shift = -mean / var * gamma + beta
    return network.add_scale(input=input,
                             mode=trt.ScaleMode.CHANNEL,
                             shift=shift,
                             scale=scale)


def conv_bn_relu(network, weight_map, input, outch, ksize, s, g, lname):
    p = (ksize - 1) // 2

    conv1 = network.add_convolution(input=input,
                                    num_output_maps=outch,
                                    kernel_shape=(ksize, ksize),
                                    kernel=weight_map[lname + "0.weight"],
                                    bias=trt.Weights())
    assert conv1
    conv1.stride = (s, s)
    conv1.padding = (p, p)
    conv1.num_groups = g

    bn1 = add_batch_norm_2d(network, weight_map, conv1.get_output(0), lname + "1", EPS)
    assert bn1

    relu1 = network.add_activation(bn1.get_output(0), type=trt.ActivationType.RELU)
    assert relu1

    shift = np.array(-6.0, dtype=np.float32)
    scale = np.array(1.0, dtype=np.float32)
    power = np.array(1.0, dtype=np.float32)
    scale1 = network.add_scale(input=bn1.get_output(0),
                               mode=trt.ScaleMode.UNIFORM,
                               shift=shift,
                               scale=scale,
                               power=power)
    assert scale1

    relu2 = network.add_activation(scale1.get_output(0), type=trt.ActivationType.RELU)
    assert relu2

    ew1 = network.add_elementwise(relu1.get_output(0), relu2.get_output(0), trt.ElementWiseOperation.SUB)
    assert ew1

    return ew1


def inverted_res(network, weight_map, input, lname, inch, outch, s, exp):
    hidden = inch * exp
    use_res_connect = (s == 1 and inch == outch)

    if exp != 1:
        ew1 = conv_bn_relu(network, weight_map, input, hidden, 1, 1, 1, lname + "conv.0.")
        ew2 = conv_bn_relu(network, weight_map, ew1.get_output(0), hidden, 3, s, hidden, lname + "conv.1.")
        conv1 = network.add_convolution(input=ew2.get_output(0),
                                        num_output_maps=outch,
                                        kernel_shape=(1, 1),
                                        kernel=weight_map[lname + "conv.2.weight"],
                                        bias=trt.Weights())
        assert conv1
        bn1 = add_batch_norm_2d(network, weight_map, conv1.get_output(0), lname + "conv.3", EPS)
    else:
        ew1 = conv_bn_relu(network, weight_map, input, hidden, 3, s, hidden, lname + "conv.0.")
        conv1 = network.add_convolution(input=ew1.get_output(0),
                                        num_output_maps=outch,
                                        kernel_shape=(1, 1),
                                        kernel=weight_map[lname + "conv.1.weight"],
                                        bias=trt.Weights())
        assert conv1
        bn1 = add_batch_norm_2d(network, weight_map, conv1.get_output(0), lname + "conv.2", EPS)

    if not use_res_connect:
        return bn1

    ew3 = network.add_elementwise(input, bn1.get_output(0), trt.ElementWiseOperation.SUM)
    assert ew3

    return ew3


def create_engine(max_batch_size, builder, config, dt):
    weight_map = load_weights(WEIGHT_PATH)
    network = builder.create_network()

    data = network.add_input(INPUT_BLOB_NAME, dt, (3, INPUT_H, INPUT_W))
    assert data

    ew1 = conv_bn_relu(network, weight_map, data, 32, 3, 2, 1, "features.0.")
    ir1 = inverted_res(network, weight_map, ew1.get_output(0), "features.1.", 32, 16, 1, 1)
    ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.2.", 16, 24, 2, 6)
    ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.3.", 24, 24, 1, 6)
    ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.4.", 24, 32, 2, 6)
    ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.5.", 32, 32, 1, 6)
    ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.6.", 32, 32, 1, 6)
    ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.7.", 32, 64, 2, 6)
    ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.8.", 64, 64, 1, 6)
    ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.9.", 64, 64, 1, 6)
    ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.10.", 64, 64, 1, 6)
    ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.11.", 64, 96, 1, 6)
    ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.12.", 96, 96, 1, 6)
    ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.13.", 96, 96, 1, 6)
    ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.14.", 96, 160, 2, 6)
    ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.15.", 160, 160, 1, 6)
    ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.16.", 160, 160, 1, 6)
    ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.17.", 160, 320, 1, 6)
    ew2 = conv_bn_relu(network, weight_map, ir1.get_output(0), 1280, 1, 1, 1, "features.18.")

    pool1 = network.add_pooling(input=ew2.get_output(0),
                                type=trt.PoolingType.AVERAGE,
                                window_size=trt.DimsHW(7, 7))
    assert pool1

    fc1 = network.add_fully_connected(input=pool1.get_output(0),
                                      num_outputs=OUTPUT_SIZE,
                                      kernel=weight_map["classifier.1.weight"],
                                      bias=weight_map["classifier.1.bias"])
    assert fc1

    fc1.get_output(0).name = OUTPUT_BLOB_NAME
    network.mark_output(fc1.get_output(0))

    # Build Engine
    builder.max_batch_size = max_batch_size
    builder.max_workspace_size = 1 << 32
    engine = builder.build_engine(network, config)

    del network
    del weight_map

    return engine


def API_to_model(max_batch_size):
    builder = trt.Builder(TRT_LOGGER)
    config = builder.create_builder_config()
    engine = create_engine(max_batch_size, builder, config, trt.float32)
    assert engine
    with open(ENGINE_PATH, "wb") as f:
        f.write(engine.serialize())

    del engine
    del builder
    del config


class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()


def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream


def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-s", action='store_true')
    parser.add_argument("-d", action='store_true')
    args = parser.parse_args()

    if not (args.s ^ args.d):
        print(
            "arguments not right!\n"
            "python mobilenet_v2.py -s   # serialize model to plan file\n"
            "python mobilenet_v2.py -d   # deserialize plan file and run inference"
        )
        sys.exit()

    if args.s:
        API_to_model(BATCH_SIZE)
    else:
        runtime = trt.Runtime(TRT_LOGGER)
        assert runtime

        with open(ENGINE_PATH, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        assert engine

        context = engine.create_execution_context()
        assert context

        data = np.ones((BATCH_SIZE * 3 * INPUT_H * INPUT_W), dtype=np.float32)
        inputs, outputs, bindings, stream = allocate_buffers(engine)
        inputs[0].host = data

        trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)

        print(f'Output: \n{trt_outputs[0][:10]}\n{trt_outputs[0][-10:]}')


================================================
FILE: mobilenet/mobilenetv3/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(mobilenetv3)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

find_package(CUDA REQUIRED)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(/usr/include/x86_64-linux-gnu/)
link_directories(/usr/lib/x86_64-linux-gnu/)

add_executable(mobilenetv3  ${PROJECT_SOURCE_DIR}/mobilenet_v3.cpp)
target_link_libraries(mobilenetv3 nvinfer)
target_link_libraries(mobilenetv3 cudart)

add_definitions(-O2 -pthread)


================================================
FILE: mobilenet/mobilenetv3/README.md
================================================
# mobilenet v3

MobileNetV3 architecture from
     "Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244?context=cs>.

For the Pytorch implementation, you can refer to [mobilenetv3.pytorch](https://github.com/chufei1995/mobilenetv3.pytorch)

## Run

1. generate mbv3_small.wts/mbv3_large.wts from pytorch implementation

2. put mbv3_small.wts/mbv3_large.wts into tensorrtx/mobilenet/mobilenetv3

3. build and run

```
cd tensorrtx/mobilenet/mobilenetv3
mkdir build
cd build
cmake ..
make
sudo ./mobilenetv3 -s small(or large) // serialize model to plan file i.e. 'mobilenetv3_small.engine'
sudo ./mobilenetv3 -d small(or large)  // deserialize plan file and run inference
```

4. see if the output is same as pytorch side

### TensorRT Python API

```
# 1. generate mobilenetv3.wts from [mobilenetv3.pytorch](https://github.com/chufei1995/mobilenetv3.pytorch)

# 2. put mobilenetv3.wts into tensorrtx/mobilenet/mobilenetv3

# 3. install Python dependencies (tensorrt/pycuda/numpy)

cd tensorrtx/mobilenet/mobilenetv3

python mobilenet_v2.py -s small(or large)  // serialize model to plan file i.e. 'mobilenetv2.engine'
python mobilenet_v2.py -d small(or large)  // deserialize plan file and run inference

```


================================================
FILE: mobilenet/mobilenetv3/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "NvInferRuntimeCommon.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf {
   public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {}

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {}

    ~LogStreamConsumerBuffer() {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr()) {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync() {
        putOutput();
        return 0;
    }

    void putOutput() {
        if (mShouldLog) {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }

   private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase {
   public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog) {}

   protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
   public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(severity <= reportableSeverity),
          mSeverity(severity) {}

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(other.mShouldLog),
          mSeverity(other.mSeverity) {}

    void setReportableSeverity(Severity reportableSeverity) {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

   private:
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger {
   public:
    Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {}

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult {
        kRUNNING,  //!< The test is running
        kPASSED,   //!< The test passed
        kFAILED,   //!< The test failed
        kWAIVED    //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger() { return *this; }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) override {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity) { mReportableSeverity = severity; }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom {
       public:
        TestAtom(TestAtom&&) = default;

       private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started), mName(name), mCmdline(cmdline) {}

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline) {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom) {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result) {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass) {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const { return mReportableSeverity; }

   private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result) {
        switch (result) {
            case TestResult::kRUNNING:
                return "RUNNING";
            case TestResult::kPASSED:
                return "PASSED";
            case TestResult::kFAILED:
                return "FAILED";
            case TestResult::kWAIVED:
                return "WAIVED";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result) {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv) {
        std::stringstream ss;
        for (int i = 0; i < argc; i++) {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace {

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

}  // anonymous namespace

#endif  // TENSORRT_LOGGING_H


================================================
FILE: mobilenet/mobilenetv3/mobilenet_v3.cpp
================================================
#include <chrono>
#include <cmath>
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"

#define CHECK(status)                                          \
    do {                                                       \
        auto ret = (status);                                   \
        if (ret != 0) {                                        \
            std::cerr << "Cuda failure: " << ret << std::endl; \
            abort();                                           \
        }                                                      \
    } while (0)

// stuff we know about the network and the input/output blobs
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int OUTPUT_SIZE = 1000;
static const int BS = 1;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";

using namespace nvinfer1;

static Logger gLogger;

// Load weights from files shared with TensorRT samples.
// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--) {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x) {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

IScaleLayer* addBatchNorm(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input,
                          std::string lname, float eps) {
    float* gamma = (float*)weightMap[lname + ".weight"].values;
    float* beta = (float*)weightMap[lname + ".bias"].values;
    float* mean = (float*)weightMap[lname + ".running_mean"].values;
    float* var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;
    std::cout << "len " << len << std::endl;

    float* scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};

    float* shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float* pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

ILayer* hSwish(INetworkDefinition* network, ITensor& input, std::string name) {
    auto hsig = network->addActivation(input, ActivationType::kHARD_SIGMOID);
    assert(hsig);
    hsig->setAlpha(1.0 / 6.0);
    hsig->setBeta(0.5);
    ILayer* hsw = network->addElementWise(input, *hsig->getOutput(0), ElementWiseOperation::kPROD);
    assert(hsw);
    return hsw;
}

ILayer* convBnHswish(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch,
                     int ksize, int s, int g, std::string lname) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    int p = (ksize - 1) / 2;
    IConvolutionLayer* conv1 =
            network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[lname + "0.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{p, p});
    conv1->setNbGroups(g);

    IScaleLayer* bn1 = addBatchNorm(network, weightMap, *conv1->getOutput(0), lname + "1", 1e-5);
    ILayer* hsw = hSwish(network, *bn1->getOutput(0), lname + "2");
    assert(hsw);
    return hsw;
}

ILayer* seLayer(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int c, int w,
                std::string lname) {
    int h = w;
    IPoolingLayer* l1 = network->addPoolingNd(input, PoolingType::kAVERAGE, DimsHW(w, h));
    assert(l1);
    l1->setStrideNd(DimsHW{w, h});
    IFullyConnectedLayer* l2 = network->addFullyConnected(
            *l1->getOutput(0), BS * c / 4, weightMap[lname + "fc.0.weight"], weightMap[lname + "fc.0.bias"]);
    IActivationLayer* relu1 = network->addActivation(*l2->getOutput(0), ActivationType::kRELU);
    IFullyConnectedLayer* l4 = network->addFullyConnected(
            *relu1->getOutput(0), BS * c, weightMap[lname + "fc.2.weight"], weightMap[lname + "fc.2.bias"]);

    auto hsig = network->addActivation(*l4->getOutput(0), ActivationType::kHARD_SIGMOID);
    assert(hsig);
    hsig->setAlpha(1.0 / 6.0);
    hsig->setBeta(0.5);

    ILayer* se = network->addElementWise(input, *hsig->getOutput(0), ElementWiseOperation::kPROD);
    assert(se);
    return se;
}

ILayer* convSeq1(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int output,
                 int hdim, int k, int s, bool use_se, bool use_hs, int w, std::string lname) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    int p = (k - 1) / 2;
    IConvolutionLayer* conv1 =
            network->addConvolutionNd(input, hdim, DimsHW{k, k}, weightMap[lname + "0.weight"], emptywts);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{p, p});
    conv1->setNbGroups(hdim);

    IScaleLayer* bn1 = addBatchNorm(network, weightMap, *conv1->getOutput(0), lname + "1", 1e-5);
    ITensor *tensor3, *tensor4;
    tensor3 = nullptr;
    tensor4 = nullptr;
    if (use_hs) {
        ILayer* hsw = hSwish(network, *bn1->getOutput(0), lname + "2");
        tensor3 = hsw->getOutput(0);
    } else {
        IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
        tensor3 = relu1->getOutput(0);
    }
    if (use_se) {
        ILayer* se1 = seLayer(network, weightMap, *tensor3, hdim, w, lname + "3.");
        tensor4 = se1->getOutput(0);
    } else {
        tensor4 = tensor3;
    }
    IConvolutionLayer* conv2 =
            network->addConvolutionNd(*tensor4, output, DimsHW{1, 1}, weightMap[lname + "4.weight"], emptywts);
    IScaleLayer* bn2 = addBatchNorm(network, weightMap, *conv2->getOutput(0), lname + "5", 1e-5);
    assert(bn2);
    return bn2;
}

ILayer* convSeq2(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int output,
                 int hdim, int k, int s, bool use_se, bool use_hs, int w, std::string lname) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    int p = (k - 1) / 2;
    IConvolutionLayer* conv1 =
            network->addConvolutionNd(input, hdim, DimsHW{1, 1}, weightMap[lname + "0.weight"], emptywts);
    IScaleLayer* bn1 = addBatchNorm(network, weightMap, *conv1->getOutput(0), lname + "1", 1e-5);
    ITensor *tensor3, *tensor6, *tensor7;
    tensor3 = nullptr;
    tensor6 = nullptr;
    tensor7 = nullptr;
    if (use_hs) {
        ILayer* hsw1 = hSwish(network, *bn1->getOutput(0), lname + "2");
        tensor3 = hsw1->getOutput(0);
    } else {
        IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
        tensor3 = relu1->getOutput(0);
    }
    IConvolutionLayer* conv2 =
            network->addConvolutionNd(*tensor3, hdim, DimsHW{k, k}, weightMap[lname + "3.weight"], emptywts);
    conv2->setStrideNd(DimsHW{s, s});
    conv2->setPaddingNd(DimsHW{p, p});
    conv2->setNbGroups(hdim);
    IScaleLayer* bn2 = addBatchNorm(network, weightMap, *conv2->getOutput(0), lname + "4", 1e-5);
    if (use_se) {
        ILayer* se1 = seLayer(network, weightMap, *bn2->getOutput(0), hdim, w, lname + "5.");
        tensor6 = se1->getOutput(0);
    } else {
        tensor6 = bn2->getOutput(0);
    }
    if (use_hs) {
        ILayer* hsw2 = hSwish(network, *tensor6, lname + "6");
        tensor7 = hsw2->getOutput(0);
    } else {
        IActivationLayer* relu2 = network->addActivation(*tensor6, ActivationType::kRELU);
        tensor7 = relu2->getOutput(0);
    }
    IConvolutionLayer* conv3 =
            network->addConvolutionNd(*tensor7, output, DimsHW{1, 1}, weightMap[lname + "7.weight"], emptywts);
    IScaleLayer* bn3 = addBatchNorm(network, weightMap, *conv3->getOutput(0), lname + "8", 1e-5);
    assert(bn3);
    return bn3;
}

ILayer* invertedRes(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input,
                    std::string lname, int inch, int outch, int s, int hidden, int k, bool use_se, bool use_hs, int w) {
    bool use_res_connect = (s == 1 && inch == outch);
    ILayer* conv = nullptr;
    if (inch == hidden) {
        conv = convSeq1(network, weightMap, input, outch, hidden, k, s, use_se, use_hs, w, lname + "conv.");
    } else {
        conv = convSeq2(network, weightMap, input, outch, hidden, k, s, use_se, use_hs, w, lname + "conv.");
    }

    if (!use_res_connect)
        return conv;
    IElementWiseLayer* ew3 = network->addElementWise(input, *conv->getOutput(0), ElementWiseOperation::kSUM);
    assert(ew3);
    return ew3;
}

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngineSmall(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    INetworkDefinition* network = builder->createNetworkV2(0U);

    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../mbv3_small.wts");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    //auto test1 = network->addActivation(*data, ActivationType::kRELU);
    auto ew1 = convBnHswish(network, weightMap, *data, 16, 3, 2, 1, "features.0.");
    auto ir1 = invertedRes(network, weightMap, *ew1->getOutput(0), "features.1.", 16, 16, 2, 16, 3, 1, 0, 56);
    auto ir2 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.2.", 16, 24, 2, 72, 3, 0, 0, 28);
    auto ir3 = invertedRes(network, weightMap, *ir2->getOutput(0), "features.3.", 24, 24, 1, 88, 3, 0, 0, 28);
    auto ir4 = invertedRes(network, weightMap, *ir3->getOutput(0), "features.4.", 24, 40, 2, 96, 5, 1, 1, 14);
    auto ir5 = invertedRes(network, weightMap, *ir4->getOutput(0), "features.5.", 40, 40, 1, 240, 5, 1, 1, 14);
    auto ir6 = invertedRes(network, weightMap, *ir5->getOutput(0), "features.6.", 40, 40, 1, 240, 5, 1, 1, 14);
    auto ir7 = invertedRes(network, weightMap, *ir6->getOutput(0), "features.7.", 40, 48, 1, 120, 5, 1, 1, 14);
    auto ir8 = invertedRes(network, weightMap, *ir7->getOutput(0), "features.8.", 48, 48, 1, 144, 5, 1, 1, 14);
    auto ir9 = invertedRes(network, weightMap, *ir8->getOutput(0), "features.9.", 48, 96, 2, 288, 5, 1, 1, 7);
    auto ir10 = invertedRes(network, weightMap, *ir9->getOutput(0), "features.10.", 96, 96, 1, 576, 5, 1, 1, 7);
    auto ir11 = invertedRes(network, weightMap, *ir10->getOutput(0), "features.11.", 96, 96, 1, 576, 5, 1, 1, 7);
    ILayer* ew2 = convBnHswish(network, weightMap, *ir11->getOutput(0), 576, 1, 1, 1, "conv.0.");
    ILayer* se1 = seLayer(network, weightMap, *ew2->getOutput(0), 576, 7, "conv.1.");

    IPoolingLayer* pool1 = network->addPoolingNd(*se1->getOutput(0), PoolingType::kAVERAGE, DimsHW{7, 7});
    assert(pool1);
    pool1->setStrideNd(DimsHW{7, 7});
    ILayer* sw1 = hSwish(network, *pool1->getOutput(0), "hSwish.0");

    IFullyConnectedLayer* fc1 = network->addFullyConnected(*sw1->getOutput(0), 1280, weightMap["classifier.0.weight"],
                                                           weightMap["classifier.0.bias"]);
    assert(fc1);
    ILayer* bn1 = addBatchNorm(network, weightMap, *fc1->getOutput(0), "classifier.1", 1e-5);
    ILayer* sw2 = hSwish(network, *bn1->getOutput(0), "hSwish.1");
    IFullyConnectedLayer* fc2 = network->addFullyConnected(*sw2->getOutput(0), 1000, weightMap["classifier.3.weight"],
                                                           weightMap["classifier.3.bias"]);
    ILayer* bn2 = addBatchNorm(network, weightMap, *fc2->getOutput(0), "classifier.4", 1e-5);
    ILayer* sw3 = hSwish(network, *bn2->getOutput(0), "hSwish.2");

    sw3->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    std::cout << "set name out" << std::endl;
    network->markOutput(*sw3->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(1 << 20);
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "build out" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    return engine;
}

ICudaEngine* createEngineLarge(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    INetworkDefinition* network = builder->createNetworkV2(0U);

    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../mbv3_large.wts");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    //auto test1 = network->addActivation(*data, ActivationType::kRELU);
    auto ew1 = convBnHswish(network, weightMap, *data, 16, 3, 2, 1, "features.0.");
    auto ir1 = invertedRes(network, weightMap, *ew1->getOutput(0), "features.1.", 16, 16, 1, 16, 3, 0, 0, 112);
    auto ir2 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.2.", 16, 24, 2, 64, 3, 0, 0, 56);
    auto ir3 = invertedRes(network, weightMap, *ir2->getOutput(0), "features.3.", 24, 24, 1, 72, 3, 0, 0, 56);
    auto ir4 = invertedRes(network, weightMap, *ir3->getOutput(0), "features.4.", 24, 40, 2, 72, 5, 1, 0, 28);
    auto ir5 = invertedRes(network, weightMap, *ir4->getOutput(0), "features.5.", 40, 40, 1, 120, 5, 1, 0, 28);
    auto ir6 = invertedRes(network, weightMap, *ir5->getOutput(0), "features.6.", 40, 40, 1, 120, 5, 1, 0, 28);
    auto ir7 = invertedRes(network, weightMap, *ir6->getOutput(0), "features.7.", 40, 80, 2, 240, 3, 0, 1, 14);
    auto ir8 = invertedRes(network, weightMap, *ir7->getOutput(0), "features.8.", 80, 80, 1, 200, 3, 0, 1, 14);
    auto ir9 = invertedRes(network, weightMap, *ir8->getOutput(0), "features.9.", 80, 80, 1, 184, 3, 0, 1, 14);
    auto ir10 = invertedRes(network, weightMap, *ir9->getOutput(0), "features.10.", 80, 80, 1, 184, 3, 0, 1, 14);
    auto ir11 = invertedRes(network, weightMap, *ir10->getOutput(0), "features.11.", 80, 112, 1, 480, 3, 1, 1, 14);
    auto ir12 = invertedRes(network, weightMap, *ir11->getOutput(0), "features.12.", 112, 112, 1, 672, 3, 1, 1, 14);
    auto ir13 = invertedRes(network, weightMap, *ir12->getOutput(0), "features.13.", 112, 160, 1, 672, 5, 1, 1, 14);
    auto ir14 = invertedRes(network, weightMap, *ir13->getOutput(0), "features.14.", 160, 160, 2, 672, 5, 1, 1, 7);
    auto ir15 = invertedRes(network, weightMap, *ir14->getOutput(0), "features.15.", 160, 160, 1, 960, 5, 1, 1, 7);
    ILayer* ew2 = convBnHswish(network, weightMap, *ir15->getOutput(0), 960, 1, 1, 1, "conv.0.");

    IPoolingLayer* pool1 = network->addPoolingNd(*ew2->getOutput(0), PoolingType::kAVERAGE, DimsHW{7, 7});
    assert(pool1);
    pool1->setStrideNd(DimsHW{7, 7});
    ILayer* sw1 = hSwish(network, *pool1->getOutput(0), "hSwish.0");

    IFullyConnectedLayer* fc1 = network->addFullyConnected(*sw1->getOutput(0), 1280, weightMap["classifier.0.weight"],
                                                           weightMap["classifier.0.bias"]);
    assert(fc1);
    ILayer* sw2 = hSwish(network, *fc1->getOutput(0), "hSwish.1");
    IFullyConnectedLayer* fc2 = network->addFullyConnected(*sw2->getOutput(0), 1000, weightMap["classifier.3.weight"],
                                                           weightMap["classifier.3.bias"]);

    fc2->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    std::cout << "set name out" << std::endl;
    network->markOutput(*fc2->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(1 << 20);
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "build out" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream, std::string mode) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine;

    if (mode == "small") {
        std::cout << "create engine small" << std::endl;
        engine = createEngineSmall(maxBatchSize, builder, config, DataType::kFLOAT);
    } else if (mode == "large") {
        engine = createEngineLarge(maxBatchSize, builder, config, DataType::kFLOAT);
    }
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    config->destroy();
    builder->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float),
                          cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost,
                          stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

int main(int argc, char** argv) {
    if (argc != 3) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./mobilenet -s small  // serialize small model to plan file" << std::endl;
        std::cerr << "./mobilenet -s large  // serialize large model to plan file" << std::endl;
        std::cerr << "./mobilenet -d small  // deserialize small model plan file and run inference" << std::endl;
        std::cerr << "./mobilenet -d large  // deserialize large model plan file and run inference" << std::endl;
        return -1;
    }

    // create a model using the API directly and serialize it to a stream
    char* trtModelStream{nullptr};
    size_t size{0};
    std::string mode = std::string(argv[2]);
    std::cout << mode << std::endl;

    if (std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(1, &modelStream, mode);
        assert(modelStream != nullptr);

        std::ofstream p("mobilenetv3_" + mode + ".engine", std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 1;
    } else if (std::string(argv[1]) == "-d") {
        std::ifstream file("mobilenetv3_" + mode + ".engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        return -1;
    }

    // Subtract mean from image
    static float data[3 * INPUT_H * INPUT_W];
    for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
        data[i] = 1.0;
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    // Run inference
    static float prob[OUTPUT_SIZE];
    for (int i = 0; i < 10; i++) {
        auto start = std::chrono::system_clock::now();
        doInference(*context, data, prob, 1);
        auto end = std::chrono::system_clock::now();
        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    // Print histogram of the output distribution
    std::cout << "\nOutput:\n\n";
    for (unsigned int i = 0; i < OUTPUT_SIZE; i++) {
        std::cout << prob[i] << ", ";
        //if (i % 10 == 0) std::cout << i / 10 << std::endl;
    }
    std::cout << std::endl;

    return 0;
}


================================================
FILE: mobilenet/mobilenetv3/mobilenet_v3.py
================================================
import os
import sys
import struct
import argparse

import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit  # noqa: F401
import tensorrt as trt

BATCH_SIZE = 1
INPUT_H = 224
INPUT_W = 224
OUTPUT_SIZE = 1000
BS = 1
INPUT_BLOB_NAME = "data"
OUTPUT_BLOB_NAME = "prob"
EPS = 1e-5

WEIGHT_PATH_SMALL = "./mobilenetv3.wts"
ENGINE_PATH = "./mobilenetv3.engine"

TRT_LOGGER = trt.Logger(trt.Logger.INFO)


def load_weights(file):
    print(f"Loading weights: {file}")

    assert os.path.exists(file), 'Unable to load weight file.'

    weight_map = {}
    with open(file, "r") as f:
        lines = [line.strip() for line in f]
    count = int(lines[0])
    assert count == len(lines) - 1
    for i in range(1, count + 1):
        splits = lines[i].split(" ")
        name = splits[0]
        cur_count = int(splits[1])
        assert cur_count + 2 == len(splits)
        values = []
        for j in range(2, len(splits)):
            # hex string to bytes to float
            values.append(struct.unpack(">f", bytes.fromhex(splits[j])))
        weight_map[name] = np.array(values, dtype=np.float32)

    return weight_map


def add_batch_norm_2d(network, weight_map, input, layer_name, eps):
    gamma = weight_map[layer_name + ".weight"]
    beta = weight_map[layer_name + ".bias"]
    mean = weight_map[layer_name + ".running_mean"]
    var = weight_map[layer_name + ".running_var"]
    var = np.sqrt(var + eps)

    scale = gamma / var
    shift = -mean / var * gamma + beta
    return network.add_scale(input=input,
                             mode=trt.ScaleMode.CHANNEL,
                             shift=shift,
                             scale=scale)


def add_h_swish(network, input):
    h_sig = network.add_activation(input, type=trt.ActivationType.HARD_SIGMOID)
    assert h_sig
    h_sig.alpha = 1.0 / 6.0
    h_sig.beta = 0.5
    hsw = network.add_elementwise(input, h_sig.get_output(0), trt.ElementWiseOperation.PROD)
    assert hsw

    return hsw


def conv_bn_h_swish(network, weight_map, input, outch, ksize, s, g, lname):
    p = (ksize - 1) // 2
    conv1 = network.add_convolution(input=input,
                                    num_output_maps=outch,
                                    kernel_shape=(ksize, ksize),
                                    kernel=weight_map[lname + "0.weight"],
                                    bias=trt.Weights()
                                    )
    assert conv1
    conv1.stride = (s, s)
    conv1.padding = (p, p)
    conv1.num_groups = g

    bn1 = add_batch_norm_2d(network, weight_map, conv1.get_output(0), lname + "1", EPS)
    hsw = add_h_swish(network, bn1.get_output(0))
    assert hsw

    return hsw


def add_se_layer(network, weight_map, input, c, w, lname):
    h = w
    l1 = network.add_pooling(input=input,
                             type=trt.PoolingType.AVERAGE,
                             window_size=trt.DimsHW(w, h))
    assert l1
    l1.stride_nd = (w, h)

    l2 = network.add_fully_connected(input=l1.get_output(0),
                                     num_outputs=BS * c // 4,
                                     kernel=weight_map[lname + "fc.0.weight"],
                                     bias=weight_map[lname + "fc.0.bias"])
    relu1 = network.add_activation(l2.get_output(0), type=trt.ActivationType.RELU)
    l4 = network.add_fully_connected(input=relu1.get_output(0),
                                     num_outputs=BS * c,
                                     kernel=weight_map[lname + "fc.2.weight"],
                                     bias=weight_map[lname + "fc.2.bias"])

    se = add_h_swish(network, l4.get_output(0))

    return se


def conv_seq_1(network, weight_map, input, output, hdim, k, s, use_se, use_hs, w, lname):
    p = (k - 1) // 2
    conv1 = network.add_convolution(input=input,
                                    num_output_maps=hdim,
                                    kernel_shape=(k, k),
                                    kernel=weight_map[lname + "0.weight"],
                                    bias=trt.Weights())
    assert conv1
    conv1.stride = (s, s)
    conv1.padding = (p, p)
    conv1.num_groups = hdim

    bn1 = add_batch_norm_2d(network, weight_map, conv1.get_output(0), lname + "1", EPS)

    if use_hs:
        hsw = add_h_swish(network, bn1.get_output(0))
        tensor3 = hsw.get_output(0)
    else:
        relu1 = network.add_activation(bn1.get_output(0), type=trt.ActivationType.RELU)
        tensor3 = relu1.get_output(0)

    if use_se:
        se1 = add_se_layer(network, weight_map, tensor3, hdim, w, lname + "3.")
        tensor4 = se1.get_output(0)
    else:
        tensor4 = tensor3

    conv2 = network.add_convolution(input=tensor4,
                                    num_output_maps=output,
                                    kernel_shape=(1, 1),
                                    kernel=weight_map[lname + "4.weight"],
                                    bias=trt.Weights())
    bn2 = add_batch_norm_2d(network, weight_map, conv2.get_output(0), lname + "5", EPS)
    assert bn2

    return bn2


def conv_seq_2(network, weight_map, input, output, hdim, k, s, use_se, use_hs, w, lname):
    p = (k - 1) // 2
    conv1 = network.add_convolution(input=input,
                                    num_output_maps=hdim,
                                    kernel_shape=(1, 1),
                                    kernel=weight_map[lname + "0.weight"],
                                    bias=trt.Weights())
    bn1 = add_batch_norm_2d(network, weight_map, conv1.get_output(0), lname + "1", EPS)

    if use_hs:
        hsw1 = add_h_swish(network, bn1.get_output(0))
        tensor3 = hsw1.get_output(0)
    else:
        relu1 = network.add_activation(bn1.get_output(0), type=trt.ActivationType.RELU)
        tensor3 = relu1.get_output(0)

    conv2 = network.add_convolution(input=tensor3,
                                    num_output_maps=hdim,
                                    kernel_shape=(k, k),
                                    kernel=weight_map[lname + "3.weight"],
                                    bias=trt.Weights())
    conv2.stride = (s, s)
    conv2.padding = (p, p)
    conv2.num_groups = hdim
    bn2 = add_batch_norm_2d(network, weight_map, conv2.get_output(0), lname + "4", EPS)

    if use_se:
        se1 = add_se_layer(network, weight_map, bn2.get_output(0), hdim, w, lname + "5.")
        tensor6 = se1.get_output(0)
    else:
        tensor6 = bn2.get_output(0)

    if use_hs:
        hsw2 = add_h_swish(network, tensor6)
        tensor7 = hsw2.get_output(0)
    else:
        relu2 = network.add_activation(tensor6, type=trt.ActivationType.RELU)
        tensor7 = relu2.get_output(0)

    conv3 = network.add_convolution(input=tensor7,
                                    num_output_maps=output,
                                    kernel_shape=(1, 1),
                                    kernel=weight_map[lname + "7.weight"],
                                    bias=trt.Weights())
    bn3 = add_batch_norm_2d(network, weight_map, conv3.get_output(0), lname + "8", EPS)
    assert bn3

    return bn3


def inverted_res(network, weight_map, input, lname, inch, outch, s, hidden, k, use_se, use_hs, w):
    use_res_connect = (s == 1 and inch == outch)

    if inch == hidden:
        conv = conv_seq_1(network, weight_map, input, outch, hidden, k, s, use_se, use_hs, w, lname + "conv.")
    else:
        conv = conv_seq_2(network, weight_map, input, outch, hidden, k, s, use_se, use_hs, w, lname + "conv.")

    if not use_res_connect:
        return conv

    ew3 = network.add_elementwise(input, conv.get_output(0), trt.ElementWiseOperation.SUM)
    assert ew3

    return ew3


def create_engine_small(max_batch_size, builder, config, dt):
    weight_map = load_weights(WEIGHT_PATH_SMALL)
    network = builder.create_network()

    data = network.add_input(INPUT_BLOB_NAME, dt, (3, INPUT_H, INPUT_W))
    assert data

    ew1 = conv_bn_h_swish(network, weight_map, data, 16, 3, 2, 1, "features.0.")
    ir1 = inverted_res(network, weight_map, ew1.get_output(0), "features.1.", 16, 16, 2, 16, 3, 1, 0, 56)
    ir2 = inverted_res(network, weight_map, ir1.get_output(0), "features.2.", 16, 24, 2, 72, 3, 0, 0, 28)
    ir3 = inverted_res(network, weight_map, ir2.get_output(0), "features.3.", 24, 24, 1, 88, 3, 0, 0, 28)
    ir4 = inverted_res(network, weight_map, ir3.get_output(0), "features.4.", 24, 40, 2, 96, 5, 1, 1, 14)
    ir5 = inverted_res(network, weight_map, ir4.get_output(0), "features.5.", 40, 40, 1, 240, 5, 1, 1, 14)
    ir6 = inverted_res(network, weight_map, ir5.get_output(0), "features.6.", 40, 40, 1, 240, 5, 1, 1, 14)
    ir7 = inverted_res(network, weight_map, ir6.get_output(0), "features.7.", 40, 48, 1, 120, 5, 1, 1, 14)
    ir8 = inverted_res(network, weight_map, ir7.get_output(0), "features.8.", 48, 48, 1, 144, 5, 1, 1, 14)
    ir9 = inverted_res(network, weight_map, ir8.get_output(0), "features.9.", 48, 96, 2, 288, 5, 1, 1, 7)
    ir10 = inverted_res(network, weight_map, ir9.get_output(0), "features.10.", 96, 96, 1, 576, 5, 1, 1, 7)
    ir11 = inverted_res(network, weight_map, ir10.get_output(0), "features.11.", 96, 96, 1, 576, 5, 1, 1, 7)
    ew2 = conv_bn_h_swish(network, weight_map, ir11.get_output(0), 576, 1, 1, 1, "conv.0.")
    se1 = add_se_layer(network, weight_map, ew2.get_output(0), 576, 7, "conv.1.")

    pool1 = network.add_pooling(input=se1.get_output(0),
                                type=trt.PoolingType.AVERAGE,
                                window_size=trt.DimsHW(7, 7))
    assert pool1
    pool1.stride_nd = (7, 7)
    sw1 = add_h_swish(network, pool1.get_output(0))

    fc1 = network.add_fully_connected(input=sw1.get_output(0),
                                      num_outputs=1280,
                                      kernel=weight_map["classifier.0.weight"],
                                      bias=weight_map["classifier.0.bias"])
    assert fc1
    bn1 = add_batch_norm_2d(network, weight_map, fc1.get_output(0), "classifier.1", EPS)
    sw2 = add_h_swish(network, bn1.get_output(0))

    fc2 = network.add_fully_connected(input=sw2.get_output(0),
                                      num_outputs=OUTPUT_SIZE,
                                      kernel=weight_map["classifier.3.weight"],
                                      bias=weight_map["classifier.3.bias"])
    bn2 = add_batch_norm_2d(network, weight_map, fc2.get_output(0), "classifier.4", EPS)
    sw3 = add_h_swish(network, bn2.get_output(0))

    sw3.get_output(0).name = OUTPUT_BLOB_NAME
    network.mark_output(sw3.get_output(0))

    # Build Engine
    builder.max_batch_size = max_batch_size
    builder.max_workspace_size = 1 << 20
    engine = builder.build_engine(network, config)

    del network
    del weight_map

    return engine


def create_engine_large(max_batch_size, builder, config, dt):
    weight_map = load_weights(WEIGHT_PATH_SMALL)
    network = builder.create_network()

    data = network.add_input(INPUT_BLOB_NAME, dt, (3, INPUT_H, INPUT_W))
    assert data

    ew1 = conv_bn_h_swish(network, weight_map, data, 16, 3, 2, 1, "features.0.")
    ir1 = inverted_res(network, weight_map, ew1.get_output(0), "features.1.", 16, 16, 1, 16, 3, 0, 0, 112)
    ir2 = inverted_res(network, weight_map, ir1.get_output(0), "features.2.", 16, 24, 2, 64, 3, 0, 0, 56)
    ir3 = inverted_res(network, weight_map, ir2.get_output(0), "features.3.", 24, 24, 1, 72, 3, 0, 0, 56)
    ir4 = inverted_res(network, weight_map, ir3.get_output(0), "features.4.", 24, 40, 2, 72, 5, 1, 0, 28)
    ir5 = inverted_res(network, weight_map, ir4.get_output(0), "features.5.", 40, 40, 1, 120, 5, 1, 0, 28)
    ir6 = inverted_res(network, weight_map, ir5.get_output(0), "features.6.", 40, 40, 1, 120, 5, 1, 0, 28)
    ir7 = inverted_res(network, weight_map, ir6.get_output(0), "features.7.", 40, 80, 2, 240, 3, 0, 1, 14)
    ir8 = inverted_res(network, weight_map, ir7.get_output(0), "features.8.", 80, 80, 1, 200, 3, 0, 1, 14)
    ir9 = inverted_res(network, weight_map, ir8.get_output(0), "features.9.", 80, 80, 1, 184, 3, 0, 1, 14)
    ir10 = inverted_res(network, weight_map, ir9.get_output(0), "features.10.", 80, 80, 1, 184, 3, 0, 1, 14)
    ir11 = inverted_res(network, weight_map, ir10.get_output(0), "features.11.", 80, 112, 1, 480, 3, 1, 1, 14)
    ir12 = inverted_res(network, weight_map, ir11.get_output(0), "features.12.", 112, 112, 1, 672, 3, 1, 1, 14)
    ir13 = inverted_res(network, weight_map, ir12.get_output(0), "features.13.", 112, 160, 1, 672, 5, 1, 1, 14)
    ir14 = inverted_res(network, weight_map, ir13.get_output(0), "features.14.", 160, 160, 2, 672, 5, 1, 1, 7)
    ir15 = inverted_res(network, weight_map, ir14.get_output(0), "features.15.", 160, 160, 1, 960, 5, 1, 1, 7)
    ew2 = conv_bn_h_swish(network, weight_map, ir15.get_output(0), 960, 1, 1, 1, "conv.0.")

    pool1 = network.add_pooling(input=ew2.get_output(0),
                                type=trt.PoolingType.AVERAGE,
                                window_size=trt.DimsHW(7, 7))
    assert pool1
    pool1.stride_nd = (7, 7)
    sw1 = add_h_swish(network, pool1.get_output(0))

    fc1 = network.add_fully_connected(input=sw1.get_output(0),
                                      num_outputs=1280,
                                      kernel=weight_map["classifier.0.weight"],
                                      bias=weight_map["classifier.0.bias"])
    assert fc1
    sw2 = add_h_swish(network, fc1.get_output(0))

    fc2 = network.add_fully_connected(input=sw2.get_output(0),
                                      num_outputs=OUTPUT_SIZE,
                                      kernel=weight_map["classifier.3.weight"],
                                      bias=weight_map["classifier.3.bias"])

    fc2.get_output(0).name = OUTPUT_BLOB_NAME
    network.mark_output(fc2.get_output(0))

    # Build Engine
    builder.max_batch_size = max_batch_size
    builder.max_workspace_size = 1 << 20
    engine = builder.build_engine(network, config)

    del network
    del weight_map

    return engine


def API_to_model(max_batch_size, model_type):
    builder = trt.Builder(TRT_LOGGER)
    config = builder.create_builder_config()
    if model_type == "small":
        engine = create_engine_small(max_batch_size, builder, config, trt.float32)
        assert engine
    else:
        engine = create_engine_large(max_batch_size, builder, config, trt.float32)
        assert engine

    with open(ENGINE_PATH, "wb") as f:
        f.write(engine.serialize())

    del engine
    del builder
    del config


class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()


def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream


def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-s", action='store_true')
    parser.add_argument("-d", action='store_true')
    parser.add_argument("-t", help='indicate small or large model')
    args = parser.parse_args()

    if not (args.s ^ args.d):
        print(
            "arguments not right!\n"
            "python mobilenet_v2.py -s   # serialize model to plan file\n"
            "python mobilenet_v2.py -d   # deserialize plan file and run inference"
        )
        sys.exit()

    if args.s:
        API_to_model(BATCH_SIZE, args.t)
    else:
        runtime = trt.Runtime(TRT_LOGGER)
        assert runtime

        with open(ENGINE_PATH, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        assert engine

        context = engine.create_execution_context()
        assert context

        data = np.ones((BATCH_SIZE * 3 * INPUT_H * INPUT_W), dtype=np.float32)
        inputs, outputs, bindings, stream = allocate_buffers(engine)
        inputs[0].host = data

        trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)

        print(f'Output: \n{trt_outputs[0][:10]}\n{trt_outputs[0][-10:]}')


================================================
FILE: psenet/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(PSENet)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

find_package(CUDA REQUIRED)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(/usr/include/x86_64-linux-gnu/)
link_directories(/usr/lib/x86_64-linux-gnu/)


set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")


find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

file(GLOB SOURCE_FILES "*.h" "*.cpp")

add_executable(psenet ${SOURCE_FILES})
target_link_libraries(psenet nvinfer)
target_link_libraries(psenet cudart)
target_link_libraries(psenet ${OpenCV_LIBS})

add_definitions(-O2 -pthread)


================================================
FILE: psenet/README.md
================================================
# PSENet

**preprocessing + inference + postprocessing = 30ms** with fp32 on Tesla P40. 
The original Tensorflow implementation is [tensorflow_PSENet](https://github.com/liuheng92/tensorflow_PSENet). A TensorRT Python api implementation is [TensorRT-Python-PSENet](https://github.com/upczww/TensorRT-Python-PSENet).

## Key Features
- Generating `.wts` from `Tensorflow`.
- Dynamic batch and dynamic shape input.
- Object-Oriented Programming.
- Practice with C++ 11.


<p align="center">
<img src="https://user-images.githubusercontent.com/15235574/105487078-821d6800-5cea-11eb-87dc-e3317a941763.jpeg">
</p>

## How to Run

* 1. generate .wts

  Download pretrained model from https://github.com/liuheng92/tensorflow_PSENet
  and put `model.ckpt.*` to `model` dir. Add a file `model/checkpoint` with content
    ```
    model_checkpoint_path: "model.ckpt"
    all_model_checkpoint_paths: "model.ckpt"
    ```
    Then run

    ```
    python gen_tf_wts.py
    ```
    which will gengerate a `psenet.wts`.
* 2. cmake and make

  ```
  mkdir build
  cd build
  cmake ..
  make
  ```
* 3. build engine and run detection
  ```
  cp ../psenet.wts ./
  cp ../test.jpg ./
  ./psenet -s  // serialize model to plan file
  ./psenet -d  // deserialize plan file and run inference
  ```

## Known Issues
None

## Todo

* use `ExponentialMovingAverage` weight.


================================================
FILE: psenet/gen_tf_wts.py
================================================
from sys import prefix
import tensorflow as tf
from tensorflow.python import pywrap_tensorflow
import numpy as np
import struct

model_dir = "model"

ckpt = tf.train.get_checkpoint_state(model_dir)
ckpt_path = ckpt.model_checkpoint_path

reader = pywrap_tensorflow.NewCheckpointReader(ckpt_path)
param_dict = reader.get_variable_to_shape_map()


f = open(r"psenet.wts", "w")
keys = param_dict.keys()
f.write("{}\n".format(len(keys)))

for key in keys:
    weight = reader.get_tensor(key)
    print(key, weight.shape)
    if len(weight.shape) == 4:
        weight = np.transpose(weight, (3, 2, 0, 1))
        print(weight.shape)
    weight = np.reshape(weight, -1)
    f.write("{} {} ".format(key, len(weight)))
    for w in weight:
        f.write(" ")
        f.write(struct.pack(">f", float(w)).hex())
    f.write("\n")

================================================
FILE: psenet/layers.cpp
================================================
#include "layers.h"

IScaleLayer* addBatchNorm2d(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps)
{
    float* gamma = (float*)weightMap[lname + "gamma"].values; // scale
    float* beta = (float*)weightMap[lname + "beta"].values;   // offset
    float* mean = (float*)weightMap[lname + "moving_mean"].values;
    float* var = (float*)weightMap[lname + "moving_variance"].values;
    int len = weightMap[lname + "moving_variance"].count;

    float* scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (auto i = 0; i < len; i++)
    {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{ DataType::kFLOAT, scval, len };

    float* shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (auto i = 0; i < len; i++)
    {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{ DataType::kFLOAT, shval, len };

    float* pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (auto i = 0; i < len; i++)
    {
        pval[i] = 1.0;
    }
    Weights power{ DataType::kFLOAT, pval, len };

    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

IActivationLayer* bottleneck(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int ch, int stride, std::string lname, int branch_type)
{

    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };

    IConvolutionLayer* conv1 = network->addConvolutionNd(input, ch, DimsHW{ 1, 1 }, weightMap[lname + "conv1/weights"], emptywts);
    assert(conv1);

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "conv1/BatchNorm/", 1e-5);
    assert(bn1);

    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), ch, DimsHW{ 3, 3 }, weightMap[lname + "conv2/weights"], emptywts);
    conv2->setStrideNd(DimsHW{ stride, stride });
    conv2->setPaddingNd(DimsHW{ 1, 1 });
    assert(conv2);

    IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "conv2/BatchNorm/", 1e-5);
    assert(bn2);

    IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU);
    assert(relu2);

    IConvolutionLayer* conv3 = network->addConvolutionNd(*relu2->getOutput(0), ch * 4, DimsHW{ 1, 1 }, weightMap[lname + "conv3/weights"], emptywts);
    assert(conv3);

    IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "conv3/BatchNorm/", 1e-5);
    assert(bn3);
    IElementWiseLayer* ew1;
    // branch_type 0:shortcut,1:conv+bn+shortcut,2:maxpool+shortcut
    if (branch_type == 0)
    {
        ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM);
        assert(ew1);
    }
    else if (branch_type == 1)
    {
        IConvolutionLayer* conv4 = network->addConvolutionNd(input, ch * 4, DimsHW{ 1, 1 }, weightMap[lname + "shortcut/weights"], emptywts);
        conv4->setStrideNd(DimsHW{ stride, stride });
        assert(conv4);
        IScaleLayer* bn4 = addBatchNorm2d(network, weightMap, *conv4->getOutput(0), lname + "shortcut/BatchNorm/", 1e-5);
        assert(bn4);
        ew1 = network->addElementWise(*bn4->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM);
        assert(ew1);
    }
    else
    {
        IPoolingLayer* pool = network->addPoolingNd(input, PoolingType::kMAX, DimsHW{ 1, 1 });
        pool->setStrideNd(DimsHW{ 2, 2 });
        assert(pool);
        ew1 = network->addElementWise(*pool->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM);
        assert(ew1);
    }
    IActivationLayer* relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU);
    assert(relu3);
    return relu3;
}

IActivationLayer* addConvRelu(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int kernel, int stride, std::string lname)
{
    IConvolutionLayer* conv = network->addConvolutionNd(input, 256, DimsHW{ kernel, kernel }, weightMap[lname + "weights"], weightMap[lname + "biases"]);
    conv->setStrideNd(DimsHW{ stride, stride });
    if (kernel == 3)
    {
        conv->setPaddingNd(DimsHW{ 1, 1 });
    }
    assert(conv);

    IActivationLayer* ac = network->addActivation(*conv->getOutput(0), ActivationType::kRELU);
    assert(ac);
    return ac;
}

================================================
FILE: psenet/layers.h
================================================
#ifndef TENSORRTX_LAYERS_H
#define TENSORRTX_LAYERS_H

#include <map>
#include <math.h>
#include <assert.h>

#include "NvInfer.h"
#include "cuda_runtime_api.h"
using namespace nvinfer1;

IScaleLayer *addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights> &weightMap, ITensor &input, std::string lname, float eps);

IActivationLayer *bottleneck(INetworkDefinition *network, std::map<std::string, Weights> &weightMap, ITensor &input, int ch, int stride, std::string lname, int branch_type);

IActivationLayer *addConvRelu(INetworkDefinition *network, std::map<std::string, Weights> &weightMap, ITensor &input, int outch, int kernel, int stride, std::string lname);

#endif


================================================
FILE: psenet/main.cpp
================================================
#include "psenet.h"

int main(int argc, char** argv)
{
    PSENet psenet(1200, 640, 0.90, 6, 4);

    if (argc == 2 && std::string(argv[1]) == "-s")
    {
        std::cout << "Serializling Engine" << std::endl;
        psenet.serializeEngine();
        return 0;
    }
    else if (argc == 2 && std::string(argv[1]) == "-d")
    {
        psenet.init();
        std::vector<std::string> files;
        for (int i = 0; i < 10; i++)
            files.emplace_back("test.jpg");
        for (auto file : files)
        {
            std::cout << "Detect " << file << std::endl;
            psenet.detect(file);
        }

        return 0;
    }
    else
    {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./psenet -s  // serialize model to plan file" << std::endl;
        std::cerr << "./psenet -d  // deserialize plan file and run inference" << std::endl;
        return -1;
    }
}


================================================
FILE: psenet/psenet.cpp
================================================
#include "psenet.h"
#include <string>
#include <queue>
#define MAX_INPUT_SIZE 1200
#define MIN_INPUT_SIZE 128
#define OPT_INPUT_W 640
#define OPT_INPUT_H 640

PSENet::PSENet(int max_side_len, int min_side_len, float threshold, int num_kernel, int stride) : max_side_len_(max_side_len), min_side_len_(min_side_len),
post_threshold_(threshold),
num_kernels_(num_kernel),
stride_(stride)
{
}

PSENet::~PSENet()
{
}

// create the engine using only the API and not any parser.
ICudaEngine* PSENet::createEngine(IBuilder* builder, IBuilderConfig* config)
{
    std::map<std::string, Weights> weightMap = loadWeights("./psenet.wts");
    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
    const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
    INetworkDefinition* network = builder->createNetworkV2(explicitBatch);

    ITensor* data = network->addInput(input_name_, dt, Dims4{ -1, 3, -1, -1 });
    assert(data);

    IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 64, DimsHW{ 7, 7 }, weightMap["resnet_v1_50/conv1/weights"], emptywts);
    conv1->setStrideNd(DimsHW{ 2, 2 });
    conv1->setPaddingNd(DimsHW{ 3, 3 });
    assert(conv1);

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "resnet_v1_50/conv1/BatchNorm/", 1e-5);
    assert(bn1);
    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    // C2
    IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{ 3, 3 });
    pool1->setStrideNd(DimsHW{ 2, 2 });
    pool1->setPrePadding(DimsHW{ 0, 0 });
    pool1->setPostPadding(DimsHW{ 1, 1 });
    assert(pool1);

    IActivationLayer* x;

    x = bottleneck(network, weightMap, *pool1->getOutput(0), 64, 1, "resnet_v1_50/block1/unit_1/bottleneck_v1/", 1);
    x = bottleneck(network, weightMap, *x->getOutput(0), 64, 1, "resnet_v1_50/block1/unit_2/bottleneck_v1/", 0);
    // C3
    IActivationLayer* block1 = bottleneck(network, weightMap, *x->getOutput(0), 64, 2, "resnet_v1_50/block1/unit_3/bottleneck_v1/", 2);

    x = bottleneck(network, weightMap, *block1->getOutput(0), 128, 1, "resnet_v1_50/block2/unit_1/bottleneck_v1/", 1);
    x = bottleneck(network, weightMap, *x->getOutput(0), 128, 1, "resnet_v1_50/block2/unit_2/bottleneck_v1/", 0);
    x = bottleneck(network, weightMap, *x->getOutput(0), 128, 1, "resnet_v1_50/block2/unit_3/bottleneck_v1/", 0);
    // C4
    IActivationLayer* block2 = bottleneck(network, weightMap, *x->getOutput(0), 128, 2, "resnet_v1_50/block2/unit_4/bottleneck_v1/", 2);

    x = bottleneck(network, weightMap, *block2->getOutput(0), 256, 1, "resnet_v1_50/block3/unit_1/bottleneck_v1/", 1);
    x = bottleneck(network, weightMap, *x->getOutput(0), 256, 1, "resnet_v1_50/block3/unit_2/bottleneck_v1/", 0);
    x = bottleneck(network, weightMap, *x->getOutput(0), 256, 1, "resnet_v1_50/block3/unit_3/bottleneck_v1/", 0);
    x = bottleneck(network, weightMap, *x->getOutput(0), 256, 1, "resnet_v1_50/block3/unit_4/bottleneck_v1/", 0);
    x = bottleneck(network, weightMap, *x->getOutput(0), 256, 1, "resnet_v1_50/block3/unit_5/bottleneck_v1/", 0);
    IActivationLayer* block3 = bottleneck(network, weightMap, *x->getOutput(0), 256, 2, "resnet_v1_50/block3/unit_6/bottleneck_v1/", 2);

    x = bottleneck(network, weightMap, *block3->getOutput(0), 512, 1, "resnet_v1_50/block4/unit_1/bottleneck_v1/", 1);
    x = bottleneck(network, weightMap, *x->getOutput(0), 512, 1, "resnet_v1_50/block4/unit_2/bottleneck_v1/", 0);
    // C5
    IActivationLayer* block4 = bottleneck(network, weightMap, *x->getOutput(0), 512, 1, "resnet_v1_50/block4/unit_3/bottleneck_v1/", 0);

    IActivationLayer* build_p5_r1 = addConvRelu(network, weightMap, *block4->getOutput(0), 256, 1, 1, "build_feature_pyramid/build_P5/");
    assert(build_p5_r1);
    IActivationLayer* build_p4_r1 = addConvRelu(network, weightMap, *block2->getOutput(0), 256, 1, 1, "build_feature_pyramid/build_P4/reduce_dimension/");
    assert(build_p4_r1);

    IResizeLayer* bfp_layer4_resize = network->addResize(*build_p5_r1->getOutput(0));
    auto build_p4_r1_shape = network->addShape(*build_p4_r1->getOutput(0))->getOutput(0);
    bfp_layer4_resize->setInput(1, *build_p4_r1_shape);
    bfp_layer4_resize->setResizeMode(ResizeMode::kNEAREST);
    bfp_layer4_resize->setAlignCorners(false);
    assert(bfp_layer4_resize);

    IElementWiseLayer* bfp_add = network->addElementWise(*bfp_layer4_resize->getOutput(0), *build_p4_r1->getOutput(0), ElementWiseOperation::kSUM);
    assert(bfp_add);

    IActivationLayer* build_p4_r2 = addConvRelu(network, weightMap, *bfp_add->getOutput(0), 256, 3, 1, "build_feature_pyramid/build_P4/avoid_aliasing/");
    assert(build_p4_r2);

    IActivationLayer* build_p3_r1 = addConvRelu(network, weightMap, *block1->getOutput(0), 256, 1, 1, "build_feature_pyramid/build_P3/reduce_dimension/");
    assert(build_p3_r1);

    IResizeLayer* bfp_layer3_resize = network->addResize(*build_p4_r2->getOutput(0));
    bfp_layer3_resize->setResizeMode(ResizeMode::kNEAREST);
    auto build_p3_r1_shape = network->addShape(*build_p3_r1->getOutput(0))->getOutput(0);
    bfp_layer3_resize->setInput(1, *build_p3_r1_shape);
    bfp_layer3_resize->setAlignCorners(false);
    assert(bfp_layer3_resize);
    IElementWiseLayer* bfp_add1 = network->addElementWise(*bfp_layer3_resize->getOutput(0), *build_p3_r1->getOutput(0), ElementWiseOperation::kSUM);
    assert(bfp_add1);

    IActivationLayer* build_p3_r2 = addConvRelu(network, weightMap, *bfp_add1->getOutput(0), 256, 3, 1, "build_feature_pyramid/build_P3/avoid_aliasing/");
    assert(build_p3_r2);

    IActivationLayer* build_p2_r1 = addConvRelu(network, weightMap, *pool1->getOutput(0), 256, 1, 1, "build_feature_pyramid/build_P2/reduce_dimension/");
    assert(build_p2_r1);
    IResizeLayer* bfp_layer2_resize = network->addResize(*build_p3_r2->getOutput(0));
    bfp_layer2_resize->setResizeMode(ResizeMode::kNEAREST);
    auto build_p2_r1_shape = network->addShape(*build_p2_r1->getOutput(0))->getOutput(0);
    bfp_layer2_resize->setInput(1, *build_p2_r1_shape);
    bfp_layer2_resize->setAlignCorners(false);
    assert(bfp_layer2_resize);
    IElementWiseLayer* bfp_add2 = network->addElementWise(*bfp_layer2_resize->getOutput(0), *build_p2_r1->getOutput(0), ElementWiseOperation::kSUM);
    assert(bfp_add2);

    // P2
    IActivationLayer* build_p2_r2 = addConvRelu(network, weightMap, *bfp_add2->getOutput(0), 256, 3, 1, "build_feature_pyramid/build_P2/avoid_aliasing/");
    assert(build_p2_r2);
    auto build_p2_r2_shape = network->addShape(*build_p2_r2->getOutput(0))->getOutput(0);
    // P3 x2
    IResizeLayer* layer1_resize = network->addResize(*build_p3_r2->getOutput(0));
    layer1_resize->setResizeMode(ResizeMode::kLINEAR);
    layer1_resize->setInput(1, *build_p2_r2_shape);
    layer1_resize->setAlignCorners(false);
    assert(layer1_resize);

    // P4 x4
    IResizeLayer* layer2_resize = network->addResize(*build_p4_r2->getOutput(0));
    layer2_resize->setResizeMode(ResizeMode::kLINEAR);
    layer2_resize->setInput(1, *build_p2_r2_shape);
    layer2_resize->setAlignCorners(false);
    assert(layer2_resize);

    // P5 x8
    IResizeLayer* layer3_resize = network->addResize(*build_p5_r1->getOutput(0));
    layer3_resize->setResizeMode(ResizeMode::kLINEAR);
    layer3_resize->setInput(1, *build_p2_r2_shape);
    layer3_resize->setAlignCorners(false);
    assert(layer3_resize);

    // C(P5,P4,P3,P2)
    ITensor* inputTensors[] = { layer3_resize->getOutput(0), layer2_resize->getOutput(0), layer1_resize->getOutput(0), build_p2_r2->getOutput(0) };

    IConcatenationLayer* concat = network->addConcatenation(inputTensors, 4);
    assert(concat);

    IConvolutionLayer* feature_result_conv = network->addConvolutionNd(*concat->getOutput(0), 256, DimsHW{ 3, 3 }, weightMap["feature_results/Conv/weights"], emptywts);
    feature_result_conv->setPaddingNd(DimsHW{ 1, 1 });
    assert(feature_result_conv);

    IScaleLayer* feature_result_bn = addBatchNorm2d(network, weightMap, *feature_result_conv->getOutput(0), "feature_results/Conv/BatchNorm/", 1e-5);
    assert(feature_result_bn);

    IActivationLayer* feature_result_relu = network->addActivation(*feature_result_bn->getOutput(0), ActivationType::kRELU);
    assert(feature_result_relu);
    IConvolutionLayer* feature_result_conv_1 = network->addConvolutionNd(*feature_result_relu->getOutput(0), 6, DimsHW{ 1, 1 }, weightMap["feature_results/Conv_1/weights"], weightMap["feature_results/Conv_1/biases"]);
    assert(feature_result_conv_1);

    IActivationLayer* sigmoid = network->addActivation(*feature_result_conv_1->getOutput(0), ActivationType::kSIGMOID);
    assert(sigmoid);

    sigmoid->getOutput(0)->setName(output_name_);
    std::cout << "Set name out" << std::endl;
    network->markOutput(*sigmoid->getOutput(0));

    // Set profile
    IOptimizationProfile* profile = builder->createOptimizationProfile();
    profile->setDimensions(input_name_, OptProfileSelector::kMIN, Dims4(1, 3, MIN_INPUT_SIZE, MIN_INPUT_SIZE));
    profile->setDimensions(input_name_, OptProfileSelector::kOPT, Dims4(1, 3, OPT_INPUT_H, OPT_INPUT_W));
    profile->setDimensions(input_name_, OptProfileSelector::kMAX, Dims4(1, 3, MAX_INPUT_SIZE, MAX_INPUT_SIZE));
    config->addOptimizationProfile(profile);

    // Build engine
    config->setMaxWorkspaceSize(1 << 30); // 1G
#ifdef USE_FP16
    config->setFlag(BuilderFlag::kFP16);
#endif
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    ;
    std::cout << "Build out" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*)(mem.second.values));
    }
    return engine;
}

void PSENet::serializeEngine()
{
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();
    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(builder, config);
    assert(engine != nullptr);

    // Serialize the engine
    IHostMemory* modelStream{ nullptr };
    modelStream = engine->serialize();
    assert(modelStream != nullptr);

    std::ofstream p("./psenet.engine", std::ios::binary | std::ios::out);
    if (!p)
    {
        std::cerr << "Could not open plan output file" << std::endl;
        return;
    }
    p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());

    return;
}

void PSENet::deserializeEngine()
{
    std::ifstream file("./psenet.engine", std::ios::binary | std::ios::in);
    if (file.good())
    {
        file.seekg(0, file.end);
        size_t size = file.tellg();
        file.seekg(0, file.beg);
        char* trtModelStream = new char[size];
        assert(trtModelStream);
        file.read(trtModelStream, size);
        file.close();
        mCudaEngine = std::shared_ptr<nvinfer1::ICudaEngine>(mRuntime->deserializeCudaEngine(trtModelStream, size), InferDeleter());
        assert(mCudaEngine != nullptr);
    }
}

void PSENet::inferenceOnce(IExecutionContext& context, float* input, float* output, int input_h, int input_w)
{
    const ICudaEngine& engine = context.getEngine();
    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(input_name_);
    const int outputIndex = engine.getBindingIndex(output_name_);

    context.setBindingDimensions(inputIndex, Dims4(1, 3, input_h, input_w));

    int input_size = 3 * input_h * input_w * sizeof(float);
    int output_size = input_h * input_w * 6 / 16 * sizeof(float);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], input_size));
    CHECK(cudaMalloc(&buffers[outputIndex], output_size));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, input_size, cudaMemcpyHostToDevice, stream));
    context.enqueueV2(buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], output_size, cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

void PSENet::init()
{
    mRuntime = std::shared_ptr<nvinfer1::IRuntime>(createInferRuntime(gLogger), InferDeleter());
    assert(mRuntime != nullptr);

    std::cout << "Deserialize Engine" << std::endl;
    deserializeEngine();

    mContext = std::shared_ptr<nvinfer1::IExecutionContext>(mCudaEngine->createExecutionContext(), InferDeleter());
    assert(mContext != nullptr);

    mContext->setOptimizationProfile(0);

    std::cout << "Finished init" << std::endl;
}
void PSENet::detect(std::string image_path)
{
    // Run inference
    cv::Mat image = cv::imread(image_path);
    int resize_h, resize_w;
    float ratio_h, ratio_w;

    auto start = std::chrono::system_clock::now();

    float* input = preProcess(image, resize_h, resize_w, ratio_h, ratio_w);
    float* output = new float[resize_h * resize_w * 6 / 16];

    inferenceOnce(*mContext, input, output, resize_h, resize_w);

    std::vector<cv::RotatedRect> boxes = postProcess(output, resize_h, resize_w);
    drawRects(image, boxes, stride_, ratio_h, ratio_w, 1.0);
    auto end = std::chrono::system_clock::now();

    cv::imwrite("result_" + image_path, image);

    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    delete input;
    delete output;
}

float* PSENet::preProcess(cv::Mat image, int& resize_h, int& resize_w, float& ratio_h, float& ratio_w)
{
    cv::Mat imageRGB;
    cv::cvtColor(image, imageRGB, cv::COLOR_BGR2RGB);
    cv::Mat imageProcessed;
    int h = imageRGB.size().height;
    int w = imageRGB.size().width;
    resize_w = w;
    resize_h = h;

    float ratio = 1.0;
    // limit the max side and min side
    if (resize_h > max_side_len_ || resize_w > max_side_len_)
    {
        if (resize_h > resize_w)
            ratio = float(max_side_len_) / float(resize_h);
        else
            ratio = float(max_side_len_) / float(resize_w);
    }
    if (resize_h < min_side_len_ || resize_w < min_side_len_)
    {
        if (resize_h < resize_w)
            ratio = float(min_side_len_) / float(resize_h);
        else
            ratio = float(min_side_len_) / float(resize_w);
    }
    resize_h = int(resize_h * ratio);
    resize_w = int(resize_w * ratio);

    if (resize_h % 32 != 0)
        resize_h = (resize_h / 32 + 1) * 32;
    if (resize_w % 32 != 0)
        resize_w = (resize_w / 32 + 1) * 32;
    ratio_h = resize_h / float(h);
    ratio_w = resize_w / float(w);

    cv::resize(imageRGB, imageProcessed, cv::Size(resize_w, resize_h));
    float* input = new float[3 * resize_h * resize_w];
    cv::Mat imgFloat;
    imageProcessed.convertTo(imgFloat, CV_32FC3);
    cv::subtract(imgFloat, cv::Scalar(123.68, 116.78, 103.94), imgFloat, cv::noArray(), -1);
    std::vector<cv::Mat> chw;
    for (auto i = 0; i < 3; ++i)
        chw.emplace_back(cv::Mat(cv::Size(resize_w, resize_h), CV_32FC1, input + i * resize_w * resize_h));
    cv::split(imgFloat, chw);
    return input;
}

std::vector<cv::RotatedRect> PSENet::postProcess(float* origin_output, int resize_h, int resize_w)
{
    // BxCxHxW  S0 ===> S5  small ===> large
    const int h = resize_h / stride_;
    const int w = resize_w / stride_;
    const int length = h * w;
    // get kernels, sequence: 0->n, max -> min
    std::vector<cv::Mat> kernels(num_kernels_);
    for (auto i = num_kernels_ - 1; i >= 0; --i)
    {
        cv::Mat tmp_kernel(h, w, CV_32FC1, (void*)(origin_output + i * length), 0);
        cv::threshold(tmp_kernel, tmp_kernel, post_threshold_, 255, cv::THRESH_BINARY);
        tmp_kernel.convertTo(tmp_kernel, CV_8UC1);
        assert(tmp_kernel.rows == h && tmp_kernel.cols == w);
        kernels[num_kernels_ - 1 - i] = tmp_kernel;
    }
    cv::Mat stats, centroids, label_image;
    int label_num = cv::connectedComponents(kernels[num_kernels_ - 1], label_image, 4);

    label_image.convertTo(label_image, CV_8U);
    assert(label_image.rows == h && label_image.cols == w);

    cv::Mat out = cv::Mat::zeros(h, w, CV_8UC1);
    std::queue<std::tuple<int, int, int>> q;
    std::queue<std::tuple<int, int, int>> next_q;
    for (int i = 0; i < h; i++)
    {
        for (int j = 0; j < w; j++)
        {
            auto label = *label_image.ptr(i, j);
            if (label > 0)
            {
                q.push(std::make_tuple(i, j, label));
                *out.ptr(i, j) = label;
            }
        }
    }

    int dx[4] = { -1, 1, 0, 0 };
    int dy[4] = { 0, 0, -1, 1 };
    for (int i = num_kernels_ - 2; i >= 0; i--)
    {
        //get each kernels
        auto kernel = kernels[i];
        while (!q.empty())
        {
            //get each queue menber in q
            auto q_n = q.front();
            q.pop();
            int y = std::get<0>(q_n); //i
            int x = std::get<1>(q_n); //j
            int l = std::get<2>(q_n); //label
            //store the edge pixel after one expansion
            bool is_edge = true;
            for (int idx = 0; idx < 4; idx++)
            {
                int index_y = y + dy[idx];
                int index_x = x + dx[idx];
                if (index_y < 0 || index_y >= h || index_x < 0 || index_x >= w)
                    continue;
                if (!*kernel.ptr(index_y, index_x) || *out.ptr(index_y, index_x) > 0)
                    continue;
                q.push(std::make_tuple(index_y, index_x, l));
                *out.ptr(index_y, index_x) = l;
                is_edge = false;
            }
            if (is_edge)
            {
                next_q.push(std::make_tuple(y, x, l));
            }
        }
        std::swap(q, next_q);
    }
    std::vector<cv::RotatedRect> boxes;
    for (auto n = 1; n < label_num; ++n)
    {
        std::vector<cv::Point> points;
        cv::findNonZero(out == n, points);
        cv::Mat fuck = out == n;
        cv::RotatedRect rect = cv::minAreaRect(points);
        boxes.emplace_back(rect);
    }
    return boxes;
}


================================================
FILE: psenet/psenet.h
================================================
#ifndef TENSORRTX_PSENET_H
#define TENSORRTX_PSENET_H
#include <memory>
#include <vector>
#include <chrono>
#include <opencv2/opencv.hpp>
#include "utils.h"
#include "layers.h"
class PSENet
{
public:
	PSENet(int max_side_len, int min_side_len, float threshold, int num_kernel, int stride);
	~PSENet();

	ICudaEngine* createEngine(IBuilder* builder, IBuilderConfig* config);
	void serializeEngine();
	void deserializeEngine();
	void init();
	void inferenceOnce(IExecutionContext& context, float* input, float* output, int input_h, int input_w);
	void detect(std::string image_path);
	float* preProcess(cv::Mat image, int& resize_h, int& resize_w, float& ratio_h, float& ratio_w);
	std::vector<cv::RotatedRect> postProcess(float* origin_output, int resize_h, int resize_w);

private:
	Logger gLogger;
	std::shared_ptr<nvinfer1::IRuntime> mRuntime;
	std::shared_ptr<nvinfer1::ICudaEngine> mCudaEngine;
	std::shared_ptr<nvinfer1::IExecutionContext> mContext;
	DataType dt = DataType::kFLOAT;
	const char* input_name_ = "input";
	const char* output_name_ = "output";
	int max_side_len_ = 1024;
	int min_side_len_ = 640;
	float post_threshold_ = 0.9;
	int num_kernels_ = 6;
	int stride_ = 4;
};

#endif // TENSORRTX_PSENET_H


================================================
FILE: psenet/utils.cpp
================================================
#include "utils.h"

// Load weights from files shared with TensorRT samples.
// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file)
{
    std::cout << "Loading weights: " << file << std::endl;
    std::cout << "Model weight is large, it will take some time." << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{ DataType::kFLOAT, nullptr, 0 };
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }
    std::cout << "Finish load weight" << std::endl;
    return weightMap;
}

cv::RotatedRect expandBox(const cv::RotatedRect& inBox, float ratio)
{
    cv::Size size = inBox.size;
    int neww = int(size.width * ratio);
    int newh = int(size.height * ratio);
    return cv::RotatedRect(inBox.center, cv::Size(neww, newh), inBox.angle);
}


void drawRects(cv::Mat& image, std::vector<cv::RotatedRect> boxes, float stride, float ratio_h, float ratio_w, float expand_ratio)
{
    cv::Point2f rect[4];
    for (unsigned int i = 0; i < boxes.size(); i++)
    {
        cv::RotatedRect box = boxes[i];
        cv::RotatedRect expandbox = expandBox(box, expand_ratio);
        expandbox.points(rect);
        for (auto j = 0; j < 4; j++)
        {
            cv::line(image, cv::Point{ int(rect[j].x / ratio_w * stride), int(rect[j].y / ratio_h * stride) }, cv::Point{ int(rect[(j + 1) % 4].x / ratio_w * stride), int(rect[(j + 1) % 4].y / ratio_h * stride) }, cv::Scalar(0, 0, 255), 2, 8);
        }
    }
}


================================================
FILE: psenet/utils.h
================================================
#ifndef TENSORRTX_UTILS_H
#define TENSORRTX_UTILS_H

#include <map>
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "assert.h"
#include <fstream>

using namespace nvinfer1;

std::map<std::string, Weights> loadWeights(const std::string file);

cv::RotatedRect expandBox(const cv::RotatedRect& inBox, float ratio = 1.0);

void drawRects(cv::Mat& image, std::vector<cv::RotatedRect> boxes, float stride, float ratio_h, float ratio_w, float expand_ratio);

cv::Mat renderSegment(cv::Mat image, const cv::Mat& mask);

// <============== Operator =============>
struct InferDeleter
{
    template <typename T>
    void operator()(T* obj) const
    {
        if (obj)
        {
            obj->destroy();
        }
    }
};

#define CHECK(status)                             \
    do                                            \
    {                                             \
        auto ret = (status);                      \
        if (ret != 0)                             \
        {                                         \
            std::cout << "Cuda failure: " << ret; \
            abort();                              \
        }                                         \
    } while (0)

// Logger for TensorRT info/warning/errors
class Logger : public nvinfer1::ILogger
{
public:
    Logger() : Logger(Severity::kWARNING) {}

    Logger(Severity severity) : reportableSeverity(severity) {}

    void log(Severity severity, const char* msg) override
    {
        // suppress messages with severity enum value greater than the reportable
        if (severity > reportableSeverity)
            return;

        switch (severity)
        {
        case Severity::kINTERNAL_ERROR:
            std::cerr << "INTERNAL_ERROR: ";
            break;
        case Severity::kERROR:
            std::cerr << "ERROR: ";
            break;
        case Severity::kWARNING:
            std::cerr << "WARNING: ";
            break;
        case Severity::kINFO:
            std::cerr << "INFO: ";
            break;
        default:
            std::cerr << "UNKNOWN: ";
            break;
        }
        std::cerr << msg << std::endl;
    }

    Severity reportableSeverity{ Severity::kWARNING };
};

#endif


================================================
FILE: rcnn/BatchedNms.cu
================================================
#include <cuda.h>
#include <thrust/device_ptr.h>
#include <thrust/sequence.h>
#include <thrust/execution_policy.h>
#include <thrust/gather.h>
#include <cmath>
#include <algorithm>
#include <iostream>
#include <stdexcept>
#include <cstdint>
#include <vector>
#include "BatchedNmsPlugin.h"
#include "./cuda_utils.h"
#include "macros.h"

#ifdef CUDA_11
#include <cub/device/device_radix_sort.cuh>
#include <cub/iterator/counting_input_iterator.cuh>
#else
#include <thrust/system/cuda/detail/cub/device/device_radix_sort.cuh>
#include <thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh>
namespace cub = thrust::cuda_cub::cub;

#endif

namespace nvinfer1 {

__global__ void batched_nms_kernel(
    const int nms_method, const float threshold, const int num_detections,
    const int *indices, float *scores, const float *classes, const float4 *boxes) {

    // Go through detections by descending score
    for (int m = 0; m < num_detections; m++) {
        int i = blockIdx.x * blockDim.x + threadIdx.x;
        if (i < num_detections && m < i && scores[m] > 0.0f) {
            int idx = indices[i];
            int max_idx = indices[m];
            int icls = classes[idx];
            int mcls = classes[max_idx];
            if (mcls == icls) {
                float4 ibox = boxes[idx];
                float4 mbox = boxes[max_idx];
                float x1 = max(ibox.x, mbox.x);
                float y1 = max(ibox.y, mbox.y);
                float x2 = min(ibox.z, mbox.z);
                float y2 = min(ibox.w, mbox.w);
                float w = max(0.0f, x2 - x1);
                float h = max(0.0f, y2 - y1);
                float iarea = (ibox.z - ibox.x) * (ibox.w - ibox.y);
                float marea = (mbox.z - mbox.x) * (mbox.w - mbox.y);
                float inter = w * h;
                float overlap = inter / (iarea + marea - inter);
                float sigma = 0.5;  // this is an empirical value
                // printf("nms_method: %d", nms_method);
                //nms methods selection in the second stage
                // 0: original nms
                // 1: soft-nms (linear)
                // 2: soft-nms (gaussian)
                // printf("nms_method: ", nms_method);
                switch (nms_method)
                {
                case 0:
                    if (overlap > threshold) {
                        scores[i] = 0.0f;
                    }
                    break;
                case 1:
                    if (overlap > threshold) {
                        scores[i] = (1 - overlap) * scores[i];
                    }
                    break;        
                case 2:
                    if (overlap > threshold) {
                        scores[i] = std::exp(-(overlap * overlap) / sigma) * scores[i];
                    }
                    break;           
                default:
                    if (overlap > threshold) {
                        scores[i] = 0.0f;
                    }
                    break;
                }
            }
        }
        // Sync discarded detections
        __syncthreads();
    }
}

int batchedNms(int nms_method, int batch_size,
    const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs,
    size_t count, int detections_per_im, float nms_thresh,
    void *workspace, size_t workspace_size, cudaStream_t stream) {

    if (!workspace || !workspace_size) {
        // Return required scratch space size cub style
        workspace_size += get_size_aligned<int>(count);   // indices
        workspace_size += get_size_aligned<int>(count);   // indices_sorted
        workspace_size += get_size_aligned<float>(count);  // scores_sorted

        size_t temp_size_sort = 0;
        cub::DeviceRadixSort::SortPairsDescending(
            static_cast<void*>(nullptr), temp_size_sort,
            static_cast<float*>(nullptr),
            static_cast<float*>(nullptr),
            static_cast<int*>(nullptr),
            static_cast<int*>(nullptr), count);
        workspace_size += temp_size_sort;

        return workspace_size;
    }

    auto on_stream = thrust::cuda::par.on(stream);

    auto indices = get_next_ptr<int>(count, workspace, workspace_size);
    std::vector<int> indices_h(count);
    for (int i = 0; i < count; i++)
        indices_h[i] = i;
    cudaMemcpyAsync(indices, indices_h.data(), count * sizeof * indices, cudaMemcpyHostToDevice, stream);
    auto indices_sorted = get_next_ptr<int>(count, workspace, workspace_size);
    auto scores_sorted = get_next_ptr<float>(count, workspace, workspace_size);

    for (int batch = 0; batch < batch_size; batch++) {
        auto in_scores = static_cast<const float *>(inputs[0]) + batch * count;
        auto in_boxes = static_cast<const float4 *>(inputs[1]) + batch * count;
        auto in_classes = static_cast<const float *>(inputs[2]) + batch * count;

        auto out_scores = static_cast<float *>(outputs[0]) + batch * detections_per_im;
        auto out_boxes = static_cast<float4 *>(outputs[1]) + batch * detections_per_im;
        auto out_classes = static_cast<float *>(outputs[2]) + batch * detections_per_im;

        // Sort scores and corresponding indices
        int num_detections = count;
        cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size,
            in_scores, scores_sorted, indices, indices_sorted, num_detections, 0, sizeof(*scores_sorted) * 8, stream);

        // Launch actual NMS kernel - 1 block with each thread handling n detections
        // TODO: different device has differnet max threads
        const int max_threads = 1024;
        
        int num_per_thread = ceil(static_cast<float>(num_detections) / max_threads);
        batched_nms_kernel << <num_per_thread, max_threads, 0, stream >> > (nms_method, nms_thresh, num_detections,
            indices_sorted, scores_sorted, in_classes, in_boxes);

        // Re-sort with updated scores
        cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size,
            scores_sorted, scores_sorted, indices_sorted, indices,
            num_detections, 0, sizeof(*scores_sorted) * 8, stream);

        // Gather filtered scores, boxes, classes
        num_detections = min(detections_per_im, num_detections);
        cudaMemcpyAsync(out_scores, scores_sorted, num_detections * sizeof *scores_sorted,
        cudaMemcpyDeviceToDevice, stream);
        if (num_detections < detections_per_im) {
            thrust::fill_n(on_stream, out_scores + num_detections, detections_per_im - num_detections, 0);
        }
        thrust::gather(on_stream, indices, indices + num_detections, in_boxes, out_boxes);
        thrust::gather(on_stream, indices, indices + num_detections, in_classes, out_classes);
    }

    return 0;
}
}  // namespace nvinfer1


================================================
FILE: rcnn/BatchedNmsPlugin.h
================================================
#pragma once

#include <NvInfer.h>

#include <vector>
#include <cassert>
#include "macros.h"

using namespace nvinfer1;

#define PLUGIN_NAME "BatchedNms"
#define PLUGIN_VERSION "1"
#define PLUGIN_NAMESPACE ""

namespace nvinfer1 {
int batchedNms(int nms_method, int batchSize,
    const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs,
    size_t count, int detections_per_im, float nms_thresh,
    void *workspace, size_t workspace_size, cudaStream_t stream);

/*
    input1: scores{C, 1} C->topk
    input2: boxes{C, 4} C->topk format:XYXY
    input3: classes{C, 1} C->topk
    output1: scores{C, 1} C->detections_per_img
    output2: boxes{C, 4} C->detections_per_img format:XYXY
    output3: classes{C, 1} C->detections_per_img
    Description: implement batched nms
*/
class BatchedNmsPlugin : public IPluginV2Ext {
    int _nms_method;
    float _nms_thresh;
    int _detections_per_im;

    size_t _count = 1;

 protected:
    void deserialize(void const* data, size_t length) {
        const char* d = static_cast<const char*>(data);
        read(d, _nms_method);
        read(d, _nms_thresh);
        read(d, _detections_per_im);
        read(d, _count);
    }

    size_t getSerializationSize() const override {
        return sizeof(_nms_method) + sizeof(_nms_thresh) + sizeof(_detections_per_im)
            + sizeof(_count);
    }

    void serialize(void *buffer) const TRT_NOEXCEPT override {
        char* d = static_cast<char*>(buffer);
        write(d, _nms_method);
        write(d, _nms_thresh);
        write(d, _detections_per_im);
        write(d, _count);
    }

 public:
    BatchedNmsPlugin(int nms_method, float nms_thresh, int detections_per_im)
        : _nms_method(nms_method), _nms_thresh(nms_thresh), _detections_per_im(detections_per_im) {
        assert(nms_method >= 0);
        assert(nms_thresh > 0);
        assert(detections_per_im > 0);
    }

    BatchedNmsPlugin(int nms_method, float nms_thresh, int detections_per_im, size_t count)
        : _nms_method(nms_method), _nms_thresh(nms_thresh), _detections_per_im(detections_per_im), _count(count) {
        assert(nms_method >= 0);
        assert(nms_thresh > 0);
        assert(detections_per_im > 0);
        assert(count > 0);
    }

    BatchedNmsPlugin(void const* data, size_t length) {
        this->deserialize(data, length);
    }

    const char *getPluginType() const TRT_NOEXCEPT override {
        return PLUGIN_NAME;
    }

    const char *getPluginVersion() const TRT_NOEXCEPT override {
        return PLUGIN_VERSION;
    }

    int getNbOutputs() const TRT_NOEXCEPT override {
        return 3;
    }

    Dims getOutputDimensions(int index,
        const Dims *inputs, int nbInputDims) TRT_NOEXCEPT override {
        assert(nbInputDims == 3);
        assert(index < this->getNbOutputs());
        return Dims2(_detections_per_im, index == 1 ? 4 : 1);
    }

    bool supportsFormat(DataType type, PluginFormat format) const TRT_NOEXCEPT override {
        return type == DataType::kFLOAT && format == PluginFormat::kLINEAR;
    }

    int initialize() TRT_NOEXCEPT override { return 0; }

    void terminate() TRT_NOEXCEPT override {}

    size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override {
        static int size = -1;
        if (size < 0) {
            size = batchedNms(_nms_method, maxBatchSize, nullptr, nullptr, _count,
                _detections_per_im, _nms_thresh,
                nullptr, 0, nullptr);
        }
        return size;
    }

    int enqueue(int batchSize,
        const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs,
        void *workspace, cudaStream_t stream) TRT_NOEXCEPT override {
        return batchedNms(_nms_method, batchSize, inputs, outputs, _count,
            _detections_per_im, _nms_thresh,
            workspace, getWorkspaceSize(batchSize), stream);
    }

    void destroy() TRT_NOEXCEPT override {
        delete this;
    }

    const char *getPluginNamespace() const TRT_NOEXCEPT override {
        return PLUGIN_NAMESPACE;
    }

    void setPluginNamespace(const char *N) TRT_NOEXCEPT override {
    }

    // IPluginV2Ext Methods
    DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override {
        assert(index < 3);
        return DataType::kFLOAT;
    }

    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
        int nbInputs) const TRT_NOEXCEPT override {
        return false;
    }

    bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override { return false; }

    void configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs,
        const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast,
        const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize) TRT_NOEXCEPT override {
        assert(*inputTypes == nvinfer1::DataType::kFLOAT &&
            floatFormat == nvinfer1::PluginFormat::kLINEAR);
        assert(nbInputs == 3);
        assert(inputDims[0].d[0] == inputDims[2].d[0]);
        assert(inputDims[1].d[0] == inputDims[2].d[0]);
        _count = inputDims[0].d[0];
    }

    IPluginV2Ext *clone() const TRT_NOEXCEPT override {
        return new BatchedNmsPlugin(_nms_method, _nms_thresh, _detections_per_im, _count);
    }

 private:
    template<typename T> void write(char*& buffer, const T& val) const {
        *reinterpret_cast<T*>(buffer) = val;
        buffer += sizeof(T);
    }

    template<typename T> void read(const char*& buffer, T& val) {
        val = *reinterpret_cast<const T*>(buffer);
        buffer += sizeof(T);
    }
};

class BatchedNmsPluginCreator : public IPluginCreator {
 public:
    BatchedNmsPluginCreator() {}

    const char *getPluginNamespace() const TRT_NOEXCEPT override {
        return PLUGIN_NAMESPACE;
    }
    const char *getPluginName() const TRT_NOEXCEPT override {
        return PLUGIN_NAME;
    }

    const char *getPluginVersion() const TRT_NOEXCEPT override {
        return PLUGIN_VERSION;
    }

    IPluginV2 *deserializePlugin(const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT override {
        return new BatchedNmsPlugin(serialData, serialLength);
    }

    void setPluginNamespace(const char *N) TRT_NOEXCEPT override {}
    const PluginFieldCollection *getFieldNames() TRT_NOEXCEPT override { return nullptr; }
    IPluginV2 *createPlugin(const char *name, const PluginFieldCollection *fc) TRT_NOEXCEPT override { return nullptr; }
};

REGISTER_TENSORRT_PLUGIN(BatchedNmsPluginCreator);

}  // namespace nvinfer1

#undef PLUGIN_NAME
#undef PLUGIN_VERSION
#undef PLUGIN_NAMESPACE


================================================
FILE: rcnn/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.1)

project(rcnn)

add_definitions(-std=c++14)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_BUILD_TYPE Debug)

set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--extended-lambda)

find_package(CUDA REQUIRED)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(/home/jushi/TensorRT-8.2.1.6/include)
link_directories(/home/jushi/TensorRT-8.2.1.6/lib)

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")

cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/BatchedNms.cu ${PROJECT_SOURCE_DIR}/PredictorDecode.cu ${PROJECT_SOURCE_DIR}/RoiAlign.cu ${PROJECT_SOURCE_DIR}/RpnDecode.cu ${PROJECT_SOURCE_DIR}/RpnNms.cu ${PROJECT_SOURCE_DIR}/MaskRcnnInference.cu)
target_link_libraries(myplugins nvinfer cudart)

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

add_executable(rcnn ${PROJECT_SOURCE_DIR}/rcnn.cpp)
target_link_libraries(rcnn nvinfer)
target_link_libraries(rcnn cudart)
target_link_libraries(rcnn myplugins)
target_link_libraries(rcnn ${OpenCV_LIBS})

add_definitions(-O2 -pthread)


================================================
FILE: rcnn/MaskRcnnInference.cu
================================================
#include "MaskRcnnInferencePlugin.h"
#include "macros.h"

namespace nvinfer1 {

__device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); }

__global__ void MaskRcnnInferenceKernel(
    const int nthreads,
    const int detections_per_im,
    const int output_size,
    const int num_classes,
    const float* indices,
    const float* masks,
    float* out_masks) {
    size_t index = blockIdx.x * blockDim.x + threadIdx.x;
    if (index < nthreads) {
        int ind = index / output_size / output_size / num_classes;
        int ind_class = indices[ind];
        int cur_class = index / output_size / output_size % num_classes;
        if (ind_class == cur_class) {
            int w = index % output_size;
            int h = index / output_size % output_size;
            int tmp = ind * num_classes * output_size * output_size +
              cur_class * output_size*output_size + h * output_size + w;
            float maskVal = masks[ind * num_classes * output_size *
              output_size + cur_class * output_size * output_size +
              h * output_size + w];
            out_masks[ind * output_size * output_size + h * output_size + w] = Logist(maskVal);
        }
    }
}

int maskRcnnInference(int batchSize,
    const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs,
    int detections_per_im, int output_size, int num_classes, cudaStream_t stream) {

    for (int batch = 0; batch < batchSize; batch++) {
        auto in_indices = static_cast<const float *>(inputs[0]) + batch * detections_per_im;
        auto in_masks = static_cast<const float *>(inputs[1]) + batch * detections_per_im *
          num_classes * output_size * output_size;

        auto out_masks = static_cast<float *>(outputs[0]) + batch * detections_per_im * output_size * output_size;

        int nthreads = detections_per_im * num_classes * output_size * output_size;
        const int max_threads = 1024;
        int blocksPerGrid = ceil(static_cast<float>(nthreads) / max_threads);
        // TODO: can implement this function with thrust?
        MaskRcnnInferenceKernel << <blocksPerGrid, max_threads, 0, stream >> > (
            nthreads,
            detections_per_im,
            output_size,
            num_classes,
            in_indices,
            in_masks,
            out_masks);
        cudaDeviceSynchronize();
    }

    return 0;
}

}  // namespace nvinfer1


================================================
FILE: rcnn/MaskRcnnInferencePlugin.h
================================================
#pragma once

#include <NvInfer.h>

#include <vector>
#include <cassert>
#include "macros.h"

using namespace nvinfer1;

#define PLUGIN_NAME "MaskRcnnInference"
#define PLUGIN_VERSION "1"
#define PLUGIN_NAMESPACE ""

namespace nvinfer1 {
int maskRcnnInference(int batchSize,
    const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs,
    int detections_per_im, int output_size, int num_classes, cudaStream_t stream);
/*
    input1: indices{C, 1} C->topk
    input2: masks{C, NUM_CLASS, size, size} C->topk format:XYXY
    output1: masks{C, 1, size, size} C->detections_per_img
    Description: implement index select
*/

class MaskRcnnInferencePlugin : public IPluginV2Ext {
    int _detections_per_im;
    int _output_size;
    int _num_classes = 1;

 protected:
    void deserialize(void const* data, size_t length) {
        const char* d = static_cast<const char*>(data);
        read(d, _detections_per_im);
        read(d, _output_size);
        read(d, _num_classes);
    }
    size_t getSerializationSize() const TRT_NOEXCEPT override {
        return sizeof(_detections_per_im) + sizeof(_output_size) + sizeof(_num_classes);
    }
    void serialize(void *buffer) const TRT_NOEXCEPT override {
        char* d = static_cast<char*>(buffer);
        write(d, _detections_per_im);
        write(d, _output_size);
        write(d, _num_classes);
    }

 public:
    MaskRcnnInferencePlugin(int detections_per_im, int output_size)
        : _detections_per_im(detections_per_im), _output_size(output_size) {
        assert(detections_per_im > 0);
        assert(output_size > 0);
    }
    MaskRcnnInferencePlugin(int detections_per_im, int output_size, int num_classes)
        : _detections_per_im(detections_per_im), _output_size(output_size), _num_classes(num_classes) {
        assert(detections_per_im > 0);
        assert(output_size > 0);
        assert(num_classes > 0);
    }
    MaskRcnnInferencePlugin(void const* data, size_t length) {
        this->deserialize(data, length);
    }
    const char *getPluginType() const TRT_NOEXCEPT override {
        return PLUGIN_NAME;
    }
    const char *getPluginVersion() const TRT_NOEXCEPT override {
        return PLUGIN_VERSION;
    }
    int getNbOutputs() const TRT_NOEXCEPT override {
        return 1;
    }
    Dims getOutputDimensions(int index,
        const Dims *inputs, int nbInputDims) TRT_NOEXCEPT override {
        assert(index < this->getNbOutputs());
        return Dims4(_detections_per_im, 1, _output_size, _output_size);
    }
    bool supportsFormat(DataType type, PluginFormat format) const TRT_NOEXCEPT override {
        return type == DataType::kFLOAT && format == PluginFormat::kLINEAR;
    }
    int initialize() TRT_NOEXCEPT override { return 0; }
    void terminate() TRT_NOEXCEPT override {}
    size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override {
        return 0;
    }
    int enqueue(int batchSize,
        const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs,
        void *workspace, cudaStream_t stream) TRT_NOEXCEPT override {
        return maskRcnnInference(batchSize, inputs, outputs,
            _detections_per_im, _output_size, _num_classes, stream);
    }
    void destroy() TRT_NOEXCEPT override {
        delete this;
    }
    const char *getPluginNamespace() const TRT_NOEXCEPT override {
        return PLUGIN_NAMESPACE;
    }
    void setPluginNamespace(const char *N) TRT_NOEXCEPT override {
    }
    // IPluginV2Ext Methods
    DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override {
        assert(index < 1);
        return DataType::kFLOAT;
    }
    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
        int nbInputs) const TRT_NOEXCEPT override {
        return false;
    }
    bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override { return false; }
    void configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs,
        const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast,
        const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize) TRT_NOEXCEPT override {
        assert(*inputTypes == nvinfer1::DataType::kFLOAT &&
            floatFormat == nvinfer1::PluginFormat::kLINEAR);
        assert(nbInputs == 2);
        assert(inputDims[0].d[0] == _detections_per_im);
        assert(inputDims[1].d[0] == _detections_per_im);
        assert(inputDims[1].d[2] == _output_size);
        assert(inputDims[1].d[3] == _output_size);
        _num_classes = inputDims[1].d[1];
    }
    IPluginV2Ext *clone() const TRT_NOEXCEPT override {
        return new MaskRcnnInferencePlugin(_detections_per_im, _output_size, _num_classes);
    }

 private:
    template<typename T> void write(char*& buffer, const T& val) const {
        *reinterpret_cast<T*>(buffer) = val;
        buffer += sizeof(T);
    }
    template<typename T> void read(const char*& buffer, T& val) {
        val = *reinterpret_cast<const T*>(buffer);
        buffer += sizeof(T);
    }
};

class MaskRcnnInferencePluginCreator : public IPluginCreator {
 public:
    MaskRcnnInferencePluginCreator() {}
    const char *getPluginNamespace() const TRT_NOEXCEPT override {
        return PLUGIN_NAMESPACE;
    }
    const char *getPluginName() const TRT_NOEXCEPT override {
        return PLUGIN_NAME;
    }
    const char *getPluginVersion() const TRT_NOEXCEPT override {
        return PLUGIN_VERSION;
    }
    IPluginV2 *deserializePlugin(const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT override {
        return new MaskRcnnInferencePlugin(serialData, serialLength);
    }
    void setPluginNamespace(const char *N) TRT_NOEXCEPT override {}
    const PluginFieldCollection *getFieldNames() TRT_NOEXCEPT override { return nullptr; }
    IPluginV2 *createPlugin(const char *name, const PluginFieldCollection *fc) TRT_NOEXCEPT override { return nullptr; }
};

REGISTER_TENSORRT_PLUGIN(MaskRcnnInferencePluginCreator);

}  // namespace nvinfer1

#undef PLUGIN_NAME
#undef PLUGIN_VERSION
#undef PLUGIN_NAMESPACE


================================================
FILE: rcnn/PredictorDecode.cu
================================================
#include <thrust/device_ptr.h>
#include <thrust/sequence.h>
#include <thrust/execution_policy.h>
#include <thrust/gather.h>

#include <algorithm>
#include <cstdint>

#include "PredictorDecodePlugin.h"
#include "./cuda_utils.h"
#include "macros.h"

#ifdef CUDA_11
#include <cub/device/device_radix_sort.cuh>
#include <cub/iterator/counting_input_iterator.cuh>
#else
#include <thrust/system/cuda/detail/cub/device/device_radix_sort.cuh>
#include <thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh>
namespace cub = thrust::cuda_cub::cub;
#endif

namespace nvinfer1 {

int predictorDecode(int batchSize, const void *const *inputs,
void *TRT_CONST_ENQUEUE*outputs, unsigned int num_boxes, unsigned int num_classes,
unsigned int image_height, unsigned int image_width,
const std::vector<float>& bbox_reg_weights, void *workspace,
size_t workspace_size, cudaStream_t stream) {
    int scores_size = num_boxes * num_classes;

    if (!workspace || !workspace_size) {
        // Return required scratch space size cub style
        workspace_size = get_size_aligned<float>(bbox_reg_weights.size());  // anchors
        workspace_size += get_size_aligned<int>(scores_size);      // indices
        workspace_size += get_size_aligned<int>(scores_size);      // indices_sorted
        workspace_size += get_size_aligned<float>(scores_size);    // scores_sorted

        size_t temp_size_sort = 0;
        cub::DeviceRadixSort::SortPairsDescending(
            static_cast<void*>(nullptr), temp_size_sort,
            static_cast<float*>(nullptr),
            static_cast<float*>(nullptr),
            static_cast<int*>(nullptr),
            static_cast<int*>(nullptr),
            scores_size);
        workspace_size += temp_size_sort;

        return workspace_size;
    }

    auto bbox_reg_weights_d = get_next_ptr<float>(bbox_reg_weights.size(), workspace, workspace_size);
    cudaMemcpyAsync(bbox_reg_weights_d, bbox_reg_weights.data(),
    bbox_reg_weights.size() * sizeof *bbox_reg_weights_d,
    cudaMemcpyHostToDevice, stream);

    auto on_stream = thrust::cuda::par.on(stream);

    auto indices = get_next_ptr<int>(scores_size, workspace, workspace_size);
    std::vector<int> indices_h(scores_size, 0);
    for (int i = 0; i < scores_size; i++) indices_h[i] = i;
    cudaMemcpyAsync(indices, indices_h.data(), scores_size * sizeof(int), cudaMemcpyHostToDevice, stream);
    auto indices_sorted = get_next_ptr<int>(scores_size, workspace, workspace_size);
    auto scores_sorted = get_next_ptr<float>(scores_size, workspace, workspace_size);

    for (int batch = 0; batch < batchSize; batch++) {
        auto in_scores = static_cast<const float *>(inputs[0]) + batch * scores_size;
        auto in_boxes = static_cast<const float4 *>(inputs[1]) + batch * scores_size;
        auto in_proposals = static_cast<const float4 *>(inputs[2]) + batch * num_boxes;

        auto out_scores = static_cast<float *>(outputs[0]) + batch * num_boxes;
        auto out_boxes = static_cast<float4 *>(outputs[1]) + batch * num_boxes;
        auto out_classes = static_cast<float *>(outputs[2]) + batch * num_boxes;

        // Only keep top n scores
        cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size,
            in_scores, scores_sorted, indices, indices_sorted, scores_size, 0, sizeof(*scores_sorted) * 8, stream);

        // Gather boxes
        thrust::transform(on_stream, indices_sorted, indices_sorted + num_boxes,
            thrust::make_zip_iterator(thrust::make_tuple(out_scores, out_boxes, out_classes)),
            [=] __device__(int i) {
            int cls = i % num_classes;
            int n = i / num_classes;
            float4 deltas = in_boxes[i];

            float4 boxes = in_proposals[n];

            float w = boxes.z - boxes.x;
            float h = boxes.w - boxes.y;
            float pred_ctr_x = (deltas.x / bbox_reg_weights_d[0]) * w + boxes.x + 0.5f * w;
            float pred_ctr_y = (deltas.y / bbox_reg_weights_d[1]) * h + boxes.y + 0.5f * h;
            float pred_w = exp(deltas.z / bbox_reg_weights_d[2]) * w;
            float pred_h = exp(deltas.w / bbox_reg_weights_d[3]) * h;

            boxes = float4{
              max(0.0f, pred_ctr_x - 0.5f * pred_w),
              max(0.0f, pred_ctr_y - 0.5f * pred_h),
              min(pred_ctr_x + 0.5f * pred_w, static_cast<float>(image_width)),
              min(pred_ctr_y + 0.5f * pred_h, static_cast<float>(image_width))
            };

            // filter empty boxes
            if (boxes.z - boxes.x <= 0.0f || boxes.w - boxes.y <= 0.0f) return thrust::make_tuple(0.0f, boxes, cls);
            else
                return thrust::make_tuple(in_scores[i], boxes, cls);
        });
    }

    return 0;
}

}  // namespace nvinfer1


================================================
FILE: rcnn/PredictorDecodePlugin.h
================================================
#pragma once

#include <NvInfer.h>

#include <cassert>
#include <vector>
#include "macros.h"

using namespace nvinfer1;

#define PLUGIN_NAME "PredictorDecode"
#define PLUGIN_VERSION "1"
#define PLUGIN_NAMESPACE ""

namespace nvinfer1 {

int predictorDecode(int batchSize,
const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs, unsigned int num_boxes,
unsigned int num_classes, unsigned int image_height,
unsigned int image_width, const std::vector<float>& bbox_reg_weights,
void *workspace, size_t workspace_size, cudaStream_t stream);

/*
    input1: scores{N,C,1,1} N->nums C->num of classes
    input2: boxes{N,C*4,1,1} N->nums C->num of classes
    input3: proposals{N,4} N->nums
    output1: scores{N, 1} N->nums
    output2: boxes{N, 4} N->nums format:XYXY
    output3: classes{N, 1} N->nums
    Description: implement fast rcnn decode
*/
class PredictorDecodePlugin : public IPluginV2Ext {
    unsigned int _num_boxes;
    unsigned int _num_classes;
    unsigned int _image_height;
    unsigned int _image_width;
    std::vector<float> _bbox_reg_weights;
    mutable int size = -1;

 protected:
    void deserialize(void const* data, size_t length) {
        const char* d = static_cast<const char*>(data);
        read(d, _num_boxes);
        read(d, _num_classes);
        read(d, _image_height);
        read(d, _image_width);
        size_t bbox_reg_weights_size;
        read(d, bbox_reg_weights_size);
        while (bbox_reg_weights_size--) {
            float val;
            read(d, val);
            _bbox_reg_weights.push_back(val);
        }
    }

    size_t getSerializationSize() const TRT_NOEXCEPT override {
        return sizeof(_num_boxes) + sizeof(_num_classes) +
        sizeof(_image_height) + sizeof(_image_width) + sizeof(size_t) +
        sizeof(float)*_bbox_reg_weights.size();
    }

    void serialize(void *buffer) const TRT_NOEXCEPT override {
        char* d = static_cast<char*>(buffer);
        write(d, _num_boxes);
        write(d, _num_classes);
        write(d, _image_height);
        write(d, _image_width);
        write(d, _bbox_reg_weights.size());
        for (auto &val : _bbox_reg_weights) {
            write(d, val);
        }
    }

 public:
    PredictorDecodePlugin(unsigned int num_boxes, unsigned int image_height,
    unsigned int image_width, std::vector<float> const& bbox_reg_weights)
        : _num_boxes(num_boxes), _image_height(image_height),
        _image_width(image_width), _bbox_reg_weights(bbox_reg_weights) {}

    PredictorDecodePlugin(unsigned int num_boxes, unsigned int num_classes,
    unsigned int image_height, unsigned int image_width,
    std::vector<float> const& bbox_reg_weights)
        : _num_boxes(num_boxes), _num_classes(num_classes),
        _image_height(image_height), _image_width(image_width),
        _bbox_reg_weights(bbox_reg_weights) {}

    PredictorDecodePlugin(void const* data, size_t length) {
        this->deserialize(data, length);
    }

    const char *getPluginType() const TRT_NOEXCEPT override {
        return PLUGIN_NAME;
    }

    const char *getPluginVersion() const TRT_NOEXCEPT override {
        return PLUGIN_VERSION;
    }

    int getNbOutputs() const TRT_NOEXCEPT override {
        return 3;
    }

    Dims getOutputDimensions(int index,
        const Dims *inputs, int nbInputDims) TRT_NOEXCEPT override {
        assert(nbInputDims == 3);
        assert(index < this->getNbOutputs());
        return Dims2(_num_boxes, (index == 1 ? 4 : 1));
    }

    bool supportsFormat(DataType type, PluginFormat format) const TRT_NOEXCEPT override {
        return type == DataType::kFLOAT && format == PluginFormat::kLINEAR;
    }

    int initialize() TRT_NOEXCEPT override { return 0; }

    void terminate() TRT_NOEXCEPT override {}

    size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override {
        if (size < 0) {
            size = predictorDecode(maxBatchSize, nullptr, nullptr,
            _num_boxes, _num_classes, _image_height, _image_width,
            _bbox_reg_weights, nullptr, 0, nullptr);
        }
        return size;
    }

    int enqueue(int batchSize,
        const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs,
        void *workspace, cudaStream_t stream) TRT_NOEXCEPT override {
        return predictorDecode(batchSize, inputs, outputs, _num_boxes,
        _num_classes, _image_height, _image_width, _bbox_reg_weights,
        workspace, getWorkspaceSize(batchSize), stream);
    }

    void destroy() TRT_NOEXCEPT override {
        delete this;
    };

    const char *getPluginNamespace() const TRT_NOEXCEPT override {
        return PLUGIN_NAMESPACE;
    }

    void setPluginNamespace(const char *N) TRT_NOEXCEPT override {}

    // IPluginV2Ext Methods
    DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override {
        assert(index < this->getNbOutputs());
        return DataType::kFLOAT;
    }

    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
        int nbInputs) const TRT_NOEXCEPT override {
        return false;
    }

    bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override { return false; }

    void configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs,
        const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast,
        const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize) TRT_NOEXCEPT override {
        assert(*inputTypes == nvinfer1::DataType::kFLOAT &&
            floatFormat == nvinfer1::PluginFormat::kLINEAR);
        assert(nbInputs == 3);
        assert(nbOutputs == 3);
        auto const& scores_dims = inputDims[0];
        auto const& boxes_dims = inputDims[1];
        auto const& proposals_dims = inputDims[2];
        assert(scores_dims.d[0] == _num_boxes);
        assert(scores_dims.d[0] == boxes_dims.d[0]);
        assert(scores_dims.d[0] == proposals_dims.d[0]);
        assert(scores_dims.d[1] * 4 == boxes_dims.d[1]);
        assert(proposals_dims.d[1] == 4);
        _num_classes = scores_dims.d[1];
    }

    IPluginV2Ext *clone() const TRT_NOEXCEPT override {
        return new PredictorDecodePlugin(_num_boxes, _num_classes, _image_height, _image_width, _bbox_reg_weights);
    }

 private:
    template<typename T> void write(char*& buffer, const T& val) const {
        *reinterpret_cast<T*>(buffer) = val;
        buffer += sizeof(T);
    }

    template<typename T> void read(const char*& buffer, T& val) {
        val = *reinterpret_cast<const T*>(buffer);
        buffer += sizeof(T);
    }
};

class PredictorDecodePluginCreator : public IPluginCreator {
 public:
    PredictorDecodePluginCreator() {}

    const char *getPluginName() const TRT_NOEXCEPT override {
        return PLUGIN_NAME;
    }

    const char *getPluginVersion() const TRT_NOEXCEPT override {
        return PLUGIN_VERSION;
    }

    const char *getPluginNamespace() const TRT_NOEXCEPT override {
        return PLUGIN_NAMESPACE;
    }

    IPluginV2 *deserializePlugin(const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT override {
        return new PredictorDecodePlugin(serialData, serialLength);
    }

    void setPluginNamespace(const char *N) TRT_NOEXCEPT override {}
    const PluginFieldCollection *getFieldNames() TRT_NOEXCEPT override { return nullptr; }
    IPluginV2 *createPlugin(const char *name, const PluginFieldCollection *fc) TRT_NOEXCEPT override { return nullptr; }
};

REGISTER_TENSORRT_PLUGIN(PredictorDecodePluginCreator);

}  // namespace nvinfer1

#undef PLUGIN_NAME
#undef PLUGIN_VERSION
#undef PLUGIN_NAMESPACE


================================================
FILE: rcnn/README.md
================================================
# Rcnn

The Pytorch implementation is [facebookresearch/detectron2](https://github.com/facebookresearch/detectron2). Now, outputting instance segmentation results on the original image size and selecting different nms methods are available, which is more convenient for engineering applications.

## Models

- [x] Faster R-CNN(C4)

- [x] Mask R-CNN(C4)

## Test Environment

- GTX3090 / Ubuntu20.04 / cuda11 / cudnn8.0.4 / TensorRT8.1.1 / OpenCV4.5  form docker hakuyyf/tensorrtx:trt8_cuda11
- GTX2080Ti / Ubuntu16.04 / cuda10.2 / cudnn8.0.4 / TensorRT7.2.1 / OpenCV4.2
- GTX2080Ti / win10 / cuda10.2 / cudnn8.0.4 / TensorRT7.2.1 / OpenCV4.2 / VS2017 (need to replace function corresponding to the dirent.h and add "--extended-lambda" in CUDA C/C++ -> Command Line -> Other options)

TensorRT7.2 is recomended because Resize layer in 7.0 with kLINEAR mode is a little different with opencv. You can also implement data preprocess out of tensorrt if you want to use TensorRT7.0 or more previous version. 
TensorRT 8.x is supported and you can use it.

**The result under fp32 is same to pytorch about 4 decimal places**!

## Contributors

<a href="https://github.com/HaiyangPeng"><img src="https://avatars.githubusercontent.com/u/46739135?v=4" width="40px;" alt=""/></a>
<a href="https://github.com/nengwp"><img src="https://avatars.githubusercontent.com/u/44516353?s=96&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/freedenS"><img src="https://avatars.githubusercontent.com/u/26213470?v=4" width="40px;" alt=""/></a>

## How to Run

1. generate .wts from pytorch with .pkl or .pth

```
// git clone -b v0.4 https://github.com/facebookresearch/detectron2.git
// go to facebookresearch/detectron2
python setup.py build develop // more install information see https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md
// download https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_1x/137257644/model_final_721ade.pkl
// download https://raw.githubusercontent.com/freedenS/TestImage/main/demo.jpg
// copy tensorrtx/rcnn/gen_wts.py and demo.jpg into facebookresearch/detectron2
// ensure cfg.MODEL.WEIGHTS in gen_wts.py is correct
// go to facebookresearch/detectron2
python gen_wts.py
// a file 'faster.wts' will be generated.
```

2. build tensorrtx/rcnn and run

```
// put faster.wts into tensorrtx/rcnn
// go to tensorrtx/rcnn
// update parameters in rcnn.cpp if your model is trained on custom dataset.The parameters are corresponding to config in detectron2.
mkdir build
cd build
cmake ..
make
sudo ./rcnn -s [.wts] [m] // serialize model to plan file, add m for maskrcnn
sudo ./rcnn -d [.engine] [image folder] [m] // deserialize and run inference, the images in [image folder] will be processed. add m for maskrcnn
// For example
sudo ./rcnn -s faster.wts faster.engine
sudo ./rcnn -d faster.engine ../samples
// sudo ./rcnn -s mask.wts mask.engine m
// sudo ./rcnn -d mask.engine ../samples m
```

3. check the images generated, as follows. _demo.jpg and so on.

## Backbone

#### R18, R34, R152

```
// python
1.download pretrained model
  R18: https://download.pytorch.org/models/resnet18-f37072fd.pth
  R34: https://download.pytorch.org/models/resnet34-b627a593.pth
  R50: https://download.pytorch.org/models/resnet50-0676ba61.pth
  R101: https://download.pytorch.org/models/resnet101-63fe2227.pth
  R152: https://download.pytorch.org/models/resnet152-394f9c45.pth
2.convert pth to pkl by facebookresearch/detectron2/tools/convert-torchvision-to-d2.py
3.set merge_from_file in gen_wts.py
  ./configs/COCO-Detections/faster_rcnn_R_50_C4_1x.yaml for fasterRcnn
  ./configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml for maskRcnn
4.set cfg.MODEL.RESNETS.DEPTH = 18(34,50,101,152),
      cfg.MODEL.RESNETS.STRIDE_IN_1X1 = False,
      cfg.MODEL.RESNETS.RES2_OUT_CHANNELS = 64, // for R18, R34; 256 for others
      cfg.MODEL.PIXEL_MEAN = [123.675, 116.280, 103.530],
      cfg.MODEL.PIXEL_STD = [58.395, 57.120, 57.375],
      cfg.INPUT.FORMAT = "RGB"
  and then train your own model
5.generate your wts file.
// c++
6.set BACKBONE_RESNETTYPE = R18(R34,R50,R101,R152) in rcnn.cpp line 14
7.modify PIXEL_MEAN and PIXEL_STD in rcnn.cpp
8.set STRIDE_IN_1X1=false in backbone.hpp line 9
9.set other parameters if it's not same with default
10.build your engine, refer to how to run
11.convert your image to RGB before inference
```

#### R50, R101

```
1.download pretrained model
  R50: https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_1x/137257644/model_final_721ade.pkl for fasterRcnn
       https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x/137259246/model_final_9243eb.pkl for maskRcnn
  R101: https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_C4_3x/138204752/model_final_298dad.pkl for fasterRcnn
        https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x/138363239/model_final_a2914c.pkl for maskRcnn
2.set merge_from_file in gen_wts.py
  R50-faster: ./configs/COCO-Detection/faster_rcnn_R_50_C4_1x.yaml
  R101-faster: ./configs/COCO-Detection/faster_rcnn_R_101_C4_3x.yaml
  R50-mask: ./configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml
  R101-mask: ./configs/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml
3.set BACKBONE_RESNETTYPE = R50(R101) rcnn.cpp line 14
4.set STRIDE_IN_1X1=true in backbone.hpp
5.follow how to run
```

## NOTE

- if you meet the error below, just try to make again. The flag has been added in CMakeLists.txt

  ```
  error: __host__ or __device__ annotation on lambda requires --extended-lambda nvcc flag
  ```

- the image preprocess of sizing and padding was moved out from tensorrt, see DataPreprocess in rcnn.cpp, so the input data is {H, W, C}
- now, left-right and top-bottom padding preprocessings are optionally available in preprocessImg of common.hpp, and you can set arbitrary sizes of INPUT_H_ and INPUT_W_

- the predicted boxes is corresponding to new image size containing padding, so the final boxes need to subtract padding size and multiply with the ratio, see preprocessImg in common.hpp and calculateSize in rcnn.cpp

- tensorrt use fixed input size, if the size of your data is different from the engine, you need to adjust your data and the result.

- if you want to use maskrcnn with cuda10.2, please be sure that you have upgraded cuda to the latest patch. see https://github.com/NVIDIA/TensorRT/issues/1151 for detail.

- you can build fasterRcnn with maskRcnn weights file.

- do initializing for _pre_nms_topk in RpnNmsPlugin,  _count in BatchedNmsPlugin and _num_classes in MaskRcnnInferencePlugin inside class to prevent error assert, because the configurePlugin function is implemented after clone() and before serialize(). one can also set it through constructor.

## Quantization

1. quantizationType:fp32,fp16,int8. see BuildRcnnModel(rcnn.cpp line 345) for detail.

2. the usage of int8 is same with [tensorrtx/yolov5](../yolov5/README.md).

## Latency

average cost of doInference(in rcnn.cpp) from second time with batch=1 under the ubuntu environment above, input size: 640(w)*480(h)

|               | fp32  | fp16 | int8 |
| ------------- | ----- | ---- | ---- |
| Faster-R50C4  | 138ms | 36ms | 30ms |
| Faster-R101C4 | 146ms | 38ms | 32ms |
| Mask-R50C4    | 153ms | 44ms | 33ms |
| Mask-R101C4   | 168ms | 45ms | 35ms |

## Plugins

decode and nms plugins are modified from [retinanet-examples](https://github.com/NVIDIA/retinanet-examples/tree/master/csrc/plugins)

- RpnDecodePlugin: calculate coordinates of  proposals which is the first n

```
parameters:
  top_n: num of proposals to select
  anchors: coordinates of all anchors
  stride: stride of current feature map
  image_height: iamge height after DataPreprocess for clipping the box beyond the boundary
  image_width: iamge width after DataPreprocess for clipping the box beyond the boundary

Inputs:
  scores{C,H,W} C is number of anchors, H and W are the size of feature map
  boxes{C,H,W} C is 4*number of anchors, H and W are the size of feature map
Outputs:
  scores{C,1} C is equal to top_n
  boxes{C,4} C is equal to top_n
```

- RpnNmsPlugin: apply nms to proposals

```
parameters:
  nms_thresh: thresh of nms
  post_nms_topk: number of proposals to select
  
Inputs:
  scores{C,1} C is equal to top_n
  boxes{C,4} C is equal to top_n
Outputs:
  boxes{C,4} C is equal to post_nms_topk
```

- RoiAlignPlugin: implement of RoiAlign(align=True). see https://github.com/facebookresearch/detectron2/blob/f50ec07cf220982e2c4861c5a9a17c4864ab5bfd/detectron2/layers/roi_align.py#L7 for detail

```
parameters:
  pooler_resolution: output size
  spatial_scale: scale the input boxes by this number
  sampling_ratio: number of inputs samples to take for each output
  num_proposals: number of proposals
  
Inputs:
  boxes{N,4} N is number of boxes
  features{C,H,W} C is channels of feature map, H and W are sizes of feature map
Outputs:
  features{N,C,H,W} N is number of boxes, C is channels of feature map, H and W are equal to pooler_resolution
```

- PredictorDecodePlugin: calculate coordinates of predicted boxes by applying delta to proposals

```
parameters:
  num_boxes: num of proposals
  image_height: iamge height after DataPreprocess for clipping the box beyond the boundary
  image_width: iamge width after DataPreprocess for clipping the box beyond the boundary
  bbox_reg_weights: the weights for dx,dy,dw,dh. see https://github.com/facebookresearch/detectron2/blob/master/detectron2/config/defaults.py#L292 for detail

Inputs:
  scores{N,C,1,1} N is euqal to num_boxes, C is the num of classes
  boxes{N,C,1,1} N is euqal to num_boxes, C is the num of classes
  proposals{N,4} N is equal to num_boxes
Outputs:
  scores{N,1} N is equal to num_boxes
  boxes{N,4} N is equal to num_boxes
  classes{N,1} N is equal to num_boxes
```

- BatchedNmsPlugin: apply nms to predicted boxes with different classes. same with https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/nms.py#L19

```
parameters:
  nms_thresh: thresh of nms
  detections_per_im: number of detections to return per image

Inputs:
  scores{N,1} N is the number of the boxes
  boxes{N,4} N is the number of the boxes
  classes{N,1} N is the number of the boxes
Outputs:
  scores{N,1} N is equal to detections_per_im
  boxes{N,4} N is equal to detections_per_im
  classes{N,1} N is equal to detections_per_im
```

- MaskRcnnInferencePlugin:  extract the masks for the predicted classes and do sigmoid. same with https://github.com/facebookresearch/detectron2/blob/9c7f8a142216ebc52d3617c11f8fafd75b74e637/detectron2/modeling/roi_heads/mask_head.py#L114

```
parameters:
  detections_per_im: number of detections to return per image
  output_size: same with output size of RoiAlign

Inputs:
  indices{N,1} N is the number of the predicted boxes
  masks{N,C,H,W} N is the number of the predicted boxes
Outputs:
  selected_masks{N,1,H,W} N is the number of the predicted boxes, H and W is equal to output_size
```


================================================
FILE: rcnn/RoiAlign.cu
================================================
#include <cuda.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <thrust/sequence.h>
#include <thrust/execution_policy.h>
#include <thrust/gather.h>

#include <algorithm>
#include <iostream>
#include <stdexcept>
#include <cstdint>
#include <vector>
#include <cmath>

#include "RoiAlignPlugin.h"
#include "./cuda_utils.h"
#include "macros.h"

#ifdef CUDA_11
#include <cub/device/device_radix_sort.cuh>
#include <cub/iterator/counting_input_iterator.cuh>
#else
#include <thrust/system/cuda/detail/cub/device/device_radix_sort.cuh>
#include <thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh>
namespace cub = thrust::cuda_cub::cub;
#endif

namespace nvinfer1 {
template <typename T>
__device__ T bilinear_interpolate(
    const T* bottom_data,
    const int height,
    const int width,
    T y,
    T x) {
    // deal with cases that inverse elements are out of feature map boundary
    if (y < -1.0 || y > height || x < -1.0 || x > width) {
        // empty
        return 0;
    }

    if (y <= 0) {
        y = 0;
    }
    if (x <= 0) {
        x = 0;
    }

    int y_low = static_cast<int>(y);
    int x_low = static_cast<int>(x);
    int y_high;
    int x_high;

    if (y_low >= height - 1) {
        y_high = y_low = height - 1;
        y = (T)y_low;
    } else {
        y_high = y_low + 1;
    }

    if (x_low >= width - 1) {
        x_high = x_low = width - 1;
        x = (T)x_low;
    } else {
        x_high = x_low + 1;
    }

    T ly = y - y_low;
    T lx = x - x_low;
    T hy = 1. - ly, hx = 1. - lx;
    // do bilinear interpolation
    T v1 = bottom_data[y_low * width + x_low];
    T v2 = bottom_data[y_low * width + x_high];
    T v3 = bottom_data[y_high * width + x_low];
    T v4 = bottom_data[y_high * width + x_high];
    T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;

    T val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;  // mode Avg

    return val;
}

__global__ void RoIAlignForward(
    const int nthreads,
    const float* bottom_data,
    const float spatial_scale,
    const int channels,
    const int height,
    const int width,
    const int pooled_height,
    const int pooled_width,
    const int sampling_ratio,
    const float4* bottom_rois,
    float* top_data) {
    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) {
        // (n, c, ph, pw) is an element in the pooled output
        int pw = index % pooled_width;
        int ph = (index / pooled_width) % pooled_height;
        int c = (index / pooled_width / pooled_height) % channels;
        int n = index / pooled_width / pooled_height / channels;

        const float4* offset_bottom_rois = bottom_rois + n;

        // Do not using rounding; this implementation detail is critical
        float roi_offset = 0.5f;
        float roi_start_w = offset_bottom_rois->x * spatial_scale - roi_offset;
        float roi_start_h = offset_bottom_rois->y * spatial_scale - roi_offset;
        float roi_end_w = offset_bottom_rois->z * spatial_scale - roi_offset;
        float roi_end_h = offset_bottom_rois->w * spatial_scale - roi_offset;

        float roi_width = roi_end_w - roi_start_w;
        float roi_height = roi_end_h - roi_start_h;

        float bin_size_h = static_cast<float>(roi_height) / static_cast<float>(pooled_height);
        float bin_size_w = static_cast<float>(roi_width) / static_cast<float>(pooled_width);

        const float* offset_bottom_data =
            bottom_data + static_cast<int>(c * height * width);

        // We use roi_bin_grid to sample the grid and mimic integral
        int roi_bin_grid_h = (sampling_ratio > 0)
            ? sampling_ratio
            : ceil(roi_height / pooled_height);  // e.g., = 2
        int roi_bin_grid_w =
            (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);

        // We do average (integral) pooling inside a bin
        const float count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4

        float output_val = 0.f;
        // bool max_flag = false;
        // e.g., iy = 0, 1
        for (int iy = 0; iy < roi_bin_grid_h; iy++) {
            const float y = roi_start_h + ph * bin_size_h +
                static_cast<float>(iy + .5f) * bin_size_h /
                static_cast<float>(roi_bin_grid_h);  // e.g., 0.5, 1.5
            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
                const float x = roi_start_w + pw * bin_size_w +
                    static_cast<float>(ix + .5f) * bin_size_w /
                    static_cast<float>(roi_bin_grid_w);

                float val = bilinear_interpolate(
                    offset_bottom_data, height, width, y, x);

                output_val += val;
            }
        }

        output_val /= count;

        top_data[index] = output_val;
    }
}

int roiAlign(int batchSize, const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs, int pooler_resolution, float spatial_scale,
    int sampling_ratio, int num_proposals, int out_channels, int feature_h, int feature_w, cudaStream_t stream) {
    for (int batch = 0; batch < batchSize; batch++) {
        auto in_boxes = static_cast<const float4 *>(inputs[0]) + batch * num_proposals;
        auto in_features = static_cast<const float *>(inputs[1]) + batch * out_channels * feature_h * feature_w;

        int nthreads = num_proposals * out_channels * pooler_resolution * pooler_resolution;
        auto out_features = static_cast<float *>(outputs[0]) + batch * nthreads;
        const int max_threads = 1024;

        int blocksPerGrid = ceil(static_cast<float>(nthreads) / max_threads);
        RoIAlignForward<< <blocksPerGrid, max_threads, 0, stream>> > (
            nthreads,
            in_features,
            spatial_scale,
            out_channels,
            feature_h,
            feature_w,
            pooler_resolution,
            pooler_resolution,
            sampling_ratio,
            in_boxes,
            out_features);
        cudaDeviceSynchronize();
    }

    return 0;
}
}  // namespace nvinfer1


================================================
FILE: rcnn/RoiAlignPlugin.h
================================================
#pragma once

#include <NvInfer.h>

#include <cassert>
#include <vector>
#include "macros.h"

using namespace nvinfer1;

#define PLUGIN_NAME "RoiAlign"
#define PLUGIN_VERSION "1"
#define PLUGIN_NAMESPACE ""

namespace nvinfer1 {
int roiAlign(int batchSize, const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs,
int pooler_resolution, float spatial_scale, int sampling_ratio,
int num_proposals, int out_channels, int feature_h, int feature_w,
cudaStream_t stream);

    /*
        input1: boxes{N,4} N->post_nms_topk
        input2: features{C,H,W} C->num of feature map channels
        output1: features{N, C, H, W} N:nums of proposals C:output out_channels H,W:roialign size
        Description: roialign
    */
class RoiAlignPlugin : public IPluginV2Ext {
    int _pooler_resolution;
    float _spatial_scale;
    int _sampling_ratio;
    int _num_proposals;
    int _out_channels;
    int _feature_h;
    int _feature_w;

 protected:
    void deserialize(void const* data, size_t length) {
        const char* d = static_cast<const char*>(data);
        read(d, _pooler_resolution);
        read(d, _spatial_scale);
        read(d, _sampling_ratio);
        read(d, _num_proposals);
        read(d, _out_channels);
        read(d, _feature_h);
        read(d, _feature_w);
    }

    size_t getSerializationSize() const TRT_NOEXCEPT override {
        return sizeof(_pooler_resolution) + sizeof(_spatial_scale) + sizeof(_sampling_ratio) +
            sizeof(_num_proposals) + sizeof(_out_channels) + sizeof(_feature_h) + sizeof(_feature_w);
    }

    void serialize(void *buffer) const TRT_NOEXCEPT override {
        char* d = static_cast<char*>(buffer);
        write(d, _pooler_resolution);
        write(d, _spatial_scale);
        write(d, _sampling_ratio);
        write(d, _num_proposals);
        write(d, _out_channels);
        write(d, _feature_h);
        write(d, _feature_w);
    }

 public:
    RoiAlignPlugin(int pooler_resolution, float spatial_scale, int sampling_ratio, int num_proposals,
        int out_channels)
        : _pooler_resolution(pooler_resolution), _spatial_scale(spatial_scale), _sampling_ratio(sampling_ratio),
        _num_proposals(num_proposals), _out_channels(out_channels) {}

    RoiAlignPlugin(int pooler_resolution, float spatial_scale, int sampling_ratio, int num_proposals,
        int out_channels, int feature_h, int feature_w)
        : _pooler_resolution(pooler_resolution), _spatial_scale(spatial_scale), _sampling_ratio(sampling_ratio),
        _num_proposals(num_proposals), _out_channels(out_channels), _feature_h(feature_h), _feature_w(feature_w) {}

    RoiAlignPlugin(void const* data, size_t length) {
        this->deserialize(data, length);
    }

    const char *getPluginType() const TRT_NOEXCEPT override {
        return PLUGIN_NAME;
    }

    const char *getPluginVersion() const TRT_NOEXCEPT override {
        return PLUGIN_VERSION;
    }

    int getNbOutputs() const TRT_NOEXCEPT override {
        return 1;
    }

    Dims getOutputDimensions(int index,
        const Dims *inputs, int nbInputDims) TRT_NOEXCEPT override {
        assert(index < this->getNbOutputs());
        return Dims4(_num_proposals, _out_channels, _pooler_resolution, _pooler_resolution);
    }

    bool supportsFormat(DataType type, PluginFormat format) const TRT_NOEXCEPT override {
        return type == DataType::kFLOAT && format == PluginFormat::kLINEAR;
    }

    int initialize() TRT_NOEXCEPT override { return 0; }

    void terminate() TRT_NOEXCEPT override {}

    size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override {
        return 0;
    }

    int enqueue(int batchSize,
        const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs,
        void *workspace, cudaStream_t stream) TRT_NOEXCEPT override {
        return roiAlign(batchSize, inputs, outputs, _pooler_resolution, _spatial_scale, _sampling_ratio,
            _num_proposals, _out_channels, _feature_h, _feature_w, stream);
    }

    void destroy() TRT_NOEXCEPT override {
        delete this;
    };

    const char *getPluginNamespace() const TRT_NOEXCEPT override {
        return PLUGIN_NAMESPACE;
    }

    void setPluginNamespace(const char *N) TRT_NOEXCEPT override {
    }

    // IPluginV2Ext Methods
    DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override {
        assert(index < this->getNbOutputs());
        return DataType::kFLOAT;
    }

    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
        int nbInputs) const TRT_NOEXCEPT override {
        return false;
    }

    bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override { return false; }

    void configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs,
        const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast,
        const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize) TRT_NOEXCEPT override {
        assert(*inputTypes == nvinfer1::DataType::kFLOAT &&
            floatFormat == nvinfer1::PluginFormat::kLINEAR);
        assert(nbInputs == 2);
        assert(nbOutputs == 1);
        auto const& boxes_dims = inputDims[0];
        auto const& feature_dims = inputDims[1];
        assert(_num_proposals == boxes_dims.d[0]);
        assert(_out_channels == feature_dims.d[0]);
        _feature_h = feature_dims.d[1];
        _feature_w = feature_dims.d[2];
    }

    IPluginV2Ext *clone() const TRT_NOEXCEPT override {
        return new RoiAlignPlugin(_pooler_resolution, _spatial_scale, _sampling_ratio, _num_proposals,
            _out_channels, _feature_h, _feature_w);
    }

 private:
    template<typename T> void write(char*& buffer, const T& val) const {
        *reinterpret_cast<T*>(buffer) = val;
        buffer += sizeof(T);
    }

    template<typename T> void read(const char*& buffer, T& val) {
        val = *reinterpret_cast<const T*>(buffer);
        buffer += sizeof(T);
    }
};

class RoiAlignPluginCreator : public IPluginCreator {
 public:
    RoiAlignPluginCreator() {}

    const char *getPluginName() const TRT_NOEXCEPT override {
        return PLUGIN_NAME;
    }

    const char *getPluginVersion() const TRT_NOEXCEPT override {
        return PLUGIN_VERSION;
    }

    const char *getPluginNamespace() const TRT_NOEXCEPT override {
        return PLUGIN_NAMESPACE;
    }

    IPluginV2 *deserializePlugin(const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT override {
        return new RoiAlignPlugin(serialData, serialLength);
    }

    void setPluginNamespace(const char *N) TRT_NOEXCEPT override {}
    const PluginFieldCollection *getFieldNames() TRT_NOEXCEPT override { return nullptr; }
    IPluginV2 *createPlugin(const char *name, const PluginFieldCollection *fc) TRT_NOEXCEPT override { return nullptr; }
};

REGISTER_TENSORRT_PLUGIN(RoiAlignPluginCreator);
}  // namespace nvinfer1

#undef PLUGIN_NAME
#undef PLUGIN_VERSION
#undef PLUGIN_NAMESPACE


================================================
FILE: rcnn/RpnDecode.cu
================================================
#include <thrust/device_ptr.h>
#include <thrust/sequence.h>
#include <thrust/execution_policy.h>
#include <thrust/gather.h>
#include <thrust/tabulate.h>
#include <thrust/count.h>
#include <thrust/find.h>

#include <algorithm>
#include <cstdint>

#include "RpnDecodePlugin.h"
#include "./cuda_utils.h"
#include "macros.h"

#ifdef CUDA_11
#include <cub/device/device_radix_sort.cuh>
#include <cub/iterator/counting_input_iterator.cuh>
#else
#include <thrust/system/cuda/detail/cub/device/device_radix_sort.cuh>
#include <thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh>
namespace cub = thrust::cuda_cub::cub;
#endif

namespace nvinfer1 {

int rpnDecode(int batch_size,
    const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs,
    size_t height, size_t width, size_t image_height, size_t image_width, float stride,
    const std::vector<float> &anchors, int top_n,
    void *workspace, size_t workspace_size, cudaStream_t stream) {

    size_t num_anchors = anchors.size() / 4;
    int scores_size = num_anchors * height * width;

    if (!workspace || !workspace_size) {
        // Return required scratch space size cub style
        workspace_size = get_size_aligned<float>(anchors.size());  // anchors
        workspace_size += get_size_aligned<int>(scores_size);      // indices
        workspace_size += get_size_aligned<int>(scores_size);      // indices_sorted
        workspace_size += get_size_aligned<float>(scores_size);    // scores_sorted

        size_t temp_size_sort = 0;
        if (scores_size > top_n) {
            cub::DeviceRadixSort::SortPairsDescending(
                static_cast<void*>(nullptr), temp_size_sort,
                static_cast<float*>(nullptr),
                static_cast<float*>(nullptr),
                static_cast<int*>(nullptr),
                static_cast<int*>(nullptr), scores_size);
            workspace_size += temp_size_sort;
        }

        return workspace_size;
    }

    auto anchors_d = get_next_ptr<float>(anchors.size(), workspace, workspace_size);
    cudaMemcpyAsync(anchors_d, anchors.data(), anchors.size() * sizeof *anchors_d, cudaMemcpyHostToDevice, stream);

    auto on_stream = thrust::cuda::par.on(stream);

    auto indices = get_next_ptr<int>(scores_size, workspace, workspace_size);
    // TODO: how to generate sequence on gpu directly?
    std::vector<int> indices_h(scores_size);
    for (int i = 0; i < scores_size; i++)
        indices_h[i] = i;
    cudaMemcpyAsync(indices, indices_h.data(), scores_size * sizeof * indices, cudaMemcpyHostToDevice, stream);
    auto indices_sorted = get_next_ptr<int>(scores_size, workspace, workspace_size);
    auto scores_sorted = get_next_ptr<float>(scores_size, workspace, workspace_size);

    for (int batch = 0; batch < batch_size; batch++) {
        auto in_scores = static_cast<const float *>(inputs[0]) + batch * scores_size;
        auto in_boxes = static_cast<const float *>(inputs[1]) + batch * scores_size * 4;

        auto out_scores = static_cast<float *>(outputs[0]) + batch * top_n;
        auto out_boxes = static_cast<float4 *>(outputs[1]) + batch * top_n;

        // Only keep top n scores
        int num_detections = scores_size;
        auto indices_filtered = indices;
        if (num_detections > top_n) {
            cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size,
                in_scores, scores_sorted, indices, indices_sorted, scores_size, 0, sizeof(*scores_sorted) * 8, stream);
            indices_filtered = indices_sorted;
            num_detections = top_n;
        }

        // Gather boxes
        bool has_anchors = !anchors.empty();
        thrust::transform(on_stream, indices_filtered, indices_filtered + num_detections,
            thrust::make_zip_iterator(thrust::make_tuple(out_scores, out_boxes)),
            [=] __device__(int i) {
            int x = i % width;
            int y = (i / width) % height;
            int a = (i / height / width) % num_anchors;
            float4 box = float4{
              in_boxes[((a * 4 + 0) * height + y) * width + x],
              in_boxes[((a * 4 + 1) * height + y) * width + x],
              in_boxes[((a * 4 + 2) * height + y) * width + x],
              in_boxes[((a * 4 + 3) * height + y) * width + x]
            };

            if (has_anchors) {
                // Add anchors offsets to deltas
                float x = (i % width) * stride;
                float y = ((i / width) % height) * stride;
                float *d = anchors_d + 4 * a;

                float x1 = x + d[0];
                float y1 = y + d[1];
                float x2 = x + d[2];
                float y2 = y + d[3];
                float w = x2 - x1;
                float h = y2 - y1;
                float pred_ctr_x = box.x * w + x1 + 0.5f * w;
                float pred_ctr_y = box.y * h + y1 + 0.5f * h;
                float pred_w = exp(box.z) * w;
                float pred_h = exp(box.w) * h;

                // TODO: set image size as parameter
                box = float4{
                  max(0.0f, pred_ctr_x - 0.5f * pred_w),
                  max(0.0f, pred_ctr_y - 0.5f * pred_h),
                  min(pred_ctr_x + 0.5f * pred_w, static_cast<float>(image_width)),
                  min(pred_ctr_y + 0.5f * pred_h, static_cast<float>(image_height))
                };
            }
            // filter empty boxes
            if (box.z - box.x <= 0.0f || box.w - box.y <= 0.0f)
                return thrust::make_tuple(-FLT_MAX, box);
            else
                return thrust::make_tuple(in_scores[i], box);
        });

        // Zero-out unused scores
        if (num_detections < top_n) {
            thrust::fill(on_stream, out_scores + num_detections,
                out_scores + top_n, -FLT_MAX);
        }
    }

    return 0;
}
}  // namespace nvinfer1


================================================
FILE: rcnn/RpnDecodePlugin.h
================================================
#pragma once

#include <NvInfer.h>

#include <cassert>
#include <vector>
#include "macros.h"

using namespace nvinfer1;

#define PLUGIN_NAME "RpnDecode"
#define PLUGIN_VERSION "1"
#define PLUGIN_NAMESPACE ""

namespace nvinfer1 {

int rpnDecode(int batchSize, const void *const *inputs,
void *TRT_CONST_ENQUEUE*outputs, size_t height, size_t width, size_t image_height,
size_t image_width, float stride, const std::vector<float> &anchors,
int top_n, void *workspace, size_t workspace_size, cudaStream_t stream);

/*
    input1: scores{C,H,W} C->anchors
    input2: boxes{C,H,W} C->4*anchors
    output1: scores{C, 1} C->topk
    output2: boxes{C, 4} C->topk format:XYXY
    Description: implement anchor decode
*/
class RpnDecodePlugin : public IPluginV2Ext {
    int _top_n;
    std::vector<float> _anchors;
    float _stride;

    size_t _height;
    size_t _width;
    size_t _image_height;  // for cliping the boxes by limiting y coordinates to the range [0, height]
    size_t _image_width;  // for cliping the boxes by limiting x coordinates to the range [0, width]
    mutable int size = -1;

 protected:
    void deserialize(void const* data, size_t length) {
        const char* d = static_cast<const char*>(data);
        read(d, _top_n);
        size_t anchors_size;
        read(d, anchors_size);
        while (anchors_size--) {
            float val;
            read(d, val);
            _anchors.push_back(val);
        }
        read(d, _stride);
        read(d, _height);
        read(d, _width);
        read(d, _image_height);
        read(d, _image_width);
    }

    size_t getSerializationSize() const TRT_NOEXCEPT override {
        return sizeof(_top_n)
            + sizeof(size_t) + sizeof(float) * _anchors.size() + sizeof(_stride)
            + sizeof(_height) + sizeof(_width) + sizeof(_image_height) + sizeof(_image_width);
    }

    void serialize(void *buffer) const TRT_NOEXCEPT override {
        char* d = static_cast<char*>(buffer);
        write(d, _top_n);
        write(d, _anchors.size());
        for (auto &val : _anchors) {
            write(d, val);
        }
        write(d, _stride);
        write(d, _height);
        write(d, _width);
        write(d, _image_height);
        write(d, _image_width);
    }

 public:
    RpnDecodePlugin(int top_n, std::vector<float> const& anchors, float stride, size_t image_height, size_t image_width)
        :  _top_n(top_n), _anchors(anchors), _stride(stride), _image_height(image_height), _image_width(image_width) {}

    RpnDecodePlugin(int top_n, std::vector<float> const& anchors, float stride,
        size_t height, size_t width, size_t image_height, size_t image_width)
        : _top_n(top_n), _anchors(anchors), _stride(stride),
        _height(height), _width(width), _image_height(image_height), _image_width(image_width) {}

    RpnDecodePlugin(void const* data, size_t length) {
        this->deserialize(data, length);
    }

    const char *getPluginType() const TRT_NOEXCEPT override {
        return PLUGIN_NAME;
    }

    const char *getPluginVersion() const TRT_NOEXCEPT override {
        return PLUGIN_VERSION;
    }

    int getNbOutputs() const TRT_NOEXCEPT override {
        return 2;
    }

    Dims getOutputDimensions(int index,
        const Dims *inputs, int nbInputDims) TRT_NOEXCEPT override {
        assert(nbInputDims == 2);
        assert(index < this->getNbOutputs());
        return Dims2(_top_n, (index == 1 ? 4 : 1));
    }

    bool supportsFormat(DataType type, PluginFormat format) const TRT_NOEXCEPT override {
        return type == DataType::kFLOAT && format == PluginFormat::kLINEAR;
    }

    int initialize() TRT_NOEXCEPT override { return 0; }

    void terminate() TRT_NOEXCEPT override {}

    size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override {
        if (size < 0) {
            size = rpnDecode(maxBatchSize, nullptr, nullptr, _height, _width, _image_height, _image_width, _stride,
                _anchors, _top_n,
                nullptr, 0, nullptr);
        }
        return size;
    }

    int enqueue(int batchSize,
        const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs,
        void *workspace, cudaStream_t stream) TRT_NOEXCEPT override {
        return rpnDecode(batchSize, inputs, outputs, _height, _width, _image_height, _image_width, _stride,
            _anchors, _top_n, workspace, getWorkspaceSize(batchSize), stream);
    }

    void destroy() TRT_NOEXCEPT override {
        delete this;
    };

    const char *getPluginNamespace() const TRT_NOEXCEPT override {
        return PLUGIN_NAMESPACE;
    }

    void setPluginNamespace(const char *N) TRT_NOEXCEPT override {
    }

    // IPluginV2Ext Methods
    DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override {
        assert(index < 3);
        return DataType::kFLOAT;
    }

    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
        int nbInputs) const TRT_NOEXCEPT override {
        return false;
    }

    bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override { return false; }

    void configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs,
        const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast,
        const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize) TRT_NOEXCEPT override {
        assert(*inputTypes == nvinfer1::DataType::kFLOAT &&
            floatFormat == nvinfer1::PluginFormat::kLINEAR);
        assert(nbInputs == 2);
        assert(nbOutputs == 2);
        auto const& scores_dims = inputDims[0];
        auto const& boxes_dims = inputDims[1];
        assert(scores_dims.d[1] == boxes_dims.d[1]);
        assert(scores_dims.d[2] == boxes_dims.d[2]);
        _height = scores_dims.d[1];
        _width = scores_dims.d[2];
    }

    IPluginV2Ext *clone() const TRT_NOEXCEPT override {
        return new RpnDecodePlugin(_top_n, _anchors, _stride, _height, _width, _image_height, _image_width);
    }

 private:
    template<typename T> void write(char*& buffer, const T& val) const {
        *reinterpret_cast<T*>(buffer) = val;
        buffer += sizeof(T);
    }

    template<typename T> void read(const char*& buffer, T& val) {
        val = *reinterpret_cast<const T*>(buffer);
        buffer += sizeof(T);
    }
};

class RpnDecodePluginCreator : public IPluginCreator {
 public:
    RpnDecodePluginCreator() {}

    const char *getPluginName() const TRT_NOEXCEPT override {
        return PLUGIN_NAME;
    }

    const char *getPluginVersion() const TRT_NOEXCEPT override {
        return PLUGIN_VERSION;
    }

    const char *getPluginNamespace() const TRT_NOEXCEPT override {
        return PLUGIN_NAMESPACE;
    }

    IPluginV2 *deserializePlugin(const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT override {
        return new RpnDecodePlugin(serialData, serialLength);
    }

    void setPluginNamespace(const char *N) TRT_NOEXCEPT override {}
    const PluginFieldCollection *getFieldNames() TRT_NOEXCEPT override { return nullptr; }
    IPluginV2 *createPlugin(const char *name, const PluginFieldCollection *fc) TRT_NOEXCEPT override { return nullptr; }
};

REGISTER_TENSORRT_PLUGIN(RpnDecodePluginCreator);

}  // namespace nvinfer1

#undef PLUGIN_NAME
#undef PLUGIN_VERSION
#undef PLUGIN_NAMESPACE


================================================
FILE: rcnn/RpnNms.cu
================================================
#include <cuda.h>
#include <thrust/device_ptr.h>
#include <thrust/gather.h>

#include <algorithm>
#include <iostream>
#include <stdexcept>
#include <cstdint>
#include <vector>
#include <cmath>

#include "RpnNmsPlugin.h"
#include "./cuda_utils.h"
#include "macros.h"

#ifdef CUDA_11
#include <cub/device/device_radix_sort.cuh>
#include <cub/iterator/counting_input_iterator.cuh>
#else
#include <thrust/system/cuda/detail/cub/device/device_radix_sort.cuh>
#include <thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh>
namespace cub = thrust::cuda_cub::cub;
#endif

namespace nvinfer1 {

    __global__ void rpn_nms_kernel(
        const float threshold, const int num_detections,
        const int *indices, float *scores, const float4 *boxes) {
        // Go through detections by descending score
        for (int m = 0; m < num_detections; m++) {
            int i = blockIdx.x * blockDim.x + threadIdx.x;
            if (i < num_detections && m < i && scores[m] > -FLT_MAX) {
                int idx = indices[i];
                int max_idx = indices[m];

                float4 ibox = boxes[idx];
                float4 mbox = boxes[max_idx];
                float x1 = max(ibox.x, mbox.x);
                float y1 = max(ibox.y, mbox.y);
                float x2 = min(ibox.z, mbox.z);
                float y2 = min(ibox.w, mbox.w);
                float w = max(0.0f, x2 - x1);
                float h = max(0.0f, y2 - y1);
                float iarea = (ibox.z - ibox.x) * (ibox.w - ibox.y);
                float marea = (mbox.z - mbox.x) * (mbox.w - mbox.y);
                float inter = w * h;
                float overlap = inter / (iarea + marea - inter);
                if (overlap > threshold) {
                    scores[i] = -FLT_MAX;
                }
            }

            // Sync discarded detections
            __syncthreads();
        }
    }

    int rpnNms(int batch_size,
        const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs,
        size_t pre_nms_topk, int post_nms_topk, float nms_thresh,
        void *workspace, size_t workspace_size, cudaStream_t stream) {
        if (!workspace || !workspace_size) {
            // Return required scratch space size cub style
            workspace_size += get_size_aligned<int>(pre_nms_topk);   // indices
            workspace_size += get_size_aligned<int>(pre_nms_topk);   // indices_sorted
            workspace_size += get_size_aligned<float>(pre_nms_topk);  // scores
            workspace_size += get_size_aligned<float>(pre_nms_topk);  // scores_sorted

            size_t temp_size_sort = 0;
            cub::DeviceRadixSort::SortPairsDescending(
                static_cast<void*>(nullptr), temp_size_sort,
                static_cast<float*>(nullptr),
                static_cast<float*>(nullptr),
                static_cast<int*>(nullptr),
                static_cast<int*>(nullptr), pre_nms_topk);
            workspace_size += temp_size_sort;

            return workspace_size;
        }

        auto on_stream = thrust::cuda::par.on(stream);

        auto indices = get_next_ptr<int>(pre_nms_topk, workspace, workspace_size);
        std::vector<int> indices_h(pre_nms_topk);
        for (int i = 0; i < pre_nms_topk; i++)
            indices_h[i] = i;
        cudaMemcpyAsync(indices, indices_h.data(), pre_nms_topk * sizeof * indices, cudaMemcpyHostToDevice, stream);
        auto indices_sorted = get_next_ptr<int>(pre_nms_topk, workspace, workspace_size);
        auto scores = get_next_ptr<float>(pre_nms_topk, workspace, workspace_size);
        auto scores_sorted = get_next_ptr<float>(pre_nms_topk, workspace, workspace_size);

        for (int batch = 0; batch < batch_size; batch++) {
            auto in_scores = static_cast<const float *>(inputs[0]) + batch * pre_nms_topk;
            auto in_boxes = static_cast<const float4 *>(inputs[1]) + batch * pre_nms_topk;

            auto out_boxes = static_cast<float4 *>(outputs[0]) + batch * post_nms_topk;

            int num_detections = pre_nms_topk;
            cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size,
                in_scores, scores_sorted, indices, indices_sorted, num_detections, 0,
                sizeof(*scores_sorted) * 8, stream);

            // Launch actual NMS kernel - 1 block with each thread handling n detections
            // TODO: different device has differnet max threads
            const int max_threads = 1024;
            int num_per_thread = ceil(static_cast<float>(num_detections) / max_threads);
            rpn_nms_kernel << <num_per_thread, max_threads, 0, stream >> > (nms_thresh, num_detections,
                indices_sorted, scores_sorted, in_boxes);

            // Re-sort with updated scores
            cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size,
                scores_sorted, scores, indices_sorted, indices, num_detections, 0, sizeof(*scores_sorted) * 8, stream);

            // Gather filtered scores, boxes, classes
            num_detections = min(post_nms_topk, num_detections);
            thrust::gather(on_stream, indices, indices + num_detections, in_boxes, out_boxes);
        }

        return 0;
    }
}  // namespace nvinfer1


================================================
FILE: rcnn/RpnNmsPlugin.h
================================================
#pragma once

#include <NvInfer.h>

#include <vector>
#include <cassert>
#include "macros.h"

using namespace nvinfer1;

#define PLUGIN_NAME "RpnNms"
#define PLUGIN_VERSION "1"
#define PLUGIN_NAMESPACE ""

namespace nvinfer1 {

int rpnNms(int batchSize,
    const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs,
    size_t pre_nms_topk, int post_nms_topk, float nms_thresh,
    void *workspace, size_t workspace_size, cudaStream_t stream);

/*
    input1: scores{C, 1} C->pre_nms_topk
    input2: boxes{C, 4} C->pre_nms_topk format:XYXY
    output1: boxes{C, 4} C->post_nms_topk format:XYXY
    Description: implement rpn nms
*/
class RpnNmsPlugin : public IPluginV2Ext {
    float _nms_thresh;
    int _post_nms_topk;

    size_t _pre_nms_topk = 1;
    mutable int size = -1;

 protected:
    void deserialize(void const* data, size_t length) {
        const char* d = static_cast<const char*>(data);
        read(d, _nms_thresh);
        read(d, _post_nms_topk);
        read(d, _pre_nms_topk);
    }

    size_t getSerializationSize() const TRT_NOEXCEPT override {
        return sizeof(_nms_thresh) + sizeof(_post_nms_topk)
            + sizeof(_pre_nms_topk);
    }

    void serialize(void *buffer) const TRT_NOEXCEPT override {
        char* d = static_cast<char*>(buffer);
        write(d, _nms_thresh);
        write(d, _post_nms_topk);
        write(d, _pre_nms_topk);
    }

 public:
    RpnNmsPlugin(float nms_thresh, int post_nms_topk)
        : _nms_thresh(nms_thresh), _post_nms_topk(post_nms_topk) {
        assert(nms_thresh > 0);
        assert(post_nms_topk > 0);
    }

    RpnNmsPlugin(float nms_thresh, int post_nms_topk, size_t pre_nms_topk)
        : _nms_thresh(nms_thresh), _post_nms_topk(post_nms_topk), _pre_nms_topk(pre_nms_topk) {
        assert(nms_thresh > 0);
        assert(post_nms_topk > 0);
        assert(pre_nms_topk > 0);
    }

    RpnNmsPlugin(void const* data, size_t length) {
        this->deserialize(data, length);
    }

    const char *getPluginType() const TRT_NOEXCEPT override {
        return PLUGIN_NAME;
    }

    const char *getPluginVersion() const TRT_NOEXCEPT override {
        return PLUGIN_VERSION;
    }

    int getNbOutputs() const TRT_NOEXCEPT override {
        return 1;
    }

    Dims getOutputDimensions(int index,
        const Dims *inputs, int nbInputDims) TRT_NOEXCEPT override {
        assert(nbInputDims == 2);
        assert(index < this->getNbOutputs());
        return Dims2(_post_nms_topk, 4);
    }

    bool supportsFormat(DataType type, PluginFormat format) const TRT_NOEXCEPT override {
        return type == DataType::kFLOAT && format == PluginFormat::kLINEAR;
    }

    int initialize() TRT_NOEXCEPT override { return 0; }

    void terminate() TRT_NOEXCEPT override {}

    size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override {
        if (size < 0) {
            size = rpnNms(maxBatchSize, nullptr, nullptr, _pre_nms_topk,
                _post_nms_topk, _nms_thresh,
                nullptr, 0, nullptr);
        }
        return size;
    }

    int enqueue(int batchSize,
        const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs,
        void *workspace, cudaStream_t stream) TRT_NOEXCEPT override {
        return rpnNms(batchSize, inputs, outputs, _pre_nms_topk,
            _post_nms_topk, _nms_thresh,
            workspace, getWorkspaceSize(batchSize), stream);
    }

    void destroy() TRT_NOEXCEPT override {
        delete this;
    }

    const char *getPluginNamespace() const TRT_NOEXCEPT override {
        return PLUGIN_NAMESPACE;
    }

    void setPluginNamespace(const char *N) TRT_NOEXCEPT override {
    }

    // IPluginV2Ext Methods
    DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override {
        assert(index < 1);
        return DataType::kFLOAT;
    }

    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
        int nbInputs) const TRT_NOEXCEPT override {
        return false;
    }

    bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override { return false; }

    void configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs,
        const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast,
        const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize) TRT_NOEXCEPT override {
        assert(*inputTypes == nvinfer1::DataType::kFLOAT &&
            floatFormat == nvinfer1::PluginFormat::kLINEAR);
        assert(nbInputs == 2);
        assert(inputDims[0].d[0] == inputDims[1].d[0]);
        _pre_nms_topk = inputDims[0].d[0];
    }

    IPluginV2Ext *clone() const TRT_NOEXCEPT override {
        return new RpnNmsPlugin(_nms_thresh, _post_nms_topk, _pre_nms_topk);
    }

 private:
    template<typename T> void write(char*& buffer, const T& val) const {
        *reinterpret_cast<T*>(buffer) = val;
        buffer += sizeof(T);
    }

    template<typename T> void read(const char*& buffer, T& val) {
        val = *reinterpret_cast<const T*>(buffer);
        buffer += sizeof(T);
    }
};

class RpnNmsPluginCreator : public IPluginCreator {
 public:
    RpnNmsPluginCreator() {}

    const char *getPluginNamespace() const TRT_NOEXCEPT override {
        return PLUGIN_NAMESPACE;
    }
    const char *getPluginName() const TRT_NOEXCEPT override {
        return PLUGIN_NAME;
    }

    const char *getPluginVersion() const TRT_NOEXCEPT override {
        return PLUGIN_VERSION;
    }

    IPluginV2 *deserializePlugin(const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT override {
        return new RpnNmsPlugin(serialData, serialLength);
    }

    void setPluginNamespace(const char *N) TRT_NOEXCEPT override {}
    const PluginFieldCollection *getFieldNames() TRT_NOEXCEPT override { return nullptr; }
    IPluginV2 *createPlugin(const char *name, const PluginFieldCollection *fc) TRT_NOEXCEPT override { return nullptr; }
};

REGISTER_TENSORRT_PLUGIN(RpnNmsPluginCreator);

}  // namespace nvinfer1

#undef PLUGIN_NAME
#undef PLUGIN_VERSION
#undef PLUGIN_NAMESPACE


================================================
FILE: rcnn/backbone.hpp
================================================
#pragma once
#include <vector>
#include <map>
#include <string>
#include "common.hpp"

/* when stride>1, whether to put stride in the first 1x1 convolution or the bottleneck 3x3 convolution.
set false when use backbone from torchvision*/
#define STRIDE_IN_1X1 true

enum RESNETTYPE {
    R18 = 0,
    R34,
    R50,
    R101,
    R152
};

const std::map<RESNETTYPE, std::vector<int>> num_blocks_per_stage = {
    {R18, {2, 2, 2, 2}},
    {R34, {3, 4, 6, 3}},
    {R50, {3, 4, 6, 3}},
    {R101, {3, 4, 23, 3}},
    {R152, {3, 8, 36, 3}}
};

ILayer* BasicStem(INetworkDefinition *network,
std::map<std::string, Weights>& weightMap,
const std::string& lname, ITensor& input,
int out_channels,
int group_num = 1) {
    // conv1
    IConvolutionLayer* conv1 = network->addConvolutionNd(input, out_channels, DimsHW{ 7, 7 },
    weightMap[lname + ".conv1.weight"],
    weightMap[lname + ".conv1.bias"]);
    assert(conv1);
    conv1->setStrideNd(DimsHW{ 2, 2 });
    conv1->setPaddingNd(DimsHW{ 3, 3 });
    conv1->setNbGroups(group_num);

    auto r1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);
    assert(r1);

    auto max_pool2d = network->addPoolingNd(*r1->getOutput(0), PoolingType::kMAX, DimsHW{ 3, 3 });
    max_pool2d->setStrideNd(DimsHW{ 2, 2 });
    max_pool2d->setPaddingNd(DimsHW{ 1, 1 });
    // auto mp_dim = max_pool2d->getOutput(0)->getDimensions();
    return max_pool2d;
}

ITensor* BasicBlock(INetworkDefinition *network,
std::map<std::string, Weights>& weightMap,
const std::string& lname,
ITensor& input,
int in_channels,
int out_channels,
int stride = 1) {
    // conv1
    IConvolutionLayer* conv1 = network->addConvolutionNd(input, out_channels, DimsHW{ 3, 3 },
    weightMap[lname + ".conv1.weight"],
    weightMap[lname + ".conv1.bias"]);
    assert(conv1);
    conv1->setStrideNd(DimsHW{ stride, stride });
    conv1->setPaddingNd(DimsHW{ 1, 1 });

    auto r1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);
    assert(r1);

    // conv2
    IConvolutionLayer* conv2 = network->addConvolutionNd(*r1->getOutput(0), out_channels, DimsHW{ 3, 3 },
    weightMap[lname + ".conv2.weight"],
    weightMap[lname + ".conv2.bias"]);
    assert(conv2);
    conv2->setStrideNd(DimsHW{ 1, 1 });
    conv2->setPaddingNd(DimsHW{ 1, 1 });

    // shortcut
    ITensor* shortcut_value = nullptr;
    if (in_channels != out_channels) {
        auto shortcut = network->addConvolutionNd(input, out_channels, DimsHW{ 1, 1 },
        weightMap[lname + ".shortcut.weight"],
        weightMap[lname + ".shortcut.bias"]);
        assert(shortcut);
        shortcut->setStrideNd(DimsHW{ stride, stride });
        shortcut_value = shortcut->getOutput(0);
    } else {
        shortcut_value = &input;
    }

    // add
    auto ew = network->addElementWise(*conv2->getOutput(0), *shortcut_value, ElementWiseOperation::kSUM);
    assert(ew);

    auto r3 = network->addActivation(*ew->getOutput(0), ActivationType::kRELU);
    assert(r3);

    return r3->getOutput(0);
}

ITensor* BottleneckBlock(INetworkDefinition *network,
std::map<std::string, Weights>& weightMap,
const std::string& lname,
ITensor& input,
int in_channels,
int bottleneck_channels,
int out_channels,
int stride = 1,
int dilation = 1,
int group_num = 1) {
    int stride_1x1 = STRIDE_IN_1X1 ? stride : 1;
    int stride_3x3 = STRIDE_IN_1X1 ? 1 : stride;
    // conv1
    IConvolutionLayer* conv1 = network->addConvolutionNd(input, bottleneck_channels, DimsHW{ 1, 1 },
    weightMap[lname + ".conv1.weight"],
    weightMap[lname + ".conv1.bias"]);
    assert(conv1);
    conv1->setStrideNd(DimsHW{ stride_1x1, stride_1x1 });
    conv1->setNbGroups(group_num);

    auto r1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);
    assert(r1);

    // conv2
    IConvolutionLayer* conv2 = network->addConvolutionNd(*r1->getOutput(0), bottleneck_channels, DimsHW{ 3, 3 },
    weightMap[lname + ".conv2.weight"],
    weightMap[lname + ".conv2.bias"]);
    assert(conv2);
    conv2->setStrideNd(DimsHW{ stride_3x3, stride_3x3 });
    conv2->setPaddingNd(DimsHW{ 1 * dilation, 1 * dilation });
    conv2->setDilationNd(DimsHW{ dilation, dilation });
    conv2->setNbGroups(group_num);

    auto r2 = network->addActivation(*conv2->getOutput(0), ActivationType::kRELU);
    assert(r2);

    // conv3
    IConvolutionLayer* conv3 = network->addConvolutionNd(*r2->getOutput(0), out_channels, DimsHW{ 1, 1 },
    weightMap[lname + ".conv3.weight"],
    weightMap[lname + ".conv3.bias"]);
    assert(conv3);
    conv3->setStrideNd(DimsHW{ 1, 1 });
    conv3->setNbGroups(group_num);

    // shortcut
    ITensor* shortcut_value = nullptr;
    if (in_channels != out_channels) {
        auto shortcut = network->addConvolutionNd(input, out_channels, DimsHW{ 1, 1 },
        weightMap[lname + ".shortcut.weight"],
        weightMap[lname + ".shortcut.bias"]);
        assert(shortcut);
        shortcut->setStrideNd(DimsHW{stride, stride});
        shortcut->setNbGroups(group_num);
        shortcut_value = shortcut->getOutput(0);
    } else {
        shortcut_value = &input;
    }

    // add
    auto ew = network->addElementWise(*conv3->getOutput(0), *shortcut_value, ElementWiseOperation::kSUM);
    assert(ew);

    auto r3 = network->addActivation(*ew->getOutput(0), ActivationType::kRELU);
    assert(r3);

    return r3->getOutput(0);
}

ITensor* MakeStage(INetworkDefinition *network,
std::map<std::string, Weights>& weightMap,
const std::string& lname,
ITensor& input,
int stage,
RESNETTYPE resnet_type,
int in_channels,
int bottleneck_channels,
int out_channels,
int first_stride = 1,
int dilation = 1) {
    ITensor* out = &input;
    for (int i = 0; i < stage; i++) {
        std::string layerName = lname + "." + std::to_string(i);
        int stride = i == 0 ? first_stride : 1;

        if (resnet_type == R18 || resnet_type == R34)
            out = BasicBlock(network, weightMap, layerName, *out, in_channels, out_channels, stride);
        else
            out = BottleneckBlock(network, weightMap, layerName, *out,
            in_channels, bottleneck_channels, out_channels, stride, dilation);

        in_channels = out_channels;
    }
    return out;
}

ITensor* BuildResNet(INetworkDefinition *network,
std::map<std::string, Weights>& weightMap,
ITensor& input,
RESNETTYPE resnet_type,
int stem_out_channels,
int bottleneck_channels,
int res2_out_channels,
int res5_dilation = 1) {
    assert(res5_dilation == 1 || res5_dilation == 2);  // "res5_dilation must be 1 or 2"
    if (resnet_type == R18 || resnet_type == R34) {
        assert(res2_out_channels == 64);  // "res2_out_channels must be 64 for R18/R34"
        assert(res5_dilation == 1);  // "res5_dilation must be 1 for R18/R34"
    }

    int out_channels = res2_out_channels;
    ITensor* out = nullptr;
    // stem
    auto stem = BasicStem(network, weightMap, "backbone.stem", input, stem_out_channels);
    out = stem->getOutput(0);

    // res
    for (int i = 0; i < 3; i++) {
        int dilation = (i == 3) ? res5_dilation : 1;
        int first_stride = (i == 0 || (i == 3 && dilation == 2)) ? 1 : 2;
        out = MakeStage(network, weightMap,
        "backbone.res" + std::to_string(i + 2), *out,
        num_blocks_per_stage.at(resnet_type)[i], resnet_type,
        stem_out_channels, bottleneck_channels, out_channels,
        first_stride, dilation);
        stem_out_channels = out_channels;
        bottleneck_channels *= 2;
        out_channels *= 2;
    }
    return out;
}


================================================
FILE: rcnn/calibrator.hpp
================================================
#pragma once

#include "NvInfer.h"
#include <string>
#include <vector>
#include <iostream>
#include <iterator>
#include <fstream>
#include <algorithm>
#include "./cuda_utils.h"
#include "common.hpp"
#include "macros.h"

//! \class Int8EntropyCalibrator2
//!
//! \brief Implements Entropy calibrator 2.
//!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
//!
class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
 public:
    Int8EntropyCalibrator2(int batchsize, int input_w, int input_h,
    const char* img_dir, const char* calib_table_name,
    const char* input_blob_name, bool read_cache = true);

    virtual ~Int8EntropyCalibrator2();
    int getBatchSize() const  TRT_NOEXCEPT override;
    bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
    const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
    void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;

 private:
    int batchsize_;
    int input_w_;
    int input_h_;
    int img_idx_;
    std::string img_dir_;
    std::vector<std::string> img_files_;
    size_t input_count_;
    std::string calib_table_name_;
    const char* input_blob_name_;
    bool read_cache_;
    void* device_input_;
    std::vector<char> calib_cache_;
};

Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize,
int input_w, int input_h, const char* img_dir,
const char* calib_table_name, const char* input_blob_name,
bool read_cache)
    : batchsize_(batchsize)
    , input_w_(input_w)
    , input_h_(input_h)
    , img_idx_(0)
    , img_dir_(img_dir)
    , calib_table_name_(calib_table_name)
    , input_blob_name_(input_blob_name)
    , read_cache_(read_cache) {
    input_count_ = 3 * input_w * input_h * batchsize;
    CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float)));
    read_files_in_dir(img_dir, img_files_);
}

Int8EntropyCalibrator2::~Int8EntropyCalibrator2() {
    CUDA_CHECK(cudaFree(device_input_));
}

int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT {
    return batchsize_;
}

bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT {
    if (img_idx_ + batchsize_ > static_cast<int>(img_files_.size())) {
        return false;
    }

    std::vector<float> input_imgs_(input_count_, 0);
    for (int i = img_idx_; i < img_idx_ + batchsize_; i++) {
        std::cout << img_files_[i] << "  " << i << std::endl;
        cv::Mat temp = cv::imread(img_dir_ + img_files_[i]);
        int X_LEFT_PAD = 0;
        int X_RIGHT_PAD = 0; 
        int Y_TOP_PAD = 0;
        int Y_BOTTOM_PAD = 0;
        temp = preprocessImg(temp, input_w_, input_h_, X_LEFT_PAD, X_RIGHT_PAD, Y_TOP_PAD, Y_BOTTOM_PAD);

        if (temp.empty()) {
            std::cerr << "Fatal error: image cannot open!" << std::endl;
            return false;
        }
        for (int ind = 0; ind < input_w_*input_h_*3; ind++)
            input_imgs_[(i-img_idx_)*input_w_*input_h_*3 + ind] = static_cast<float>(*(temp.data + ind));
    }
    img_idx_ += batchsize_;

    CUDA_CHECK(cudaMemcpy(device_input_, input_imgs_.data(), input_count_ * sizeof(float), cudaMemcpyHostToDevice));
    assert(!strcmp(names[0], input_blob_name_));
    bindings[0] = device_input_;
    return true;
}

const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length)  TRT_NOEXCEPT {
    std::cout << "reading calib cache: " << calib_table_name_ << std::endl;
    calib_cache_.clear();
    std::ifstream input(calib_table_name_, std::ios::binary);
    input >> std::noskipws;
    if (read_cache_ && input.good()) {
        std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(calib_cache_));
    }
    length = calib_cache_.size();
    return length ? calib_cache_.data() : nullptr;
}

void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length)  TRT_NOEXCEPT {
    std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl;
    std::ofstream output(calib_table_name_, std::ios::binary);
    output.write(reinterpret_cast<const char*>(cache), length);
}


================================================
FILE: rcnn/common.hpp
================================================
#pragma once

#include <NvInfer.h>
#include <cuda_runtime_api.h>
#include <assert.h>
#include <dirent.h>

#include <fstream>
#include <sstream>
#include <iostream>
#include <string>
#include <vector>
#include <map>
#include <algorithm>

#include <opencv2/opencv.hpp>
#include "./logging.h"
#include "./cuda_utils.h"

static Logger gLogger;

using namespace nvinfer1;

void loadWeights(const std::string file, std::map<std::string, Weights>& weightMap) {
    std::cout << "Loading weights: " << file << std::endl;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--) {
        Weights wt{ DataType::kFLOAT, nullptr, 0 };
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x) {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }
}

static inline int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
    DIR *p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 &&
            strcmp(p_file->d_name, "..") != 0) {
            // std::string cur_file_name(p_dir_name);
            // cur_file_name += "/";
            // cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}

static inline cv::Mat preprocessImg(cv::Mat& img, int input_w, int input_h, int& X_LEFT_PAD, int& X_RIGHT_PAD, int& Y_TOP_PAD, int& Y_BOTTOM_PAD) {
    int w, h;
    float x, y;
    float r_w = input_w / (img.cols*1.0);
    float r_h = input_h / (img.rows*1.0);

    // this code can also support left-right and top-bottom padding if you need
    if (r_h > r_w) {
        w = input_w;
        h = r_w * img.rows;
        x = 0.0;
        y = (input_h - h) / 2.f;
    } else {
        w = r_h * img.cols;
        h = input_h;
        x = (input_w - w) / 2.f;
        y = 0.0;
    }

    // support both odd and even cases
    X_LEFT_PAD = (int)(round(x - 0.1));
    X_RIGHT_PAD = (int)(round(x + 0.1));
    Y_TOP_PAD = (int)(round(y - 0.1));
    Y_BOTTOM_PAD = (int)(round(y + 0.1));

    cv::Mat re(h, w, CV_8UC3);
    cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
    cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
    re.copyTo(out(cv::Rect(X_LEFT_PAD, Y_TOP_PAD, re.cols, re.rows)));

    return out;
}

================================================
FILE: rcnn/cuda_utils.h
================================================
#pragma once

#include <cuda_runtime_api.h>
#include <stdexcept>
#include <cstdint>

#define CUDA_ALIGN 256

template <typename T>
inline size_t get_size_aligned(size_t num_elem) {
    size_t size = num_elem * sizeof(T);
    size_t extra_align = 0;
    if (size % CUDA_ALIGN != 0) {
        extra_align = CUDA_ALIGN - size % CUDA_ALIGN;
    }
    return size + extra_align;
}

template <typename T>
inline T *get_next_ptr(size_t num_elem, void *&workspace, size_t &workspace_size) {
    size_t size = get_size_aligned<T>(num_elem);
    if (size > workspace_size) {
        throw std::runtime_error("Workspace is too small!");
    }
    workspace_size -= size;
    T *ptr = reinterpret_cast<T *>(workspace);
    workspace = reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(workspace) + size);
    return ptr;
}

#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)\
    {\
        cudaError_t error_code = callstr;\
        if (error_code != cudaSuccess) {\
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
            assert(0);\
        }\
    }
#endif  // CUDA_CHECK


================================================
FILE: rcnn/gen_wts.py
================================================
from detectron2.layers import Conv2d
from torch import nn
import torch
import numpy as np
import struct
def fuse_conv_and_bn(conv):
    # Fuse convolution and batchnorm layers https://tehnokv.com/posts/fusing-batchnorm-and-conv/
    bn = conv.norm
    # init
    fusedconv = nn.Conv2d(conv.in_channels,
                          conv.out_channels,
                          kernel_size=conv.kernel_size,
                          stride=conv.stride,
                          padding=conv.padding,
                          groups=conv.groups,
                          bias=True).requires_grad_(False).to(conv.weight.device)

    # prepare filters
    w_conv = conv.weight.clone().view(conv.out_channels, -1)
    w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var)))
    fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.size()))

    # prepare spatial bias
    b_conv = torch.zeros(conv.weight.size(0), device=conv.weight.device) if conv.bias is None else conv.bias
    b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps))
    fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn)

    return fusedconv

def fuse_bn(model):
    for child_name, child in model.named_children():
        if isinstance(child, Conv2d) and child.norm is not None:
            setattr(model, child_name, fuse_conv_and_bn(child))
        else:
            fuse_bn(child)

def gen_wts(model, filename):
    f = open('./' + filename + '.wts', 'w')
    f.write('{}\n'.format(len(model.state_dict().keys())))
    for k, v in model.state_dict().items():
        vr = v.reshape(-1).cpu().numpy()
        f.write('{} {} '.format(k, len(vr)))
        for vv in vr:
            f.write(' ')
            f.write(struct.pack('>f',float(vv)).hex())
        f.write('\n')
    f.close()

# construct model
from detectron2.config import get_cfg
from detectron2.modeling import build_model
from detectron2.checkpoint import DetectionCheckpointer
cfg = get_cfg()
cfg.merge_from_file('./configs/COCO-Detection/faster_rcnn_R_50_C4_1x.yaml')
cfg.MODEL.WEIGHTS = './model_final_721ade.pkl'
cfg.MODEL.DEVICE = 'cpu'
model = build_model(cfg)
DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
model.eval()
fuse_bn(model)
gen_wts(model, 'faster')

# test data
# from detectron2.data.detection_utils import read_image
# from detectron2.data import transforms as T
# import cv2
# original_image = cv2.imread('./demo.jpg')
# original_image = original_image.astype('float32')

# transform_gen = T.ResizeShortestEdge(
#             [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
#         )
# height, width = original_image.shape[:2]

# image = transform_gen.get_transform(original_image).apply_image(original_image)
# image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))

# # model test
# inputs = {"image": image, "height": height, "width": width}

# with torch.no_grad():
#     predictions = model([inputs])[0]
# print (predictions)


================================================
FILE: rcnn/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include "macros.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf {
 public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog) {}

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput) {}

    ~LogStreamConsumerBuffer() {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr()) {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync() {
        putOutput();
        return 0;
    }

    void putOutput() {
        if (mShouldLog) {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog) {
        mShouldLog = shouldLog;
    }

 private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase {
 public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog) {}

 protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
 public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer)  // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity) {}

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer)  // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity) {}

    void setReportableSeverity(Severity reportableSeverity) {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

 private:
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity) {
        switch (severity) {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger {
 public:
    explicit Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity) {}

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult {
        kRUNNING,  //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger() {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity) {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom {
     public:
        TestAtom(TestAtom&&) = default;

     private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline) {}

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline) {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom) {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result) {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass) {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const {
        return mReportableSeverity;
    }

 private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity) {
        switch (severity) {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result) {
        switch (result) {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result) {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv) {
        std::stringstream ss;
        for (int i = 0; i < argc; i++) {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace {

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

}  // anonymous namespace

#endif  // TENSORRT_LOGGING_H


================================================
FILE: rcnn/macros.h
================================================
#pragma once

#include <NvInfer.h>
#include <cuda.h>

#if CUDA_VERSION >=11000
#define CUDA_11
#endif

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif


================================================
FILE: rcnn/rcnn.cpp
================================================
#include <iostream>
#include <opencv2/opencv.hpp>
#include "backbone.hpp"
#include "RpnDecodePlugin.h"
#include "RpnNmsPlugin.h"
#include "RoiAlignPlugin.h"
#include "PredictorDecodePlugin.h"
#include "BatchedNmsPlugin.h"
#include "MaskRcnnInferencePlugin.h"
#include "calibrator.hpp"

#define DEVICE 0
#define BATCH_SIZE 1
#define BACKBONE_RESNETTYPE R50
// data
static const std::vector<float> PIXEL_MEAN = { 103.53, 116.28, 123.675 };
static const std::vector<float> PIXEL_STD = {1.0, 1.0, 1.0};
static constexpr float MIN_SIZE = 800.0;
static constexpr float MAX_SIZE = 1333.0;
static constexpr int NUM_CLASSES = 80;
static int INPUT_H;  // size of model input
static int INPUT_W;
static constexpr int INPUT_H_ = 480;  // size of original image, you can change it to arbitrary size
static constexpr int INPUT_W_ = 640;
static int X_LEFT_PAD;  // pad in preprocessImg
static int X_RIGHT_PAD;
static int Y_TOP_PAD;
static int Y_BOTTOM_PAD;
static int h_ori;  // used when h_ori is not equal to INPUT_H_
static int w_ori;
// backbone
static const int RES2_OUT_CHANNELS = (BACKBONE_RESNETTYPE == R18 ||
BACKBONE_RESNETTYPE == R34) ? 64 : 256;
// rpn
static const std::vector<float> ANCHOR_SIZES = { 32, 64, 128, 256, 512 };
static const std::vector<float> ASPECT_RATIOS = { 0.5, 1.0, 2.0 };
static constexpr int PRE_NMS_TOP_K_TEST = 6000;
static constexpr float RPN_NMS_THRESH = 0.7;
static constexpr int POST_NMS_TOPK = 1000;
// roialign
static constexpr int STRIDES = 16;
static constexpr int SAMPLING_RATIO = 0;
static constexpr int POOLER_RESOLUTION = 14;
// roihead
static constexpr float NMS_THRESH_TEST = 0.5;
static constexpr int DETECTIONS_PER_IMAGE = 100;
static constexpr float SCORE_THRESH = 0.6;
static const std::vector<float> BBOX_REG_WEIGHTS = { 10.0, 10.0, 5.0, 5.0 };
static bool MASK_ON = false;

static const char* INPUT_NODE_NAME = "images";
static const std::vector<std::string> OUTPUT_NAMES = { "scores", "boxes",
"labels", "masks" };

//nms methods selection in the second stage
// 0: original nms
// 1: soft-nms (linear)
// 2: soft-nms (gaussian) 
static int NMS_METHOD = 1;
static std::vector<int> NMS_METHOD_VEC = {0, 1, 2};

std::vector<float> GenerateAnchors(const std::vector<float>& anchor_sizes,
const std::vector<float>& aspect_ratios) {
    std::vector<float> res;
    for (auto as : anchor_sizes) {
        float area = as * as;
        for (auto ar : aspect_ratios) {
            float w = sqrt(area / ar);
            float h = ar * w;
            res.push_back(-w / 2.0);
            res.push_back(-h / 2.0);
            res.push_back(w / 2.0);
            res.push_back(h / 2.0);
        }
    }
    return res;
}

// transpose && resize && normalization && padding
ITensor* DataPreprocess(INetworkDefinition *network, ITensor& input) {

    // HWC->CHW
    auto channel_permute = network->addShuffle(input);
    assert(channel_permute);
    channel_permute->setFirstTranspose(Permutation{ 2, 0, 1 });

    // sub pixel mean
    auto pixel_mean = network->addConstant(Dims3{ 3, 1, 1 },
    Weights{ DataType::kFLOAT, PIXEL_MEAN.data(), 3 });
    assert(pixel_mean);
    auto sub = network->addElementWise(*channel_permute->getOutput(0),
    *pixel_mean->getOutput(0), ElementWiseOperation::kSUB);
    assert(sub);
    auto pixel_std = network->addConstant(Dims3{ 3, 1, 1 }, Weights{DataType::kFLOAT, PIXEL_STD.data(), 3});
    assert(pixel_std);
    auto div = network->addElementWise(*sub->getOutput(0), *pixel_std->getOutput(0), ElementWiseOperation::kDIV);
    assert(div);

    return div->getOutput(0);
}

ITensor* RPN(INetworkDefinition *network,
std::map<std::string, Weights>& weightMap, ITensor& features) {
    int num_anchors = ANCHOR_SIZES.size() * ASPECT_RATIOS.size();
    int box_dim = 4;

    // rpn head conv
    auto rpn_head_conv = network->addConvolutionNd(features, features.getDimensions().d[0], DimsHW{ 3, 3 },
    weightMap["proposal_generator.rpn_head.conv.weight"],
    weightMap["proposal_generator.rpn_head.conv.bias"]);
    assert(rpn_head_conv);
    rpn_head_conv->setStrideNd(DimsHW{ 1, 1 });
    rpn_head_conv->setPaddingNd(DimsHW{ 1, 1 });
    auto rpn_head_relu = network->addActivation(*rpn_head_conv->getOutput(0), ActivationType::kRELU);
    assert(rpn_head_relu);

    // objectness logits
    auto rpn_head_logits = network->addConvolutionNd(*rpn_head_relu->getOutput(0), num_anchors, DimsHW{ 1, 1 },
    weightMap["proposal_generator.rpn_head.objectness_logits.weight"],
    weightMap["proposal_generator.rpn_head.objectness_logits.bias"]);
    assert(rpn_head_logits);
    rpn_head_logits->setStrideNd(DimsHW{ 1, 1 });

    // anchor deltas
    auto rpn_head_deltas = network->addConvolutionNd(*rpn_head_relu->getOutput(0), num_anchors * box_dim,
    DimsHW{ 1, 1 },
    weightMap["proposal_generator.rpn_head.anchor_deltas.weight"],
    weightMap["proposal_generator.rpn_head.anchor_deltas.bias"]);
    assert(rpn_head_deltas);
    auto rpn_head_deltas_dim = rpn_head_deltas->getOutput(0)->getDimensions();
    rpn_head_deltas->setStrideNd(DimsHW{ 1, 1 });

    auto anchors = GenerateAnchors(ANCHOR_SIZES, ASPECT_RATIOS);
    auto rpnDecodePlugin = RpnDecodePlugin(PRE_NMS_TOP_K_TEST, anchors, STRIDES, INPUT_H, INPUT_W);
    std::vector<ITensor*> faster_decode_inputs = { rpn_head_logits->getOutput(0), rpn_head_deltas->getOutput(0) };
    auto rpnDecodeLayer = network->addPluginV2(faster_decode_inputs.data(), faster_decode_inputs.size(),
    rpnDecodePlugin);

    std::vector<ITensor*> nms_input = { rpnDecodeLayer->getOutput(0), rpnDecodeLayer->getOutput(1) };

    // nms
    auto nmsPlugin = RpnNmsPlugin(RPN_NMS_THRESH, POST_NMS_TOPK);
    auto nmsLayer = network->addPluginV2(nms_input.data(), nms_input.size(), nmsPlugin);
    return nmsLayer->getOutput(0);
}

ITensor* SharedRoiTransform(INetworkDefinition *network, std::map<std::string, Weights>& weightMap,
ITensor* proposals, ITensor* features, int num_proposals) {
    std::vector<ITensor*> roi_inputs = { proposals, features };
    auto roiAlignPlugin = RoiAlignPlugin(POOLER_RESOLUTION, 1 / static_cast<float>(STRIDES),
    SAMPLING_RATIO, num_proposals, features->getDimensions().d[0]);
    auto roiAlignLayer = network->addPluginV2(roi_inputs.data(), roi_inputs.size(), roiAlignPlugin);

    // res5
    /* same with https://github.com/facebookresearch/detectron2/
    blob/9246ebc3af1c023cfbdae77e5d976edbcf9a2933/detectron2/modeling/roi_heads/roi_heads.py#L430,
    use bottleneck here, so pass R50*/
    auto box_features = MakeStage(network, weightMap, "roi_heads.res5",
    *roiAlignLayer->getOutput(0), 3, R50,
    roiAlignLayer->getOutput(0)->getDimensions().d[1],
    512, RES2_OUT_CHANNELS * 8, 2);
    return box_features;
}

void BoxHead(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor* proposals,
    ITensor* features, std::vector<ITensor*>& instances) {

    auto box_features = SharedRoiTransform(network, weightMap, proposals, features, POST_NMS_TOPK);
    auto box_features_mean = network->addReduce(*box_features, ReduceOperation::kAVG, 12, true);

    // score
    auto scores = network->addFullyConnected(*box_features_mean->getOutput(0), NUM_CLASSES + 1,
    weightMap["roi_heads.box_predictor.cls_score.weight"],
    weightMap["roi_heads.box_predictor.cls_score.bias"]);
    auto probs = network->addSoftMax(*scores->getOutput(0));

    auto probs_dim = probs->getOutput(0)->getDimensions();
    auto score_slice = network->addSlice(*probs->getOutput(0), Dims4{ 0, 0, 0, 0 },
    Dims4{ probs_dim.d[0], probs_dim.d[1] - 1, 1, 1 }, Dims4{ 1, 1, 1, 1 });

    auto proposal_deltas = network->addFullyConnected(*box_features_mean->getOutput(0), NUM_CLASSES * 4,
    weightMap["roi_heads.box_predictor.bbox_pred.weight"],
    weightMap["roi_heads.box_predictor.bbox_pred.bias"]);

    // decode
    std::vector<ITensor*> predictorDecodeInput = { score_slice->getOutput(0),
    proposal_deltas->getOutput(0), proposals };
    auto predictorDecodePlugin = PredictorDecodePlugin(probs_dim.d[0], INPUT_H, INPUT_W, BBOX_REG_WEIGHTS);
    auto predictorDecodeLayer = network->addPluginV2(predictorDecodeInput.data(),
    predictorDecodeInput.size(), predictorDecodePlugin);

    // nms
    std::vector<ITensor*> nmsInput = { predictorDecodeLayer->getOutput(0),
    predictorDecodeLayer->getOutput(1), predictorDecodeLayer->getOutput(2) };
    auto batchedNmsPlugin = BatchedNmsPlugin(NMS_METHOD, NMS_THRESH_TEST, DETECTIONS_PER_IMAGE);
    auto batchedNmsLayer = network->addPluginV2(nmsInput.data(), nmsInput.size(), batchedNmsPlugin);

    // instances
    instances.push_back(batchedNmsLayer->getOutput(0));
    instances.push_back(batchedNmsLayer->getOutput(1));
    instances.push_back(batchedNmsLayer->getOutput(2));
}

void MaskHead(INetworkDefinition *network, std::map<std::string, Weights>& weightMap,
    ITensor* features, std::vector<ITensor*>& instances, int out_channels = 256) {

    auto mask_features = SharedRoiTransform(network, weightMap, instances[1], features, DETECTIONS_PER_IMAGE);

    // mask_fcn
    auto mask_deconv = network->addDeconvolutionNd(*mask_features, out_channels, DimsHW{ 2, 2 },
    weightMap["roi_heads.mask_head.deconv.weight"],
    weightMap["roi_heads.mask_head.deconv.bias"]);
    mask_deconv->setStrideNd(DimsHW{ 2, 2 });
    auto deconv_relu = network->addActivation(*mask_deconv->getOutput(0), ActivationType::kRELU);
    assert(deconv_relu);
    auto predictor = network->addConvolutionNd(*deconv_relu->getOutput(0), NUM_CLASSES, DimsHW{ 1, 1 },
    weightMap["roi_heads.mask_head.predictor.weight"],
    weightMap["roi_heads.mask_head.predictor.bias"]);
    predictor->setStrideNd(DimsHW{ 1, 1 });

    ITensor* masks;
    if (NUM_CLASSES == 1) {
        auto mask_probs_pred = network->addActivation(*predictor->getOutput(0), ActivationType::kSIGMOID);
        masks = mask_probs_pred->getOutput(0);
    } else {
        std::vector<ITensor*> mask_rcnn_inference_inputs = { instances[2], predictor->getOutput(0) };
        auto maskRcnnInferencePlugin = MaskRcnnInferencePlugin(DETECTIONS_PER_IMAGE, POOLER_RESOLUTION);
        auto maskRcnnInferenceLayer = network->addPluginV2(mask_rcnn_inference_inputs.data(),
        mask_rcnn_inference_inputs.size(), maskRcnnInferencePlugin);
        masks = maskRcnnInferenceLayer->getOutput(0);
    }
    instances.push_back(masks);
}

std::vector<ITensor*> ROIHeads(INetworkDefinition *network, std::map<std::string, Weights>& weightMap,
ITensor* proposals, ITensor* features) {
    std::vector<ITensor*> instances;

    // box head
    BoxHead(network, weightMap, proposals, features, instances);

    if (MASK_ON) {
        // mask head
        MaskHead(network, weightMap, features, instances);
    }

    return instances;
}

ICudaEngine* createEngine_rcnn(unsigned int maxBatchSize,
    const std::string& wtsfile, IBuilder* builder, IBuilderConfig* config, DataType dt,
    const std::string& quantizationType) {
    /*
    description: after fuse bn
    */
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // Create input tensor of shape {INPUT_H, INPUT_W, 3} with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_NODE_NAME, dt, Dims3{ INPUT_H, INPUT_W, 3 });
    assert(data);

    // preprocess
    data = DataPreprocess(network, *data);
    std::map<std::string, Weights> weightMap;
    loadWeights(wtsfile, weightMap);

    // backbone
    ITensor* features = BuildResNet(network, weightMap, *data, BACKBONE_RESNETTYPE, 64, 64, RES2_OUT_CHANNELS);

    auto proposals = RPN(network, weightMap, *features);
    auto results = ROIHeads(network, weightMap, proposals, features);

    // build output
    for (int i = 0; i < results.size(); i++) {
        network->markOutput(*results[i]);
        results[i]->setName(OUTPUT_NAMES[i].c_str());
    }

    // build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(1ULL << 30);
    if (quantizationType == "fp32") {
    } else if (quantizationType == "fp16") {
        config->setFlag(BuilderFlag::kFP16);
    } else if (quantizationType == "int8") {
        std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
        assert(builder->platformHasFastInt8());
        config->setFlag(BuilderFlag::kINT8);
        Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, INPUT_W, INPUT_H, "./coco_calib/",
        "int8calib.table", INPUT_NODE_NAME);
        config->setInt8Calibrator(calibrator);
    } else {
        throw("does not support model type");
    }

    std::cout << "Building engine, please wait for a while..." << std::endl;
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // destroy network
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap) {
        delete[] mem.second.values;
    }
    return engine;
}

void BuildRcnnModel(unsigned int maxBatchSize, IHostMemory** modelStream, const std::string& wtsfile,
const std::string& quantizationType = "fp32") {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    ICudaEngine* engine = createEngine_rcnn(maxBatchSize,
        wtsfile, builder, config, DataType::kFLOAT, quantizationType);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
}

void doInference(IExecutionContext& context, cudaStream_t& stream, std::vector<void*>& buffers,
std::vector<float>& input, std::vector<float*>& output) {
    CUDA_CHECK(cudaMemcpyAsync(buffers[0], input.data(), BATCH_SIZE * INPUT_H * INPUT_W * 3 * sizeof(float),
    cudaMemcpyHostToDevice, stream));

    context.enqueue(BATCH_SIZE, buffers.data(), stream, nullptr);

    CUDA_CHECK(cudaMemcpyAsync(output[0], buffers[1], BATCH_SIZE * DETECTIONS_PER_IMAGE * sizeof(float),
    cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(cudaMemcpyAsync(output[1], buffers[2], BATCH_SIZE * DETECTIONS_PER_IMAGE * 4 * sizeof(float),
    cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(cudaMemcpyAsync(output[2], buffers[3], BATCH_SIZE * DETECTIONS_PER_IMAGE * sizeof(float),
    cudaMemcpyDeviceToHost, stream));
    if (MASK_ON)
        CUDA_CHECK(cudaMemcpyAsync(output[3], buffers[4],
        BATCH_SIZE * DETECTIONS_PER_IMAGE * POOLER_RESOLUTION * POOLER_RESOLUTION * sizeof(float),
        cudaMemcpyDeviceToHost, stream));

    cudaStreamSynchronize(stream);
}

void calculateSize() {
    float ratio = MIN_SIZE / static_cast<float>(std::min(INPUT_H_, INPUT_W_));
    float newh = 0, neww = 0;
    if (INPUT_H_ < INPUT_W_) {
        newh = MIN_SIZE;
        neww = ratio * INPUT_W_;
    } else {
        newh = ratio * INPUT_H_;
        neww = MIN_SIZE;
    }
    if (std::max(newh, neww) > MAX_SIZE) {
        ratio = MAX_SIZE / static_cast<float>(std::max(newh, neww));
        newh = newh * ratio;
        neww = neww * ratio;
    }
    INPUT_H = static_cast<int>(newh + 0.5);
    INPUT_W = static_cast<int>(neww + 0.5);
}


bool parse_args(int argc, char** argv, std::string& wtsFile, std::string& engineFile, std::string& imgDir) {
    if (argc < 4) return false;
    if (std::string(argv[1]) == "-s") {
        wtsFile = std::string(argv[2]);
        engineFile = std::string(argv[3]);
    } else if (std::string(argv[1]) == "-d") {
        engineFile = std::string(argv[2]);
        imgDir = std::string(argv[3]);
    } else {
        return false;
    }
    if (argc >= 5 && std::string(argv[4]) == "m") MASK_ON = true;
    return true;
}

int main(int argc, char** argv) {
    
    int flag = 0;
    for (int &item : NMS_METHOD_VEC) {
        if (item == NMS_METHOD) {
            flag = 1;
            printf("The nms method %d is applied.\n", NMS_METHOD);
            break;
        }
    }
    if (flag == 0) {
        printf("[WARNING] The nms_method %d is not supported, please choose from [0, 1, 2].\n", NMS_METHOD);
        printf("[WARNING] To make the nms robust, the default nms method 0 is applied.\n");
        NMS_METHOD = 0;
    }

    // calculate size
    calculateSize();

    cudaSetDevice(DEVICE);

    std::string wtsFile = "";
    std::string engineFile = "";

    std::string imgDir;
    if (!parse_args(argc, argv, wtsFile, engineFile, imgDir)) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./rcnn -s [.wts] [.engine] [m] // serialize model to plan file" << std::endl;
        std::cerr << "./rcnn -d [.engine] ../samples [m]  // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    if (!wtsFile.empty()) {
        IHostMemory* modelStream{ nullptr };
        BuildRcnnModel(BATCH_SIZE, &modelStream, wtsFile, "fp32");
        assert(modelStream != nullptr);
        std::ofstream p(engineFile, std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 0;
    }

    // deserialize the .engine and run inference
    std::ifstream file(engineFile, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engineFile << " error!" << std::endl;
        return -1;
    }

    std::string trtModelStream;
    size_t modelSize{ 0 };
    file.seekg(0, file.end);
    modelSize = file.tellg();
    file.seekg(0, file.beg);
    trtModelStream.resize(modelSize);
    assert(!trtModelStream.empty());
    file.read(const_cast<char*>(trtModelStream.c_str()), modelSize);
    file.close();

    // build engine
    std::cout << "build engine" << std::endl;
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream.c_str(), modelSize);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    runtime->destroy();

    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    // prepare input file
    std::vector<std::string> fileList;
    if (read_files_in_dir(imgDir.c_str(), fileList) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    // prepare input data
    std::vector<float> data(BATCH_SIZE * INPUT_H * INPUT_W * 3, 0);
    void *data_d, *scores_d, *boxes_d, *classes_d, *masks_d;
    CUDA_CHECK(cudaMalloc(&data_d, BATCH_SIZE * INPUT_H * INPUT_W * 3 * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&scores_d, BATCH_SIZE * DETECTIONS_PER_IMAGE * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&boxes_d, BATCH_SIZE * DETECTIONS_PER_IMAGE * 4 * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&classes_d, BATCH_SIZE * DETECTIONS_PER_IMAGE * sizeof(float)));

    std::vector<float> scores_h(BATCH_SIZE * DETECTIONS_PER_IMAGE);
    std::vector<float> boxes_h(BATCH_SIZE * DETECTIONS_PER_IMAGE * 4);
    std::vector<float> classes_h(BATCH_SIZE * DETECTIONS_PER_IMAGE);
    std::vector<float> masks_h;

    std::vector<void*> buffers = { data_d, scores_d, boxes_d, classes_d };
    std::vector<float*> outputs = {scores_h.data(), boxes_h.data(), classes_h.data()};

    if (MASK_ON) {
        CUDA_CHECK(cudaMalloc(&masks_d,
        BATCH_SIZE * DETECTIONS_PER_IMAGE * POOLER_RESOLUTION * POOLER_RESOLUTION * sizeof(float)));
        masks_h.resize(BATCH_SIZE * DETECTIONS_PER_IMAGE * POOLER_RESOLUTION * POOLER_RESOLUTION);
        buffers.push_back(masks_d);
        outputs.push_back(masks_h.data());
    }

    int fcount = 0;
    int fileLen = fileList.size();
    for (int f = 0; f < fileLen; f++) {
        fcount++;
        if (fcount < BATCH_SIZE && f + 1 != fileLen) continue;

        for (int b = 0; b < fcount; b++) {
            cv::Mat img = cv::imread(imgDir + "/" + fileList[f - fcount + 1 + b]);
            h_ori = img.rows;
            w_ori = img.cols;
            img = preprocessImg(img, INPUT_W, INPUT_H, X_LEFT_PAD, X_RIGHT_PAD, Y_TOP_PAD, Y_BOTTOM_PAD);

            if (img.empty()) continue;
            for (int i = 0; i < INPUT_H * INPUT_W * 3; i++)
                data[b*INPUT_H * INPUT_W * 3 + i] = static_cast<float>(*(img.data + i));
        }

        // Run inference
        auto start = std::chrono::system_clock::now();

        doInference(*context, stream, buffers, data, outputs);

        auto end = std::chrono::system_clock::now();
        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;

        float h_ratio = static_cast<float>(h_ori) / (INPUT_H - (Y_TOP_PAD + Y_BOTTOM_PAD));  // ratio of original image size to model input size
        float w_ratio = static_cast<float>(w_ori) / (INPUT_W - (X_LEFT_PAD + X_RIGHT_PAD));

        for (int b = 0; b < fcount; b++) {
            cv::Mat img = cv::imread(imgDir + "/" + fileList[f - fcount + 1 + b]);
            for (int i = 0; i < DETECTIONS_PER_IMAGE; i++) {
                if (scores_h[b * DETECTIONS_PER_IMAGE + i] > SCORE_THRESH) {
                    float x1 = (boxes_h[b * DETECTIONS_PER_IMAGE * 4 + i * 4 + 0] - X_LEFT_PAD) * w_ratio;
                    float y1 = (boxes_h[b * DETECTIONS_PER_IMAGE * 4 + i * 4 + 1] - Y_TOP_PAD) * h_ratio;
                    float x2 = (boxes_h[b * DETECTIONS_PER_IMAGE * 4 + i * 4 + 2] - X_LEFT_PAD) * w_ratio;
                    float y2 = (boxes_h[b * DETECTIONS_PER_IMAGE * 4 + i * 4 + 3] - Y_TOP_PAD) * h_ratio;
                    int label = classes_h[b * DETECTIONS_PER_IMAGE + i];
                    float score = scores_h[b * DETECTIONS_PER_IMAGE + i];
                    printf("boxes:[%.6f, %.6f, %.6f, %.6f] scores: %.4f label: %d \n", x1, y1, x2, y2, score, label);
                    cv::Rect r(x1, y1, x2 - x1, y2 - y1);
                    cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
                    cv::putText(img, std::to_string(label), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
                    cv::Scalar(0xFF, 0xFF, 0xFF), 2);


                    if (MASK_ON) {
                        cv::Mat maskPart = cv::Mat::zeros(cv::Size(POOLER_RESOLUTION, POOLER_RESOLUTION), CV_32FC1);
                        memcpy(maskPart.data,
                          &masks_h[b * DETECTIONS_PER_IMAGE * POOLER_RESOLUTION * POOLER_RESOLUTION +
                          i * POOLER_RESOLUTION * POOLER_RESOLUTION],
                          POOLER_RESOLUTION * POOLER_RESOLUTION * sizeof(float));

                        cv::Rect r(cv::Point(floor(x1) - 1 < 0 ? 0 : floor(x1) - 1,
                                             floor(y1) - 1 < 0 ? 0 : floor(y1) - 1),
                                   cv::Point(ceil(x2) + 1 > w_ori ? w_ori : ceil(x2) + 1,
                                             ceil(y2) + 1 > h_ori ? h_ori : ceil(y2) + 1));
                        cv::resize(maskPart, maskPart, cv::Size(r.width, r.height));
                        cv::Mat curMask = cv::Mat::zeros(cv::Size(w_ori, h_ori), CV_8UC1);
                        cv::threshold(maskPart, maskPart, 0.5, 255, cv::THRESH_BINARY);
                        curMask(r) += maskPart;
                        std::vector<std::vector<cv::Point>> contours;
                        cv::findContours(curMask, contours, cv::RETR_EXTERNAL, cv::CHAIN_APPROX_NONE);
                        for (int c = 0; c < contours.size(); c++)
                            cv::drawContours(img, contours, c, cv::Scalar(0, 0, 255));
                    }
                }
            }
            cv::imwrite("_" + fileList[f - fcount + 1 + b], img);
        }
        fcount = 0;
    }

    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(data_d));
    CUDA_CHECK(cudaFree(scores_d));
    CUDA_CHECK(cudaFree(boxes_d));
    CUDA_CHECK(cudaFree(classes_d));
    if (MASK_ON) CUDA_CHECK(cudaFree(masks_d));
    context->destroy();
    engine->destroy();

    return 0;
}


================================================
FILE: real-esrgan/general-x4v3/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.16)
project(real-esrgan)

set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/")

add_definitions(-std=c++17)
add_definitions(-DAPI_EXPORTS)
option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
#set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

#find_package(CUDA REQUIRED)

INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/src/include)

# cuda
FIND_PACKAGE(CUDA REQUIRED)
#INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)

# <------------------------TensorRT Related------------------------->
include_directories(YOUR_TENSORRT_INCLUDE_DIR) # TensorRT-8.6.1.6/include
link_directories(YOUR_TENSORRT_LIB_DIR) # TensorRT-8.6.1.6/lib

# <------------------------OpenCV Related------------------------->
# opencv
FIND_PACKAGE(OpenCV REQUIRED)
INCLUDE_DIRECTORIES(${OpenCV_INCLUDE_DIRS})

set(CMAKE_CXX_STANDARD 17)

add_executable(${PROJECT_NAME} main.cpp)

cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/src/pixel_shuffle/pixel_shuffle.cu)
target_link_libraries(myplugins nvinfer cudart)


TARGET_LINK_LIBRARIES(${PROJECT_NAME} nvinfer)
TARGET_LINK_LIBRARIES(${PROJECT_NAME} cudart)
TARGET_LINK_LIBRARIES(${PROJECT_NAME} ${OpenCV_LIBS})
TARGET_LINK_LIBRARIES(${PROJECT_NAME} myplugins)


================================================
FILE: real-esrgan/general-x4v3/README.md
================================================
# Real-ESRGAN realesr-general-x4v3 model

## How to Run
0. Replace YOUR_TENSORRT_INCLUDE_DIR and YOUR_TENSORRT_LIB_DIR in CMakeLists.txt with your TensorRT include and lib directories.
1. generate .wts from pytorch with .pt
```
git clone https://github.com/xinntao/Real-ESRGAN.git
cd Real-ESRGAN

# Install basicsr - https://github.com/xinntao/BasicSR
# We use BasicSR for both training and inference
pip install basicsr
# facexlib and gfpgan are for face enhancement
pip install facexlib
pip install gfpgan
pip install -r requirements.txt
python setup.py develop
```
download realesr-general-x4v3.pth (and realesr-general-wdn-x4v3.pth if needed) from
https://github.com/xinntao/Real-ESRGAN/releases

```
cp {tensorrtx}/real-esrgan-general-x4v3/gen_wts.py {xinntao}/Real-ESRGAN
cd {xinntao}/Real-ESRGAN
python gen_wts.py
// a file 'real-esrgan.wts' will be generated.
```

**Be aware that if you need both realesr-general-x4v3.pth and realesr-general-wdn-x4v3.pth, please write a Python script to average all weights of realesr-general-x4v3.pth and realesr-general-wdn-x4v3.pth (from {xinntao}/Real-ESRGAN), then save it as a .pth file, and use this new file to generate a .wts file.**

2. build tensorrtx/real-esrgan-general-x4v3 and run

```
cd {tensorrtx}/real-esrgan-general-x4v3/
mkdir build
cd build
cp {xinntao}/Real-ESRGAN/real-esrgan.wts {tensorrtx}/real-esrgan/weights/
cmake ..
make
./real-esrgan your_images_dir
```


================================================
FILE: real-esrgan/general-x4v3/cmake/FindTensorRT.cmake
================================================
# source:
# https://github.com/NVIDIA/tensorrt-laboratory/blob/master/cmake/FindTensorRT.cmake

# This module defines the following variables:
#
# ::
#
#   TensorRT_INCLUDE_DIRS
#   TensorRT_LIBRARIES
#   TensorRT_FOUND
#
# ::
#
#   TensorRT_VERSION_STRING - version (x.y.z)
#   TensorRT_VERSION_MAJOR  - major version (x)
#   TensorRT_VERSION_MINOR  - minor version (y)
#   TensorRT_VERSION_PATCH  - patch version (z)
#
# Hints
# ^^^^^
# A user may set ``TensorRT_DIR`` to an installation root to tell this module where to look.
#
set(_TensorRT_SEARCHES)

if(TensorRT_DIR)
    set(_TensorRT_SEARCH_ROOT PATHS ${TensorRT_DIR} NO_DEFAULT_PATH)
    list(APPEND _TensorRT_SEARCHES _TensorRT_SEARCH_ROOT)
endif()

# appends some common paths
set(_TensorRT_SEARCH_NORMAL
        PATHS "/usr"
        )
list(APPEND _TensorRT_SEARCHES _TensorRT_SEARCH_NORMAL)

# Include dir
foreach(search ${_TensorRT_SEARCHES})
    find_path(TensorRT_INCLUDE_DIR NAMES NvInfer.h ${${search}} PATH_SUFFIXES include)
endforeach()

if(NOT TensorRT_LIBRARY)
    foreach(search ${_TensorRT_SEARCHES})
        find_library(TensorRT_LIBRARY NAMES nvinfer ${${search}} PATH_SUFFIXES lib)
    endforeach()
endif()

if(NOT TensorRT_PARSERS_LIBRARY)
    foreach(search ${_TensorRT_SEARCHES})
        find_library(TensorRT_NVPARSERS_LIBRARY NAMES nvparsers ${${search}} PATH_SUFFIXES lib)
    endforeach()
endif()

if(NOT TensorRT_NVONNXPARSER_LIBRARY)
    foreach(search ${_TensorRT_SEARCHES})
        find_library(TensorRT_NVONNXPARSER_LIBRARY NAMES nvonnxparser ${${search}} PATH_SUFFIXES lib)
    endforeach()
endif()

mark_as_advanced(TensorRT_INCLUDE_DIR)

if(TensorRT_INCLUDE_DIR AND EXISTS "${TensorRT_INCLUDE_DIR}/NvInfer.h")
    file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MAJOR REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$")
    file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MINOR REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$")
    file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_PATCH REGEX "^#define NV_TENSORRT_PATCH [0-9]+.*$")

    string(REGEX REPLACE "^#define NV_TENSORRT_MAJOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MAJOR "${TensorRT_MAJOR}")
    string(REGEX REPLACE "^#define NV_TENSORRT_MINOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MINOR "${TensorRT_MINOR}")
    string(REGEX REPLACE "^#define NV_TENSORRT_PATCH ([0-9]+).*$" "\\1" TensorRT_VERSION_PATCH "${TensorRT_PATCH}")
    set(TensorRT_VERSION_STRING "${TensorRT_VERSION_MAJOR}.${TensorRT_VERSION_MINOR}.${TensorRT_VERSION_PATCH}")
endif()

include(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(TensorRT REQUIRED_VARS TensorRT_LIBRARY TensorRT_INCLUDE_DIR VERSION_VAR TensorRT_VERSION_STRING)

if(TensorRT_FOUND)
    set(TensorRT_INCLUDE_DIRS ${TensorRT_INCLUDE_DIR})

    if(NOT TensorRT_LIBRARIES)
        set(TensorRT_LIBRARIES ${TensorRT_LIBRARY} ${TensorRT_NVONNXPARSER_LIBRARY} ${TensorRT_NVPARSERS_LIBRARY})
    endif()

    if(NOT TARGET TensorRT::TensorRT)
        add_library(TensorRT::TensorRT UNKNOWN IMPORTED)
        set_target_properties(TensorRT::TensorRT PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIRS}")
        set_property(TARGET TensorRT::TensorRT APPEND PROPERTY IMPORTED_LOCATION "${TensorRT_LIBRARY}")
    endif()
endif()


================================================
FILE: real-esrgan/general-x4v3/gen_wts.py
================================================
import argparse
import os
import struct
from realesrgan import RealESRGANer
from realesrgan.archs.srvgg_arch import SRVGGNetCompact

from basicsr.archs.rrdbnet_arch import RRDBNet
from basicsr.utils.download_util import load_file_from_url


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input', type=str, help='Input image or folder')
    parser.add_argument(
        '-n',
        '--model_name',
        type=str,
        default='realesr-general-x4v3',
        help=('RealESRGAN_x2plus Model names: '
              'realesr-animevideov3 | realesr-general-x4v3'))
    parser.add_argument('-o', '--output', type=str, help='Output folder')
    parser.add_argument(
        '-dn',
        '--denoise_strength',
        type=float,
        default=0.5,
        help=('Denoise strength. 0 for weak denoise (keep noise), 1 for strong denoise ability. '
              'Only used for the realesr-general-x4v3 model'))
    parser.add_argument('-s', '--outscale', type=float, default=4, help='The final upsampling scale of the image')
    parser.add_argument(
        '--model_path', type=str, default=None, help='[Option] Model path. Usually, you do not need to specify it')
    parser.add_argument('--suffix', type=str, default='out', help='Suffix of the restored image')
    parser.add_argument('-t', '--tile', type=int, default=0, help='Tile size, 0 for no tile during testing')
    parser.add_argument('--tile_pad', type=int, default=10, help='Tile padding')
    parser.add_argument('--pre_pad', type=int, default=0, help='Pre padding size at each border')
    parser.add_argument('--face_enhance', action='store_true', help='Use GFPGAN to enhance face')
    parser.add_argument(
        '--fp32', action='store_true', help='Use fp32 precision during inference. Default: fp16 (half precision).')
    parser.add_argument(
        '--alpha_upsampler',
        type=str,
        default='realesrgan',
        help='The upsampler for the alpha channels. Options: realesrgan | bicubic')
    parser.add_argument(
        '--ext',
        type=str,
        default='auto',
        help='Image extension. Options: auto | jpg | png, auto means using the same extension as inputs')
    parser.add_argument(
        '-g', '--gpu-id', type=int, default=None, help='gpu device to use (default=None) can be 0,1,2 for multi-gpu')

    args = parser.parse_args()

    # determine models according to model names
    args.model_name = args.model_name.split('.')[0]
    if args.model_name == 'RealESRGAN_x4plus':  # x4 RRDBNet model
        model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)
        netscale = 4
        file_url = ['https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth']
    elif args.model_name == 'RealESRNet_x4plus':  # x4 RRDBNet model
        model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)
        netscale = 4
        file_url = ['https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.1/RealESRNet_x4plus.pth']
    elif args.model_name == 'RealESRGAN_x4plus_anime_6B':  # x4 RRDBNet model with 6 blocks
        model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=6, num_grow_ch=32, scale=4)
        netscale = 4
        file_url = ['https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth']
    elif args.model_name == 'RealESRGAN_x2plus':  # x2 RRDBNet model
        model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=2)
        netscale = 2
        file_url = ['https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.1/RealESRGAN_x2plus.pth']
    elif args.model_name == 'realesr-animevideov3':  # x4 VGG-style model (XS size)
        model = SRVGGNetCompact(num_in_ch=3, num_out_ch=3, num_feat=64, num_conv=16, upscale=4, act_type='prelu')
        netscale = 4
        file_url = ['https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-animevideov3.pth']
    elif args.model_name == 'realesr-general-x4v3':  # x4 VGG-style model (S size)
        model = SRVGGNetCompact(num_in_ch=3, num_out_ch=3, num_feat=64, num_conv=32, upscale=4, act_type='prelu')
        netscale = 4
        file_url = [
            'https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-general-wdn-x4v3.pth',
            'https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-general-x4v3.pth'
        ]

    # determine model paths
    if args.model_path is not None:
        model_path = args.model_path
    else:
        model_path = os.path.join('weights', args.model_name + '.pth')
        if not os.path.isfile(model_path):
            ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
            for url in file_url:
                # model_path will be updated
                model_path = load_file_from_url(
                    url=url, model_dir=os.path.join(ROOT_DIR, 'weights'), progress=True, file_name=None)

    # use dni to control the denoise strength
    dni_weight = None
    if args.model_name == 'realesr-general-x4v3' and args.denoise_strength != 1:
        # wdn_model_path = model_path.replace('realesr-general-x4v3', 'realesr-general-wdn-x4v3')
        # model_path = [model_path, wdn_model_path]
        # dni_weight = [args.denoise_strength, 1 - args.denoise_strength]
        model_path = model_path.replace('realesr-general-x4v3', 'realesr-general-x4v3-cat')
        dni_weight = None

    # restorer
    upsampler = RealESRGANer(
        scale=netscale,
        model_path=model_path,
        dni_weight=dni_weight,
        model=model,
        tile=args.tile,
        tile_pad=args.tile_pad,
        pre_pad=args.pre_pad,
        half=not args.fp32,
        gpu_id=args.gpu_id)

    if os.path.isfile('real-esrgan.wts'):
        print('Already, real-esrgan.wts file exists.')
    else:
        print('making real-esrgan.wts file ...')
        f = open("real-esrgan.wts", 'w')
        f.write("{}\n".format(len(upsampler.model.state_dict().keys())))
        for k, v in upsampler.model.state_dict().items():
            print('key: ', k)
            print('value: ', v.shape)
            vr = v.reshape(-1).cpu().numpy()
            f.write("{} {}".format(k, len(vr)))
            for vv in vr:
                f.write(" ")
                f.write(struct.pack(">f", float(vv)).hex())
            f.write("\n")
        print('Completed real-esrgan.wts file!')


if __name__ == '__main__':
    main()


================================================
FILE: real-esrgan/general-x4v3/main.cpp
================================================
#include <NvInfer.h>
#include <dirent.h>
#include <fstream>
#include <iostream>
#include <memory>
#include <opencv4/opencv2/opencv.hpp>
#include <vector>

#include "config/config.hpp"
#include "cuda_utils.h"
#include "logging/logging.h"
#include "pixel_shuffle/pixel_shuffle.hpp"
#include "preprocess/preprocess.hpp"

static Logger gLogger;

using namespace nvinfer1;

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--) {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x) {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

auto* ConvPRelu(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int conv_nb,
                int index) {

    IConvolutionLayer* conv = network->addConvolutionNd(input, conv_nb, DimsHW{3, 3},
                                                        weightMap["body." + std::to_string(index) + ".weight"],
                                                        weightMap["body." + std::to_string(index) + ".bias"]);
    assert(conv);
    conv->setName(("body." + std::to_string(index) + ".weight").c_str());
    conv->setStrideNd(DimsHW{1, 1});
    conv->setPaddingNd(DimsHW{1, 1});
    auto conv_res = conv->getOutput(0);

    // add prelu layer
    // slope 64 number

    //auto slope = network->addConstant( {64}, weightMap["body." + std::to_string(index + 1) + ".weight"] );
    auto slope = network->addConstant(Dims4{1, 64, 1, 1}, weightMap["body." + std::to_string(index + 1) + ".weight"]);
    assert(slope);
    slope->setName(("body." + std::to_string(index + 1) + ".weight").c_str());

    auto prelu = network->addParametricReLU(*conv_res, *slope->getOutput(0));
    assert(prelu);

    return prelu;
}

void build_engine(DataType dt, std::string& wts_path) {

    std::map<std::string, Weights> weightMap = loadWeights(wts_path);

    nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger);
    nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();

    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(1U);

    auto data = network->addInput(INPUT_BLOB_NAME, nvinfer1::DataType::kFLOAT,
                                  nvinfer1::Dims4{BATCH_SIZE, INPUT_C, INPUT_H, INPUT_W});

    // first
    auto layer = ConvPRelu(network, weightMap, *data, 64, 0);

    for (int i = 0; i < 32; ++i) {
        layer = ConvPRelu(network, weightMap, *layer->getOutput(0), 64, 2 * i + 2);
    }

    auto conv_last = network->addConvolutionNd(*layer->getOutput(0), 48, DimsHW{3, 3}, weightMap["body.66.weight"],
                                               weightMap["body.66.bias"]);
    assert(conv_last);
    conv_last->setName("body.66.weight");
    conv_last->setStrideNd(DimsHW{1, 1});
    conv_last->setPaddingNd(DimsHW{1, 1});
    auto conv_last_res = conv_last->getOutput(0);

    // add pixel shuffle layer by plugin
    IPluginCreator* creator = getPluginRegistry()->getPluginCreator("PixelShufflePlugin", "1");
    const PluginFieldCollection* pluginFC = creator->getFieldNames();
    std::vector<PluginField> pluginData;
    int upscaleFactor = 4;
    pluginData.emplace_back(PluginField{"upscaleFactor", &upscaleFactor, PluginFieldType::kINT32, 1});
    PluginFieldCollection pluginFCWithData = {static_cast<int>(pluginData.size()), pluginData.data()};
    auto pluginObj = creator->createPlugin("PixelShuffle", &pluginFCWithData);

    auto pixelShuffleLayer = network->addPluginV2(&conv_last_res, 1, *pluginObj);

    // the input "data" interpolate 4x and add to pixelShuffleLayer->getOutput(0)

    auto interpolateLayer = network->addResize(*data);
    interpolateLayer->setResizeMode(ResizeMode::kNEAREST);
    // Define scale factors
    float scales[] = {1.0f, 1.0f, 1.0 * OUT_SCALE, 1.0 * OUT_SCALE};  // scale_factor=4 for height and width
    interpolateLayer->setScales(scales, OUT_SCALE);

    // Add the two tensor as output
    auto addLayer = network->addElementWise(*interpolateLayer->getOutput(0), *pixelShuffleLayer->getOutput(0),
                                            ElementWiseOperation::kSUM);

    // output
    addLayer->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*addLayer->getOutput(0));

    // fp16
    if (USE_FP16) {
        config->setFlag(BuilderFlag::kFP16);
    }

    std::cout << "Building engine, please wait for a while..." << std::endl;
    IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // Don't need the network any more
    delete network;

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    std::ofstream ofs("../weights/real-esrgan.engine", std::ios::binary);

    assert(serialized_model != nullptr);
    ofs.write(reinterpret_cast<const char*>(serialized_model->data()), serialized_model->size());

    delete config;
    delete serialized_model;
    delete builder;
}

static inline int read_files_in_dir(const char* p_dir_name, std::vector<std::string>& file_names) {
    DIR* p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}

void doInference(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output) {
    context.setBindingDimensions(0, Dims4(BATCH_SIZE, INPUT_C, INPUT_H, INPUT_W));
    context.enqueueV2(buffers, stream, nullptr);
    CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], BATCH_SIZE * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost,
                               stream));
    CUDA_CHECK(cudaStreamSynchronize(stream));
}

int main(int argc, char** argv) {
    std::string img_dir;

    if (argc < 2) {
        std::cerr << "Usage: " << argv[0] << " <image_dir>" << std::endl;
        return -1;
    } else {
        img_dir = argv[1];
    }

    std::string wts_path = "../weights/real-esrgan.wts";
    build_engine(DataType::kFLOAT, wts_path);

    std::string engine_name = "../weights/real-esrgan.engine";
    // deserialize the .engine and run inference
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        return -1;
    }
    char* trtModelStream = nullptr;
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    trtModelStream = new char[size];
    assert(trtModelStream);
    file.read(trtModelStream, size);
    file.close();

    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;
    assert(engine->getNbBindings() == 2);
    void* buffers[2];
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);
    assert(inputIndex == 0);
    assert(outputIndex == 1);

    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc(&buffers[inputIndex], BATCH_SIZE * INPUT_C * INPUT_H * INPUT_W * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(float)));

    std::vector<float> data;
    std::vector<float> output;
    //std::vector<float> res;

    //data.resize(BATCH_SIZE * 3 * INPUT_H * INPUT_W);
    data.resize(BATCH_SIZE * INPUT_C * INPUT_H * INPUT_W);
    output.resize(BATCH_SIZE * OUTPUT_SIZE);

    // Create stream
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    for (int index = 0; index < file_names.size(); ++index) {

        auto img = cv::imread(img_dir + "/" + file_names[index]);
        auto begin = std::chrono::high_resolution_clock::now();

        // BATCH_SIZE = 1
        for (int b = 0; b < BATCH_SIZE; b++) {
            int i = 0;
            for (int row = 0; row < INPUT_H; ++row) {
                uchar* uc_pixel = img.data + row * img.step;
                for (int col = 0; col < INPUT_W; ++col) {
                    //    static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
                    // BGR2RGB and normalization
                    data[b * 3 * INPUT_H * INPUT_W + i] = (float)uc_pixel[2] / 255.0;
                    data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = (float)uc_pixel[1] / 255.0;
                    data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = (float)uc_pixel[0] / 255.0;
                    uc_pixel += 3;
                    ++i;
                }
            }
        }
        CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], data.data(),
                                   BATCH_SIZE * INPUT_C * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice,
                                   stream));
        doInference(*context, stream, (void**)buffers, output.data());
        auto end = std::chrono::high_resolution_clock::now();
        std::cout << "Inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count()
                  << " ms" << std::endl;

        int OUTPUT_C = 3;
        int OUTPUT_H = INPUT_H * OUT_SCALE;
        int OUTPUT_W = INPUT_W * OUT_SCALE;

        for (int b = 0; b < BATCH_SIZE; b++) {
            cv::Mat img_res(OUTPUT_H, OUTPUT_W, CV_8UC3);
            int i = 0;
            for (int row = 0; row < OUTPUT_H; ++row) {
                uchar* uc_pixel = img_res.data + row * img_res.step;
                for (int col = 0; col < OUTPUT_W; ++col) {
                    // RGB2BGR and de_normalization
                    auto r2 = std::round(output[b * OUTPUT_C * OUTPUT_H * OUTPUT_W + i] * 255.0);
                    if (r2 < 0)
                        r2 = 0;
                    if (r2 > 255)
                        r2 = 255;
                    auto g2 = std::round(output[b * OUTPUT_C * OUTPUT_H * OUTPUT_W + i + 1 * OUTPUT_H * OUTPUT_W] *
                                         255.0);
                    if (g2 < 0)
                        g2 = 0;
                    if (g2 > 255)
                        g2 = 255;
                    auto b2 = std::round(output[b * OUTPUT_C * OUTPUT_H * OUTPUT_W + i + 2 * OUTPUT_H * OUTPUT_W] *
                                         255.0);
                    if (b2 < 0)
                        b2 = 0;
                    if (b2 > 255)
                        b2 = 255;

                    uc_pixel[0] = static_cast<uchar>(b2);  // B
                    uc_pixel[1] = static_cast<uchar>(g2);  // G
                    uc_pixel[2] = static_cast<uchar>(r2);  // R

                    // uc_pixel[0] = static_cast<uchar>(std::round(output[b * OUTPUT_C * OUTPUT_H * OUTPUT_W + i + 2 * OUTPUT_H * OUTPUT_W] * 255.0)); // B
                    // uc_pixel[1] = static_cast<uchar>(std::round(output[b * OUTPUT_C * OUTPUT_H * OUTPUT_W + i + 1 * OUTPUT_H * OUTPUT_W] * 255.0)); // G
                    // uc_pixel[2] = static_cast<uchar>(std::round(output[b * OUTPUT_C * OUTPUT_H * OUTPUT_W + i] * 255.0)); // R
                    uc_pixel += 3;
                    ++i;
                }
            }
            cv::imwrite("_" + file_names[index] + ".jpg", img_res);
        }
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(buffers[0]));
    CUDA_CHECK(cudaFree(buffers[1]));
    // Destroy the engine
    delete context;
    delete engine;
    delete runtime;
}


================================================
FILE: real-esrgan/general-x4v3/src/include/config/config.hpp
================================================
#ifndef REAL_ESRGAN_TRT_CONFIG_HPP
#define REAL_ESRGAN_TRT_CONFIG_HPP

#include <string>

//std::string INPUT_BLOB_NAME = "input";
//std::string OUTPUT_BLOB_NAME = "output";

const char* INPUT_BLOB_NAME = "input_0";
const char* OUTPUT_BLOB_NAME = "output_0";

const bool USE_FP16 = false;

static const int BATCH_SIZE = 1;
static const int INPUT_C = 3;
static const int INPUT_H = 450;
static const int INPUT_W = 300;
static const int OUT_SCALE = 4;
//static const int OUTPUT_SIZE = INPUT_C * INPUT_H * OUT_SCALE * INPUT_W * OUT_SCALE;
static const int OUTPUT_SIZE = BATCH_SIZE * 48 * 450 * 300;
//INPUT_C * INPUT_H * OUT_SCALE * INPUT_W * OUT_SCALE;
#endif  //REAL_ESRGAN_TRT_CONFIG_HPP


================================================
FILE: real-esrgan/general-x4v3/src/include/cuda_utils.h
================================================
#ifndef TRTX_CUDA_UTILS_H_
#define TRTX_CUDA_UTILS_H_

#include <cuda_runtime_api.h>
#include <stdint.h>
#include <cstdio>
#include <iostream>
#include <vector>

#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)                                                                    \
    {                                                                                          \
        cudaError_t error_code = callstr;                                                      \
        if (error_code != cudaSuccess) {                                                       \
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
            assert(0);                                                                         \
        }                                                                                      \
    }
#endif  // CUDA_CHECK

#endif  // TRTX_CUDA_UTILS_H_


================================================
FILE: real-esrgan/general-x4v3/src/include/logging/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "NvInferRuntimeCommon.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf {
   public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {}

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {}

    ~LogStreamConsumerBuffer() {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr()) {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync() {
        putOutput();
        return 0;
    }

    void putOutput() {
        if (mShouldLog) {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }

   private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase {
   public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog) {}

   protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
   public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(severity <= reportableSeverity),
          mSeverity(severity) {}

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(other.mShouldLog),
          mSeverity(other.mSeverity) {}

    void setReportableSeverity(Severity reportableSeverity) {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

   private:
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger {
   public:
    Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {}

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult {
        kRUNNING,  //!< The test is running
        kPASSED,   //!< The test passed
        kFAILED,   //!< The test failed
        kWAIVED    //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger() { return *this; }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) noexcept override {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity) { mReportableSeverity = severity; }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom {
       public:
        TestAtom(TestAtom&&) = default;

       private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started), mName(name), mCmdline(cmdline) {}

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline) {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom) {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result) {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass) {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const { return mReportableSeverity; }

   private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result) {
        switch (result) {
            case TestResult::kRUNNING:
                return "RUNNING";
            case TestResult::kPASSED:
                return "PASSED";
            case TestResult::kFAILED:
                return "FAILED";
            case TestResult::kWAIVED:
                return "WAIVED";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result) {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv) {
        std::stringstream ss;
        for (int i = 0; i < argc; i++) {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace {

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

}  // anonymous namespace

#endif  // TENSORRT_LOGGING_H


================================================
FILE: real-esrgan/general-x4v3/src/include/pixel_shuffle/pixel_shuffle.hpp
================================================
#ifndef REAL_ESRGAN_TRT_PIXEL_SHUFFLE_HPP
#define REAL_ESRGAN_TRT_PIXEL_SHUFFLE_HPP

#include <string>
#include <vector>
#include "NvInfer.h"

class PixelShufflePlugin : public nvinfer1::IPluginV2DynamicExt {
   public:
    PixelShufflePlugin(int upscaleFactor) : mUpscaleFactor(upscaleFactor) {}

    PixelShufflePlugin(const void* data, size_t length) { memcpy(&mUpscaleFactor, data, sizeof(mUpscaleFactor)); }

    const char* getPluginType() const noexcept override { return "PixelShufflePlugin"; }

    const char* getPluginVersion() const noexcept override { return "1"; }

    int getNbOutputs() const noexcept override { return 1; }

    // nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept override
    // {
    //     assert(outputIndex == 0);
    //     auto* in = &inputs[0];
    //     nvinfer1::DimsExprs outputDims = *in;
    //     int channels = in->d[0];
    //     int height = in->d[1];
    //     int width = in->d[2];
    //     int upscaleFactor = mUpscaleFactor;
    //     outputDims.d[0] = exprBuilder.constant(channels / (upscaleFactor * upscaleFactor));
    //     outputDims.d[1] = exprBuilder.operation(nvinfer1::DimensionOperation::kPROD, {height, exprBuilder.constant(upscaleFactor)});
    //     outputDims.d[2] = exprBuilder.operation(nvinfer1::DimensionOperation::kPROD, {width, exprBuilder.constant(upscaleFactor)});
    //     return outputDims;
    // }
    nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs,
                                            nvinfer1::IExprBuilder& exprBuilder) noexcept override {
        // assert(nbInputs == 1);
        auto inDims = inputs[0];
        // assert(inDims.nbDims == 4);
        int c = inDims.d[1]->getConstantValue() / (mUpscaleFactor * mUpscaleFactor);
        int h = inDims.d[2]->getConstantValue() * mUpscaleFactor;
        int w = inDims.d[3]->getConstantValue() * mUpscaleFactor;
        nvinfer1::DimsExprs outDims;
        outDims.nbDims = 4;
        outDims.d[0] = inDims.d[0];
        outDims.d[1] = exprBuilder.constant(c);
        outDims.d[2] = exprBuilder.constant(h);
        outDims.d[3] = exprBuilder.constant(w);
        return outDims;
    }

    bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs,
                                   int nbOutputs) noexcept override {
        return inOut[pos].format == nvinfer1::TensorFormat::kLINEAR && inOut[pos].type == nvinfer1::DataType::kFLOAT;
    }

    nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
                                         int nbInputs) const noexcept override {

        return inputTypes[0];
    }

    // bool canBroadcastInputAcrossBatch(int inputIndex) const noexcept override
    // {
    //     return false;
    // }

    void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs,
                         const nvinfer1::DynamicPluginTensorDesc* outputs, int nbOutputs) noexcept override {}

    // void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) noexcept override
    // {
    //     // Optionally configure plugin if necessary
    // }

    size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
                            const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const noexcept override {
        return 0;
    }

    size_t getSerializationSize() const noexcept override { return sizeof(mUpscaleFactor); }

    void serialize(void* buffer) const noexcept override { memcpy(buffer, &mUpscaleFactor, sizeof(mUpscaleFactor)); }

    void destroy() noexcept override {
        // delete this;
    }

    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override { return new PixelShufflePlugin(mUpscaleFactor); }

    void setPluginNamespace(const char* pluginNamespace) noexcept override { mNamespace = pluginNamespace; }

    const char* getPluginNamespace() const noexcept override { return mNamespace.c_str(); }

    int initialize() noexcept override { return 0; }

    int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
                    void const* const* inputs, void* const* outputs, void* workspace,
                    cudaStream_t stream) noexcept override;

    void terminate() noexcept override {}

   private:
    int mUpscaleFactor;
    std::string mNamespace;
};

class PixelShufflePluginCreator : public nvinfer1::IPluginCreator {
   public:
    PixelShufflePluginCreator() {
        mPluginAttributes.clear();
        mPluginAttributes.emplace_back(
                nvinfer1::PluginField("upscaleFactor", nullptr, nvinfer1::PluginFieldType::kINT32, 1));
        mFC.nbFields = mPluginAttributes.size();
        mFC.fields = mPluginAttributes.data();
    }

    ~PixelShufflePluginCreator() override = default;

    const char* getPluginName() const noexcept override { return "PixelShufflePlugin"; }

    const char* getPluginVersion() const noexcept override { return "1"; }

    const nvinfer1::PluginFieldCollection* getFieldNames() noexcept override { return &mFC; }

    nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) noexcept override {
        int upscaleFactor = 0;
        for (int i = 0; i < fc->nbFields; ++i) {
            if (strcmp(fc->fields[i].name, "upscaleFactor") == 0) {
                upscaleFactor = *static_cast<const int*>(fc->fields[i].data);
            }
        }
        return new PixelShufflePlugin(upscaleFactor);
    }

    nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData,
                                           size_t serialLength) noexcept override {
        return new PixelShufflePlugin(serialData, serialLength);
    }

    void setPluginNamespace(const char* pluginNamespace) noexcept override { mNamespace = pluginNamespace; }

    const char* getPluginNamespace() const noexcept override { return mNamespace.c_str(); }

   private:
    static nvinfer1::PluginFieldCollection mFC;
    static std::vector<nvinfer1::PluginField> mPluginAttributes;
    std::string mNamespace;
};

nvinfer1::PluginFieldCollection PixelShufflePluginCreator::mFC{};
std::vector<nvinfer1::PluginField> PixelShufflePluginCreator::mPluginAttributes{
        nvinfer1::PluginField{"upscaleFactor", nullptr, nvinfer1::PluginFieldType::kINT32, 1}};

REGISTER_TENSORRT_PLUGIN(PixelShufflePluginCreator);

#endif  //REAL_ESRGAN_TRT_PIXEL_SHUFFLE_HPP


================================================
FILE: real-esrgan/general-x4v3/src/include/preprocess/preprocess.hpp
================================================
#ifndef REAL_ESRGAN_TRT_PREPROCESS_HPP
#define REAL_ESRGAN_TRT_PREPROCESS_HPP

struct PreprocessStruct {
    int N;
    int C;
    int H;
    int W;
};

#endif  //REAL_ESRGAN_TRT_PREPROCESS_HPP


================================================
FILE: real-esrgan/general-x4v3/src/pixel_shuffle/pixel_shuffle.cpp
================================================
// PixelShufflePlugin.cpp
//
// #include "pixel_shuffle/pixel_shuffle.hpp"
// #include <cstring>
// #include <cassert>
//
// PixelShufflePlugin::PixelShufflePlugin(int upscaleFactor)
//         : mUpscaleFactor(upscaleFactor) {
//     // Initialize other members
// }
//
// PixelShufflePlugin::PixelShufflePlugin(const void* data, size_t length) {
//     // Deserialize data to initialize members
//     const char* d = static_cast<const char*>(data);
//     mUpscaleFactor = *reinterpret_cast<const int*>(d);
//     d += sizeof(int);
//     mInputVolume = *reinterpret_cast<const size_t*>(d);
//     d += sizeof(size_t);
//     mOutputVolume = *reinterpret_cast<const size_t*>(d);
// }
//
// int PixelShufflePlugin::getNbOutputs() const {
//     return 1;
// }
//
// nvinfer1::Dims PixelShufflePlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) {
//     assert(index == 0);
//     assert(nbInputDims == 1);
//     int c = inputs[0].d[0];
//     int h = inputs[0].d[1];
//     int w = inputs[0].d[2];
//     int upscaleFactor = mUpscaleFactor;
//
//     assert(c % (upscaleFactor * upscaleFactor) == 0);
//     int newC = c / (upscaleFactor * upscaleFactor);
//     int newH = h * upscaleFactor;
//     int newW = w * upscaleFactor;
//
//     return nvinfer1::Dims3(newC, newH, newW);
// }
//
// int PixelShufflePlugin::initialize() {
//     return 0;
// }
//
// void PixelShufflePlugin::terminate() {
//     // Clean up
// }
//
// size_t PixelShufflePlugin::getWorkspaceSize(int maxBatchSize) const {
//     return 0;
// }
//
// int PixelShufflePlugin::enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) {
//     // Launch CUDA kernel for PixelShuffle
//     // Assume inputs[0] and outputs[0] are pointers to device memory
//     const float* input = static_cast<const float*>(inputs[0]);
//     float* output = static_cast<float*>(outputs[0]);
//
//     int c = mInputVolume / (mUpscaleFactor * mUpscaleFactor);
//     int h = mOutputVolume / (c * mUpscaleFactor);
//     int w = h; // Assuming square input for simplicity
//     int upscaleFactor = mUpscaleFactor;
//
//     // Launch CUDA kernel (to be implemented)
//     // pixelShuffleKernel(input, output, c, h, w, upscaleFactor, stream);
//
//     return 0;
// }
//
// size_t PixelShufflePlugin::getSerializationSize() const {
//     return sizeof(int) + sizeof(size_t) * 2;
// }
//
// void PixelShufflePlugin::serialize(void* buffer) const {
//     char* d = static_cast<char*>(buffer);
//     *reinterpret_cast<int*>(d) = mUpscaleFactor;
//     d += sizeof(int);
//     *reinterpret_cast<size_t*>(d) = mInputVolume;
//     d += sizeof(size_t);
//     *reinterpret_cast<size_t*>(d) = mOutputVolume;
// }
//
// void PixelShufflePlugin::destroy() {
//     delete this;
// }
//
// const char* PixelShufflePlugin::getPluginType() const {
//     return "PixelShufflePlugin";
// }
//
// const char* PixelShufflePlugin::getPluginVersion() const {
//     return "1";
// }
//
// void PixelShufflePlugin::setPluginNamespace(const char* pluginNamespace) {
//     mPluginNamespace = pluginNamespace;
// }
//
// const char* PixelShufflePlugin::getPluginNamespace() const {
//     return mPluginNamespace;
// }
//
// nvinfer1::IPluginV2IOExt* PixelShufflePlugin::clone() const {
//     return new PixelShufflePlugin(mUpscaleFactor);
// }
//
// bool PixelShufflePlugin::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const {
//     return inOut[pos].format == nvinfer1::TensorFormat::kLINEAR && inOut[pos].type == nvinfer1::DataType::kFLOAT;
// }
//
// void PixelShufflePlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {
//     // Configure the plugin based on the input and output descriptions
//     mInputVolume = in[0].desc.volume();
//     mOutputVolume = out[0].desc.volume();
// }
//
// nvinfer1::DataType PixelShufflePlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const {
//     return inputTypes[0];
// }
//
// bool PixelShufflePlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const {
//     return false;
// }
//
// bool PixelShufflePlugin::canBroadcastInputAcrossBatch(int inputIndex) const {
//     return false;
// }


================================================
FILE: real-esrgan/general-x4v3/src/pixel_shuffle/pixel_shuffle.cu
================================================
#include <cuda_runtime.h>
#include <string>
#include "pixel_shuffle/pixel_shuffle.hpp"

// CUDA kernel for PixelShuffle
__global__ void PixelShuffleKernel(const float* input, float* output, int batchSize, int channels, int height,
                                   int width, int upscaleFactor) {
    int outHeight = height * upscaleFactor;
    int outWidth = width * upscaleFactor;
    int outChannels = channels / (upscaleFactor * upscaleFactor);

    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= batchSize * outChannels * outHeight * outWidth)
        return;

    int out_w = idx % outWidth;
    int out_h = (idx / outWidth) % outHeight;
    int out_c = (idx / outWidth / outHeight) % outChannels;
    int b = idx / (outWidth * outHeight * outChannels);

    int in_c =
            out_c * upscaleFactor * upscaleFactor + (out_h % upscaleFactor) * upscaleFactor + (out_w % upscaleFactor);
    int in_h = out_h / upscaleFactor;
    int in_w = out_w / upscaleFactor;

    output[idx] = input[((b * channels + in_c) * height + in_h) * width + in_w];
}

int32_t PixelShufflePlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc,
                                    nvinfer1::PluginTensorDesc const* outputDesc, void const* const* inputs,
                                    void* const* outputs, void* workspace, cudaStream_t stream) noexcept {
    const float* input = static_cast<const float*>(inputs[0]);
    float* output = static_cast<float*>(outputs[0]);

    int batchSize = inputDesc[0].dims.d[0];
    int channels = inputDesc[0].dims.d[1];
    int height = inputDesc[0].dims.d[2];
    int width = inputDesc[0].dims.d[3];
    int upscaleFactor = mUpscaleFactor;

    int outChannels = channels / (upscaleFactor * upscaleFactor);
    int outHeight = height * upscaleFactor;
    int outWidth = width * upscaleFactor;

    int numElements = batchSize * outChannels * outHeight * outWidth;

    PixelShuffleKernel<<<(numElements + 255) / 256, 256>>>(input, output, batchSize, channels, height, width,
                                                           upscaleFactor);
    return cudaGetLastError() != cudaSuccess;
}


================================================
FILE: real-esrgan/x4plus/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(real-esrgan)

add_definitions(-std=c++11)
add_definitions(-DAPI_EXPORTS)
option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

find_package(CUDA REQUIRED)

if(WIN32)
enable_language(CUDA)
endif(WIN32)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(/usr/include/x86_64-linux-gnu/)
link_directories(/usr/lib/x86_64-linux-gnu/)

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -g -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
cuda_add_library(myplugins SHARED preprocess.cu postprocess.cu)
target_link_libraries(myplugins nvinfer cudart)

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

cuda_add_executable(real-esrgan real-esrgan.cpp)

target_link_libraries(real-esrgan nvinfer)
target_link_libraries(real-esrgan cudart)
target_link_libraries(real-esrgan myplugins)
target_link_libraries(real-esrgan ${OpenCV_LIBS})

if(UNIX)
add_definitions(-O2 -pthread)
endif(UNIX)


================================================
FILE: real-esrgan/x4plus/README.md
================================================
# Real-ESRGAN
The Pytorch implementation is [real-esrgan](https://github.com/xinntao/Real-ESRGAN).

<p align="center">
<img src="https://user-images.githubusercontent.com/40158321/170728105-0a1429e8-d117-4844-9c4b-a2d9db4a4ada.png">
</p>

## Config
- Input shape(**INPUT_H**, **INPUT_W**, **INPUT_C**) defined in real-esrgan.cpp
- GPU id(**DEVICE**) can be selected by the macro in real-esrgan.cpp
- **BATCH_SIZE** can be selected by the macro in real-esrgan.cpp
- FP16/FP32 can be selected by **PRECISION_MODE** in real-esrgan.cpp
- The example result can be visualized by **VISUALIZATION**. 

## How to Run, real-esrgan as example

0. prepare test image  
- download : [OST_009.png](https://drive.google.com/file/d/1KAyAiQ8qHc5jSBkk2Uft2LfIhzi9XSyH/view?usp=sharing)   

```
cd {tensorrtx}/real-esrgan/
mkdir sample   
cp ~/Download/OST_009.png {tensorrtx}/real-esrgan/sample
```

1. generate .wts from pytorch with .pt, or download .wts from model zoo

```
git clone https://github.com/xinntao/Real-ESRGAN.git
cd Real-ESRGAN
pip install basicsr
pip install facexlib
pip install gfpgan
pip install -r requirements.txt
python setup.py develop

// download https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth
cp ~/RealESRGAN_x4plus.pth {xinntao}/Real-ESRGAN/experiments/pretrained_models

cp {tensorrtx}/Real-ESRGAN/gen_wts.py {xinntao}/Real-ESRGAN
cd {xinntao}/Real-ESRGAN
python gen_wts.py
// a file 'real-esrgan.wts' will be generated.
```

2. build tensorrtx/real-esrgan and run

```
cd {tensorrtx}/real-esrgan/
mkdir build
cd build
cp {xinntao}/Real-ESRGAN/real-esrgan.wts {tensorrtx}/real-esrgan/build
cmake ..
make
sudo ./real-esrgan -s [.wts] [.engine]   // serialize model to plan file
sudo ./real-esrgan -d [.engine] [image folder]  // deserialize and run inference, the images in [image folder] will be processed.
// For example
// sudo ./real-esrgan -s ./real-esrgan.wts ./real-esrgan_f32.engine
// sudo ./real-esrgan -d ./real-esrgan_f32.engine ../samples

```

3. check the images generated, as follows. _OST_009.png


================================================
FILE: real-esrgan/x4plus/common.hpp
================================================
#ifndef REAL_ESRGAN_COMMON_H_
#define REAL_ESRGAN_COMMON_H_

#include <fstream>
#include <map>
#include <sstream>
#include <vector>
#include <opencv2/opencv.hpp>
#include "NvInfer.h"

using namespace nvinfer1;

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{ DataType::kFLOAT, nullptr, 0 };
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

ITensor* residualDenseBlock(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor* x, std::string lname)
{
    IConvolutionLayer* conv_1 = network->addConvolutionNd(*x, 32, DimsHW{ 3, 3 }, weightMap[lname + ".conv1.weight"], weightMap[lname + ".conv1.bias"]);
    conv_1->setStrideNd(DimsHW{ 1, 1 });
    conv_1->setPaddingNd(DimsHW{ 1, 1 });
    IActivationLayer* leaky_relu_1 = network->addActivation(*conv_1->getOutput(0), ActivationType::kLEAKY_RELU);
    leaky_relu_1->setAlpha(0.2);
    ITensor* x1 = leaky_relu_1->getOutput(0);

    ITensor* concat_input2[] = { x, x1 };
    IConcatenationLayer* concat2 = network->addConcatenation(concat_input2, 2);
    concat2->setAxis(0);
    IConvolutionLayer* conv_2 = network->addConvolutionNd(*concat2->getOutput(0), 32, DimsHW{ 3, 3 }, weightMap[lname + ".conv2.weight"], weightMap[lname + ".conv2.bias"]);
    conv_2->setStrideNd(DimsHW{ 1, 1 });
    conv_2->setPaddingNd(DimsHW{ 1, 1 });
    IActivationLayer* leaky_relu_2 = network->addActivation(*conv_2->getOutput(0), ActivationType::kLEAKY_RELU);
    leaky_relu_2->setAlpha(0.2);
    ITensor* x2 = leaky_relu_2->getOutput(0);

    ITensor* concat_input3[] = { x, x1, x2 };
    IConcatenationLayer* concat3 = network->addConcatenation(concat_input3, 3);
    concat3->setAxis(0);
    IConvolutionLayer* conv_3 = network->addConvolutionNd(*concat3->getOutput(0), 32, DimsHW{ 3, 3 }, weightMap[lname + ".conv3.weight"], weightMap[lname + ".conv3.bias"]);
    conv_3->setStrideNd(DimsHW{ 1, 1 });
    conv_3->setPaddingNd(DimsHW{ 1, 1 });
    IActivationLayer* leaky_relu_3 = network->addActivation(*conv_3->getOutput(0), ActivationType::kLEAKY_RELU);
    leaky_relu_3->setAlpha(0.2);
    ITensor* x3 = leaky_relu_3->getOutput(0);

    ITensor* concat_input4[] = { x, x1, x2, x3 };
    IConcatenationLayer* concat4 = network->addConcatenation(concat_input4, 4);
    concat4->setAxis(0);
    IConvolutionLayer* conv_4 = network->addConvolutionNd(*concat4->getOutput(0), 32, DimsHW{ 3, 3 }, weightMap[lname + ".conv4.weight"], weightMap[lname + ".conv4.bias"]);
    conv_4->setStrideNd(DimsHW{ 1, 1 });
    conv_4->setPaddingNd(DimsHW{ 1, 1 });
    IActivationLayer* leaky_relu_4 = network->addActivation(*conv_4->getOutput(0), ActivationType::kLEAKY_RELU);
    leaky_relu_4->setAlpha(0.2);
    ITensor* x4 = leaky_relu_4->getOutput(0);

    ITensor* concat_input5[] = { x, x1, x2, x3, x4 };
    IConcatenationLayer* concat5 = network->addConcatenation(concat_input5, 5);
    concat5->setAxis(0);
    IConvolutionLayer* conv_5 = network->addConvolutionNd(*concat5->getOutput(0), 64, DimsHW{ 3, 3 }, weightMap[lname + ".conv5.weight"], weightMap[lname + ".conv5.bias"]);
    conv_5->setStrideNd(DimsHW{ 1, 1 });
    conv_5->setPaddingNd(DimsHW{ 1, 1 });
    ITensor* x5 = conv_5->getOutput(0);

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float)));
    *scval = 0.2;
    Weights scale{ DataType::kFLOAT, scval, 1 };
    float *shval = reinterpret_cast<float*>(malloc(sizeof(float)));
    *shval = 0.0;
    Weights shift{ DataType::kFLOAT, shval, 1 };
    float *pval = reinterpret_cast<float*>(malloc(sizeof(float)));
    *pval = 1.0;
    Weights power{ DataType::kFLOAT, pval, 1 };

    IScaleLayer* scaled = network->addScale(*x5, ScaleMode::kUNIFORM, shift, scale, power);
    IElementWiseLayer* ew1 = network->addElementWise(*scaled->getOutput(0), *x, ElementWiseOperation::kSUM);
    return ew1->getOutput(0);
}

ITensor* RRDB(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor* x, std::string lname)
{
    ITensor* out = residualDenseBlock(network, weightMap, x, lname + ".rdb1");
    out = residualDenseBlock(network, weightMap, out, lname + ".rdb2");
    out = residualDenseBlock(network, weightMap, out, lname + ".rdb3");

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float)));
    *scval = 0.2;
    Weights scale{ DataType::kFLOAT, scval, 1 };
    float *shval = reinterpret_cast<float*>(malloc(sizeof(float)));
    *shval = 0.0;
    Weights shift{ DataType::kFLOAT, shval, 1 };
    float *pval = reinterpret_cast<float*>(malloc(sizeof(float)));
    *pval = 1.0;
    Weights power{ DataType::kFLOAT, pval, 1 };

    IScaleLayer* scaled = network->addScale(*out, ScaleMode::kUNIFORM, shift, scale, power);
    IElementWiseLayer* ew1 = network->addElementWise(*scaled->getOutput(0), *x, ElementWiseOperation::kSUM);
    return ew1->getOutput(0);
}


#endif

================================================
FILE: real-esrgan/x4plus/cuda_utils.h
================================================
#ifndef TRTX_CUDA_UTILS_H_
#define TRTX_CUDA_UTILS_H_

#include <cuda_runtime_api.h>
#include <stdint.h>
#include <cstdio>
#include <vector>
#include <iostream>

#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)\
    {\
        cudaError_t error_code = callstr;\
        if (error_code != cudaSuccess) {\
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
            assert(0);\
        }\
    } 
#endif  // CUDA_CHECK

#endif  // TRTX_CUDA_UTILS_H_


================================================
FILE: real-esrgan/x4plus/gen_wts.py
================================================
import argparse
import os
import struct
from basicsr.archs.rrdbnet_arch import RRDBNet
from realesrgan import RealESRGANer
from realesrgan.archs.srvgg_arch import SRVGGNetCompact

def main():
    """Inference demo for Real-ESRGAN.
    """
    parser = argparse.ArgumentParser()
    #parser.add_argument('-i', '--input', type=str, default='../TestData3', help='Input image or folder')
    parser.add_argument('-i', '--input', type=str, default='inputs', help='Input image or folder')
    parser.add_argument(
        '-n',
        '--model_name',
        type=str,
        default='RealESRGAN_x4plus',
        help=('Model names: RealESRGAN_x4plus | RealESRNet_x4plus | RealESRGAN_x4plus_anime_6B | RealESRGAN_x2plus | '
              'realesr-animevideov3'))
    parser.add_argument('-o', '--output', type=str, default='results', help='Output folder')
    parser.add_argument('-s', '--outscale', type=float, default=4, help='The final upsampling scale of the image')
    parser.add_argument('--suffix', type=str, default='out', help='Suffix of the restored image')
    parser.add_argument('-t', '--tile', type=int, default=0, help='Tile size, 0 for no tile during testing')
    parser.add_argument('--tile_pad', type=int, default=10, help='Tile padding')
    parser.add_argument('--pre_pad', type=int, default=0, help='Pre padding size at each border')
    parser.add_argument('--face_enhance', action='store_true', help='Use GFPGAN to enhance face')
    parser.add_argument(
        '--fp32', action='store_true', help='Use fp32 precision during inference. Default: fp16 (half precision).')
    parser.add_argument(
        '--alpha_upsampler',
        type=str,
        default='realesrgan',
        help='The upsampler for the alpha channels. Options: realesrgan | bicubic')
    parser.add_argument(
        '--ext',
        type=str,
        default='auto',
        help='Image extension. Options: auto | jpg | png, auto means using the same extension as inputs')
    args = parser.parse_args()

    # determine models according to model names
    args.model_name = args.model_name.split('.')[0]
    if args.model_name in ['RealESRGAN_x4plus', 'RealESRNet_x4plus']:  # x4 RRDBNet model
        model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)
        netscale = 4
    elif args.model_name in ['RealESRGAN_x4plus_anime_6B']:  # x4 RRDBNet model with 6 blocks
        model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=6, num_grow_ch=32, scale=4)
        netscale = 4
    elif args.model_name in ['RealESRGAN_x2plus']:  # x2 RRDBNet model
        model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=2)
        netscale = 2
    elif args.model_name in ['realesr-animevideov3']:  # x4 VGG-style model (XS size)
        model = SRVGGNetCompact(num_in_ch=3, num_out_ch=3, num_feat=64, num_conv=16, upscale=4, act_type='prelu')
        netscale = 4

    # determine model paths
    model_path = os.path.join('experiments/pretrained_models', args.model_name + '.pth')
    if not os.path.isfile(model_path):
        model_path = os.path.join('realesrgan/weights', args.model_name + '.pth')
    if not os.path.isfile(model_path):
        raise ValueError(f'Model {args.model_name} does not exist.')

    # restorer
    upsampler = RealESRGANer(
        scale=netscale,
        model_path=model_path,
        model=model,
        tile=args.tile,
        tile_pad=args.tile_pad,
        pre_pad=args.pre_pad,
        half=args.fp32)

    if os.path.isfile('real-esrgan.wts'):
        print('Already, real-esrgan.wts file exists.')
    else:
        print('making real-esrgan.wts file ...')
        f = open("real-esrgan.wts", 'w')
        f.write("{}\n".format(len(upsampler.model.state_dict().keys())))
        for k, v in upsampler.model.state_dict().items():
            print('key: ', k)
            print('value: ', v.shape)
            vr = v.reshape(-1).cpu().numpy()
            f.write("{} {}".format(k, len(vr)))
            for vv in vr:
                f.write(" ")
                f.write(struct.pack(">f", float(vv)).hex())
            f.write("\n")
        print('Completed real-esrgan.wts file!')

if __name__ == '__main__':
    main()


================================================
FILE: real-esrgan/x4plus/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "macros.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override 
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: real-esrgan/x4plus/macros.h
================================================
#ifndef __MACROS_H
#define __MACROS_H

#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else

#if defined(_MSC_VER)
#define API __declspec(dllimport)
#else
#define API
#endif
#endif  // API_EXPORTS

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif

#endif  // __MACROS_H


================================================
FILE: real-esrgan/x4plus/postprocess.cu
================================================
#include "cuda_utils.h"

using namespace std;

// postprocess (NCHW->NHWC, RGB->BGR, *255, ROUND, uint8)
__global__ void postprocess_kernel(uint8_t* output, float* input,
    const int batchSize, const int height, const int width, const int channel,
    const int thread_count)
{
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    if (index >= thread_count) return;

    const int c_idx = index % channel;
    int idx = index / channel;
    const int w_idx = idx % width;
    idx /= width;
    const int h_idx = idx % height;
    const int b_idx = idx / height;

    int g_idx = b_idx * height * width * channel + (2 - c_idx)* height * width + h_idx * width + w_idx;
    float tt = input[g_idx] * 255.f;
    if (tt > 255)
        tt = 255;
    output[index] = tt;
}

void postprocess(uint8_t* output, float*input, int batchSize, int height, int width, int channel, cudaStream_t stream)
{
    int thread_count = batchSize * height * width * channel;
    int block = 512;
    int grid = (thread_count - 1) / block + 1;

    postprocess_kernel << <grid, block, 0, stream >> > (output, input, batchSize, height, width, channel, thread_count);
}


#include "postprocess.hpp"

namespace nvinfer1
{
    int PostprocessPluginV2::enqueue(int batchSize, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept
    {
        float* input = (float*)inputs[0];
        uint8_t* output = (uint8_t*)outputs[0];

        const int H = mPostprocess.H;
        const int W = mPostprocess.W;
        const int C = mPostprocess.C;

        postprocess(output, input, batchSize, H, W, C, stream);

        return 0;
    }
}

================================================
FILE: real-esrgan/x4plus/postprocess.hpp
================================================
#pragma once
#include <NvInfer.h>
#include <fstream>
#include "macros.h"
#include <assert.h>

struct Postprocess {
    int N;
    int C;
    int H;
    int W;
};

namespace nvinfer1
{
    class PostprocessPluginV2 : public IPluginV2IOExt
    {
    public:
        PostprocessPluginV2(const Postprocess& arg)
        {
            mPostprocess = arg;
        }

        PostprocessPluginV2(const void* data, size_t length)
        {
            const char* d = static_cast<const char*>(data);
            const char* const a = d;
            mPostprocess = read<Postprocess>(d);
            assert(d == a + length);
        }
        PostprocessPluginV2() = delete;

        virtual ~PostprocessPluginV2() {}

    public:
        int getNbOutputs() const noexcept override
        {
            return 1;
        }

        Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) noexcept override
        {
            return Dims3(mPostprocess.H, mPostprocess.W, mPostprocess.C);
        }

        int initialize() noexcept override
        {
            return 0;
        }

        void terminate() noexcept override
        {
        }

        size_t getWorkspaceSize(int maxBatchSize) const noexcept override
        {
            return 0;
        }

        int enqueue(int batchSize, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;

        size_t getSerializationSize() const noexcept override
        {
            size_t serializationSize = 0;
            serializationSize += sizeof(mPostprocess);
            return serializationSize;
        }

        void serialize(void* buffer) const noexcept override
        {
            char* d = static_cast<char*>(buffer);
            const char* const a = d;
            write(d, mPostprocess);
            assert(d == a + getSerializationSize());
        }

        void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) noexcept override
        {
        }

        //! The combination of kLINEAR + kINT8/kHALF/kFLOAT is supported.
        bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const noexcept override
        {
            assert(nbInputs == 1 && nbOutputs == 1 && pos < nbInputs + nbOutputs);
            bool condition = inOut[pos].format == TensorFormat::kLINEAR;
            condition &= inOut[pos].type != DataType::kINT32;
            condition &= inOut[pos].type == inOut[0].type;
            return condition;
        }

        DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const noexcept override
        {
            assert(inputTypes && nbInputs == 1);
            return DataType::kFLOAT; //
        }

        const char* getPluginType() const noexcept override
        {
            return "postprocess";
        }

        const char* getPluginVersion() const noexcept override
        {
            return "1";
        }

        void destroy() noexcept override
        {
            delete this;
        }

        IPluginV2Ext* clone() const noexcept override
        {
            PostprocessPluginV2* plugin = new PostprocessPluginV2(*this);
            return plugin;
        }

        void setPluginNamespace(const char* libNamespace) noexcept override
        {
            mNamespace = libNamespace;
        }

        const char* getPluginNamespace() const noexcept override
        {
            return mNamespace.data();
        }

        bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const noexcept override
        {
            return false;
        }

        bool canBroadcastInputAcrossBatch(int inputIndex) const noexcept override
        {
            return false;
        }

    private:
        template <typename T>
        void write(char*& buffer, const T& val) const
        {
            *reinterpret_cast<T*>(buffer) = val;
            buffer += sizeof(T);
        }

        template <typename T>
        T read(const char*& buffer) const
        {
            T val = *reinterpret_cast<const T*>(buffer);
            buffer += sizeof(T);
            return val;
        }

    private:
        Postprocess mPostprocess;
        std::string mNamespace;
    };

    class PostprocessPluginV2Creator : public IPluginCreator
    {
    public:
        const char* getPluginName() const noexcept override
        {
            return "postprocess";
        }

        const char* getPluginVersion() const noexcept override
        {
            return "1";
        }

        const PluginFieldCollection* getFieldNames() noexcept override
        {
            return nullptr;
        }

        IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) noexcept override
        {
            PostprocessPluginV2* plugin = new PostprocessPluginV2(*(Postprocess*)fc);
            mPluginName = name;
            return plugin;
        }

        IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) noexcept override
        {
            auto plugin = new PostprocessPluginV2(serialData, serialLength);
            mPluginName = name;
            return plugin;
        }

        void setPluginNamespace(const char* libNamespace) noexcept override
        {
            mNamespace = libNamespace;
        }

        const char* getPluginNamespace() const noexcept override
        {
            return mNamespace.c_str();
        }

    private:
        std::string mNamespace;
        std::string mPluginName;
    };
    REGISTER_TENSORRT_PLUGIN(PostprocessPluginV2Creator);
};


================================================
FILE: real-esrgan/x4plus/preprocess.cu
================================================
#include "cuda_utils.h"

using namespace std;

// preprocess (NHWC->NCHW, BGR->RGB, [0, 255]->[0, 1](Normalize))
__global__ void preprocess_kernel(float* output, uint8_t* input,
    const int batchSize, const int height, const int width, const int channel,
    const int thread_count)
{
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    if (index >= thread_count) return;

    const int w_idx = index % width;
    int idx = index / width;
    const int h_idx = idx % height;
    idx /= height;
    const int c_idx = idx % channel;
    const int b_idx = idx / channel;

    int g_idx = b_idx * height * width * channel + h_idx * width * channel + w_idx * channel + 2 - c_idx;

    output[index] = input[g_idx] / 255.f;
}

void preprocess(float* output, uint8_t*input, int batchSize, int height, int width, int channel, cudaStream_t stream)
{
    int thread_count = batchSize * height * width * channel;
    int block = 512;
    int grid = (thread_count - 1) / block + 1;

    preprocess_kernel << <grid, block, 0, stream >> > (output, input, batchSize, height, width, channel, thread_count);
}

#include "preprocess.hpp"

namespace nvinfer1
{
    int PreprocessPluginV2::enqueue(int batchSize, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept
    {
        uint8_t* input = (uint8_t*)inputs[0];
        float* output = (float*)outputs[0];

        const int H = mPreprocess.H;
        const int W = mPreprocess.W;
        const int C = mPreprocess.C;

        preprocess(output, input, batchSize, H, W, C, stream);

        return 0;
    }
}

================================================
FILE: real-esrgan/x4plus/preprocess.hpp
================================================
#pragma once
#include <NvInfer.h>
#include <fstream>
#include "macros.h"
#include <assert.h>

struct Preprocess {
    int N;
    int C;
    int H;
    int W;
};

namespace nvinfer1
{
    class PreprocessPluginV2 : public IPluginV2IOExt
    {
    public:
        PreprocessPluginV2(const Preprocess& arg)
        {
            mPreprocess = arg;
        }

        PreprocessPluginV2(const void* data, size_t length)
        {
            const char* d = static_cast<const char*>(data);
            const char* const a = d;
            mPreprocess = read<Preprocess>(d);
            assert(d == a + length);
        }
        PreprocessPluginV2() = delete;

        virtual ~PreprocessPluginV2() {}

    public:
        int getNbOutputs() const noexcept override
        {
            return 1;
        }

        Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) noexcept override
        {
            return Dims3(mPreprocess.C, mPreprocess.H, mPreprocess.W);
        }

        int initialize() noexcept override
        {
            return 0;
        }

        void terminate() noexcept override
        {
        }

        size_t getWorkspaceSize(int maxBatchSize) const noexcept override
        {
            return 0;
        }

        int enqueue(int batchSize, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;

        size_t getSerializationSize() const noexcept override
        {
            size_t serializationSize = 0;
            serializationSize += sizeof(mPreprocess);
            return serializationSize;
        }

        void serialize(void* buffer) const noexcept override
        {
            char* d = static_cast<char*>(buffer);
            const char* const a = d;
            write(d, mPreprocess);
            assert(d == a + getSerializationSize());
        }

        void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) noexcept override
        {
        }

        //! The combination of kLINEAR + kINT8/kHALF/kFLOAT is supported.
        bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const noexcept override
        {
            assert(nbInputs == 1 && nbOutputs == 1 && pos < nbInputs + nbOutputs);
            bool condition = inOut[pos].format == TensorFormat::kLINEAR;
            condition &= inOut[pos].type != DataType::kINT32;
            condition &= inOut[pos].type == inOut[0].type;
            return condition;
        }

        DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const noexcept override
        {
            assert(inputTypes && nbInputs == 1);
            return DataType::kFLOAT; //
        }

        const char* getPluginType() const noexcept override
        {
            return "preprocess";
        }

        const char* getPluginVersion() const noexcept override
        {
            return "1";
        }

        void destroy() noexcept override
        {
            delete this;
        }

        IPluginV2Ext* clone() const noexcept override
        {
            PreprocessPluginV2* plugin = new PreprocessPluginV2(*this);
            return plugin;
        }

        void setPluginNamespace(const char* libNamespace) noexcept override
        {
            mNamespace = libNamespace;
        }

        const char* getPluginNamespace() const noexcept override
        {
            return mNamespace.data();
        }

        bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const noexcept override
        {
            return false;
        }

        bool canBroadcastInputAcrossBatch(int inputIndex) const noexcept override
        {
            return false;
        }

    private:
        template <typename T>
        void write(char*& buffer, const T& val) const
        {
            *reinterpret_cast<T*>(buffer) = val;
            buffer += sizeof(T);
        }

        template <typename T>
        T read(const char*& buffer) const
        {
            T val = *reinterpret_cast<const T*>(buffer);
            buffer += sizeof(T);
            return val;
        }

    private:
        Preprocess mPreprocess;
        std::string mNamespace;
    };

    class PreprocessPluginV2Creator : public IPluginCreator
    {
    public:
        const char* getPluginName() const noexcept override
        {
            return "preprocess";
        }

        const char* getPluginVersion() const noexcept override
        {
            return "1";
        }

        const PluginFieldCollection* getFieldNames() noexcept override
        {
            return nullptr;
        }

        IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) noexcept override
        {
            PreprocessPluginV2* plugin = new PreprocessPluginV2(*(Preprocess*)fc);
            mPluginName = name;
            return plugin;
        }

        IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) noexcept override
        {
            auto plugin = new PreprocessPluginV2(serialData, serialLength);
            mPluginName = name;
            return plugin;
        }

        void setPluginNamespace(const char* libNamespace) noexcept override
        {
            mNamespace = libNamespace;
        }

        const char* getPluginNamespace() const noexcept override
        {
            return mNamespace.c_str();
        }

    private:
        std::string mNamespace;
        std::string mPluginName;
    };
    REGISTER_TENSORRT_PLUGIN(PreprocessPluginV2Creator);
};


================================================
FILE: real-esrgan/x4plus/real-esrgan.cpp
================================================
#include "cuda_utils.h"
#include "common.hpp"
#include "preprocess.hpp"// preprocess plugin 
#include "postprocess.hpp"// postprocess plugin 
#include "logging.h"
#include "utils.h"
#include <unistd.h>//access()

#define DEVICE 0 // GPU id
#define BATCH_SIZE 1

// stuff we know about the network and the input/output blobs
static const int PRECISION_MODE = 32; // fp32 : 32, fp16 : 16
static const bool VISUALIZATION = true;
static const int INPUT_H = 640;
static const int INPUT_W = 448;
static const int INPUT_C = 3;
static const int OUT_SCALE = 4;
static const int OUTPUT_SIZE = INPUT_C * INPUT_H * OUT_SCALE * INPUT_W * OUT_SCALE;
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
static Logger gLogger;

// Creat the engine using only the API and not any parser.
ICudaEngine* build_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, std::string& wts_name) {
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // Create input tensor of shape {INPUT_H, INPUT_W, INPUT_C} with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ INPUT_H, INPUT_W, INPUT_C });
    assert(data);
    std::map<std::string, Weights> weightMap = loadWeights(wts_name);

    // Custom preprocess (NHWC->NCHW, BGR->RGB, [0, 255]->[0, 1](Normalize))
    Preprocess preprocess{ maxBatchSize, INPUT_C, INPUT_H, INPUT_W };
    IPluginCreator* preprocess_creator = getPluginRegistry()->getPluginCreator("preprocess", "1");
    IPluginV2 *preprocess_plugin = preprocess_creator->createPlugin("preprocess_plugin", (PluginFieldCollection*)&preprocess);
    IPluginV2Layer* preprocess_layer = network->addPluginV2(&data, 1, *preprocess_plugin);
    preprocess_layer->setName("preprocess_layer");
    ITensor* prep = preprocess_layer->getOutput(0);

    // conv_first
    IConvolutionLayer* conv_first = network->addConvolutionNd(*prep, 64, DimsHW{ 3, 3 }, weightMap["conv_first.weight"], weightMap["conv_first.bias"]);
    conv_first->setStrideNd(DimsHW{ 1, 1 });
    conv_first->setPaddingNd(DimsHW{ 1, 1 });
    conv_first->setName("conv_first");
    ITensor* feat = conv_first->getOutput(0);

    // conv_body
    ITensor* body_feat = RRDB(network, weightMap, feat, "body.0");
    for (int idx = 1; idx < 23; idx++) {
        body_feat = RRDB(network, weightMap, body_feat, "body." + std::to_string(idx));
    }

    IConvolutionLayer* conv_body = network->addConvolutionNd(*body_feat, 64, DimsHW{ 3, 3 }, weightMap["conv_body.weight"], weightMap["conv_body.bias"]);
    conv_body->setStrideNd(DimsHW{ 1, 1 });
    conv_body->setPaddingNd(DimsHW{ 1, 1 });
    IElementWiseLayer* ew1 = network->addElementWise(*feat, *conv_body->getOutput(0), ElementWiseOperation::kSUM);
    feat = ew1->getOutput(0);

    //upsample
    IResizeLayer* interpolate_nearest = network->addResize(*feat);
    float sclaes1[] = { 1, 2, 2 };
    interpolate_nearest->setScales(sclaes1, 3);
    interpolate_nearest->setResizeMode(ResizeMode::kNEAREST);

    IConvolutionLayer* conv_up1 = network->addConvolutionNd(*interpolate_nearest->getOutput(0), 64, DimsHW{ 3, 3 }, weightMap["conv_up1.weight"], weightMap["conv_up1.bias"]);
    conv_up1->setStrideNd(DimsHW{ 1, 1 });
    conv_up1->setPaddingNd(DimsHW{ 1, 1 });
    IActivationLayer* leaky_relu_1 = network->addActivation(*conv_up1->getOutput(0), ActivationType::kLEAKY_RELU);
    leaky_relu_1->setAlpha(0.2);

    IResizeLayer* interpolate_nearest2 = network->addResize(*leaky_relu_1->getOutput(0));
    float sclaes2[] = { 1, 2, 2 };
    interpolate_nearest2->setScales(sclaes2, 3);
    interpolate_nearest2->setResizeMode(ResizeMode::kNEAREST);
    IConvolutionLayer* conv_up2 = network->addConvolutionNd(*interpolate_nearest2->getOutput(0), 64, DimsHW{ 3, 3 }, weightMap["conv_up2.weight"], weightMap["conv_up2.bias"]);
    conv_up2->setStrideNd(DimsHW{ 1, 1 });
    conv_up2->setPaddingNd(DimsHW{ 1, 1 });
    IActivationLayer* leaky_relu_2 = network->addActivation(*conv_up2->getOutput(0), ActivationType::kLEAKY_RELU);
    leaky_relu_2->setAlpha(0.2);

    IConvolutionLayer* conv_hr = network->addConvolutionNd(*leaky_relu_2->getOutput(0), 64, DimsHW{ 3, 3 }, weightMap["conv_hr.weight"], weightMap["conv_hr.bias"]);
    conv_hr->setStrideNd(DimsHW{ 1, 1 });
    conv_hr->setPaddingNd(DimsHW{ 1, 1 });
    IActivationLayer* leaky_relu_hr = network->addActivation(*conv_hr->getOutput(0), ActivationType::kLEAKY_RELU);
    leaky_relu_hr->setAlpha(0.2);
    IConvolutionLayer* conv_last = network->addConvolutionNd(*leaky_relu_hr->getOutput(0), 3, DimsHW{ 3, 3 }, weightMap["conv_last.weight"], weightMap["conv_last.bias"]);
    conv_last->setStrideNd(DimsHW{ 1, 1 });
    conv_last->setPaddingNd(DimsHW{ 1, 1 });
    ITensor* out = conv_last->getOutput(0);

    // Custom postprocess (RGB -> BGR, NCHW->NHWC, *255, ROUND, uint8)
    Postprocess postprocess{ maxBatchSize, out->getDimensions().d[0], out->getDimensions().d[1], out->getDimensions().d[2] };
    IPluginCreator* postprocess_creator = getPluginRegistry()->getPluginCreator("postprocess", "1");
    IPluginV2 *postprocess_plugin = postprocess_creator->createPlugin("postprocess_plugin", (PluginFieldCollection*)&postprocess);
    IPluginV2Layer* postprocess_layer = network->addPluginV2(&out, 1, *postprocess_plugin);
    postprocess_layer->setName("postprocess_layer");

    ITensor* final_tensor = postprocess_layer->getOutput(0);
    final_tensor->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*final_tensor);

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB

    if (PRECISION_MODE == 16) {
        std::cout << "==== precision f16 ====" << std::endl << std::endl;
        config->setFlag(BuilderFlag::kFP16);
    }
    else {
        std::cout << "==== precision f32 ====" << std::endl << std::endl;
    }

    std::cout << "Building engine, please wait for a while..." << std::endl;
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // Don't need the network any more
    delete network;

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*)(mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream, std::string& wts_name) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine *engine = build_engine(maxBatchSize, builder, config, DataType::kFLOAT, wts_name);

    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    delete engine;
    delete builder;
    delete config;
}

void doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, uint8_t* output, int batchSize) {
    // infer on the batch asynchronously, and DMA output back to host
    context.enqueue(batchSize, buffers, stream, nullptr);
    CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);
}

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir) {
    if (argc < 4) return false;
    if (std::string(argv[1]) == "-s" && argc == 4) {
        wts = std::string(argv[2]);
        engine = std::string(argv[3]);
    }
    else if (std::string(argv[1]) == "-d" && argc == 4) {
        engine = std::string(argv[2]);
        img_dir = std::string(argv[3]);
    }
    else {
        return false;
    }
    return true;
}

// ./real-esrgan -s ./real-esrgan.wts ./real-esrgan_f32.engine
// ./real-esrgan -d ./real-esrgan_f32.engine ../samples

int main(int argc, char** argv) {
    std::string wts_name = "";
    std::string engine_name = "";
    std::string img_dir;
    if (!parse_args(argc, argv, wts_name, engine_name, img_dir)) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./real-esrgan -s [.wts] [.engine] // serialize model to plan file" << std::endl;
        std::cerr << "./real-esrgan -d [.engine] ../samples  // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // create a model using the API directly and serialize it to a stream
    if (!wts_name.empty()) {
        IHostMemory* modelStream{ nullptr };
        APIToModel(BATCH_SIZE, &modelStream, wts_name);
        assert(modelStream != nullptr);
        std::ofstream p(engine_name, std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        delete modelStream;
        return 0;
    }

    // deserialize the .engine and run inference
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        return -1;
    }
    char *trtModelStream = nullptr;
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    trtModelStream = new char[size];
    assert(trtModelStream);
    file.read(trtModelStream, size);
    file.close();

    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;
    assert(engine->getNbBindings() == 2);
    void* buffers[2];
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);
    assert(inputIndex == 0);
    assert(outputIndex == 1);

    // Create GPU buffers on device	
    CUDA_CHECK(cudaMalloc(&buffers[inputIndex], BATCH_SIZE * INPUT_C * INPUT_H * INPUT_W * sizeof(uint8_t)));
    CUDA_CHECK(cudaMalloc(&buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(uint8_t)));

    std::vector<uint8_t> input(BATCH_SIZE * INPUT_H * INPUT_W * INPUT_C);
    std::vector<uint8_t> outputs(BATCH_SIZE * OUTPUT_SIZE);

    // Create stream
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    std::vector<cv::Mat> imgs_buffer(BATCH_SIZE);
    for (int f = 0; f < (int)file_names.size(); f++) {

        for (int b = 0; b < BATCH_SIZE; b++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[f]);
            if (img.empty()) continue;
            memcpy(input.data() + b * INPUT_H * INPUT_W * INPUT_C, img.data, INPUT_H * INPUT_W * INPUT_C);
        }

        CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input.data(), BATCH_SIZE * INPUT_C * INPUT_H * INPUT_W * sizeof(uint8_t), cudaMemcpyHostToDevice, stream));

        // Run inference
        auto start = std::chrono::system_clock::now();
        doInference(*context, stream, (void**)buffers, outputs.data(), BATCH_SIZE);
        auto end = std::chrono::system_clock::now();
        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    cv::Mat frame = cv::Mat(INPUT_H * OUT_SCALE, INPUT_W * OUT_SCALE, CV_8UC3, outputs.data());
    cv::imwrite("../_" + file_names[0] + ".png", frame);

    if (VISUALIZATION) {
        cv::imshow("result : " + file_names[0], frame);
        cv::waitKey(0);
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(buffers[inputIndex]));
    CUDA_CHECK(cudaFree(buffers[outputIndex]));
    // Destroy the engine
    delete context;
    delete engine;
    delete runtime;
}

================================================
FILE: real-esrgan/x4plus/utils.h
================================================
#ifndef TRTX_REAL_ESRGAN_UTILS_H_
#define TRTX_REAL_ESRGAN_UTILS_H_

#include <dirent.h>
#include <opencv2/opencv.hpp>

static inline int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
    DIR *p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 &&
            strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}

#endif  // TRTX_REAL_ESRGAN_UTILS_H_


================================================
FILE: refinedet/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(refinedet)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

# tensorrt
include_directories(/data_2/tensorrt/TensorRT-7.0.0.11/include/) #include_directories(/usr/include/x86_64-linux-gnu/)
link_directories(/data_2/tensorrt/TensorRT-7.0.0.11/lib/) #link_directories(/usr/lib/x86_64-linux-gnu/)


find_package(CUDA REQUIRED)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)

#find_package(OpenCV)
#include_directories(OpenCV_INCLUDE_DIRS)

include_directories(/home/software_install/opencv3.4.6/include)
link_directories(/home/software_install/opencv3.4.6/lib)


set(CMAKE_PREFIX_PATH "/data_1/torch1.1.0") ###torch1.1.0
find_package(Torch REQUIRED)

include_directories(/data_1/torch1.1.0/include)
link_directories(/data_1/torch1.1.0/lib)


set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")


add_executable(refinedet ${PROJECT_SOURCE_DIR}/calibrator.cpp ${PROJECT_SOURCE_DIR}/refinedet.cpp)
target_link_libraries(refinedet nvinfer)
target_link_libraries(refinedet cudart)
target_link_libraries(refinedet "${TORCH_LIBRARIES}")
target_link_libraries(refinedet opencv_calib3d opencv_core opencv_dnn opencv_imgproc opencv_highgui opencv_imgcodecs caffe2)

add_definitions(-O2 -pthread)


================================================
FILE: refinedet/README.md
================================================
# RefineDet

For the Pytorch implementation, you can refer to [luuuyi/RefineDet.PyTorch](https://github.com/luuuyi/RefineDet.PyTorch)

## How to run

```
1. generate wts file. from pytorch
python gen_wts_refinedet.py
// a file 'refinedet.wts' will be generated.

2. build tensorrtx/RefineDet and run or Using clion to open a project(recommend)
Configuration file in configure.h
You need configure your own paths and modes(SERIALIZE or INFER)
Detailed information reference configure.h
mkdir build
cd build
cmake ..
make
```

## dependence

```
TensorRT7.0.0.11 
OpenCV >= 3.4
libtorch >=1.1.0
```

## feature

1.tensorrt Multi output  
2.L2norm  
3.Postprocessing with libtorch

## More Information

See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)  
[tensorrt tutorials](https://github.com/wang-xinyu/tensorrtx/tree/master/tutorials)  
For more detailed guidance, see [yhl blog](https://www.cnblogs.com/yanghailin/p/14525128.html)


================================================
FILE: refinedet/calibrator.cpp
================================================
#include <iostream>
#include <iterator>
#include <fstream>
#include <opencv2/dnn/dnn.hpp>
#include "calibrator.h"
#include "cuda_runtime_api.h"
#include "utils.h"

Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache)
    : batchsize_(batchsize)
    , input_w_(input_w)
    , input_h_(input_h)
    , img_idx_(0)
    , img_dir_(img_dir)
    , calib_table_name_(calib_table_name)
    , input_blob_name_(input_blob_name)
    , read_cache_(read_cache)
{
    input_count_ = 3 * input_w * input_h * batchsize;
    CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float)));
    read_files_in_dir(img_dir, img_files_);
}

Int8EntropyCalibrator2::~Int8EntropyCalibrator2()
{
    CUDA_CHECK(cudaFree(device_input_));
}

int Int8EntropyCalibrator2::getBatchSize() const
{
    return batchsize_;
}

bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings)
{
    if (img_idx_ + batchsize_ > (int)img_files_.size()) {
        return false;
    }

    std::vector<cv::Mat> input_imgs_;
    for (int i = img_idx_; i < img_idx_ + batchsize_; i++) {
        std::cout << img_files_[i] << "  " << i << std::endl;
        cv::Mat temp = cv::imread(img_dir_ + img_files_[i]);
        if (temp.empty()){
            std::cerr << "Fatal error: image cannot open!" << std::endl;
            return false;
        }
//        cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_);
        input_imgs_.push_back(temp);
    }
    img_idx_ += batchsize_;
    cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0, cv::Size(input_w_, input_h_), cv::Scalar(123.0, 117.0, 104.0), true, false);
//    cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false);

    CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr<float>(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice));
    assert(!strcmp(names[0], input_blob_name_));
    bindings[0] = device_input_;
    return true;
}

const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length)
{
    std::cout << "reading calib cache: " << calib_table_name_ << std::endl;
    calib_cache_.clear();
    std::ifstream input(calib_table_name_, std::ios::binary);
    input >> std::noskipws;
    if (read_cache_ && input.good())
    {
        std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(calib_cache_));
    }
    length = calib_cache_.size();
    return length ? calib_cache_.data() : nullptr;
}

void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length)
{
    std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl;
    std::ofstream output(calib_table_name_, std::ios::binary);
    output.write(reinterpret_cast<const char*>(cache), length);
}


================================================
FILE: refinedet/calibrator.h
================================================
#ifndef ENTROPY_CALIBRATOR_H
#define ENTROPY_CALIBRATOR_H

#include "NvInfer.h"
#include <string>
#include <vector>

//! \class Int8EntropyCalibrator2
//!
//! \brief Implements Entropy calibrator 2.
//!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
//!
class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2
{
public:
    Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true);

    virtual ~Int8EntropyCalibrator2();
    int getBatchSize() const override;
    bool getBatch(void* bindings[], const char* names[], int nbBindings) override;
    const void* readCalibrationCache(size_t& length) override;
    void writeCalibrationCache(const void* cache, size_t length) override;

private:
    int batchsize_;
    int input_w_;
    int input_h_;
    int img_idx_;
    std::string img_dir_;
    std::vector<std::string> img_files_;
    size_t input_count_;
    std::string calib_table_name_;
    const char* input_blob_name_;
    bool read_cache_;
    void* device_input_;
    std::vector<char> calib_cache_;
};

#endif // ENTROPY_CALIBRATOR_H


================================================
FILE: refinedet/configure.h
================================================

#define USE_FP32  // set USE_INT8 or USE_FP16 or USE_FP32

const int num_class = 25; //num_class + 1     //Including background class

//SERIALIZE: It indicates that to generate engin by serialization, the following path needs to be set,path_wts_ and path_save_engine
//INFER: It shows that it is a reasoning mode,the following path needs to be set,path_engine
#define INFER    //SERIALIZE   INFER

const std::string path_engine = "/data_2//cmake-build-debug/refinedet_0312-now.engine";
const std::string path_wts = "/data_1/refinedet/pytorch_refinedet-master/refinedet0312.wts";
const std::string path_save_engine = "./refinedet_0312-now.engine";

//Picture folder to be detected
const char *p_dir_name = "/data_1/img/";

const float TH = 0.2;  //Confidence threshold
const int T_show = 1; //1:Show the effect      0:Test map to generate TXT
//The path to save the generated TXT when testing the map
std::string save_path_txt = "/data_1/txt/";

#define DEVICE 0  // GPU id

// stuff we know about the network and the input/output blobs
static const int INPUT_H = 320;
static const int INPUT_W = 320;
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME_arm_loc = "arm_loc";
const char* OUTPUT_BLOB_NAME_arm_conf = "arm_conf";
const char* OUTPUT_BLOB_NAME_odm_loc = "odm_loc";
const char* OUTPUT_BLOB_NAME_odm_conf = "odm_conf";

std::string label_map[] =
        {
                "background",
                "aa",
                "bb",
                "cc",
                "dd",
                "ee",
                "ff",
                "gg",
                "hh",
                "ii",
                "jj",
                "kk",
                "ll",
                "mm",
                "nn",
                "oo",
                "pp",
                "qq",
                "rr",
                "ss",
                "tt",
                "uu",
                "vv",
                "ww",
                "xx"
        };

================================================
FILE: refinedet/gen_wts_refinedet.py
================================================
import torch
import torch.nn as nn
import struct
from models.refinedet import build_refinedet


num_classes = 25
path_model = "/data_2/project_2021/pytorch_refinedet/2021/20210308.pth"
path_save_wts = "./refinedet0312.wts"
input_size = 320

net = build_refinedet('test', input_size, num_classes)  # initialize net
net.load_state_dict(torch.load(path_model))
net.eval()


f = open(path_save_wts, 'w')
f.write('{}\n'.format(len(net.state_dict().keys())))
for k, v in net.state_dict().items():
    vr = v.reshape(-1).cpu().numpy()
    f.write('{} {} '.format(k, len(vr)))
    for vv in vr:
        f.write(' ')
        f.write(struct.pack('>f',float(vv)).hex())
    f.write('\n')

print("success generate wts!")

================================================
FILE: refinedet/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) override
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};


namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: refinedet/refinedet.cpp
================================================
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "utils.h"
#include "logging.h"
#include "calibrator.h"
#include "configure.h"

#include <torch/script.h> // One-stop header.
#include "torch/torch.h"
#include "torch/jit.h"

using namespace nvinfer1;
static Logger gLogger;

//Correct the rectangle area to prevent the image from crossing the boundary
void RoiCorrect(const cv::Mat &m, cv::Rect &r)
{
    if (r.x < 0) r.x = 0;
    if (r.y < 0) r.y = 0;

    if(r.x >= m.cols-1) r.x=0;
    if(r.y >= m.rows-1) r.y=0;

    if(r.width <= 0) r.width = 1;
    if(r.height <= 0) r.height = 1;

    if(r.x + r.width > m.cols - 1) r.width = m.cols - 1 - r.x;
    if(r.y + r.height > m.rows - 1) r.height = m.rows - 1 - r.y;
}

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        
        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + ".weight"].values;
    float *beta = (float*)weightMap[lname + ".bias"].values;
    float *mean = (float*)weightMap[lname + ".running_mean"].values;
    float *var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};
    
    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

//convBnLeaky(network, weightMap, *data, 32, 3, 1, 1, 0);
ILayer* convRelu(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input,  int outch, int ksize, int s, int p,\
        int linx, const std::string pre_name = "vgg.", bool b_dilate = false) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    if (weightMap.count(pre_name + std::to_string(linx) + ".weight") == 0)
        std::cout << "no key: " <<pre_name + std::to_string(linx) + ".weight" << std::endl;

    if (weightMap.count(pre_name + std::to_string(linx) + ".bias") == 0)
        std::cout << "no key: " <<pre_name + std::to_string(linx) + ".bias" << std::endl;

    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[pre_name + std::to_string(linx) + ".weight"], weightMap[pre_name + std::to_string(linx) + ".bias"]);
    assert(conv1);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{p, p});
    if(true == b_dilate)
    {
       conv1->setDilation(DimsHW{3, 3});
    }

    auto lr = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);

    return lr;
}

//convBnLeaky(network, weightMap, *data, 32, 3, 1, 1, 0);
ILayer* convRelu_extras(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input,  int outch, int ksize, int s, int p, const std::string weight_name, const std::string bias_name){

    if (weightMap.count(weight_name) == 0)
        std::cout << "no key: " <<weight_name << std::endl;

    if (weightMap.count(bias_name) == 0)
        std::cout << "no key: " <<bias_name << std::endl;

    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[weight_name], weightMap[bias_name]);
    assert(conv1);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{p, p});

    auto lr = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);

    return lr;
}

//convBnLeaky(network, weightMap, *data, 32, 3, 1, 1, 0);
IConvolutionLayer* convReluconv_tcb0(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input,  int outch, int ksize, int s, int p, int indx_0, int indx_1){

    std::string name_w0 = "tcb0." + (std::string)std::to_string(indx_0) + ".weight";
    std::string name_b0 = "tcb0." + (std::string)std::to_string(indx_0) + ".bias";

    std::string name_w1 = "tcb0." + (std::string)std::to_string(indx_1) + ".weight";
    std::string name_b1 = "tcb0." + (std::string)std::to_string(indx_1) + ".bias";

    if (weightMap.count(name_w0) == 0)
        std::cout << "no key: " <<name_w0 << std::endl;
    if (weightMap.count(name_b0) == 0)
        std::cout << "no key: " <<name_b0 << std::endl;
    if (weightMap.count(name_w1) == 0)
        std::cout << "no key: " <<name_w1 << std::endl;
    if (weightMap.count(name_b1) == 0)
        std::cout << "no key: " <<name_b1 << std::endl;

    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[name_w0], weightMap[name_b0]);
    assert(conv1);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{p, p});

    auto lr = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);

    IConvolutionLayer* conv2 = network->addConvolutionNd(*lr->getOutput(0), 256, DimsHW{3, 3}, weightMap[name_w1], weightMap[name_b1]);
    assert(conv2);
    conv2->setStrideNd(DimsHW{1, 1});
    conv2->setPaddingNd(DimsHW{1, 1});

    return conv2;
}

ILayer* ReluconvRelu_tcb2(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input,  int outch, int ksize, int s, int p, int indx_0){
    auto lr = network->addActivation(input, ActivationType::kRELU);

    std::string name_w0 = "tcb2." + (std::string)std::to_string(indx_0) + ".weight";
    std::string name_b0 = "tcb2." + (std::string)std::to_string(indx_0) + ".bias";

    if (weightMap.count(name_w0) == 0)
        std::cout << "no key: " <<name_w0 << std::endl;

    if (weightMap.count(name_b0) == 0)
        std::cout << "no key: " <<name_b0 << std::endl;

    IConvolutionLayer* conv1 = network->addConvolutionNd(*lr->getOutput(0), outch, DimsHW{ksize, ksize}, weightMap[name_w0], weightMap[name_b0]);
    assert(conv1);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{p, p});

    auto lr1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);
    return lr1;
}

ILayer* conv_permutation(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input,  int outch, int ksize, int s, int p, const std::string weight_name, const std::string bias_name)
{
    if (weightMap.count(weight_name) == 0)
        std::cout << "no key: " <<weight_name << std::endl;
    if (weightMap.count(bias_name) == 0)
        std::cout << "no key: " <<bias_name << std::endl;
    IConvolutionLayer* a0 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[weight_name], weightMap[bias_name]);
    assert(a0);
    a0->setStrideNd(DimsHW{s, s});
    a0->setPaddingNd(DimsHW{p, p});

    auto sfl = network->addShuffle(*a0->getOutput(0));
    sfl->setFirstTranspose(Permutation{1, 2, 0});

    return sfl;
}

ILayer* cat_4_tensor(INetworkDefinition *network, ILayer*tensor_0, ILayer*tensor_1, ILayer*tensor_2, ILayer*tensor_3)
{
    Dims dim_;
    dim_.nbDims=1;
    dim_.d[0]=-1;
    //40 40 12 --->>40*40*12
    auto arm_loc_00 = network->addShuffle(*tensor_0->getOutput(0));
    assert(arm_loc_00);
    arm_loc_00->setReshapeDimensions(dim_);

    //20 20 12 --->>20*20*12
    auto arm_loc_11 = network->addShuffle(*tensor_1->getOutput(0));
    assert(arm_loc_11);
    arm_loc_11->setReshapeDimensions(dim_);  //Dims2(-1, 1)

    //10 10 12 --->>10*10*12
    auto arm_loc_22 = network->addShuffle(*tensor_2->getOutput(0));
    assert(arm_loc_22);
    arm_loc_22->setReshapeDimensions(dim_);

    //5 5 12 --->>5*5*12
    auto arm_loc_33 = network->addShuffle(*tensor_3->getOutput(0));
    assert(arm_loc_33);
    arm_loc_33->setReshapeDimensions(dim_);

//
//    Dims dim0 = arm_loc_00->getOutput(0)->getDimensions();
//    std::cout <<"debug  arm_loc_0 dim==" << dim0.d[0] << " " << dim0.d[1] << " " << dim0.d[2] << " " << dim0.d[3] << std::endl;
//    Dims dim1 = arm_loc_11->getOutput(0)->getDimensions();
//    std::cout <<"debug  arm_loc_1 dim==" << dim1.d[0] << " " << dim1.d[1] << " " << dim1.d[2] << " " << dim1.d[3] << std::endl;
//    Dims dim2 = arm_loc_22->getOutput(0)->getDimensions();
//    std::cout <<"debug  arm_loc_2 dim==" << dim2.d[0] << " " << dim2.d[1] << " " << dim2.d[2] << " " << dim2.d[3] << std::endl;
//    Dims dim3 = arm_loc_33->getOutput(0)->getDimensions();
//    std::cout <<"debug  arm_loc_3 dim==" << dim3.d[0] << " " << dim3.d[1] << " " << dim3.d[2] << " " << dim3.d[3] << std::endl;

    ITensor* arm_loc_t[] = {arm_loc_00->getOutput(0), arm_loc_11->getOutput(0), arm_loc_22->getOutput(0), arm_loc_33->getOutput(0)};
    auto arm_loc = network->addConcatenation(arm_loc_t, 4);
    //[25500]
    return arm_loc;
}


ILayer* reshapeSoftmax(INetworkDefinition *network, ITensor& input, int ch) {
    //The input is one-dimensional[12750]
    //reshape[XX,ch]
    auto re1 = network->addShuffle(input);
    assert(re1);
    re1->setReshapeDimensions(Dims3(1, -1, ch)); //[1,6375,2];
//     re1->setReshapeDimensions(Dims2(-1, ch)); //[6375,2];

    Dims dim0 = re1->getOutput(0)->getDimensions();
    std::cout <<"debug  re1 dim==" << dim0.d[0] << " " << dim0.d[1] << " " << dim0.d[2] << " " << dim0.d[3] << std::endl;

//    return re1;/////////////////////////////////////////

    auto sm = network->addSoftMax(*re1->getOutput(0));
    sm->setAxes(1<<2);
    assert(sm);
    //And then reshape one-dimensional again, and it's the same shape as it came in
    Dims dim_;
    dim_.nbDims=1;
    dim_.d[0]=-1;
    auto re2 = network->addShuffle(*sm->getOutput(0));
    assert(re2);
    re2->setReshapeDimensions(dim_);

    return re2;
}

IScaleLayer* L2norm(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, const std::string pre_name = "conv4_3_L2Norm.weight")
{
    //aa = x.pow(2)  ## [1,512,40,40]
    const static float pval1[3]{0.0, 1.0, 2.0};
    Weights wshift1{DataType::kFLOAT, pval1, 1};
    Weights wscale1{DataType::kFLOAT, pval1+1, 1};
    Weights wpower1{DataType::kFLOAT, pval1+2, 1};
    IScaleLayer* scale1 = network->addScale(
            input,
            ScaleMode::kUNIFORM,
            wshift1,
            wscale1,
            wpower1);
    assert(scale1);

   //bb =  x.pow(2).sum(dim=1, keepdim=True)  ## [1,1,40,40]
    IReduceLayer* reduce1 = network->addReduce(*scale1->getOutput(0),
                                               ReduceOperation::kSUM,
                                               1,
                                               true);
    assert(reduce1);

    //norm = x.pow(2).sum(dim=1, keepdim=True).sqrt()+self.eps  # [1,1,40,40]
    const static float pval2[3]{0.0, 1.0, 0.5};
    Weights wshift2{DataType::kFLOAT, pval2, 1};
    Weights wscale2{DataType::kFLOAT, pval2+1, 1};
    Weights wpower2{DataType::kFLOAT, pval2+2, 1};
    IScaleLayer* scale2 = network->addScale(
            *reduce1->getOutput(0),
            ScaleMode::kUNIFORM,
            wshift2,
            wscale2,
            wpower2);
    assert(scale2);

    // x = torch.div(x,norm)
    IElementWiseLayer* ew2 = network->addElementWise(input,
                                                     *scale2->getOutput(0),
                                                     ElementWiseOperation::kDIV);
    assert(ew2);

    //out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
    int len = weightMap[pre_name].count;
    float* pval3 = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    std::fill_n(pval3, len, 1.0);
    Weights wpower3{DataType::kFLOAT, pval3, len};
    weightMap[pre_name + ".power3"] = wpower3;

    float* pval4 = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    std::fill_n(pval4, len, 0.0);
    Weights wpower4{DataType::kFLOAT, pval4, len};
    weightMap[pre_name + ".power4"] = wpower4;

    IScaleLayer* scale3 = network->addScale(
            *ew2->getOutput(0),
            ScaleMode::kCHANNEL,
            wpower4,
            weightMap[pre_name],
            wpower3);
    assert(scale3);
    return scale3;
}


//convBnLeaky(network, weightMap, *data, 32, 3, 1, 1, 0);
ILayer* convBnLeaky(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input,  int outch, int ksize, int s, int p, int linx) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap["module_list." + std::to_string(linx) + ".Conv2d.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{p, p});

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "module_list." + std::to_string(linx) + ".BatchNorm2d", 1e-5);

    auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU);
    lr->setAlpha(0.1);

    return lr;
}

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    INetworkDefinition* network = builder->createNetworkV2(0U);

    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights(path_wts);
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    DimsHW maxpool_hw = DimsHW(2,2);

    auto lr0 = convRelu(network, weightMap, *data, 64, 3, 1, 1, 0);
    auto lr1 = convRelu(network, weightMap, *lr0->getOutput(0), 64, 3, 1, 1, 2);
    IPoolingLayer* pool1 = network->addPoolingNd(*lr1->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    assert(pool1);
    pool1->setStrideNd(DimsHW{2, 2});

    auto lr2 = convRelu(network, weightMap, *pool1->getOutput(0), 128, 3, 1, 1, 5);
    auto lr3 = convRelu(network, weightMap, *lr2->getOutput(0), 128, 3, 1, 1, 7);
    IPoolingLayer* pool2 = network->addPoolingNd(*lr3->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    assert(pool2);
    pool2->setStrideNd(DimsHW{2, 2});

    auto lr4 = convRelu(network, weightMap, *pool2->getOutput(0), 256, 3, 1, 1, 10);
    auto lr5 = convRelu(network, weightMap, *lr4->getOutput(0), 256, 3, 1, 1, 12);
    auto lr6 = convRelu(network, weightMap, *lr5->getOutput(0), 256, 3, 1, 1, 14);
    IPoolingLayer* pool3 = network->addPoolingNd(*lr6->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    assert(pool3);
    pool3->setStrideNd(DimsHW{2, 2});

    auto lr7 = convRelu(network, weightMap, *pool3->getOutput(0), 512, 3, 1, 1, 17);
    auto lr8 = convRelu(network, weightMap, *lr7->getOutput(0), 512, 3, 1, 1, 19);
    auto lr9 = convRelu(network, weightMap, *lr8->getOutput(0), 512, 3, 1, 1, 21);
    IPoolingLayer* pool4 = network->addPoolingNd(*lr9->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    assert(pool4);
    pool4->setStrideNd(DimsHW{2, 2});

    auto lr24 = convRelu(network, weightMap, *pool4->getOutput(0), 512, 3, 1, 1, 24);
    auto lr26 = convRelu(network, weightMap, *lr24->getOutput(0), 512, 3, 1, 1, 26);
    auto lr28 = convRelu(network, weightMap, *lr26->getOutput(0), 512, 3, 1, 1, 28);
    IPoolingLayer* pool5 = network->addPoolingNd(*lr28->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    assert(pool5);
    pool5->setStrideNd(DimsHW{2, 2});

    auto lr31 = convRelu(network, weightMap, *pool5->getOutput(0), 1024, 3, 1, 3, 31,"vgg.",true);

    //s_0
    auto out_conv4_3_L2Norm = L2norm(network, weightMap, *lr9->getOutput(0),"conv4_3_L2Norm.weight");
    //s_1
    auto out_conv5_3_L2Norm = L2norm(network, weightMap, *lr28->getOutput(0),"conv5_3_L2Norm.weight");

    //s_2
    auto lr33 = convRelu(network, weightMap, *lr31->getOutput(0), 1024, 1, 1, 0, 33);

    auto extras0 = convRelu_extras(network, weightMap, *lr33->getOutput(0), 256, 1, 1, 0, "extras.0.weight", "extras.0.bias");
    //s_3
    auto extras1 = convRelu_extras(network, weightMap, *extras0->getOutput(0), 512, 3, 2, 1, "extras.1.weight", "extras.1.bias");

    auto arm_loc_0 = conv_permutation(network, weightMap, *out_conv4_3_L2Norm->getOutput(0), 12, 3, 1, 1, "arm_loc.0.weight", "arm_loc.0.bias");
    auto arm_loc_1 = conv_permutation(network, weightMap, *out_conv5_3_L2Norm->getOutput(0), 12, 3, 1, 1, "arm_loc.1.weight", "arm_loc.1.bias");
    auto arm_loc_2 = conv_permutation(network, weightMap, *lr33->getOutput(0), 12, 3, 1, 1, "arm_loc.2.weight", "arm_loc.2.bias");
    auto arm_loc_3 = conv_permutation(network, weightMap, *extras1->getOutput(0), 12, 3, 1, 1, "arm_loc.3.weight", "arm_loc.3.bias");

    auto arm_conf_0 = conv_permutation(network, weightMap, *out_conv4_3_L2Norm->getOutput(0), 6, 3, 1, 1, "arm_conf.0.weight", "arm_conf.0.bias");
    auto arm_conf_1 = conv_permutation(network, weightMap, *out_conv5_3_L2Norm->getOutput(0), 6, 3, 1, 1, "arm_conf.1.weight", "arm_conf.1.bias");
    auto arm_conf_2 = conv_permutation(network, weightMap, *lr33->getOutput(0), 6, 3, 1, 1, "arm_conf.2.weight", "arm_conf.2.bias");
    auto arm_conf_3 = conv_permutation(network, weightMap, *extras1->getOutput(0), 6, 3, 1, 1, "arm_conf.3.weight", "arm_conf.3.bias");

    auto arm_loc = cat_4_tensor(network, arm_loc_0, arm_loc_1, arm_loc_2, arm_loc_3);
    auto arm_conf = cat_4_tensor(network, arm_conf_0, arm_conf_1, arm_conf_2, arm_conf_3);

    auto ss_0 = convReluconv_tcb0(network, weightMap, *extras1->getOutput(0),  256, 3, 1, 1, 9, 11);
    auto ss_00 = ReluconvRelu_tcb2(network, weightMap, *ss_0->getOutput(0),  256, 3, 1, 1, 10);
    auto ss_1 = convReluconv_tcb0(network, weightMap, *lr33->getOutput(0),  256, 3, 1, 1, 6, 8);

    IDeconvolutionLayer* tcb1_2 = network->addDeconvolutionNd(*ss_00->getOutput(0), 256, DimsHW{2, 2}, weightMap["tcb1.2.weight"], weightMap["tcb1.2.bias"]);  //nn.ConvTranspose2d(256, 256, 2, 2)
    tcb1_2->setStrideNd(DimsHW{2, 2});
    assert(tcb1_2);
    auto ss_1_add = network->addElementWise(*ss_1->getOutput(0), *tcb1_2->getOutput(0), ElementWiseOperation::kSUM);
    auto ss_11 = ReluconvRelu_tcb2(network, weightMap, *ss_1_add->getOutput(0),  256, 3, 1, 1, 7);

    auto ss_2 = convReluconv_tcb0(network, weightMap, *out_conv5_3_L2Norm->getOutput(0),  256, 3, 1, 1, 3, 5);
    IDeconvolutionLayer* tcb1_1 = network->addDeconvolutionNd(*ss_11->getOutput(0), 256, DimsHW{2, 2}, weightMap["tcb1.1.weight"], weightMap["tcb1.1.bias"]);  //nn.ConvTranspose2d(256, 256, 2, 2)
    tcb1_1->setStrideNd(DimsHW{2, 2});
    assert(tcb1_1);
    auto ss_2_add = network->addElementWise(*ss_2->getOutput(0), *tcb1_1->getOutput(0), ElementWiseOperation::kSUM);
    auto ss_22 = ReluconvRelu_tcb2(network, weightMap, *ss_2_add->getOutput(0),  256, 3, 1, 1, 4);

    auto ss_3 = convReluconv_tcb0(network, weightMap, *out_conv4_3_L2Norm->getOutput(0),  256, 3, 1, 1, 0, 2);
    IDeconvolutionLayer* tcb1_0 = network->addDeconvolutionNd(*ss_22->getOutput(0), 256, DimsHW{2, 2}, weightMap["tcb1.0.weight"], weightMap["tcb1.0.bias"]);  //nn.ConvTranspose2d(256, 256, 2, 2)
    tcb1_0->setStrideNd(DimsHW{2, 2});
    assert(tcb1_0);
    auto ss_3_add = network->addElementWise(*ss_3->getOutput(0), *tcb1_0->getOutput(0), ElementWiseOperation::kSUM);
    auto ss_33 = ReluconvRelu_tcb2(network, weightMap, *ss_3_add->getOutput(0),  256, 3, 1, 1, 1);

    auto odm_loc_0 = conv_permutation(network, weightMap, *ss_33->getOutput(0), 12, 3, 1, 1, "odm_loc.0.weight", "odm_loc.0.bias");
    auto odm_loc_1 = conv_permutation(network, weightMap, *ss_22->getOutput(0), 12, 3, 1, 1, "odm_loc.1.weight", "odm_loc.1.bias");
    auto odm_loc_2 = conv_permutation(network, weightMap, *ss_11->getOutput(0), 12, 3, 1, 1, "odm_loc.2.weight", "odm_loc.2.bias");
    auto odm_loc_3 = conv_permutation(network, weightMap, *ss_00->getOutput(0), 12, 3, 1, 1, "odm_loc.3.weight", "odm_loc.3.bias");

    auto odm_conf_0 = conv_permutation(network, weightMap, *ss_33->getOutput(0), 3 * num_class, 3, 1, 1, "odm_conf.0.weight", "odm_conf.0.bias");
    auto odm_conf_1 = conv_permutation(network, weightMap, *ss_22->getOutput(0), 3 * num_class, 3, 1, 1, "odm_conf.1.weight", "odm_conf.1.bias");
    auto odm_conf_2 = conv_permutation(network, weightMap, *ss_11->getOutput(0), 3 * num_class, 3, 1, 1, "odm_conf.2.weight", "odm_conf.2.bias");
    auto odm_conf_3 = conv_permutation(network, weightMap, *ss_00->getOutput(0), 3 * num_class, 3, 1, 1, "odm_conf.3.weight", "odm_conf.3.bias");

    auto odm_loc = cat_4_tensor(network, odm_loc_0, odm_loc_1, odm_loc_2, odm_loc_3);
    auto odm_conf = cat_4_tensor(network, odm_conf_0, odm_conf_1, odm_conf_2, odm_conf_3);

    //25500
    Dims dim = arm_loc->getOutput(0)->getDimensions();
    std::cout <<"debug  arm_loc dim==" << dim.d[0] << " " << dim.d[1] << " " << dim.d[2] << " " << dim.d[3] << std::endl;
    arm_loc->getOutput(0)->setName(OUTPUT_BLOB_NAME_arm_loc);
    network->markOutput(*arm_loc->getOutput(0));

    auto arm_conf_111 = reshapeSoftmax(network, *arm_conf->getOutput(0), 2);
    //12750
    Dims dim2 = arm_conf_111->getOutput(0)->getDimensions();
    std::cout <<"debug  arm_conf dim==" << dim2.d[0] << " " << dim2.d[1] << " " << dim2.d[2] << " " << dim2.d[3] << std::endl;
    arm_conf_111->getOutput(0)->setName(OUTPUT_BLOB_NAME_arm_conf);
    network->markOutput(*arm_conf_111->getOutput(0));

    //25500
    Dims dim3 = odm_loc->getOutput(0)->getDimensions();
    std::cout <<"debug  odm_loc dim==" << dim3.d[0] << " " << dim3.d[1] << " " << dim3.d[2] << " " << dim3.d[3] << std::endl;
    odm_loc->getOutput(0)->setName(OUTPUT_BLOB_NAME_odm_loc);
    network->markOutput(*odm_loc->getOutput(0));

    //159375
    Dims dim4 = odm_conf->getOutput(0)->getDimensions();
    odm_conf = reshapeSoftmax(network, *odm_conf->getOutput(0), 25);
    std::cout <<"debug  odm_conf dim==" << dim4.d[0] << " " << dim4.d[1] << " " << dim4.d[2] << " " << dim4.d[3] << std::endl;
    odm_conf->getOutput(0)->setName(OUTPUT_BLOB_NAME_odm_conf);
    network->markOutput(*odm_conf->getOutput(0));

    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB

#if defined(USE_FP16)
    config->setFlag(BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(BuilderFlag::kINT8);
    Int8EntropyCalibrator2 *calibrator = new Int8EntropyCalibrator2(1, INPUT_W, INPUT_H, "./coco_calib/", "int8calib.table", INPUT_BLOB_NAME);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;
    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*) (mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
}

torch::Tensor PriorBox()
{
    std::vector<float> mean;
    std::vector<int> feature_maps = {40,20,10,5};
    int image_size = 320;
    std::vector<int> steps = {8,16,32,64};
    std::vector<int> min_sizes = {32,64,128,256};
    std::vector<int> aspect_ratios = {2,2,2,2};
    for(int k=0;k<feature_maps.size();k++)
    {
        int f = feature_maps[k];
        for(int i=0;i<f;i++)
        {
            for(int j=0;j<f;j++)
            {
                float f_k = image_size * 1.0 / steps[k];
                float cx = (j + 0.5) / f_k;
                float cy = (i + 0.5) / f_k;
                float s_k = min_sizes[k] * 1.0 / image_size;
                mean.push_back(cx);
                mean.push_back(cy);
                mean.push_back(s_k);
                mean.push_back(s_k);

                float ar = aspect_ratios[k];
                mean.push_back(cx);
                mean.push_back(cy);
                mean.push_back(s_k * 1.0 * sqrt(ar));
                mean.push_back(s_k * 1.0 / sqrt(ar));

                mean.push_back(cx);
                mean.push_back(cy);
                mean.push_back(s_k * 1.0 / sqrt(ar));
                mean.push_back(s_k * 1.0 * sqrt(ar));
            }
        }
    }

    torch::Tensor m_prior;
    int m_prior_size = 6375;
    m_prior = torch::from_blob(mean.data(),{m_prior_size,4}).cuda();
    m_prior = m_prior.clamp(0,1);
    //    std::cout<<m_prior<<std::endl;
    return m_prior.toType(torch::kFloat64);
}


torch::Tensor decode(const torch::Tensor _loc,torch::Tensor _prior,bool b_form_pt = false)
{
    std::vector<float> variance({0.1,0.2});
    torch::Tensor top_2 = torch::tensor({0,1}).cuda().to(torch::kLong);
    torch::Tensor bottom_2 = torch::tensor({2,3}).cuda().to(torch::kLong);

    auto c1 = _prior.index_select(1,top_2)+_loc.index_select(1,top_2).mul(variance[0])*_prior.index_select(1,bottom_2);
    auto c2 = _prior.index_select(1,bottom_2)*torch::exp(_loc.index_select(1,bottom_2)*variance[1]);
    auto _retv = torch::cat({c1,c2},1);
    if(b_form_pt)
    {
        auto c3 = _retv.index_select(1,top_2)-_retv.index_select(1,bottom_2).div(2);
        auto c4 = c3 + _retv.index_select(1,bottom_2);
        return torch::cat({c3,c4},1);
    } else
    {
        return _retv;
    }

}

torch::Tensor center(torch::Tensor retv)
{
    auto c1 = retv.select(1,0).unsqueeze(1);
    auto c2 = retv.select(1,1).unsqueeze(1);
    auto c3 = retv.select(1,2).unsqueeze(1);
    auto c4 = retv.select(1,3).unsqueeze(1);

    auto _retv = torch::cat({(c1+c3).div(2),(c2+c4).div(2),c3-c1,c4-c2},1);
    return _retv;
}

bool nms(const torch::Tensor& boxes, const torch::Tensor& scores, torch::Tensor &keep, int &count,float overlap, int top_k)
{
    count =0;
    keep = torch::zeros({scores.size(0)}).to(torch::kLong).to(scores.device());
    if(0 == boxes.numel())
    {
        return false;
    }

    torch::Tensor x1 = boxes.select(1,0).clone();
    torch::Tensor y1 = boxes.select(1,1).clone();
    torch::Tensor x2 = boxes.select(1,2).clone();
    torch::Tensor y2 = boxes.select(1,3).clone();
    torch::Tensor area = (x2-x1)*(y2-y1);
    //    std::cout<<area<<std::endl;

    std::tuple<torch::Tensor,torch::Tensor> sort_ret = torch::sort(scores.unsqueeze(1), 0, 0);
    torch::Tensor v = std::get<0>(sort_ret).squeeze(1).to(scores.device());
    torch::Tensor idx = std::get<1>(sort_ret).squeeze(1).to(scores.device());

    int num_ = idx.size(0);
    if(num_ > top_k) //python:idx = idx[-top_k:]
    {
        idx = idx.slice(0,num_-top_k,num_).clone();
    }
    torch::Tensor xx1,yy1,xx2,yy2,w,h;
    while(idx.numel() > 0)
    {
        auto i = idx[-1];
        keep[count] = i;
        count += 1;
        if(1 == idx.size(0))
        {
            break;
        }
        idx = idx.slice(0,0,idx.size(0)-1).clone();

        xx1 = x1.index_select(0,idx);
        yy1 = y1.index_select(0,idx);
        xx2 = x2.index_select(0,idx);
        yy2 = y2.index_select(0,idx);

        xx1 = xx1.clamp(x1[i].item().toFloat(),INT_MAX*1.0);
        yy1 = yy1.clamp(y1[i].item().toFloat(),INT_MAX*1.0);
        xx2 = xx2.clamp(INT_MIN*1.0,x2[i].item().toFloat());
        yy2 = yy2.clamp(INT_MIN*1.0,y2[i].item().toFloat());

        w = xx2 - xx1;
        h = yy2 - yy1;

        w = w.clamp(0,INT_MAX);
        h = h.clamp(0,INT_MAX);

        torch::Tensor inter = w * h;
        torch::Tensor rem_areas = area.index_select(0,idx);

        torch::Tensor union_ = (rem_areas - inter) + area[i];
        torch::Tensor Iou = inter * 1.0 / union_;
        torch::Tensor index_small = Iou < overlap;
        auto mask_idx = torch::nonzero(index_small).squeeze();
        idx = idx.index_select(0,mask_idx);//pthon: idx = idx[IoU.le(overlap)]
    }
    return true;
}

void doInference(IExecutionContext& context, void* buffers[], cudaStream_t &stream, float* input, std::vector<std::vector<float>> &detections) {
    auto start_infer = std::chrono::system_clock::now();
    detections.clear();
    int batchSize = 1;
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
//    std::cout<<"engine.getNbBindings()==="<<engine.getNbBindings()<<std::endl;
    assert(engine.getNbBindings() == 5);

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex_arm_loc = engine.getBindingIndex(OUTPUT_BLOB_NAME_arm_loc);
    const int outputIndex_arm_conf = engine.getBindingIndex(OUTPUT_BLOB_NAME_arm_conf);
    const int outputIndex_odm_loc = engine.getBindingIndex(OUTPUT_BLOB_NAME_odm_loc);
    const int outputIndex_odm_conf = engine.getBindingIndex(OUTPUT_BLOB_NAME_odm_conf);
//    const int outputIndex2 = engine.getBindingIndex("prob2");
//    printf("inputIndex=%d\n",inputIndex);
//    printf("outputIndex_arm_loc=%d\n",outputIndex_arm_loc);
//    printf("outputIndex_arm_conf=%d\n",outputIndex_arm_conf);
//    printf("outputIndex_odm_loc=%d\n",outputIndex_odm_loc);
//    printf("outputIndex_odm_conf=%d\n",outputIndex_odm_conf);

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    cudaDeviceSynchronize();
    auto end_infer = std::chrono::system_clock::now();
    double during_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_infer - start_infer).count();
    std::cout <<"time consume context.enqueue===" <<  during_time << "ms" << std::endl;

    auto start_houchuli = std::chrono::system_clock::now();
    int m_prior_size = 6375;
    torch::Tensor m_prior = PriorBox();
    torch::Tensor arm_loc = torch::from_blob(buffers[outputIndex_arm_loc],{m_prior_size,4}).cuda().toType(torch::kFloat64).unsqueeze(0);
    torch::Tensor arm_conf = torch::from_blob(buffers[outputIndex_arm_conf],{m_prior_size,2}).cuda().toType(torch::kFloat64).unsqueeze(0);
    torch::Tensor odm_loc = torch::from_blob(buffers[outputIndex_odm_loc],{m_prior_size,4}).cuda().toType(torch::kFloat64).unsqueeze(0);
    torch::Tensor odm_conf = torch::from_blob(buffers[outputIndex_odm_conf],{m_prior_size,25}).cuda().toType(torch::kFloat64).unsqueeze(0);

    float obj_threshed = 0.01;
    torch::Tensor arm_object_conf = arm_conf.squeeze(0).select(1,1);
    torch::Tensor object_index = arm_object_conf > obj_threshed;
    object_index=object_index.unsqueeze(1);

    torch::Tensor object_index_1 = object_index.expand_as(odm_conf.squeeze(0)).toType(torch::kFloat64);
    auto filter_odm_conf = odm_conf.squeeze(0).toType(torch::kFloat64) * object_index_1;
    torch::Tensor conf_preds_ = filter_odm_conf.clone().toType(torch::kFloat64);
    torch::Tensor conf_preds = conf_preds_.transpose(1,0).toType(torch::kFloat64);
    torch::Tensor default_m = decode(arm_loc[0],m_prior);
//    default_m = center(default_m);
    bool b_form_pt = true;
    torch::Tensor decode_boxes_m = decode(odm_loc[0],default_m,b_form_pt);//6375,4

    float conf_thresh = 0.01;
    float mask_thresh = 0.01;

    torch::Tensor result_out;
    for(int i=1;i<25;i++)
    {
        torch::Tensor c_mask_m = conf_preds[i] > mask_thresh;
        torch::Tensor nonzero_index = torch::nonzero(c_mask_m);
        torch::Tensor  score_m = torch::index_select(conf_preds[i],0,nonzero_index.squeeze(1));
        torch::Tensor  boxes_m = torch::index_select(decode_boxes_m,0,nonzero_index.squeeze(1));

        torch::Tensor keep;
        int count = 0;
        float overlap = 0.45;
        int top_k=1000;
        nms(boxes_m, score_m, keep, count, overlap, top_k);
        if(0 == count) { continue; }

        keep = keep.slice(0,0,count).clone();
        torch::Tensor score_my = score_m.index_select(0,keep);
        torch::Tensor boxes_my = boxes_m.index_select(0,keep);

        if(score_my[0].item().toFloat() < conf_thresh)
        {
            continue;
        }
//        boxes_my.select(1,0).mul_(width);
//        boxes_my.select(1,1).mul_(height);
//        boxes_my.select(1,2).mul_(width);
//        boxes_my.select(1,3).mul_(height);
        torch::Tensor label_tensor = torch::full_like(score_my.unsqueeze(1),i);
        torch::Tensor result_ = torch::cat({boxes_my.toType(torch::kFloat64),score_my.unsqueeze(1).toType(torch::kFloat64),label_tensor.toType(torch::kFloat64)},1);
        if(0 == result_out.numel())
        {
            result_out = result_.clone();
        }else
        {
            result_out = torch::cat({result_out,result_},0);//Splicing by line
        }
    }
    if(0 == result_out.numel()) { std::cout<<"libtorch refinedet obj_small: nothing detect!"<<std::endl; return ;}
    result_out =result_out.cpu();

    // x1,y1,x2,y2,score,id
    auto result_data = result_out.accessor<double, 2>();
    for(int i=0;i<result_data.size(0);i++)
    {
        float score = result_data[i][4];
        float x1 = result_data[i][0];
        float y1 = result_data[i][1];
        float x2 = result_data[i][2];
        float y2 = result_data[i][3];
        int id_label = result_data[i][5];

        std::vector<float> v_detections;
        v_detections.push_back(0); //image_id
        v_detections.push_back(id_label); //label
        v_detections.push_back(score); //score
        v_detections.push_back(x1); //xmin
        v_detections.push_back(y1); //ymin
        v_detections.push_back(x2); //xmax
        v_detections.push_back(y2); //ymax
        detections.push_back(v_detections);
    }
    cudaDeviceSynchronize();
    auto end_houchuli = std::chrono::system_clock::now();
    double during_time_houchuli = std::chrono::duration_cast<std::chrono::milliseconds>(end_houchuli - start_houchuli).count();
    std::cout <<"time consume houchuli===" <<  during_time_houchuli << "ms" << std::endl;
}

void base_transform(const cv::Mat &m_src,float *data)
{
    cv::Mat image;
    cv::resize(m_src,image,cv::Size(INPUT_W,INPUT_H));
    if(1 == image.channels()) { cv::cvtColor(image,image,CV_GRAY2BGR); }

    for(int i=0;i<INPUT_H;i++)
    {
        uchar* img_data = image.ptr<uchar>(i); //Get the first address of the row pointer
        for(int j=0;j<INPUT_W;j++)
        {
            int offset = i * INPUT_H + j;
            data[offset] = (float)(img_data[j*3 + 2] * 1.0 - 123.0);
            data[offset + INPUT_H * INPUT_W] = (float)(img_data[j*3 + 1] * 1.0 - 117.0);
            data[offset + 2 * INPUT_H * INPUT_W] = (float)(img_data[j*3 + 0] * 1.0 - 104.0);
        }
    }
}

int main(int argc, char** argv) {
    cudaSetDevice(DEVICE);
    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{nullptr};
    size_t size{0};

#ifdef SERIALIZE
    IHostMemory* modelStream{nullptr};
    APIToModel(1, &modelStream);
    assert(modelStream != nullptr);
    std::ofstream p(path_save_engine, std::ios::binary);
    if (!p) {
        std::cerr << "could not open plan output file" << std::endl;
        return -1;
    }
    p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
    modelStream->destroy();
    return 0;

#elif defined  INFER
    std::ifstream file(path_engine, std::ios::binary);
    if (file.good()) {
        file.seekg(0, file.end);
        size = file.tellg();
        file.seekg(0, file.beg);
        trtModelStream = new char[size];
        assert(trtModelStream);
        file.read(trtModelStream, size);
        file.close();
    }

#else
    std::cerr << "arguments not right!" << std::endl;
    std::cerr << "configure.h should difine SERIALIZE INFER" << std::endl;
    std::cerr << "please check!" << std::endl;
    return -1;
#endif

    std::vector<std::string> file_names;
    if (read_files_in_dir(p_dir_name, file_names) < 0) {
        std::cout << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    // prepare input data ---------------------------
    float data[3 * INPUT_H * INPUT_W];

    IRuntime* runtime = createInferRuntime(gLogger);     //400M
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); //777M
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();  //971M
    assert(context != nullptr);
    delete[] trtModelStream;

    const int batchSize = 1;
    const int inputIndex=0;
    const int outputIndex_arm_loc=1;
    const int outputIndex_arm_conf=3;
    const int outputIndex_odm_loc=2;
    const int outputIndex_odm_conf=4;

    //Initialize cuda  memory:  input and 4 output memory
    void* buffers[5];
    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc(&buffers[0], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));

    const int OUTPUT_SIZE_arm_loc = 25500; //40*40*12 + 20*20*12 + 10*10*12 + 5*5*12 = 25500   (Fixed value)
    CUDA_CHECK(cudaMalloc(&buffers[outputIndex_arm_loc], batchSize * OUTPUT_SIZE_arm_loc * sizeof(float)));

    const int OUTPUT_SIZE_arm_conf = 12750; //40*40*6 + 20*20*6 + 10*10*6 + 5*5*6 = 12750 (Fixed value)
    CUDA_CHECK(cudaMalloc(&buffers[outputIndex_arm_conf], batchSize * OUTPUT_SIZE_arm_conf * sizeof(float)));

    const int OUTPUT_SIZE_odm_loc = 25500; //40*40*12 + 20*20*12 + 10*10*12 + 5*5*12 = 25500   (Fixed value)
    CUDA_CHECK(cudaMalloc(&buffers[outputIndex_odm_loc], batchSize * OUTPUT_SIZE_odm_loc * sizeof(float)));

    const int OUTPUT_SIZE_odm_conf = 159375; //40*40*(num_class*3) + 20*20**(num_class*3) + 10*10**(num_class*3) + 5*5**(num_class*3) //here num_class=25// =159375
    CUDA_CHECK(cudaMalloc(&buffers[outputIndex_odm_conf], batchSize * OUTPUT_SIZE_odm_conf * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));


    int fcount = 0;
    auto t_0 = std::chrono::steady_clock::now();
    for (auto f: file_names) {
        fcount++;
        std::cout << "\n" << fcount << "  " << f << std::endl;
        std::cout << std::string(p_dir_name) + "/" + f << std::endl;

        auto start_read = std::chrono::system_clock::now();
        cv::Mat img = cv::imread(std::string(p_dir_name) + "/" + f);
        cudaDeviceSynchronize();
        auto end_read = std::chrono::system_clock::now();
        double during_time_read = std::chrono::duration_cast<std::chrono::milliseconds>(end_read - start_read).count();
        std::cout <<"time consume during_time_read===" <<  during_time_read << "ms" << std::endl;

        if (img.empty()) continue;

        auto start_yuchuli = std::chrono::system_clock::now();
        base_transform(img,data);
        cudaDeviceSynchronize();
        auto end_yuchuli = std::chrono::system_clock::now();
        double during_time_yuchuli = std::chrono::duration_cast<std::chrono::milliseconds>(end_yuchuli - start_yuchuli).count();
        std::cout <<"time consume base_transform===" <<  during_time_yuchuli << "ms" << std::endl;

        auto start_doInfer = std::chrono::system_clock::now();
        std::vector<std::vector<float>> detections;
        doInference(*context, buffers, stream, data, detections);
        cudaDeviceSynchronize();
        auto end_doInfer = std::chrono::system_clock::now();
        double during_doinfer = std::chrono::duration_cast<std::chrono::milliseconds>(end_doInfer - start_doInfer).count();
        std::cout <<"time consume doInference===" <<  during_doinfer << "ms" << std::endl;

        /* Print the detection results. */
        for (size_t i = 0; i < detections.size(); ++i)
        {
            const std::vector<float> &d = detections[i];

            CHECK_EQ(d.size(), 7);
            const float score = d[2];

            int label = int(d[1]);
            if (label >= num_class || label < 0)
            {
                std::cout << "label_Error!" << std::endl;
                continue;
            }
            if(score < TH)
            {
                continue;
            }
            cv::Rect r;
            r.x = d[3] * img.cols;
            r.y = d[4] * img.rows;
            r.width = d[5] * img.cols - r.x;
            r.height = d[6] * img.rows - r.y;

            RoiCorrect(img, r);
            if(T_show)
            {
                cv::rectangle(img,r,cv::Scalar(255,0,0),2);
            }
            if (T_show == 0)
            {
                std::string name_1 = f.substr(0,f.size()-4);
                std::string path_txt = save_path_txt + name_1 + ".txt";
                std::ofstream fout(path_txt);
                fout << label_map[label] << " " << score << " " << r.x << " " << r.y << " " << r.x + r.width
                     << " " << r.y + r.height << std::endl; //使用自己的label
            }
        }
        if(T_show)
        {
            cv::namedWindow("show",0);
            cv::imshow("show",img);
            cv::waitKey(0);
        }
    }
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(buffers[inputIndex]));
    CUDA_CHECK(cudaFree(buffers[outputIndex_arm_loc]));
    CUDA_CHECK(cudaFree(buffers[outputIndex_arm_conf]));

    CUDA_CHECK(cudaFree(buffers[outputIndex_odm_loc]));
    CUDA_CHECK(cudaFree(buffers[outputIndex_odm_conf]));

    cudaDeviceSynchronize();
    auto ttt = std::chrono::duration_cast<std::chrono::milliseconds>
            (std::chrono::steady_clock::now() - t_0).count();
    std::cout << "all consume time="<<ttt <<"ms"<<std::endl;
    std::cout << "-----------end-----------------------"<<std::endl;

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();
    return 0;
}


================================================
FILE: refinedet/utils.h
================================================
#ifndef __TRT_UTILS_H_
#define __TRT_UTILS_H_

#include <iostream>
#include <vector>
#include <algorithm>
#include <cudnn.h>
#include <dirent.h>
#include <opencv2/opencv.hpp>

#ifndef CUDA_CHECK

#define CUDA_CHECK(callstr)                                                                    \
    {                                                                                          \
        cudaError_t error_code = callstr;                                                      \
        if (error_code != cudaSuccess) {                                                       \
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
            assert(0);                                                                         \
        }                                                                                      \
    }

#endif

namespace Tn
{
    template<typename T> 
    void write(char*& buffer, const T& val)
    {
        *reinterpret_cast<T*>(buffer) = val;
        buffer += sizeof(T);
    }

    template<typename T> 
    void read(const char*& buffer, T& val)
    {
        val = *reinterpret_cast<const T*>(buffer);
        buffer += sizeof(T);
    }
}

static inline int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
    DIR *p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 &&
            strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}

#endif


================================================
FILE: repvgg/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(repvgg)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(/usr/include/x86_64-linux-gnu/)
link_directories(/usr/lib/x86_64-linux-gnu/)

add_executable(repvgg ${PROJECT_SOURCE_DIR}/repvgg.cpp)
target_link_libraries(repvgg nvinfer)
target_link_libraries(repvgg cudart)


add_definitions(-O2 -pthread)


================================================
FILE: repvgg/README.md
================================================
# RepVGG

RepVGG models from
"RepVGG: Making VGG-style ConvNets Great Again" <https://arxiv.org/pdf/2101.03697.pdf>

For the Pytorch implementation, you can refer to [DingXiaoH/RepVGG](https://github.com/DingXiaoH/RepVGG)

# How to run

1. generate wts file.

```
git clone https://github.com/DingXiaoH/RepVGG.git
cd ReoVGG
```

You may convert a trained model into the inference-time structure with

```
python convert.py [weights file of the training-time model to load] [path to save] -a [model name]
```

For example,

```
python convert.py RepVGG-B2-train.pth RepVGG-B2-deploy.pth -a RepVGG-B2
```

Then copy `gen_wts.py` to `RepVGG` and generate .wts file, for example

```
python gen_wts.py -w RepVGG-B2-deploy.pth -s RepVGG-B2.wts
```

2. build and run
```
cd tensorrtx/repvgg

mkdir build

cd build

cmake ..

make

sudo ./repvgg -s RepVGG-B2  // serialize model to plan file i.e. 'RepVGG-B2.engine'
sudo ./repvgg -d RepVGG-B2  // deserialize plan file and run inference
```


================================================
FILE: repvgg/gen_wts.py
================================================
import argparse
import struct

import torch


def main(args):
    # Load model
    state_dict = torch.load(args.weight)
    with open(args.save_path, "w") as f:
        f.write("{}\n".format(len(state_dict.keys())))
        for k, v in state_dict.items():
            vr = v.reshape(-1).cpu().numpy()
            f.write("{} {} ".format(k, len(vr)))
            for vv in vr:
                f.write(" ")
                f.write(struct.pack(">f", float(vv)).hex())
            f.write("\n")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-w",
        "--weight",
        type=str,
        required=True,
        help="RepVGG model weight path",
    )
    parser.add_argument(
        "-s",
        "--save_path",
        type=str,
        required=True,
        help="generated wts path",
    )
    args = parser.parse_args()
    main(args)

================================================
FILE: repvgg/logging.h
================================================
#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <iostream>

// Logger for TensorRT info/warning/errors
class Logger : public nvinfer1::ILogger
{
public:
    Logger() : Logger(Severity::kINFO) {}

    Logger(Severity severity) : reportableSeverity(severity) {}

    void log(Severity severity, const char *msg) override
    {
        // suppress messages with severity enum value greater than the reportable
        if (severity > reportableSeverity)
            return;

        switch (severity)
        {
        case Severity::kINTERNAL_ERROR:
            std::cerr << "INTERNAL_ERROR: ";
            break;
        case Severity::kERROR:
            std::cerr << "ERROR: ";
            break;
        case Severity::kWARNING:
            std::cerr << "WARNING: ";
            break;
        case Severity::kINFO:
            std::cerr << "INFO: ";
            break;
        default:
            std::cerr << "UNKNOWN: ";
            break;
        }
        std::cerr << msg << std::endl;
    }

    Severity reportableSeverity{Severity::kWARNING};
};

#endif // TENSORRT_LOGGING_H


================================================
FILE: repvgg/repvgg.cpp
================================================
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include <cmath>
#include <algorithm>

#define CHECK(status)                                          \
    do                                                         \
    {                                                          \
        auto ret = (status);                                   \
        if (ret != 0)                                          \
        {                                                      \
            std::cerr << "Cuda failure: " << ret << std::endl; \
            abort();                                           \
        }                                                      \
    } while (0)

// stuff we know about the network and the input/output blobs
#define MAX_BATCH_SIZE 1
const std::vector<int> groupwise_layers{2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26};
const std::map<std::string, int> groupwise_counts = {
    {"RepVGG-A0", 1},
    {"RepVGG-A1", 1},
    {"RepVGG-A2", 1},
    {"RepVGG-B0", 1},
    {"RepVGG-B1", 1},
    {"RepVGG-B1g2", 2},
    {"RepVGG-B1g4", 4},
    {"RepVGG-B2", 1},
    {"RepVGG-B2g2", 2},
    {"RepVGG-B2g4", 4},
    {"RepVGG-B3", 1},
    {"RepVGG-B3g2", 2},
    {"RepVGG-B3g4", 4}};
const std::map<std::string, std::vector<int>> num_blocks = {
    {"RepVGG-A0", {2, 4, 14, 1}},
    {"RepVGG-A1", {2, 4, 14, 1}},
    {"RepVGG-A2", {2, 4, 14, 1}},
    {"RepVGG-B0", {4, 6, 16, 1}},
    {"RepVGG-B1", {4, 6, 16, 1}},
    {"RepVGG-B1g2", {4, 6, 16, 1}},
    {"RepVGG-B1g4", {4, 6, 16, 1}},
    {"RepVGG-B2", {4, 6, 16, 1}},
    {"RepVGG-B2g2", {4, 6, 16, 1}},
    {"RepVGG-B2g4", {4, 6, 16, 1}},
    {"RepVGG-B3", {4, 6, 16, 1}},
    {"RepVGG-B3g2", {4, 6, 16, 1}},
    {"RepVGG-B3g4", {4, 6, 16, 1}}};
const std::map<std::string, std::vector<float>> width_multiplier = {
    {"RepVGG-A0", {0.75, 0.75, 0.75, 2.5}},
    {"RepVGG-A1", {1, 1, 1, 2.5}},
    {"RepVGG-A2", {1.5, 1.5, 1.5, 2.75}},
    {"RepVGG-B0", {1, 1, 1, 2.5}},
    {"RepVGG-B1", {2, 2, 2, 4}},
    {"RepVGG-B1g2", {2, 2, 2, 4}},
    {"RepVGG-B1g4", {2, 2, 2, 4}},
    {"RepVGG-B2", {2.5, 2.5, 2.5, 5}},
    {"RepVGG-B2g2", {2.5, 2.5, 2.5, 5}},
    {"RepVGG-B2g4", {2.5, 2.5, 2.5, 5}},
    {"RepVGG-B3", {3, 3, 3, 5}},
    {"RepVGG-B3g2", {3, 3, 3, 5}},
    {"RepVGG-B3g4", {3, 3, 3, 5}}};

static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int OUTPUT_SIZE = 1000;

const char *INPUT_BLOB_NAME = "data";
const char *OUTPUT_BLOB_NAME = "prob";

using namespace nvinfer1;

static Logger gLogger;

// Load weights from files shared with TensorRT samples.
// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file)
{
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t *val = reinterpret_cast<uint32_t *>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }
    std::cout << "Finished Load weights: " << file << std::endl;
    return weightMap;
}

IActivationLayer *RepVGGBlock(INetworkDefinition *network, std::map<std::string, Weights> &weightMap, ITensor &input, int inch, int outch, int stride, int groups, std::string lname)
{
    IConvolutionLayer *conv = network->addConvolutionNd(input, outch, DimsHW{3, 3}, weightMap[lname + "rbr_reparam.weight"], weightMap[lname + "rbr_reparam.bias"]);
    conv->setStrideNd(DimsHW{stride, stride});
    conv->setPaddingNd(DimsHW{1, 1});
    conv->setNbGroups(groups);
    assert(conv);
    IActivationLayer *relu = network->addActivation(*conv->getOutput(0), ActivationType::kRELU);
    assert(relu);
    return relu;
}

IActivationLayer *makeStage(INetworkDefinition *network, std::map<std::string, Weights> &weightMap, int &layer_idx, const int group_count, ITensor &input, int inch, int outch, int stride, int blocks, std::string lname)
{
    IActivationLayer *layer;
    for (int i = 0; i < blocks; ++i)
    {
        int group = 1;
        if (std::find(groupwise_layers.begin(), groupwise_layers.end(), layer_idx) != groupwise_layers.end())
            group = group_count;
        if (i == 0)
            layer = RepVGGBlock(network, weightMap, input, inch, outch, 2, group, lname + std::to_string(i) + ".");
        else
            layer = RepVGGBlock(network, weightMap, *layer->getOutput(0), inch, outch, 1, group, lname + std::to_string(i) + ".");
        layer_idx += 1;
    }
    return layer;
}
// Creat the engine using only the API and not any parser.
ICudaEngine *createEngine(std::string netName, unsigned int maxBatchSize, IBuilder *builder, IBuilderConfig *config, DataType dt)
{
    const std::vector<int> blocks = num_blocks.at(netName);
    const std::vector<float> widths = width_multiplier.at(netName);
    const int group_count = groupwise_counts.at(netName);
    int layer_idx = 1;

    std::map<std::string, Weights> weightMap = loadWeights("../" + netName + ".wts");

    INetworkDefinition *network = builder->createNetworkV2(0U);

    // Create input tensor of shape { 3, INPUT_H, INPUT_W } with name INPUT_BLOB_NAME
    ITensor *data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
    assert(data);

    int in_planes = std::min(64, int(64 * widths[0]));
    auto stage0 = RepVGGBlock(network, weightMap, *data, 3, in_planes, 2, 1, "stage0.");
    assert(stage0);

    auto stage1 = makeStage(network, weightMap, layer_idx, group_count, *stage0->getOutput(0), in_planes, int(64 * widths[0]), 2, blocks[0], "stage1.");
    assert(stage1);
    auto stage2 = makeStage(network, weightMap, layer_idx, group_count, *stage1->getOutput(0), int(64 * widths[0]), int(128 * widths[1]), 2, blocks[1], "stage2.");
    assert(stage2);
    auto stage3 = makeStage(network, weightMap, layer_idx, group_count, *stage2->getOutput(0), int(128 * widths[1]), int(256 * widths[2]), 2, blocks[2], "stage3.");
    assert(stage3);
    auto stage4 = makeStage(network, weightMap, layer_idx, group_count, *stage3->getOutput(0), int(256 * widths[2]), int(512 * widths[3]), 2, blocks[3], "stage4.");
    assert(stage4);

    IPoolingLayer *pool = network->addPoolingNd(*stage4->getOutput(0), PoolingType::kAVERAGE, DimsHW{7, 7});
    pool->setStrideNd(DimsHW{7, 7});
    pool->setPaddingNd(DimsHW{0, 0});
    assert(pool);

    IFullyConnectedLayer *linear = network->addFullyConnected(*pool->getOutput(0), 1000, weightMap["linear.weight"], weightMap["linear.bias"]);
    assert(linear);

    linear->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    std::cout << "set name out" << std::endl;
    network->markOutput(*linear->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(1 << 20);
    ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "build out" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto &mem : weightMap)
    {
        free((void *)(mem.second.values));
    }
    return engine;
}

void APIToModel(std::string netName, unsigned int maxBatchSize, IHostMemory **modelStream)
{
    // Create builder
    IBuilder *builder = createInferBuilder(gLogger);
    IBuilderConfig *config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine *engine = createEngine(netName, maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
    config->destroy();
}

void doInference(IExecutionContext &context, float *input, float *output, int batchSize)
{
    const ICudaEngine &engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void *buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

int main(int argc, char **argv)
{
    if (argc != 3)
    {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./repvgg -s  RepVGG-B1g2 // serialize model to plan file" << std::endl;
        std::cerr << "./repvgg -d  RepVGG-B1g2 // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{nullptr};
    size_t size{0};

    if (std::string(argv[1]) == "-s")
    {
        std::string netName = std::string(argv[2]);
        IHostMemory *modelStream{nullptr};
        APIToModel(netName, MAX_BATCH_SIZE, &modelStream);
        assert(modelStream != nullptr);

        std::ofstream p(netName + ".engine", std::ios::binary);
        if (!p)
        {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char *>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 1;
    }
    else if (std::string(argv[1]) == "-d")
    {
        std::string netName = std::string(argv[2]);
        std::ifstream file(netName + ".engine", std::ios::binary);
        if (file.good())
        {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    }
    else
    {
        return -1;
    }

    static float data[3 * INPUT_H * INPUT_W];
    for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
        data[i] = 1.0;

    IRuntime *runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine *engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
    assert(engine != nullptr);
    IExecutionContext *context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    // Run inference
    static float prob[OUTPUT_SIZE];
    for (int i = 0; i < 100; i++)
    {
        auto start = std::chrono::system_clock::now();
        doInference(*context, data, prob, 1);
        auto end = std::chrono::system_clock::now();
        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    // Print histogram of the output distribution
    std::cout << "\nOutput:\n\n";
    for (unsigned int i = 0; i < 10; i++)
    {
        std::cout << prob[i] << ", ";
    }
    std::cout << std::endl;
    for (unsigned int i = 0; i < 10; i++)
    {
        std::cout << prob[OUTPUT_SIZE - 10 + i] << ", ";
    }
    std::cout << std::endl;

    return 0;
}


================================================
FILE: resnet/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(resnet)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(/usr/include/x86_64-linux-gnu/)
link_directories(/usr/lib/x86_64-linux-gnu/)

add_executable(resnet18 ${PROJECT_SOURCE_DIR}/resnet18.cpp)
target_link_libraries(resnet18 nvinfer)
target_link_libraries(resnet18 cudart)

add_executable(resnet34 ${PROJECT_SOURCE_DIR}/resnet34.cpp)
target_link_libraries(resnet34 nvinfer)
target_link_libraries(resnet34 cudart)

add_executable(resnet50 ${PROJECT_SOURCE_DIR}/resnet50.cpp)
target_link_libraries(resnet50 nvinfer)
target_link_libraries(resnet50 cudart)

add_executable(resnext50 ${PROJECT_SOURCE_DIR}/resnext50_32x4d.cpp)
target_link_libraries(resnext50 nvinfer)
target_link_libraries(resnext50 cudart)

add_executable(wideresnet50 ${PROJECT_SOURCE_DIR}/wideresnet50.cpp)
target_link_libraries(wideresnet50 nvinfer)
target_link_libraries(wideresnet50 cudart)

add_definitions(-O2 -pthread)


================================================
FILE: resnet/README.md
================================================
# resnet

ResNet-18 and ResNet-50 model from "Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>

For the Pytorch implementation, you can refer to [pytorchx/resnet](https://github.com/wang-xinyu/pytorchx/tree/master/resnet)

Wide Resnet-50 model from "Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf> . For the Pytorch implementation, you can refer to [BlueMirrors/torchtrtz](https://github.com/BlueMirrors/torchtrtz)

Following tricks are used in this resnet, nothing special, residual connection and batchnorm are used.

- Batchnorm layer, implemented with scale layer.

## TensorRT C++ API

```
// 1a. generate resnet18.wts,resnet34.wts or resnet50.wts from [pytorchx/resnet](https://github.com/wang-xinyu/pytorchx/tree/master/resnet)

// 1b. generate wide_resnet50.wts from [BlueMirrors/torchtrtz](https://github.com/BlueMirrors/torchtrtz)

// 2. put resnet18.wts,resnet34 or resnet50.wts into tensorrtx/resnet

// 3. build and run

cd tensorrtx/resnet

mkdir build

cd build

cmake ..

make

sudo ./resnet18 -s   // serialize model to plan file i.e. 'resnet18.engine'
sudo ./resnet18 -d   // deserialize plan file and run inference

or
sudo ./resnet34 -s   // serialize model to plan file i.e. 'resnet34.engine'
sudo ./resnet34 -d   // deserialize plan file and run inference

or

sudo ./resnet50 -s   // serialize model to plan file i.e. 'resnet50.engine'
sudo ./resnet50 -d   // deserialize plan file and run inference

or

sudo ./resnext50 -s   // serialize model to plan file i.e. 'resnext50.engine'
sudo ./resnext50 -d   // deserialize plan file and run inference

or

sudo ./wide_resnet50 -s   // serialize model to plan file i.e. 'wide_resnet50.engine'
sudo ./wide_resnet50 -d   // deserialize plan file and run inference


// 4. see if the output is same as 
- [pytorchx/resnet](https://github.com/wang-xinyu/pytorchx/tree/master/resnet) - for resnet18, resnet34, resnet50, resnext50
- [BlueMirrors/torchtrtz](https://github.com/BlueMirrors/torchtrtz) - for wide_resnet50
```

### TensorRT Python API

```
# 1a. generate resnet50.wts from [pytorchx/resnet](https://github.com/wang-xinyu/pytorchx/tree/master/resnet)
# 1b. generate wide_resnet50.wts from [BlueMirrors/torchtrtz](https://github.com/BlueMirrors/torchtrtz)

# 2. put resnet50.wts or wide_resnet50.wts into tensorrtx/resnet

# 3. install Python dependencies (tensorrt/pycuda/numpy)

cd tensorrtx/resnet

python resnet50.py -s   // serialize model to plan file i.e. 'resnet50.engine'
python resnet50.py -d   // deserialize plan file and run inference

or 

python wide_resnet50.py -s   // serialize model to plan file i.e. 'wide_resnet50.engine'
python wide_resnet50.py -d   // deserialize plan file and run inference

# 4. see if the output is same as 
- pytorchx/resnet - for resnet50
- BlueMirrors/torchtrtz - for wide_resnet50
```


================================================
FILE: resnet/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) override
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: resnet/resnet18.cpp
================================================
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include <cmath>

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

// stuff we know about the network and the input/output blobs
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int OUTPUT_SIZE = 1000;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";

using namespace nvinfer1;

static Logger gLogger;

// Load weights from files shared with TensorRT samples.
// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file)
{
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        
        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + ".weight"].values;
    float *beta = (float*)weightMap[lname + ".bias"].values;
    float *mean = (float*)weightMap[lname + ".running_mean"].values;
    float *var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;
    std::cout << "len " << len << std::endl;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};
    
    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

IActivationLayer* basicBlock(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{3, 3}, weightMap[lname + "conv1.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{stride, stride});
    conv1->setPaddingNd(DimsHW{1, 1});

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5);

    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{3, 3}, weightMap[lname + "conv2.weight"], emptywts);
    assert(conv2);
    conv2->setPaddingNd(DimsHW{1, 1});

    IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5);

    IElementWiseLayer* ew1;
    if (inch != outch) {
        IConvolutionLayer* conv3 = network->addConvolutionNd(input, outch, DimsHW{1, 1}, weightMap[lname + "downsample.0.weight"], emptywts);
        assert(conv3);
        conv3->setStrideNd(DimsHW{stride, stride});
        IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "downsample.1", 1e-5);
        ew1 = network->addElementWise(*bn3->getOutput(0), *bn2->getOutput(0), ElementWiseOperation::kSUM);
    } else {
        ew1 = network->addElementWise(input, *bn2->getOutput(0), ElementWiseOperation::kSUM);
    }
    IActivationLayer* relu2 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU);
    assert(relu2);
    return relu2;
}

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt)
{
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // Create input tensor of shape { 3, INPUT_H, INPUT_W } with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../resnet18.wts");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 64, DimsHW{7, 7}, weightMap["conv1.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{2, 2});
    conv1->setPaddingNd(DimsHW{3, 3});

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "bn1", 1e-5);

    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
    assert(pool1);
    pool1->setStrideNd(DimsHW{2, 2});
    pool1->setPaddingNd(DimsHW{1, 1});

    IActivationLayer* relu2 = basicBlock(network, weightMap, *pool1->getOutput(0), 64, 64, 1, "layer1.0.");
    IActivationLayer* relu3 = basicBlock(network, weightMap, *relu2->getOutput(0), 64, 64, 1, "layer1.1.");

    IActivationLayer* relu4 = basicBlock(network, weightMap, *relu3->getOutput(0), 64, 128, 2, "layer2.0.");
    IActivationLayer* relu5 = basicBlock(network, weightMap, *relu4->getOutput(0), 128, 128, 1, "layer2.1.");

    IActivationLayer* relu6 = basicBlock(network, weightMap, *relu5->getOutput(0), 128, 256, 2, "layer3.0.");
    IActivationLayer* relu7 = basicBlock(network, weightMap, *relu6->getOutput(0), 256, 256, 1, "layer3.1.");

    IActivationLayer* relu8 = basicBlock(network, weightMap, *relu7->getOutput(0), 256, 512, 2, "layer4.0.");
    IActivationLayer* relu9 = basicBlock(network, weightMap, *relu8->getOutput(0), 512, 512, 1, "layer4.1.");

    IPoolingLayer* pool2 = network->addPoolingNd(*relu9->getOutput(0), PoolingType::kAVERAGE, DimsHW{7, 7});
    assert(pool2);
    pool2->setStrideNd(DimsHW{1, 1});
    
    IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool2->getOutput(0), 1000, weightMap["fc.weight"], weightMap["fc.bias"]);
    assert(fc1);

    fc1->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    std::cout << "set name out" << std::endl;
    network->markOutput(*fc1->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(1 << 20);
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "build out" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*) (mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream)
{
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
    config->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

int main(int argc, char** argv)
{
    if (argc != 2) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./resnet18 -s   // serialize model to plan file" << std::endl;
        std::cerr << "./resnet18 -d   // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{nullptr};
    size_t size{0};

    if (std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(1, &modelStream);
        assert(modelStream != nullptr);

        std::ofstream p("resnet18.engine", std::ios::binary);
        if (!p)
        {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 1;
    } else if (std::string(argv[1]) == "-d") {
        std::ifstream file("resnet18.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        return -1;
    }


    // Subtract mean from image
    static float data[3 * INPUT_H * INPUT_W];
    for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
        data[i] = 1.0;

    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    // Run inference
    static float prob[OUTPUT_SIZE];
    for (int i = 0; i < 100; i++) {
        auto start = std::chrono::system_clock::now();
        doInference(*context, data, prob, 1);
        auto end = std::chrono::system_clock::now();
        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    // Print histogram of the output distribution
    std::cout << "\nOutput:\n\n";
    for (unsigned int i = 0; i < 10; i++)
    {
        std::cout << prob[i] << ", ";
    }
    std::cout << std::endl;
    for (unsigned int i = 0; i < 10; i++)
    {
        std::cout << prob[OUTPUT_SIZE - 10 + i] << ", ";
    }
    std::cout << std::endl;

    return 0;
}


================================================
FILE: resnet/resnet34.cpp
================================================
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include <cmath>

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if(ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while(0)

// stuff we know about the network and the input/output blobs 
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int OUTPUT_SIZE = 1000;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";

using namespace nvinfer1;

static Logger gLogger;

// Load weights from files shared with TensorRT samples.
// TensorRT weigths files have a simple space delimited format:
// [tpyt] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file)
{
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights>weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalis weight map file");

    while (count--)
    {
        Weights wt{ DataType::kFLOAT, nullptr,0 };
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val)* size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;

}

IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + ".weight"].values;
    float *bata = (float*)weightMap[lname + ".bias"].values;
    float *mean = (float*)weightMap[lname + ".running_mean"].values;
    float *var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;
    std::cout << "len " << len << std::endl;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for(int i=0; i < len; i++){
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{ DataType::kFLOAT,scval,len };

    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = bata[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{ DataType::kFLOAT, shval, len };

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{ DataType::kFLOAT, pval, len };

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

IActivationLayer* basicBlock(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname) {
    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };

    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ 3,3 }, weightMap[lname + "conv1.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{ stride,stride });
    conv1->setPaddingNd(DimsHW{ 1,1 });

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5);

    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{ 3,3 }, weightMap[lname + "conv2.weight"], emptywts);
    assert(conv2);
    conv2->setPaddingNd(DimsHW{ 1,1 });

    IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5);

    IElementWiseLayer* ew1;
    if (inch != outch) {
        IConvolutionLayer* conv3 = network->addConvolutionNd(input, outch, DimsHW{ 1,1 }, weightMap[lname + "downsample.0.weight"], emptywts);
        assert(conv3);
        conv3->setStrideNd(DimsHW{ stride, stride });
        IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "downsample.1", 1e-5);
        ew1 = network->addElementWise(*bn3->getOutput(0), *bn2->getOutput(0), ElementWiseOperation::kSUM);

    }else {
        ew1 = network->addElementWise(input, *bn2->getOutput(0),
            ElementWiseOperation::kSUM);

    }
    IActivationLayer* relu2 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU);
    assert(relu2);
    return relu2;
}

// Create the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // Create input tensor of shpae { 3, INPUT_H INPPUT_W} with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3,INPUT_H,INPUT_W });
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../resnet34.wts");
    Weights emptywts{ DataType::kFLOAT,nullptr,0 };

    IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 64, DimsHW{ 7,7 }, weightMap["conv1.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{ 2,2 });
    conv1->setPaddingNd(DimsHW{ 3,3 });

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "bn1", 1e-5);
    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{ 3,3 });
    assert(pool1);
    pool1->setStrideNd(DimsHW{ 2,2 });
    pool1->setPaddingNd(DimsHW{ 1,1 });

    IActivationLayer* relu2 = basicBlock(network, weightMap, *pool1->getOutput(0), 64, 64, 1, "layer1.0.");
    IActivationLayer* relu3 = basicBlock(network, weightMap, *relu2->getOutput(0), 64, 64, 1, "layer1.1.");
    IActivationLayer* relu4 = basicBlock(network, weightMap, *relu3->getOutput(0), 64, 64, 1, "layer1.2.");
    IActivationLayer* relu5 = basicBlock(network, weightMap, *relu4->getOutput(0), 64, 128, 2, "layer2.0.");
    IActivationLayer* relu6 = basicBlock(network, weightMap, *relu5->getOutput(0), 128, 128, 1, "layer2.1.");
    IActivationLayer* relu7 = basicBlock(network, weightMap, *relu6->getOutput(0), 128, 128, 1, "layer2.2.");
    IActivationLayer* relu8 = basicBlock(network, weightMap, *relu7->getOutput(0), 128, 128, 1, "layer2.3.");
    IActivationLayer* relu9 = basicBlock(network, weightMap, *relu8->getOutput(0), 128, 256, 2, "layer3.0.");
    IActivationLayer* relu10 = basicBlock(network, weightMap, *relu9->getOutput(0), 256, 256, 1, "layer3.1.");
    IActivationLayer* relu11 = basicBlock(network, weightMap, *relu10->getOutput(0), 256, 256, 1, "layer3.2.");
    IActivationLayer* relu12 = basicBlock(network, weightMap, *relu11->getOutput(0), 256, 256, 1, "layer3.3.");
    IActivationLayer* relu13 = basicBlock(network, weightMap, *relu12->getOutput(0), 256, 256, 1, "layer3.4.");
    IActivationLayer* relu14 = basicBlock(network, weightMap, *relu13->getOutput(0), 256, 256, 1, "layer3.5.");
    IActivationLayer* relu15 = basicBlock(network, weightMap, *relu14->getOutput(0), 256, 512, 2, "layer4.0.");
    IActivationLayer* relu16 = basicBlock(network, weightMap, *relu15->getOutput(0), 512, 512, 1, "layer4.1.");
    IActivationLayer* relu17 = basicBlock(network, weightMap, *relu16->getOutput(0), 512, 512, 1, "layer4.2.");
    IPoolingLayer* pool2 = network->addPoolingNd(*relu17->getOutput(0), PoolingType::kAVERAGE, DimsHW{ 7,7 });
    assert(pool2);
    pool2->setStrideNd(DimsHW{ 1,1 });
    IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool2->getOutput(0), 1000, weightMap["fc.weight"], weightMap["fc.bias"]);
    assert(fc1);

    fc1->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    std::cout << "set name out" << std::endl;
    network->markOutput(*fc1->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(1 << 20);
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "build out" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream)
{
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
    config->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBingdings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to konow the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H* INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));

}

int main(int argc, char** argv)
{
    if (argc != 2) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./resnet34 -s   // serialize model to plan file" << std::endl;
        std::cerr << "./resnet34 -d    // desrialize plan file and run inference" << std::endl;
        return -1;
    }

    // Create a model using the API directly and serialize it to a stream
    char *trtModelStream{ nullptr };
    size_t size(0);

    if (std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{ nullptr };
        APIToModel(1, &modelStream);
        assert(modelStream != nullptr);

        std::ofstream p("resnet34.engine", std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 1;
    }else if (std::string(argv[1]) == "-d") {
        std::ifstream file("resnet34.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    }else {
        return -1;

    }

    // Subtract mean from image
    static float data[3 * INPUT_H * INPUT_W];
    for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) {
        data[i] = 1.0;
    }

    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    // Run inference
    static float prob[OUTPUT_SIZE];
    for (int i = 0; i < 100; i++) {
        auto start = std::chrono::system_clock::now();
        doInference(*context, data, prob, 1);
        auto end = std::chrono::system_clock::now();
        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    // Print historgram of the output distribution
    std::cout << "\nOutput:\n\n";
    for (unsigned int i = 0; i < 10; i++)
    {
        std::cout << prob[i] << ",";
    }
    std::cout << std::endl;
    for (unsigned int i = 0; i < 10; i++)
    {
    std::cout << prob[OUTPUT_SIZE - 10 + i] << ",";
    }
    std::cout << std::endl;

    return 0;
}


================================================
FILE: resnet/resnet50.cpp
================================================
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include <cmath>

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

// stuff we know about the network and the input/output blobs
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int OUTPUT_SIZE = 1000;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";

using namespace nvinfer1;

static Logger gLogger;

// Load weights from files shared with TensorRT samples.
// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file)
{
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        
        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + ".weight"].values;
    float *beta = (float*)weightMap[lname + ".bias"].values;
    float *mean = (float*)weightMap[lname + ".running_mean"].values;
    float *var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;
    std::cout << "len " << len << std::endl;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};
    
    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

IActivationLayer* bottleneck(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{1, 1}, weightMap[lname + "conv1.weight"], emptywts);
    assert(conv1);

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5);

    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{3, 3}, weightMap[lname + "conv2.weight"], emptywts);
    assert(conv2);
    conv2->setStrideNd(DimsHW{stride, stride});
    conv2->setPaddingNd(DimsHW{1, 1});

    IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5);

    IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU);
    assert(relu2);

    IConvolutionLayer* conv3 = network->addConvolutionNd(*relu2->getOutput(0), outch * 4, DimsHW{1, 1}, weightMap[lname + "conv3.weight"], emptywts);
    assert(conv3);

    IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "bn3", 1e-5);

    IElementWiseLayer* ew1;
    if (stride != 1 || inch != outch * 4) {
        IConvolutionLayer* conv4 = network->addConvolutionNd(input, outch * 4, DimsHW{1, 1}, weightMap[lname + "downsample.0.weight"], emptywts);
        assert(conv4);
        conv4->setStrideNd(DimsHW{stride, stride});

        IScaleLayer* bn4 = addBatchNorm2d(network, weightMap, *conv4->getOutput(0), lname + "downsample.1", 1e-5);
        ew1 = network->addElementWise(*bn4->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM);
    } else {
        ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM);
    }
    IActivationLayer* relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU);
    assert(relu3);
    return relu3;
}

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt)
{
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // Create input tensor of shape { 3, INPUT_H, INPUT_W } with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../resnet50.wts");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 64, DimsHW{7, 7}, weightMap["conv1.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{2, 2});
    conv1->setPaddingNd(DimsHW{3, 3});

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "bn1", 1e-5);

    // Add activation layer using the ReLU algorithm.
    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    // Add max pooling layer with stride of 2x2 and kernel size of 2x2.
    IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
    assert(pool1);
    pool1->setStrideNd(DimsHW{2, 2});
    pool1->setPaddingNd(DimsHW{1, 1});

    IActivationLayer* x = bottleneck(network, weightMap, *pool1->getOutput(0), 64, 64, 1, "layer1.0.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 256, 64, 1, "layer1.1.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 256, 64, 1, "layer1.2.");

    x = bottleneck(network, weightMap, *x->getOutput(0), 256, 128, 2, "layer2.0.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.1.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.2.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.3.");

    x = bottleneck(network, weightMap, *x->getOutput(0), 512, 256, 2, "layer3.0.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.1.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.2.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.3.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.4.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.5.");

    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 512, 2, "layer4.0.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 2048, 512, 1, "layer4.1.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 2048, 512, 1, "layer4.2.");

    IPoolingLayer* pool2 = network->addPoolingNd(*x->getOutput(0), PoolingType::kAVERAGE, DimsHW{7, 7});
    assert(pool2);
    pool2->setStrideNd(DimsHW{1, 1});
    
    IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool2->getOutput(0), 1000, weightMap["fc.weight"], weightMap["fc.bias"]);
    assert(fc1);

    fc1->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    std::cout << "set name out" << std::endl;
    network->markOutput(*fc1->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(1 << 20);
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "build out" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*) (mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream)
{
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
    config->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

int main(int argc, char** argv)
{
    if (argc != 2) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./resnet -s   // serialize model to plan file" << std::endl;
        std::cerr << "./resnet -d   // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{nullptr};
    size_t size{0};

    if (std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(1, &modelStream);
        assert(modelStream != nullptr);

        std::ofstream p("resnet50.engine", std::ios::binary);
        if (!p)
        {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 1;
    } else if (std::string(argv[1]) == "-d") {
        std::ifstream file("resnet50.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        return -1;
    }


    // Subtract mean from image
    static float data[3 * INPUT_H * INPUT_W];
    for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
        data[i] = 1.0;

    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    // Run inference
    static float prob[OUTPUT_SIZE];
    for (int i = 0; i < 100; i++) {
        auto start = std::chrono::system_clock::now();
        doInference(*context, data, prob, 1);
        auto end = std::chrono::system_clock::now();
        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    // Print histogram of the output distribution
    std::cout << "\nOutput:\n\n";
    for (unsigned int i = 0; i < 10; i++)
    {
        std::cout << prob[i] << ", ";
    }
    std::cout << std::endl;
    for (unsigned int i = 0; i < 10; i++)
    {
        std::cout << prob[OUTPUT_SIZE - 10 + i] << ", ";
    }
    std::cout << std::endl;

    return 0;
}


================================================
FILE: resnet/resnet50.py
================================================
import argparse
import os
import struct
import sys

import numpy as np
import pycuda.autoinit  # noqa
import pycuda.driver as cuda
import tensorrt as trt

BATCH_SIZE = 1
INPUT_H = 224
INPUT_W = 224
OUTPUT_SIZE = 1000
INPUT_BLOB_NAME = "data"
OUTPUT_BLOB_NAME = "prob"
EPS = 1e-5

WEIGHT_PATH = "./resnet50.wts"
ENGINE_PATH = "./resnet50.engine"

TRT_LOGGER = trt.Logger(trt.Logger.INFO)


def load_weights(file):
    print(f"Loading weights: {file}")

    assert os.path.exists(file), 'Unable to load weight file.'

    weight_map = {}
    with open(file, "r") as f:
        lines = [line.strip() for line in f]
    count = int(lines[0])
    assert count == len(lines) - 1
    for i in range(1, count + 1):
        splits = lines[i].split(" ")
        name = splits[0]
        cur_count = int(splits[1])
        assert cur_count + 2 == len(splits)
        values = []
        for j in range(2, len(splits)):
            # hex string to bytes to float
            values.append(struct.unpack(">f", bytes.fromhex(splits[j])))
        weight_map[name] = np.array(values, dtype=np.float32)

    return weight_map


def addBatchNorm2d(network, weight_map, input, layer_name, eps):
    gamma = weight_map[layer_name + ".weight"]
    beta = weight_map[layer_name + ".bias"]
    mean = weight_map[layer_name + ".running_mean"]
    var = weight_map[layer_name + ".running_var"]
    var = np.sqrt(var + eps)

    scale = gamma / var
    shift = -mean / var * gamma + beta
    return network.add_scale(input=input,
                             mode=trt.ScaleMode.CHANNEL,
                             shift=shift,
                             scale=scale)


def bottleneck(network, weight_map, input, in_channels, out_channels, stride,
               layer_name):

    conv1 = network.add_convolution(input=input,
                                    num_output_maps=out_channels,
                                    kernel_shape=(1, 1),
                                    kernel=weight_map[layer_name +
                                                      "conv1.weight"],
                                    bias=trt.Weights())
    assert conv1

    bn1 = addBatchNorm2d(network, weight_map, conv1.get_output(0),
                         layer_name + "bn1", EPS)
    assert bn1

    relu1 = network.add_activation(bn1.get_output(0),
                                   type=trt.ActivationType.RELU)
    assert relu1

    conv2 = network.add_convolution(input=relu1.get_output(0),
                                    num_output_maps=out_channels,
                                    kernel_shape=(3, 3),
                                    kernel=weight_map[layer_name +
                                                      "conv2.weight"],
                                    bias=trt.Weights())
    assert conv2
    conv2.stride = (stride, stride)
    conv2.padding = (1, 1)

    bn2 = addBatchNorm2d(network, weight_map, conv2.get_output(0),
                         layer_name + "bn2", EPS)
    assert bn2

    relu2 = network.add_activation(bn2.get_output(0),
                                   type=trt.ActivationType.RELU)
    assert relu2

    conv3 = network.add_convolution(input=relu2.get_output(0),
                                    num_output_maps=out_channels * 4,
                                    kernel_shape=(1, 1),
                                    kernel=weight_map[layer_name +
                                                      "conv3.weight"],
                                    bias=trt.Weights())
    assert conv3

    bn3 = addBatchNorm2d(network, weight_map, conv3.get_output(0),
                         layer_name + "bn3", EPS)
    assert bn3

    if stride != 1 or in_channels != 4 * out_channels:
        conv4 = network.add_convolution(
            input=input,
            num_output_maps=out_channels * 4,
            kernel_shape=(1, 1),
            kernel=weight_map[layer_name + "downsample.0.weight"],
            bias=trt.Weights())
        assert conv4
        conv4.stride = (stride, stride)

        bn4 = addBatchNorm2d(network, weight_map, conv4.get_output(0),
                             layer_name + "downsample.1", EPS)
        assert bn4

        ew1 = network.add_elementwise(bn4.get_output(0), bn3.get_output(0),
                                      trt.ElementWiseOperation.SUM)
    else:
        ew1 = network.add_elementwise(input, bn3.get_output(0),
                                      trt.ElementWiseOperation.SUM)
    assert ew1

    relu3 = network.add_activation(ew1.get_output(0),
                                   type=trt.ActivationType.RELU)
    assert relu3

    return relu3


def create_engine(maxBatchSize, builder, config, dt):
    weight_map = load_weights(WEIGHT_PATH)
    network = builder.create_network()

    data = network.add_input(INPUT_BLOB_NAME, dt, (3, INPUT_H, INPUT_W))
    assert data

    conv1 = network.add_convolution(input=data,
                                    num_output_maps=64,
                                    kernel_shape=(7, 7),
                                    kernel=weight_map["conv1.weight"],
                                    bias=trt.Weights())
    assert conv1
    conv1.stride = (2, 2)
    conv1.padding = (3, 3)

    bn1 = addBatchNorm2d(network, weight_map, conv1.get_output(0), "bn1", EPS)
    assert bn1

    relu1 = network.add_activation(bn1.get_output(0),
                                   type=trt.ActivationType.RELU)
    assert relu1

    pool1 = network.add_pooling(input=relu1.get_output(0),
                                window_size=trt.DimsHW(3, 3),
                                type=trt.PoolingType.MAX)
    assert pool1
    pool1.stride = (2, 2)
    pool1.padding = (1, 1)

    x = bottleneck(network, weight_map, pool1.get_output(0), 64, 64, 1,
                   "layer1.0.")
    x = bottleneck(network, weight_map, x.get_output(0), 256, 64, 1,
                   "layer1.1.")
    x = bottleneck(network, weight_map, x.get_output(0), 256, 64, 1,
                   "layer1.2.")

    x = bottleneck(network, weight_map, x.get_output(0), 256, 128, 2,
                   "layer2.0.")
    x = bottleneck(network, weight_map, x.get_output(0), 512, 128, 1,
                   "layer2.1.")
    x = bottleneck(network, weight_map, x.get_output(0), 512, 128, 1,
                   "layer2.2.")
    x = bottleneck(network, weight_map, x.get_output(0), 512, 128, 1,
                   "layer2.3.")

    x = bottleneck(network, weight_map, x.get_output(0), 512, 256, 2,
                   "layer3.0.")
    x = bottleneck(network, weight_map, x.get_output(0), 1024, 256, 1,
                   "layer3.1.")
    x = bottleneck(network, weight_map, x.get_output(0), 1024, 256, 1,
                   "layer3.2.")
    x = bottleneck(network, weight_map, x.get_output(0), 1024, 256, 1,
                   "layer3.3.")
    x = bottleneck(network, weight_map, x.get_output(0), 1024, 256, 1,
                   "layer3.4.")
    x = bottleneck(network, weight_map, x.get_output(0), 1024, 256, 1,
                   "layer3.5.")

    x = bottleneck(network, weight_map, x.get_output(0), 1024, 512, 2,
                   "layer4.0.")
    x = bottleneck(network, weight_map, x.get_output(0), 2048, 512, 1,
                   "layer4.1.")
    x = bottleneck(network, weight_map, x.get_output(0), 2048, 512, 1,
                   "layer4.2.")

    pool2 = network.add_pooling(x.get_output(0),
                                window_size=trt.DimsHW(7, 7),
                                type=trt.PoolingType.AVERAGE)
    assert pool2
    pool2.stride = (1, 1)

    fc1 = network.add_fully_connected(input=pool2.get_output(0),
                                      num_outputs=OUTPUT_SIZE,
                                      kernel=weight_map['fc.weight'],
                                      bias=weight_map['fc.bias'])
    assert fc1

    fc1.get_output(0).name = OUTPUT_BLOB_NAME
    network.mark_output(fc1.get_output(0))

    # Build engine
    builder.max_batch_size = maxBatchSize
    builder.max_workspace_size = 1 << 20
    engine = builder.build_engine(network, config)

    del network
    del weight_map

    return engine


def APIToModel(maxBatchSize):
    builder = trt.Builder(TRT_LOGGER)
    config = builder.create_builder_config()
    engine = create_engine(maxBatchSize, builder, config, trt.float32)
    assert engine
    with open(ENGINE_PATH, "wb") as f:
        f.write(engine.serialize())

    del engine
    del builder


def doInference(context, host_in, host_out, batchSize):
    engine = context.engine
    assert engine.num_bindings == 2

    devide_in = cuda.mem_alloc(host_in.nbytes)
    devide_out = cuda.mem_alloc(host_out.nbytes)
    bindings = [int(devide_in), int(devide_out)]
    stream = cuda.Stream()

    cuda.memcpy_htod_async(devide_in, host_in, stream)
    context.execute_async(bindings=bindings, stream_handle=stream.handle)
    cuda.memcpy_dtoh_async(host_out, devide_out, stream)
    stream.synchronize()


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("-s", action='store_true')
    parser.add_argument("-d", action='store_true')
    args = parser.parse_args()

    if not (args.s ^ args.d):
        print(
            "arguments not right!\n"
            "python resnet50.py -s   # serialize model to plan file\n"
            "python resnet50.py -d   # deserialize plan file and run inference"
        )
        sys.exit()

    if args.s:
        APIToModel(BATCH_SIZE)
    else:
        runtime = trt.Runtime(TRT_LOGGER)
        assert runtime

        with open(ENGINE_PATH, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        assert engine

        context = engine.create_execution_context()
        assert context

        data = np.ones((BATCH_SIZE * 3 * INPUT_H * INPUT_W), dtype=np.float32)
        host_in = cuda.pagelocked_empty(BATCH_SIZE * 3 * INPUT_H * INPUT_W,
                                        dtype=np.float32)
        np.copyto(host_in, data.ravel())
        host_out = cuda.pagelocked_empty(OUTPUT_SIZE, dtype=np.float32)

        doInference(context, host_in, host_out, BATCH_SIZE)

        print(f'Output: \n{host_out[:10]}\n{host_out[-10:]}')


================================================
FILE: resnet/resnext50_32x4d.cpp
================================================
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include <cmath>

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

// stuff we know about the network and the input/output blobs
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int OUTPUT_SIZE = 1000;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";

using namespace nvinfer1;

static Logger gLogger;

// Load weights from files shared with TensorRT samples.
// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file)
{
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        
        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + ".weight"].values;
    float *beta = (float*)weightMap[lname + ".bias"].values;
    float *mean = (float*)weightMap[lname + ".running_mean"].values;
    float *var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;
    std::cout << "len " << len << std::endl;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};
    
    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

IActivationLayer* bottleneck(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    int groups = 32;
    int width = outch * 4 / 64 * 32;

    IConvolutionLayer* conv1 = network->addConvolutionNd(input, width, DimsHW{1, 1}, weightMap[lname + "conv1.weight"], emptywts);
    assert(conv1);

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5);

    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), width, DimsHW{3, 3}, weightMap[lname + "conv2.weight"], emptywts);
    assert(conv2);
    conv2->setStrideNd(DimsHW{stride, stride});
    conv2->setPaddingNd(DimsHW{1, 1});
    conv2->setNbGroups(groups);

    IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5);

    IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU);
    assert(relu2);

    IConvolutionLayer* conv3 = network->addConvolutionNd(*relu2->getOutput(0), outch * 4, DimsHW{1, 1}, weightMap[lname + "conv3.weight"], emptywts);
    assert(conv3);

    IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "bn3", 1e-5);

    IElementWiseLayer* ew1;
    if (stride != 1 || inch != outch * 4) {
        IConvolutionLayer* conv4 = network->addConvolutionNd(input, outch * 4, DimsHW{1, 1}, weightMap[lname + "downsample.0.weight"], emptywts);
        assert(conv4);
        conv4->setStrideNd(DimsHW{stride, stride});

        IScaleLayer* bn4 = addBatchNorm2d(network, weightMap, *conv4->getOutput(0), lname + "downsample.1", 1e-5);
        ew1 = network->addElementWise(*bn4->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM);
    } else {
        ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM);
    }
    IActivationLayer* relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU);
    assert(relu3);
    return relu3;
}

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt)
{
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // Create input tensor of shape { 3, INPUT_H, INPUT_W } with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../resnext50.wts");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 64, DimsHW{7, 7}, weightMap["conv1.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{2, 2});
    conv1->setPaddingNd(DimsHW{3, 3});

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "bn1", 1e-5);

    // Add activation layer using the ReLU algorithm.
    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
    assert(pool1);
    pool1->setStrideNd(DimsHW{2, 2});
    pool1->setPaddingNd(DimsHW{1, 1});

    IActivationLayer* x = bottleneck(network, weightMap, *pool1->getOutput(0), 64, 64, 1, "layer1.0.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 256, 64, 1, "layer1.1.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 256, 64, 1, "layer1.2.");

    x = bottleneck(network, weightMap, *x->getOutput(0), 256, 128, 2, "layer2.0.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.1.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.2.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.3.");

    x = bottleneck(network, weightMap, *x->getOutput(0), 512, 256, 2, "layer3.0.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.1.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.2.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.3.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.4.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.5.");

    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 512, 2, "layer4.0.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 2048, 512, 1, "layer4.1.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 2048, 512, 1, "layer4.2.");

    IPoolingLayer* pool2 = network->addPoolingNd(*x->getOutput(0), PoolingType::kAVERAGE, DimsHW{7, 7});
    assert(pool2);
    pool2->setStrideNd(DimsHW{1, 1});
    
    IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool2->getOutput(0), 1000, weightMap["fc.weight"], weightMap["fc.bias"]);
    assert(fc1);

    fc1->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    std::cout << "set name out" << std::endl;
    network->markOutput(*fc1->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(1 << 20);
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "build out" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*) (mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream)
{
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
    config->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

int main(int argc, char** argv)
{
    if (argc != 2) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./resnext -s   // serialize model to plan file" << std::endl;
        std::cerr << "./resnext -d   // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{nullptr};
    size_t size{0};

    if (std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(1, &modelStream);
        assert(modelStream != nullptr);

        std::ofstream p("resnext50.engine", std::ios::binary);
        if (!p)
        {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 1;
    } else if (std::string(argv[1]) == "-d") {
        std::ifstream file("resnext50.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        return -1;
    }


    // Subtract mean from image
    static float data[3 * INPUT_H * INPUT_W];
    for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
        data[i] = 1.0;

    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    // Run inference
    static float prob[OUTPUT_SIZE];
    for (int i = 0; i < 100; i++) {
        auto start = std::chrono::system_clock::now();
        doInference(*context, data, prob, 1);
        auto end = std::chrono::system_clock::now();
        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    // Print histogram of the output distribution
    std::cout << "\nOutput:\n\n";
    for (unsigned int i = 0; i < 10; i++)
    {
        std::cout << prob[i] << ", ";
    }
    std::cout << std::endl;
    for (unsigned int i = 0; i < 10; i++)
    {
        std::cout << prob[OUTPUT_SIZE - 10 + i] << ", ";
    }
    std::cout << std::endl;

    return 0;
}


================================================
FILE: resnet/wide_resnet50.py
================================================
import os
import sys
import struct
import argparse

import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt

BATCH_SIZE = 1
INPUT_H = 224
INPUT_W = 224
OUTPUT_SIZE = 1000
BS = 1
INPUT_BLOB_NAME = "data"
OUTPUT_BLOB_NAME = "prob"
EPS = 1e-5

WEIGHT_PATH = "./wide_resnet50.wts"
ENGINE_PATH = "./wide_resnet50.engine"

TRT_LOGGER = trt.Logger(trt.Logger.INFO)


def load_weights(file):
    print(f"Loading weights: {file}")

    assert os.path.exists(file), 'Unable to load weight file.'

    weight_map = {}
    with open(file, "r") as f:
        lines = [line.strip() for line in f]
    count = int(lines[0])
    assert count == len(lines) - 1
    for i in range(1, count + 1):
        splits = lines[i].split(" ")
        name = splits[0]
        cur_count = int(splits[1])
        assert cur_count + 2 == len(splits)
        values = []
        for j in range(2, len(splits)):
            # hex string to bytes to float
            values.append(struct.unpack(">f", bytes.fromhex(splits[j])))
        weight_map[name] = np.array(values, dtype=np.float32)

    return weight_map


def addBatchNorm2d(network, weight_map, inputs, layer_name, eps):
    gamma = weight_map[layer_name + ".weight"]
    beta = weight_map[layer_name + ".bias"]
    mean = weight_map[layer_name + ".running_mean"]
    var = weight_map[layer_name + ".running_var"]
    print(layer_name + " " +  str(len(weight_map[layer_name + ".running_var"])))
    var = np.sqrt(var + eps)

    scale = gamma / var
    shift = -mean / var * gamma + beta
    return network.add_scale(input=inputs,
                             mode=trt.ScaleMode.CHANNEL,
                             shift=shift,
                             scale=scale)


def bottleneck(network, weight_map, input, in_channels, out_channels, stride, layer_name):
    # empty weights for bias
    emptywts = trt.Weights()

    conv1 = network.add_convolution(input=input,
                                    num_output_maps=out_channels,
                                    kernel_shape=(1, 1),
                                    kernel=weight_map[layer_name + "conv1.weight"],
                                    bias=emptywts)
    assert conv1

    bn1 = addBatchNorm2d(network, weight_map, conv1.get_output(0), layer_name + "bn1", EPS)
    assert bn1

    relu1 = network.add_activation(bn1.get_output(0), type=trt.ActivationType.RELU)
    assert relu1

    conv2 = network.add_convolution(input=relu1.get_output(0),
                                    num_output_maps=out_channels,
                                    kernel_shape=(3, 3),
                                    kernel=weight_map[layer_name + "conv2.weight"],
                                    bias=emptywts)
    assert conv2
    conv2.stride = (stride, stride)
    conv2.padding = (1, 1)

    bn2 = addBatchNorm2d(network, weight_map, conv2.get_output(0),
                         layer_name + "bn2", EPS)
    assert bn2

    relu2 = network.add_activation(bn2.get_output(0),
                                   type=trt.ActivationType.RELU)
    assert relu2

    conv3 = network.add_convolution(input=relu2.get_output(0),
                                    num_output_maps=out_channels * 2,
                                    kernel_shape=(1, 1),
                                    kernel=weight_map[layer_name + "conv3.weight"],
                                    bias=emptywts)
    assert conv3

    bn3 = addBatchNorm2d(network, weight_map, conv3.get_output(0), layer_name + "bn3", EPS)
    assert bn3

    if stride != 1 or in_channels != 2 * out_channels:
        conv4 = network.add_convolution(
            input=input,
            num_output_maps=out_channels * 2,
            kernel_shape=(1, 1),
            kernel=weight_map[layer_name + "downsample.0.weight"],
            bias=emptywts)
        assert conv4
        conv4.stride = (stride, stride)

        bn4 = addBatchNorm2d(network, weight_map, conv4.get_output(0), layer_name + "downsample.1", EPS)
        assert bn4

        ew1 = network.add_elementwise(bn4.get_output(0), bn3.get_output(0),
                                      trt.ElementWiseOperation.SUM)
    else:
        ew1 = network.add_elementwise(input, bn3.get_output(0), trt.ElementWiseOperation.SUM)
    assert ew1

    relu3 = network.add_activation(ew1.get_output(0), type=trt.ActivationType.RELU)
    assert relu3

    return relu3


def create_engine(maxBatchSize, builder, config, dt):
    weight_map = load_weights(WEIGHT_PATH)
    network = builder.create_network()

    data = network.add_input(INPUT_BLOB_NAME, dt, (3, INPUT_H, INPUT_W))
    assert data

    # empty weights for bias
    emptywts = trt.Weights()

    conv1 = network.add_convolution(input=data,
                                    num_output_maps=64,
                                    kernel_shape=(7, 7),
                                    kernel=weight_map["conv1.weight"],
                                    bias=emptywts)
    assert conv1
    conv1.stride = (2, 2)
    conv1.padding = (3, 3)

    bn1 = addBatchNorm2d(network, weight_map, conv1.get_output(0), "bn1", EPS)
    assert bn1

    relu1 = network.add_activation(bn1.get_output(0), type=trt.ActivationType.RELU)
    assert relu1

    pool1 = network.add_pooling(input=relu1.get_output(0),
                                window_size=trt.DimsHW(3, 3),
                                type=trt.PoolingType.MAX)
    assert pool1
    pool1.stride = (2, 2)
    pool1.padding = (1, 1)

    x = bottleneck(network, weight_map, pool1.get_output(0), 64, 128, 1, "layer1.0.")
    x = bottleneck(network, weight_map, x.get_output(0), 256, 128, 1, "layer1.1.")
    x = bottleneck(network, weight_map, x.get_output(0), 256, 128, 1, "layer1.2.")

    x = bottleneck(network, weight_map, x.get_output(0), 256, 256, 2, "layer2.0.")
    x = bottleneck(network, weight_map, x.get_output(0), 512, 256, 1, "layer2.1.")
    x = bottleneck(network, weight_map, x.get_output(0), 512, 256, 1, "layer2.2.")
    x = bottleneck(network, weight_map, x.get_output(0), 512, 256, 1, "layer2.3.")

    x = bottleneck(network, weight_map, x.get_output(0), 512, 512, 2, "layer3.0.")
    x = bottleneck(network, weight_map, x.get_output(0), 1024, 512, 1, "layer3.1.")
    x = bottleneck(network, weight_map, x.get_output(0), 1024, 512, 1, "layer3.2.")
    x = bottleneck(network, weight_map, x.get_output(0), 1024, 512, 1, "layer3.3.")
    x = bottleneck(network, weight_map, x.get_output(0), 1024, 512, 1, "layer3.4.")
    x = bottleneck(network, weight_map, x.get_output(0), 1024, 512, 1, "layer3.5.")

    x = bottleneck(network, weight_map, x.get_output(0), 1024, 1024, 2, "layer4.0.")
    x = bottleneck(network, weight_map, x.get_output(0), 2048, 1024, 1, "layer4.1.")
    x = bottleneck(network, weight_map, x.get_output(0), 2048, 1024, 1, "layer4.2.")

    pool2 = network.add_pooling(x.get_output(0),
                                window_size=trt.DimsHW(7, 7),
                                type=trt.PoolingType.AVERAGE)
    assert pool2
    pool2.stride = (1, 1)

    fc1 = network.add_fully_connected(input=pool2.get_output(0),
                                      num_outputs=OUTPUT_SIZE,
                                      kernel=weight_map['fc.weight'],
                                      bias=weight_map['fc.bias'])
    assert fc1

    fc1.get_output(0).name = OUTPUT_BLOB_NAME
    network.mark_output(fc1.get_output(0))

    # Build engine
    builder.max_batch_size = maxBatchSize
    builder.max_workspace_size = 1 << 20
    engine = builder.build_engine(network, config)
    print("build out")
    del network
    del weight_map

    return engine


def APIToModel(maxBatchSize):
    builder = trt.Builder(TRT_LOGGER)
    config = builder.create_builder_config()
    engine = create_engine(maxBatchSize, builder, config, trt.float32)
    assert engine
    with open(ENGINE_PATH, "wb") as f:
        f.write(engine.serialize())

    del engine
    del builder


def doInference(context, host_in, host_out, batchSize):
    engine = context.engine
    assert engine.num_bindings == 2

    devide_in = cuda.mem_alloc(host_in.nbytes)
    devide_out = cuda.mem_alloc(host_out.nbytes)
    bindings = [int(devide_in), int(devide_out)]
    stream = cuda.Stream()

    cuda.memcpy_htod_async(devide_in, host_in, stream)
    context.execute_async(bindings=bindings, stream_handle=stream.handle)
    cuda.memcpy_dtoh_async(host_out, devide_out, stream)
    stream.synchronize()


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("-s", action='store_true')
    parser.add_argument("-d", action='store_true')
    args = parser.parse_args()

    if not (args.s ^ args.d):
        print(
            "arguments not right!\n"
            "python wide_resnet50.py -s   # serialize model to plan file\n"
            "python wide_resnet50.py -d   # deserialize plan file and run inference"
        )
        sys.exit()

    if args.s:
        APIToModel(BATCH_SIZE)
    else:
        runtime = trt.Runtime(TRT_LOGGER)
        assert runtime

        with open(ENGINE_PATH, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        assert engine

        context = engine.create_execution_context()
        assert context

        data = np.ones((BATCH_SIZE * 3 * INPUT_H * INPUT_W), dtype=np.float32)
        host_in = cuda.pagelocked_empty(BATCH_SIZE * 3 * INPUT_H * INPUT_W,
                                        dtype=np.float32)
        np.copyto(host_in, data.ravel())
        host_out = cuda.pagelocked_empty(OUTPUT_SIZE, dtype=np.float32)

        doInference(context, host_in, host_out, BATCH_SIZE)

        print(f'Output: \n{host_out[:10]}\n{host_out[-10:]}')


================================================
FILE: resnet/wideresnet50.cpp
================================================
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include <cmath>

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

// stuff we know about the network and the input/output blobs
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int OUTPUT_SIZE = 1000;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";

using namespace nvinfer1;

static Logger gLogger;

// Load weights from files shared with TensorRT samples.
// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file)
{
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + ".weight"].values;
    float *beta = (float*)weightMap[lname + ".bias"].values;
    float *mean = (float*)weightMap[lname + ".running_mean"].values;
    float *var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;
    std::cout << "len " << len << std::endl;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};

    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

IActivationLayer* bottleneck(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{1, 1}, weightMap[lname + "conv1.weight"], emptywts);
    assert(conv1);

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5);

    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{3, 3}, weightMap[lname + "conv2.weight"], emptywts);
    assert(conv2);
    conv2->setStrideNd(DimsHW{stride, stride});
    conv2->setPaddingNd(DimsHW{1, 1});

    IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5);

    IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU);
    assert(relu2);

    IConvolutionLayer* conv3 = network->addConvolutionNd(*relu2->getOutput(0), outch * 2, DimsHW{1, 1}, weightMap[lname + "conv3.weight"], emptywts);
    assert(conv3);

    IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "bn3", 1e-5);

    IElementWiseLayer* ew1;
    if (stride != 1 || inch != outch * 2) {
        IConvolutionLayer* conv4 = network->addConvolutionNd(input, outch * 2, DimsHW{1, 1}, weightMap[lname + "downsample.0.weight"], emptywts);
        assert(conv4);
        conv4->setStrideNd(DimsHW{stride, stride});

        IScaleLayer* bn4 = addBatchNorm2d(network, weightMap, *conv4->getOutput(0), lname + "downsample.1", 1e-5);
        ew1 = network->addElementWise(*bn4->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM);
    } else {
        ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM);
    }
    IActivationLayer* relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU);
    assert(relu3);
    return relu3;
}

// Create the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // Create input tensor of shape { 3, INPUT_H, INPUT_W } with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../wideresnet50.wts");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 64, DimsHW{7, 7}, weightMap["conv1.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{2, 2});
    conv1->setPaddingNd(DimsHW{3, 3});

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "bn1", 1e-5);

    // Add activation layer using the ReLU algorithm.
    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    // Add max pooling layer with stride of 2x2 and kernel size of 2x2.
    IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
    assert(pool1);
    pool1->setStrideNd(DimsHW{2, 2});
    pool1->setPaddingNd(DimsHW{1, 1});

    IActivationLayer* x = bottleneck(network, weightMap, *pool1->getOutput(0), 64, 128, 1, "layer1.0.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 256, 128, 1, "layer1.1.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 256, 128, 1, "layer1.2.");

    x = bottleneck(network, weightMap, *x->getOutput(0), 256, 256, 2, "layer2.0.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 512, 256, 1, "layer2.1.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 512, 256, 1, "layer2.2.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 512, 256, 1, "layer2.3.");

    x = bottleneck(network, weightMap, *x->getOutput(0), 512, 512, 2, "layer3.0.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 512, 1, "layer3.1.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 512, 1, "layer3.2.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 512, 1, "layer3.3.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 512, 1, "layer3.4.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 512, 1, "layer3.5.");

    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 1024, 2, "layer4.0.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 2048, 1024, 1, "layer4.1.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 2048, 1024, 1, "layer4.2.");

    IPoolingLayer* pool2 = network->addPoolingNd(*x->getOutput(0), PoolingType::kAVERAGE, DimsHW{7, 7});
    assert(pool2);
    pool2->setStrideNd(DimsHW{1, 1});

    IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool2->getOutput(0), 1000, weightMap["fc.weight"], weightMap["fc.bias"]);
    assert(fc1);

    fc1->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    std::cout << "set name out" << std::endl;
    network->markOutput(*fc1->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(1 << 20);
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "build out" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*) (mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream)
{
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
    config->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

int main(int argc, char** argv)
{
    if (argc != 2) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./wideresnet -s   // serialize model to plan file" << std::endl;
        std::cerr << "./wideresnet -d   // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{nullptr};
    size_t size{0};

    if (std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(1, &modelStream);
        assert(modelStream != nullptr);

        std::ofstream p("wideresnet50.engine", std::ios::binary);
        if (!p)
        {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 1;
    } else if (std::string(argv[1]) == "-d") {
        std::ifstream file("wideresnet50.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        return -1;
    }


    // Subtract mean from image
    static float data[3 * INPUT_H * INPUT_W];
    for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
        data[i] = 1.0;

    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    // Run inference
    static float prob[OUTPUT_SIZE];
    for (int i = 0; i < 100; i++) {
        auto start = std::chrono::system_clock::now();
        doInference(*context, data, prob, 1);
        auto end = std::chrono::system_clock::now();
        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    // Print histogram of the output distribution
    std::cout << "\nOutput:\n\n";
    for (unsigned int i = 0; i < 10; i++)
    {
        std::cout << prob[i] << ", ";
    }
    std::cout << std::endl;
    for (unsigned int i = 0; i < 10; i++)
    {
        std::cout << prob[OUTPUT_SIZE - 10 + i] << ", ";
    }
    std::cout << std::endl;

    return 0;
}


================================================
FILE: retinaface/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(retinaface)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

find_package(CUDA REQUIRED)

if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
    message("embed_platform on")
    include_directories(/usr/local/cuda/targets/aarch64-linux/include)
    link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
else()
    message("embed_platform off")
    include_directories(/usr/local/cuda/include)
    link_directories(/usr/local/cuda/lib64)
endif()

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")

cuda_add_library(decodeplugin SHARED ${PROJECT_SOURCE_DIR}/decode.cu)
target_link_libraries(decodeplugin nvinfer cudart)

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

add_executable(retina_r50 ${PROJECT_SOURCE_DIR}/calibrator.cpp ${PROJECT_SOURCE_DIR}/retina_r50.cpp)
target_link_libraries(retina_r50 nvinfer)
target_link_libraries(retina_r50 cudart)
target_link_libraries(retina_r50 decodeplugin)
target_link_libraries(retina_r50 ${OpenCV_LIBRARIES})

add_executable(retina_mnet ${PROJECT_SOURCE_DIR}/calibrator.cpp ${PROJECT_SOURCE_DIR}/retina_mnet.cpp)
target_link_libraries(retina_mnet nvinfer)
target_link_libraries(retina_mnet cudart)
target_link_libraries(retina_mnet decodeplugin)
target_link_libraries(retina_mnet ${OpenCV_LIBRARIES})

add_definitions(-O2 -pthread)


================================================
FILE: retinaface/README.md
================================================
# RetinaFace

 The pytorch implementation is [biubug6/Pytorch_Retinaface](https://github.com/biubug6/Pytorch_Retinaface), I forked it into 
[wang-xinyu/Pytorch_Retinaface](https://github.com/wang-xinyu/Pytorch_Retinaface) and add genwts.py

This branch is using TensorRT 7 API, branch [trt4->retinaface](https://github.com/wang-xinyu/tensorrtx/tree/trt4/retinaface) is using TensorRT 4.

## Config

- Input shape `INPUT_H`, `INPUT_W` defined in `decode.h`
- INT8/FP16/FP32 can be selected by the macro `USE_FP16` or `USE_INT8` or `USE_FP32` in `retina_r50.cpp`
- GPU id can be selected by the macro `DEVICE` in `retina_r50.cpp`
- Batchsize can be selected by the macro `BATCHSIZE` in `retina_r50.cpp`

## Run

The following described how to run `retina_r50`. While `retina_mnet` is nearly the same, just generate `retinaface.wts` with `mobilenet0.25_Final.pth` and run `retina_mnet`.

1. generate retinaface.wts from pytorch implementation https://github.com/wang-xinyu/Pytorch_Retinaface

```
git clone https://github.com/wang-xinyu/Pytorch_Retinaface.git
// download its weights 'Resnet50_Final.pth', put it in Pytorch_Retinaface/weights
cd Pytorch_Retinaface
python detect.py --save_model
python genwts.py
// a file 'retinaface.wts' will be generated.
```

2. put retinaface.wts into tensorrtx/retinaface, build and run

```
git clone https://github.com/wang-xinyu/tensorrtx.git
cd tensorrtx/retinaface
// put retinaface.wts here
mkdir build
cd build
cmake ..
make
sudo ./retina_r50 -s  // build and serialize model to file i.e. 'retina_r50.engine'
wget https://github.com/Tencent/FaceDetection-DSFD/raw/master/data/worlds-largest-selfie.jpg
sudo ./retina_r50 -d  // deserialize model file and run inference.
```

3. check the images generated, as follows. 0_result.jpg

4. we also provide a python wrapper

```
// install python-tensorrt, pycuda, etc.
// ensure the retina_r50.engine and libdecodeplugin.so have been built
python retinaface_trt.py
```

# INT8 Quantization

1. Prepare calibration images, you can randomly select 1000s images from your train set. For widerface, you can also download my calibration images `widerface_calib` from [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing) or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh

2. unzip it in retinaface/build

3. set the macro `USE_INT8` in retina_r50.cpp and make

4. serialize the model and test

<p align="center">
<img src="https://user-images.githubusercontent.com/15235574/78901890-9077fb80-7aab-11ea-94f1-237f51fcc347.jpg">
</p>

## More Information

Check the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)


================================================
FILE: retinaface/calibrator.cpp
================================================
#include <iostream>
#include <iterator>
#include <fstream>
#include <opencv2/dnn/dnn.hpp>
#include "calibrator.h"
#include "cuda_runtime_api.h"
#include "common.hpp"

Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache)
    : batchsize_(batchsize)
    , input_w_(input_w)
    , input_h_(input_h)
    , img_idx_(0)
    , img_dir_(img_dir)
    , calib_table_name_(calib_table_name)
    , input_blob_name_(input_blob_name)
    , read_cache_(read_cache)
{
    input_count_ = 3 * input_w * input_h * batchsize;
    CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float)));
    read_files_in_dir(img_dir, img_files_);
}

Int8EntropyCalibrator2::~Int8EntropyCalibrator2()
{
    CHECK(cudaFree(device_input_));
}

int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT
{
    return batchsize_;
}

bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT
{
    if (img_idx_ + batchsize_ > (int)img_files_.size()) {
        return false;
    }

    std::vector<cv::Mat> input_imgs_;
    for (int i = img_idx_; i < img_idx_ + batchsize_; i++) {
        std::cout << img_files_[i] << "  " << i << std::endl;
        cv::Mat temp = cv::imread(img_dir_ + img_files_[i]);
        if (temp.empty()){
            std::cerr << "Fatal error: image cannot open!" << std::endl;
            return false;
        }
        cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_);
        input_imgs_.push_back(pr_img);
    }
    img_idx_ += batchsize_;
    cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0, cv::Size(input_w_, input_h_), cv::Scalar(104, 117, 123), false, false);

    CHECK(cudaMemcpy(device_input_, blob.ptr<float>(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice));
    assert(!strcmp(names[0], input_blob_name_));
    bindings[0] = device_input_;
    return true;
}

const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT
{
    std::cout << "reading calib cache: " << calib_table_name_ << std::endl;
    calib_cache_.clear();
    std::ifstream input(calib_table_name_, std::ios::binary);
    input >> std::noskipws;
    if (read_cache_ && input.good())
    {
        std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(calib_cache_));
    }
    length = calib_cache_.size();
    return length ? calib_cache_.data() : nullptr;
}

void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT
{
    std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl;
    std::ofstream output(calib_table_name_, std::ios::binary);
    output.write(reinterpret_cast<const char*>(cache), length);
}


================================================
FILE: retinaface/calibrator.h
================================================
#ifndef ENTROPY_CALIBRATOR_H
#define ENTROPY_CALIBRATOR_H

#include "NvInfer.h"
#include <string>
#include <vector>
#include "macros.h"

//! \class Int8EntropyCalibrator2
//!
//! \brief Implements Entropy calibrator 2.
//!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
//!
class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2
{
public:
    Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true);

    virtual ~Int8EntropyCalibrator2();
    int getBatchSize() const TRT_NOEXCEPT override;
    bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
    const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
    void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;

private:
    int batchsize_;
    int input_w_;
    int input_h_;
    int img_idx_;
    std::string img_dir_;
    std::vector<std::string> img_files_;
    size_t input_count_;
    std::string calib_table_name_;
    const char* input_blob_name_;
    bool read_cache_;
    void* device_input_;
    std::vector<char> calib_cache_;
};

#endif // ENTROPY_CALIBRATOR_H


================================================
FILE: retinaface/common.hpp
================================================
#ifndef RETINAFACE_COMMON_H_
#define RETINAFACE_COMMON_H_
#include <opencv2/opencv.hpp>
#include <dirent.h>
#include "NvInfer.h"
#include "decode.h"

using namespace nvinfer1;

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
    int w, h, x, y;
    float r_w = input_w / (img.cols*1.0);
    float r_h = input_h / (img.rows*1.0);
    if (r_h > r_w) {
        w = input_w;
        h = r_w * img.rows;
        x = 0;
        y = (input_h - h) / 2;
    } else {
        w = r_h * img.cols;
        h = input_h;
        x = (input_w - w) / 2;
        y = 0;
    }
    cv::Mat re(h, w, CV_8UC3);
    cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
    cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
    re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
    return out;
}

static inline int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
    DIR *p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 &&
            strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}

static inline cv::Rect get_rect_adapt_landmark(cv::Mat& img, int input_w, int input_h, float bbox[4], float lmk[10]) {
    int l, r, t, b;
    float r_w = input_w / (img.cols * 1.0);
    float r_h = input_h / (img.rows * 1.0);
    if (r_h > r_w) {
        l = bbox[0] / r_w;
        r = bbox[2] / r_w;
        t = (bbox[1] - (input_h - r_w * img.rows) / 2) / r_w;
        b = (bbox[3] - (input_h - r_w * img.rows) / 2) / r_w;
        for (int i = 0; i < 10; i += 2) {
            lmk[i] /= r_w;
            lmk[i + 1] = (lmk[i + 1] - (input_h - r_w * img.rows) / 2) / r_w;
        }
    } else {
        l = (bbox[0] - (input_w - r_h * img.cols) / 2) / r_h;
        r = (bbox[2] - (input_w - r_h * img.cols) / 2) / r_h;
        t = bbox[1] / r_h;
        b = bbox[3] / r_h;
        for (int i = 0; i < 10; i += 2) {
            lmk[i] = (lmk[i] - (input_w - r_h * img.cols) / 2) / r_h;
            lmk[i + 1] /= r_h;
        }
    }
    return cv::Rect(l, t, r-l, b-t);
}

static float iou(float lbox[4], float rbox[4]) {
    float interBox[] = {
        std::max(lbox[0], rbox[0]), //left
        std::min(lbox[2], rbox[2]), //right
        std::max(lbox[1], rbox[1]), //top
        std::min(lbox[3], rbox[3]), //bottom
    };

    if(interBox[2] > interBox[3] || interBox[0] > interBox[1])
        return 0.0f;

    float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]);
    return interBoxS / ((lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) -interBoxS + 0.000001f);
}

static bool cmp(const decodeplugin::Detection& a, const decodeplugin::Detection& b) {
    return a.class_confidence > b.class_confidence;
}

static inline void nms(std::vector<decodeplugin::Detection>& res, float *output, float nms_thresh = 0.4) {
    std::vector<decodeplugin::Detection> dets;
    for (int i = 0; i < output[0]; i++) {
        if (output[15 * i + 1 + 4] <= 0.1) continue;
        decodeplugin::Detection det;
        memcpy(&det, &output[15 * i + 1], sizeof(decodeplugin::Detection));
        dets.push_back(det);
    }
    std::sort(dets.begin(), dets.end(), cmp);
    for (size_t m = 0; m < dets.size(); ++m) {
        auto& item = dets[m];
        res.push_back(item);
        //std::cout << item.class_confidence << " bbox " << item.bbox[0] << ", " << item.bbox[1] << ", " << item.bbox[2] << ", " << item.bbox[3] << std::endl;
        for (size_t n = m + 1; n < dets.size(); ++n) {
            if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
                dets.erase(dets.begin()+n);
                --n;
            }
        }
    }
}

// Load weights from files
// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
static inline std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        
        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

static inline Weights getWeights(std::map<std::string, Weights>& weightMap, std::string key) {
    if (weightMap.count(key) != 1) {
        std::cerr << key << " not existed in weight map, fatal error!!!" << std::endl;
        exit(-1);
    }
    return weightMap[key];
}

static inline IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + ".weight"].values;
    float *beta = (float*)weightMap[lname + ".bias"].values;
    float *mean = (float*)weightMap[lname + ".running_mean"].values;
    float *var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};
    
    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

#endif


================================================
FILE: retinaface/decode.cu
================================================
#include "decode.h"
#include "stdio.h"

namespace nvinfer1
{
    DecodePlugin::DecodePlugin()
    {
    }

    DecodePlugin::~DecodePlugin()
    {
    }

    // create the plugin at runtime from a byte stream
    DecodePlugin::DecodePlugin(const void* data, size_t length)
    {
    }

    void DecodePlugin::serialize(void* buffer) const TRT_NOEXCEPT
    {
    }

    size_t DecodePlugin::getSerializationSize() const TRT_NOEXCEPT
    {
        return 0;
    }

    int DecodePlugin::initialize() TRT_NOEXCEPT
    { 
        return 0;
    }

    Dims DecodePlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT
    {
        //output the result to channel
        int totalCount = 0;
        totalCount += decodeplugin::INPUT_H / 8 * decodeplugin::INPUT_W / 8 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);
        totalCount += decodeplugin::INPUT_H / 16 * decodeplugin::INPUT_W / 16 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);
        totalCount += decodeplugin::INPUT_H / 32 * decodeplugin::INPUT_W / 32 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);

        return Dims3(totalCount + 1, 1, 1);
    }

    // Set plugin namespace
    void DecodePlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT
    {
        mPluginNamespace = pluginNamespace;
    }

    const char* DecodePlugin::getPluginNamespace() const TRT_NOEXCEPT
    {
        return mPluginNamespace;
    }

    // Return the DataType of the plugin output at the requested index
    DataType DecodePlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT
    {
        return DataType::kFLOAT;
    }

    // Return true if output tensor is broadcast across a batch.
    bool DecodePlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT
    {
        return false;
    }

    // Return true if plugin can use input that is broadcast across batch without replication.
    bool DecodePlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT
    {
        return false;
    }

    void DecodePlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT
    {
    }

    // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
    void DecodePlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT
    {
    }

    // Detach the plugin object from its execution context.
    void DecodePlugin::detachFromContext() TRT_NOEXCEPT {}

    const char* DecodePlugin::getPluginType() const TRT_NOEXCEPT
    {
        return "Decode_TRT";
    }

    const char* DecodePlugin::getPluginVersion() const TRT_NOEXCEPT
    {
        return "1";
    }

    void DecodePlugin::destroy() TRT_NOEXCEPT
    {
        delete this;
    }

    // Clone the plugin
    IPluginV2IOExt* DecodePlugin::clone() const TRT_NOEXCEPT
    {
        DecodePlugin *p = new DecodePlugin();
        p->setPluginNamespace(mPluginNamespace);
        return p;
    }

    __device__ float Logist(float data){ return 1./(1. + expf(-data)); };

    __global__ void CalDetection(const float *input, float *output, int num_elem, int step, int anchor, int output_elem) {

        int idx = threadIdx.x + blockDim.x * blockIdx.x;
        if (idx >= num_elem) return;

        int h = decodeplugin::INPUT_H / step;
        int w = decodeplugin::INPUT_W / step;
        int total_grid = h * w;
        int bn_idx = idx / total_grid;
        idx = idx - bn_idx * total_grid;
        int y = idx / w;
        int x = idx % w;
        const float* cur_input = input + bn_idx * (4 + 2 + 10) * 2 * total_grid;
        const float *bbox_reg = &cur_input[0];
        const float *cls_reg = &cur_input[2 * 4 * total_grid];
        const float *lmk_reg = &cur_input[2 * 4 * total_grid + 2 * 2 * total_grid];

        for (int k = 0; k < 2; ++k) {
            float conf1 = cls_reg[idx + k * total_grid * 2];
            float conf2 = cls_reg[idx + k * total_grid * 2 + total_grid];
            conf2 = expf(conf2) / (expf(conf1) + expf(conf2));
            if (conf2 <= 0.02) continue;

            float *res_count = output + bn_idx * output_elem;
            int count = (int)atomicAdd(res_count, 1);
            char* data = (char *)res_count + sizeof(float) + count * sizeof(decodeplugin::Detection);
            decodeplugin::Detection* det = (decodeplugin::Detection*)(data);

            float prior[4];
            prior[0] = ((float)x + 0.5) / w;
            prior[1] = ((float)y + 0.5) / h;
            prior[2] = (float)anchor * (k + 1) / decodeplugin::INPUT_W;
            prior[3] = (float)anchor * (k + 1) / decodeplugin::INPUT_H;

            //Location
            det->bbox[0] = prior[0] + bbox_reg[idx + k * total_grid * 4] * 0.1 * prior[2];
            det->bbox[1] = prior[1] + bbox_reg[idx + k * total_grid * 4 + total_grid] * 0.1 * prior[3];
            det->bbox[2] = prior[2] * expf(bbox_reg[idx + k * total_grid * 4 + total_grid * 2] * 0.2);
            det->bbox[3] = prior[3] * expf(bbox_reg[idx + k * total_grid * 4 + total_grid * 3] * 0.2);
            det->bbox[0] -= det->bbox[2] / 2;
            det->bbox[1] -= det->bbox[3] / 2;
            det->bbox[2] += det->bbox[0];
            det->bbox[3] += det->bbox[1];
            det->bbox[0] *= decodeplugin::INPUT_W;
            det->bbox[1] *= decodeplugin::INPUT_H;
            det->bbox[2] *= decodeplugin::INPUT_W;
            det->bbox[3] *= decodeplugin::INPUT_H;
            det->class_confidence = conf2;
            for (int i = 0; i < 10; i += 2) {
                det->landmark[i] = prior[0] + lmk_reg[idx + k * total_grid * 10 + total_grid * i] * 0.1 * prior[2];
                det->landmark[i+1] = prior[1] + lmk_reg[idx + k * total_grid * 10 + total_grid * (i + 1)] * 0.1 * prior[3];
                det->landmark[i] *= decodeplugin::INPUT_W;
                det->landmark[i+1] *= decodeplugin::INPUT_H;
            }
        }
    }

    void DecodePlugin::forwardGpu(const float *const * inputs, float * output, cudaStream_t stream, int batchSize)
    {
        int num_elem = 0;
        int base_step = 8;
        int base_anchor = 16;
        int thread_count;

        int totalCount = 1;
        totalCount += decodeplugin::INPUT_H / 8 * decodeplugin::INPUT_W / 8 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);
        totalCount += decodeplugin::INPUT_H / 16 * decodeplugin::INPUT_W / 16 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);
        totalCount += decodeplugin::INPUT_H / 32 * decodeplugin::INPUT_W / 32 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);
        for(int idx = 0 ; idx < batchSize; ++idx) {
            cudaMemsetAsync(output + idx * totalCount, 0, sizeof(float), stream);
        }

        for (unsigned int i = 0; i < 3; ++i)
        {
            num_elem = batchSize * decodeplugin::INPUT_H / base_step * decodeplugin::INPUT_W / base_step;
            thread_count = (num_elem < thread_count_) ? num_elem : thread_count_;
            CalDetection<<< (num_elem + thread_count - 1) / thread_count, thread_count, 0, stream>>>
                (inputs[i], output, num_elem, base_step, base_anchor, totalCount);
            base_step *= 2;
            base_anchor *= 4;
        }
    }

    int DecodePlugin::enqueue(int batchSize, const void*const * inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT
    {
        //GPU
        //CUDA_CHECK(cudaStreamSynchronize(stream));
        forwardGpu((const float *const *)inputs, (float *)outputs[0], stream, batchSize);
        return 0;
    };

    PluginFieldCollection DecodePluginCreator::mFC{};
    std::vector<PluginField> DecodePluginCreator::mPluginAttributes;

    DecodePluginCreator::DecodePluginCreator()
    {
        mPluginAttributes.clear();

        mFC.nbFields = mPluginAttributes.size();
        mFC.fields = mPluginAttributes.data();
    }

    const char* DecodePluginCreator::getPluginName() const TRT_NOEXCEPT
    {
        return "Decode_TRT";
    }

    const char* DecodePluginCreator::getPluginVersion() const TRT_NOEXCEPT
    {
        return "1";
    }

    const PluginFieldCollection* DecodePluginCreator::getFieldNames() TRT_NOEXCEPT
    {
        return &mFC;
    }

    IPluginV2IOExt* DecodePluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT
    {
        DecodePlugin* obj = new DecodePlugin();
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

    IPluginV2IOExt* DecodePluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT
    {
        // This object will be deleted when the network is destroyed, which will
        // call PReluPlugin::destroy()
        DecodePlugin* obj = new DecodePlugin(serialData, serialLength);
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

}


================================================
FILE: retinaface/decode.h
================================================
#ifndef _DECODE_CU_H
#define _DECODE_CU_H

#include <string>
#include <vector>
#include "NvInfer.h"
#include "macros.h"

namespace decodeplugin
{
    struct alignas(float) Detection{
        float bbox[4];  //x1 y1 x2 y2
        float class_confidence;
        float landmark[10];
    };
    static const int INPUT_H = 480;
    static const int INPUT_W = 640;
}

namespace nvinfer1
{
    class DecodePlugin: public IPluginV2IOExt
    {
        public:
            DecodePlugin();
            DecodePlugin(const void* data, size_t length);

            ~DecodePlugin();

            int getNbOutputs() const TRT_NOEXCEPT override
            {
                return 1;
            }

            Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;

            int initialize() TRT_NOEXCEPT override;

            virtual void terminate() TRT_NOEXCEPT override {};

            virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0;}

            virtual int enqueue(int batchSize, const void*const * inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;

            virtual size_t getSerializationSize() const TRT_NOEXCEPT override;

            virtual void serialize(void* buffer) const TRT_NOEXCEPT override;

            bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override {
                return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
            }

            const char* getPluginType() const TRT_NOEXCEPT override;

            const char* getPluginVersion() const TRT_NOEXCEPT override;

            void destroy() TRT_NOEXCEPT override;

            IPluginV2IOExt* clone() const TRT_NOEXCEPT override;

            void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;

            const char* getPluginNamespace() const TRT_NOEXCEPT override;

            DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;

            bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override;

            bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;

            void attachToContext(
                    cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;

            void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT override;

            void detachFromContext() TRT_NOEXCEPT override;

            int input_size_;
        private:
            void forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize = 1);
            int thread_count_ = 256;
            const char* mPluginNamespace;
    };

    class DecodePluginCreator : public IPluginCreator
    {
        public:
            DecodePluginCreator();

            ~DecodePluginCreator() override = default;

            const char* getPluginName() const TRT_NOEXCEPT override;

            const char* getPluginVersion() const TRT_NOEXCEPT override;

            const PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;

            IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT override;

            IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;

            void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override
            {
                mNamespace = libNamespace;
            }

            const char* getPluginNamespace() const TRT_NOEXCEPT override
            {
                return mNamespace.c_str();
            }

        private:
            std::string mNamespace;
            static PluginFieldCollection mFC;
            static std::vector<PluginField> mPluginAttributes;
    };
    REGISTER_TENSORRT_PLUGIN(DecodePluginCreator);
};

#endif 


================================================
FILE: retinaface/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "macros.h"


using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: retinaface/macros.h
================================================
#ifndef __MACROS_H
#define __MACROS_H

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif

#endif  // __MACROS_H


================================================
FILE: retinaface/retina_mnet.cpp
================================================
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include "cuda_runtime_api.h"
#include "logging.h"
#include "common.hpp"
#include "calibrator.h"

#define USE_FP16  // set USE_INT8 or USE_FP16 or USE_FP32
#define DEVICE 0  // GPU id
#define BATCH_SIZE 1
#define CONF_THRESH 0.75
#define IOU_THRESH 0.4

// stuff we know about the network and the input/output blobs
static const int INPUT_H = decodeplugin::INPUT_H;  // H, W must be able to  be divided by 32.
static const int INPUT_W = decodeplugin::INPUT_W;;
static const int OUTPUT_SIZE = (INPUT_H / 8 * INPUT_W / 8 + INPUT_H / 16 * INPUT_W / 16 + INPUT_H / 32 * INPUT_W / 32) * 2  * 15 + 1;
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";

static Logger gLogger;

ILayer* conv_bn(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, int oup, int s = 1, float leaky = 0.1) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    IConvolutionLayer* conv1 = network->addConvolutionNd(input, oup, DimsHW{3, 3}, getWeights(weightMap, lname + ".0.weight"), emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{1, 1});
    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".1", 1e-5);
    auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU);
    lr->setAlpha(leaky);
    assert(lr);
    return lr;
}

ILayer* conv_bn_no_relu(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, int oup, int s = 1) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    IConvolutionLayer* conv1 = network->addConvolutionNd(input, oup, DimsHW{3, 3}, getWeights(weightMap, lname + ".0.weight"), emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{1, 1});
    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".1", 1e-5);
    return bn1;
}

ILayer* conv_bn1X1(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, int oup, int s = 1, float leaky = 0.1) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    IConvolutionLayer* conv1 = network->addConvolutionNd(input, oup, DimsHW{1, 1}, getWeights(weightMap, lname + ".0.weight"), emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{0, 0});
    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".1", 1e-5);
    auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU);
    lr->setAlpha(leaky);
    assert(lr);
    return lr;
}

ILayer* conv_dw(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, int inp, int oup, int s = 1, float leaky = 0.1) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    IConvolutionLayer* conv1 = network->addConvolutionNd(input, inp, DimsHW{3, 3}, getWeights(weightMap, lname + ".0.weight"), emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{1, 1});
    conv1->setNbGroups(inp);
    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".1", 1e-5);
    auto lr1 = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU);
    lr1->setAlpha(leaky);
    assert(lr1);
    IConvolutionLayer* conv2 = network->addConvolutionNd(*lr1->getOutput(0), oup, DimsHW{1, 1}, getWeights(weightMap, lname + ".3.weight"), emptywts);
    assert(conv2);
    IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + ".4", 1e-5);
    auto lr2 = network->addActivation(*bn2->getOutput(0), ActivationType::kLEAKY_RELU);
    lr2->setAlpha(leaky);
    assert(lr2);
    return lr2;
}

IActivationLayer* ssh(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, int oup) {
    auto conv3x3 = conv_bn_no_relu(network, weightMap, input, lname + ".conv3X3", oup / 2);
    auto conv5x5_1 = conv_bn(network, weightMap, input, lname + ".conv5X5_1", oup / 4);
    auto conv5x5 = conv_bn_no_relu(network, weightMap, *conv5x5_1->getOutput(0), lname + ".conv5X5_2", oup / 4);
    auto conv7x7 = conv_bn(network, weightMap, *conv5x5_1->getOutput(0), lname + ".conv7X7_2", oup / 4);
    conv7x7 = conv_bn_no_relu(network, weightMap, *conv7x7->getOutput(0), lname + ".conv7x7_3", oup / 4);
    ITensor* inputTensors[] = {conv3x3->getOutput(0), conv5x5->getOutput(0), conv7x7->getOutput(0)};
    auto cat = network->addConcatenation(inputTensors, 3);
    IActivationLayer* relu1 = network->addActivation(*cat->getOutput(0), ActivationType::kRELU);
    assert(relu1);
    return relu1;
}

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // Create input tensor with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../retinaface.wts");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    // ------------- backbone mobilenet0.25  ---------------
    // stage 1
    auto x = conv_bn(network, weightMap, *data, "body.stage1.0", 8, 2);
    x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage1.1", 8, 16);
    x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage1.2", 16, 32, 2);
    x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage1.3", 32, 32);
    x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage1.4", 32, 64, 2);
    x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage1.5", 64, 64);
    auto stage1 = x;

    // stage 2
    x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage2.0", 64, 128, 2);
    x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage2.1", 128, 128);
    x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage2.2", 128, 128);
    x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage2.3", 128, 128);
    x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage2.4", 128, 128);
    x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage2.5", 128, 128);
    auto stage2 = x;

    // stage 3
    x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage3.0", 128, 256, 2);
    x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage3.1", 256, 256);
    auto stage3 = x;

    //Dims d1 = stage1->getOutput(0)->getDimensions();
    //std::cout << d1.d[0] << " " << d1.d[1] << " " << d1.d[2] << std::endl;
    // ------------- FPN ---------------
    auto output1 = conv_bn1X1(network, weightMap, *stage1->getOutput(0), "fpn.output1", 64);
    auto output2 = conv_bn1X1(network, weightMap, *stage2->getOutput(0), "fpn.output2", 64);
    auto output3 = conv_bn1X1(network, weightMap, *stage3->getOutput(0), "fpn.output3", 64);

    float *deval = reinterpret_cast<float*>(malloc(sizeof(float) * 64 * 2 * 2));
    for (int i = 0; i < 64 * 2 * 2; i++) {
        deval[i] = 1.0;
    }
    Weights deconvwts{DataType::kFLOAT, deval, 64 * 2 * 2};
    IDeconvolutionLayer* up3 = network->addDeconvolutionNd(*output3->getOutput(0), 64, DimsHW{2, 2}, deconvwts, emptywts);
    assert(up3);
    up3->setStrideNd(DimsHW{2, 2});
    up3->setNbGroups(64);
    weightMap["up3"] = deconvwts;

    output2 = network->addElementWise(*output2->getOutput(0), *up3->getOutput(0), ElementWiseOperation::kSUM);
    output2 = conv_bn(network, weightMap, *output2->getOutput(0), "fpn.merge2", 64);

    IDeconvolutionLayer* up2 = network->addDeconvolutionNd(*output2->getOutput(0), 64, DimsHW{2, 2}, deconvwts, emptywts);
    assert(up2);
    up2->setStrideNd(DimsHW{2, 2});
    up2->setNbGroups(64);
    output1 = network->addElementWise(*output1->getOutput(0), *up2->getOutput(0), ElementWiseOperation::kSUM);
    output1 = conv_bn(network, weightMap, *output1->getOutput(0), "fpn.merge1", 64);

    // ------------- SSH ---------------
    auto ssh1 = ssh(network, weightMap, *output1->getOutput(0), "ssh1", 64);
    auto ssh2 = ssh(network, weightMap, *output2->getOutput(0), "ssh2", 64);
    auto ssh3 = ssh(network, weightMap, *output3->getOutput(0), "ssh3", 64);

    //// ------------- Head ---------------
    auto bbox_head1 = network->addConvolutionNd(*ssh1->getOutput(0), 2 * 4, DimsHW{1, 1}, weightMap["BboxHead.0.conv1x1.weight"], weightMap["BboxHead.0.conv1x1.bias"]);
    auto bbox_head2 = network->addConvolutionNd(*ssh2->getOutput(0), 2 * 4, DimsHW{1, 1}, weightMap["BboxHead.1.conv1x1.weight"], weightMap["BboxHead.1.conv1x1.bias"]);
    auto bbox_head3 = network->addConvolutionNd(*ssh3->getOutput(0), 2 * 4, DimsHW{1, 1}, weightMap["BboxHead.2.conv1x1.weight"], weightMap["BboxHead.2.conv1x1.bias"]);

    auto cls_head1 = network->addConvolutionNd(*ssh1->getOutput(0), 2 * 2, DimsHW{1, 1}, weightMap["ClassHead.0.conv1x1.weight"], weightMap["ClassHead.0.conv1x1.bias"]);
    auto cls_head2 = network->addConvolutionNd(*ssh2->getOutput(0), 2 * 2, DimsHW{1, 1}, weightMap["ClassHead.1.conv1x1.weight"], weightMap["ClassHead.1.conv1x1.bias"]);
    auto cls_head3 = network->addConvolutionNd(*ssh3->getOutput(0), 2 * 2, DimsHW{1, 1}, weightMap["ClassHead.2.conv1x1.weight"], weightMap["ClassHead.2.conv1x1.bias"]);

    auto lmk_head1 = network->addConvolutionNd(*ssh1->getOutput(0), 2 * 10, DimsHW{1, 1}, weightMap["LandmarkHead.0.conv1x1.weight"], weightMap["LandmarkHead.0.conv1x1.bias"]);
    auto lmk_head2 = network->addConvolutionNd(*ssh2->getOutput(0), 2 * 10, DimsHW{1, 1}, weightMap["LandmarkHead.1.conv1x1.weight"], weightMap["LandmarkHead.1.conv1x1.bias"]);
    auto lmk_head3 = network->addConvolutionNd(*ssh3->getOutput(0), 2 * 10, DimsHW{1, 1}, weightMap["LandmarkHead.2.conv1x1.weight"], weightMap["LandmarkHead.2.conv1x1.bias"]);

    //// ------------- Decode bbox, conf, landmark ---------------
    ITensor* inputTensors1[] = {bbox_head1->getOutput(0), cls_head1->getOutput(0), lmk_head1->getOutput(0)};
    auto cat1 = network->addConcatenation(inputTensors1, 3);
    ITensor* inputTensors2[] = {bbox_head2->getOutput(0), cls_head2->getOutput(0), lmk_head2->getOutput(0)};
    auto cat2 = network->addConcatenation(inputTensors2, 3);
    ITensor* inputTensors3[] = {bbox_head3->getOutput(0), cls_head3->getOutput(0), lmk_head3->getOutput(0)};
    auto cat3 = network->addConcatenation(inputTensors3, 3);

    auto creator = getPluginRegistry()->getPluginCreator("Decode_TRT", "1");
    PluginFieldCollection pfc;
    IPluginV2 *pluginObj = creator->createPlugin("decode", &pfc);
    ITensor* inputTensors[] = {cat1->getOutput(0), cat2->getOutput(0), cat3->getOutput(0)};
    auto decodelayer = network->addPluginV2(inputTensors, 3, *pluginObj);
    assert(decodelayer);

    decodelayer->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*decodelayer->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(1 << 20);
#if defined(USE_FP16)
    config->setFlag(BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << builder->platformHasFastInt8() << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(BuilderFlag::kINT8);
    Int8EntropyCalibrator2 *calibrator = new Int8EntropyCalibrator2(1, INPUT_W, INPUT_H, "./widerface_calib/", "mnet_int8calib.table", INPUT_BLOB_NAME);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*)(mem.second.values));
        mem.second.values = NULL;
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    config->destroy();
    builder->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

int main(int argc, char** argv) {
    if (argc != 2) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./retina_mnet -s   // serialize model to plan file" << std::endl;
        std::cerr << "./retina_mnet -d   // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    cudaSetDevice(DEVICE);
    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{nullptr};
    size_t size{0};

    if (std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(BATCH_SIZE, &modelStream);
        assert(modelStream != nullptr);

        std::ofstream p("retina_mnet.engine", std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 1;
    } else if (std::string(argv[1]) == "-d") {
        std::ifstream file("retina_mnet.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        return -1;
    }

    // prepare input data ---------------------------
    static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
    //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
    //    data[i] = 1.0;

    cv::Mat img = cv::imread("worlds-largest-selfie.jpg");
    cv::Mat pr_img = preprocess_img(img, INPUT_W, INPUT_H);
    //cv::imwrite("preprocessed.jpg", pr_img);

    // For multi-batch, I feed the same image multiple times.
    // If you want to process different images in a batch, you need adapt it.
    for (int b = 0; b < BATCH_SIZE; b++) {
        float *p_data = &data[b * 3 * INPUT_H * INPUT_W];
        for (int i = 0; i < INPUT_H * INPUT_W; i++) {
            p_data[i] = pr_img.at<cv::Vec3b>(i)[0] - 104.0;
            p_data[i + INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[1] - 117.0;
            p_data[i + 2 * INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[2] - 123.0;
        }
    }

    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    //ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);

    // Run inference
    static float prob[BATCH_SIZE * OUTPUT_SIZE];
    auto start = std::chrono::system_clock::now();
    doInference(*context, data, prob, BATCH_SIZE);
    auto end = std::chrono::system_clock::now();
    std::cout << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << "us" << std::endl;

    for (int b = 0; b < BATCH_SIZE; b++) {
        std::vector<decodeplugin::Detection> res;
        nms(res, &prob[b * OUTPUT_SIZE], IOU_THRESH);
        std::cout << "number of detections -> " << prob[b * OUTPUT_SIZE] << std::endl;
        std::cout << " -> " << prob[b * OUTPUT_SIZE + 10] << std::endl;
        std::cout << "after nms -> " << res.size() << std::endl;
        cv::Mat tmp = img.clone();
        for (size_t j = 0; j < res.size(); j++) {
            if (res[j].class_confidence < CONF_THRESH) continue;
            cv::Rect r = get_rect_adapt_landmark(tmp, INPUT_W, INPUT_H, res[j].bbox, res[j].landmark);
            cv::rectangle(tmp, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
            //cv::putText(tmp, std::to_string((int)(res[j].class_confidence * 100)) + "%", cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 1);
            for (int k = 0; k < 10; k += 2) {
                cv::circle(tmp, cv::Point(res[j].landmark[k], res[j].landmark[k + 1]), 1, cv::Scalar(255 * (k > 2), 255 * (k > 0 && k < 8), 255 * (k < 6)), 4);
            }
        }
        cv::imwrite(std::to_string(b) + "_result.jpg", tmp);
    }

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    // Print histogram of the output distribution
    //std::cout << "\nOutput:\n\n";
    //for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
    //{
    //    std::cout << prob[i] << ", ";
    //    if (i % 10 == 0) std::cout << i / 10 << std::endl;
    //}
    //std::cout << std::endl;

    return 0;
}


================================================
FILE: retinaface/retina_r50.cpp
================================================
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include "cuda_runtime_api.h"
#include "logging.h"
#include "common.hpp"
#include "calibrator.h"

#define USE_INT8  // set USE_INT8 or USE_FP16 or USE_FP32
#define DEVICE 0  // GPU id
#define BATCH_SIZE 1
#define CONF_THRESH 0.75
#define IOU_THRESH 0.4

// stuff we know about the network and the input/output blobs
static const int INPUT_H = decodeplugin::INPUT_H;  // H, W must be able to  be divided by 32.
static const int INPUT_W = decodeplugin::INPUT_W;;
static const int OUTPUT_SIZE = (INPUT_H / 8 * INPUT_W / 8 + INPUT_H / 16 * INPUT_W / 16 + INPUT_H / 32 * INPUT_W / 32) * 2  * 15 + 1;
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";

static Logger gLogger;

IActivationLayer* bottleneck(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{1, 1}, weightMap[lname + "conv1.weight"], emptywts);
    assert(conv1);

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5);

    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{3, 3}, weightMap[lname + "conv2.weight"], emptywts);
    assert(conv2);
    conv2->setStrideNd(DimsHW{stride, stride});
    conv2->setPaddingNd(DimsHW{1, 1});

    IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5);

    IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU);
    assert(relu2);

    IConvolutionLayer* conv3 = network->addConvolutionNd(*relu2->getOutput(0), outch * 4, DimsHW{1, 1}, weightMap[lname + "conv3.weight"], emptywts);
    assert(conv3);

    IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "bn3", 1e-5);

    IElementWiseLayer* ew1;
    if (stride != 1 || inch != outch * 4) {
        IConvolutionLayer* conv4 = network->addConvolutionNd(input, outch * 4, DimsHW{1, 1}, weightMap[lname + "downsample.0.weight"], emptywts);
        assert(conv4);
        conv4->setStrideNd(DimsHW{stride, stride});

        IScaleLayer* bn4 = addBatchNorm2d(network, weightMap, *conv4->getOutput(0), lname + "downsample.1", 1e-5);
        ew1 = network->addElementWise(*bn4->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM);
    } else {
        ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM);
    }
    IActivationLayer* relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU);
    assert(relu3);
    return relu3;
}

ILayer* conv_bn_relu(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int kernelsize, int stride, int padding, bool userelu, std::string lname) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{kernelsize, kernelsize}, getWeights(weightMap, lname + ".0.weight"), emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{stride, stride});
    conv1->setPaddingNd(DimsHW{padding, padding});

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".1", 1e-5);

    if (!userelu) return bn1;

    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    return relu1;
}

IActivationLayer* ssh(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname) {
    auto conv3x3 = conv_bn_relu(network, weightMap, input, 256 / 2, 3, 1, 1, false, lname + ".conv3X3");
    auto conv5x5_1 = conv_bn_relu(network, weightMap, input, 256 / 4, 3, 1, 1, true, lname + ".conv5X5_1");
    auto conv5x5 = conv_bn_relu(network, weightMap, *conv5x5_1->getOutput(0), 256 / 4, 3, 1, 1, false, lname + ".conv5X5_2");
    auto conv7x7 = conv_bn_relu(network, weightMap, *conv5x5_1->getOutput(0), 256 / 4, 3, 1, 1, true, lname + ".conv7X7_2");
    conv7x7 = conv_bn_relu(network, weightMap, *conv7x7->getOutput(0), 256 / 4, 3, 1, 1, false, lname + ".conv7x7_3");
    ITensor* inputTensors[] = {conv3x3->getOutput(0), conv5x5->getOutput(0), conv7x7->getOutput(0)};
    auto cat = network->addConcatenation(inputTensors, 3);
    IActivationLayer* relu1 = network->addActivation(*cat->getOutput(0), ActivationType::kRELU);
    assert(relu1);
    return relu1;
}

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // Create input tensor with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../retinaface.wts");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    // ------------- backbone resnet50 ---------------
    IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 64, DimsHW{7, 7}, weightMap["body.conv1.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{2, 2});
    conv1->setPaddingNd(DimsHW{3, 3});

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "body.bn1", 1e-5);

    // Add activation layer using the ReLU algorithm.
    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    // Add max pooling layer with stride of 2x2 and kernel size of 2x2.
    IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
    assert(pool1);
    pool1->setStrideNd(DimsHW{2, 2});
    pool1->setPaddingNd(DimsHW{1, 1});

    IActivationLayer* x = bottleneck(network, weightMap, *pool1->getOutput(0), 64, 64, 1, "body.layer1.0.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 256, 64, 1, "body.layer1.1.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 256, 64, 1, "body.layer1.2.");

    x = bottleneck(network, weightMap, *x->getOutput(0), 256, 128, 2, "body.layer2.0.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "body.layer2.1.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "body.layer2.2.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "body.layer2.3.");
    IActivationLayer* layer2 = x;

    x = bottleneck(network, weightMap, *x->getOutput(0), 512, 256, 2, "body.layer3.0.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "body.layer3.1.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "body.layer3.2.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "body.layer3.3.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "body.layer3.4.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "body.layer3.5.");
    IActivationLayer* layer3 = x;

    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 512, 2, "body.layer4.0.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 2048, 512, 1, "body.layer4.1.");
    x = bottleneck(network, weightMap, *x->getOutput(0), 2048, 512, 1, "body.layer4.2.");
    IActivationLayer* layer4 = x;

    // ------------- FPN ---------------
    auto output1 = conv_bn_relu(network, weightMap, *layer2->getOutput(0), 256, 1, 1, 0, true, "fpn.output1");
    auto output2 = conv_bn_relu(network, weightMap, *layer3->getOutput(0), 256, 1, 1, 0, true, "fpn.output2");
    auto output3 = conv_bn_relu(network, weightMap, *layer4->getOutput(0), 256, 1, 1, 0, true, "fpn.output3");

    float *deval = reinterpret_cast<float*>(malloc(sizeof(float) * 256 * 2 * 2));
    for (int i = 0; i < 256 * 2 * 2; i++) {
        deval[i] = 1.0;
    }
    Weights deconvwts{DataType::kFLOAT, deval, 256 * 2 * 2};
    IDeconvolutionLayer* up3 = network->addDeconvolutionNd(*output3->getOutput(0), 256, DimsHW{2, 2}, deconvwts, emptywts);
    assert(up3);
    up3->setStrideNd(DimsHW{2, 2});
    up3->setNbGroups(256);
    weightMap["up3"] = deconvwts;

    output2 = network->addElementWise(*output2->getOutput(0), *up3->getOutput(0), ElementWiseOperation::kSUM);
    output2 = conv_bn_relu(network, weightMap, *output2->getOutput(0), 256, 3, 1, 1, true, "fpn.merge2");

    IDeconvolutionLayer* up2 = network->addDeconvolutionNd(*output2->getOutput(0), 256, DimsHW{2, 2}, deconvwts, emptywts);
    assert(up2);
    up2->setStrideNd(DimsHW{2, 2});
    up2->setNbGroups(256);
    output1 = network->addElementWise(*output1->getOutput(0), *up2->getOutput(0), ElementWiseOperation::kSUM);
    output1 = conv_bn_relu(network, weightMap, *output1->getOutput(0), 256, 3, 1, 1, true, "fpn.merge1");

    // ------------- SSH ---------------
    auto ssh1 = ssh(network, weightMap, *output1->getOutput(0), "ssh1");
    auto ssh2 = ssh(network, weightMap, *output2->getOutput(0), "ssh2");
    auto ssh3 = ssh(network, weightMap, *output3->getOutput(0), "ssh3");

    // ------------- Head ---------------
    auto bbox_head1 = network->addConvolutionNd(*ssh1->getOutput(0), 2 * 4, DimsHW{1, 1}, weightMap["BboxHead.0.conv1x1.weight"], weightMap["BboxHead.0.conv1x1.bias"]);
    auto bbox_head2 = network->addConvolutionNd(*ssh2->getOutput(0), 2 * 4, DimsHW{1, 1}, weightMap["BboxHead.1.conv1x1.weight"], weightMap["BboxHead.1.conv1x1.bias"]);
    auto bbox_head3 = network->addConvolutionNd(*ssh3->getOutput(0), 2 * 4, DimsHW{1, 1}, weightMap["BboxHead.2.conv1x1.weight"], weightMap["BboxHead.2.conv1x1.bias"]);

    auto cls_head1 = network->addConvolutionNd(*ssh1->getOutput(0), 2 * 2, DimsHW{1, 1}, weightMap["ClassHead.0.conv1x1.weight"], weightMap["ClassHead.0.conv1x1.bias"]);
    auto cls_head2 = network->addConvolutionNd(*ssh2->getOutput(0), 2 * 2, DimsHW{1, 1}, weightMap["ClassHead.1.conv1x1.weight"], weightMap["ClassHead.1.conv1x1.bias"]);
    auto cls_head3 = network->addConvolutionNd(*ssh3->getOutput(0), 2 * 2, DimsHW{1, 1}, weightMap["ClassHead.2.conv1x1.weight"], weightMap["ClassHead.2.conv1x1.bias"]);

    auto lmk_head1 = network->addConvolutionNd(*ssh1->getOutput(0), 2 * 10, DimsHW{1, 1}, weightMap["LandmarkHead.0.conv1x1.weight"], weightMap["LandmarkHead.0.conv1x1.bias"]);
    auto lmk_head2 = network->addConvolutionNd(*ssh2->getOutput(0), 2 * 10, DimsHW{1, 1}, weightMap["LandmarkHead.1.conv1x1.weight"], weightMap["LandmarkHead.1.conv1x1.bias"]);
    auto lmk_head3 = network->addConvolutionNd(*ssh3->getOutput(0), 2 * 10, DimsHW{1, 1}, weightMap["LandmarkHead.2.conv1x1.weight"], weightMap["LandmarkHead.2.conv1x1.bias"]);

    // ------------- Decode bbox, conf, landmark ---------------
    ITensor* inputTensors1[] = {bbox_head1->getOutput(0), cls_head1->getOutput(0), lmk_head1->getOutput(0)};
    auto cat1 = network->addConcatenation(inputTensors1, 3);
    ITensor* inputTensors2[] = {bbox_head2->getOutput(0), cls_head2->getOutput(0), lmk_head2->getOutput(0)};
    auto cat2 = network->addConcatenation(inputTensors2, 3);
    ITensor* inputTensors3[] = {bbox_head3->getOutput(0), cls_head3->getOutput(0), lmk_head3->getOutput(0)};
    auto cat3 = network->addConcatenation(inputTensors3, 3);

    auto creator = getPluginRegistry()->getPluginCreator("Decode_TRT", "1");
    PluginFieldCollection pfc;
    IPluginV2 *pluginObj = creator->createPlugin("decode", &pfc);
    ITensor* inputTensors[] = {cat1->getOutput(0), cat2->getOutput(0), cat3->getOutput(0)};
    auto decodelayer = network->addPluginV2(inputTensors, 3, *pluginObj);
    assert(decodelayer);

    decodelayer->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*decodelayer->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(1 << 20);
#if defined(USE_FP16)
    config->setFlag(BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << builder->platformHasFastInt8() << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(BuilderFlag::kINT8);
    Int8EntropyCalibrator2 *calibrator = new Int8EntropyCalibrator2(1, INPUT_W, INPUT_H, "./widerface_calib/", "r50_int8calib.table", INPUT_BLOB_NAME);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*)(mem.second.values));
        mem.second.values = NULL;
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    config->destroy();
    builder->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

int main(int argc, char** argv) {
    if (argc != 2) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./retina_r50 -s   // serialize model to plan file" << std::endl;
        std::cerr << "./retina_r50 -d   // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    cudaSetDevice(DEVICE);
    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{nullptr};
    size_t size{0};

    if (std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(BATCH_SIZE, &modelStream);
        assert(modelStream != nullptr);

        std::ofstream p("retina_r50.engine", std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 1;
    } else if (std::string(argv[1]) == "-d") {
        std::ifstream file("retina_r50.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        return -1;
    }

    // prepare input data ---------------------------
    static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
    //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
    //    data[i] = 1.0;

    cv::Mat img = cv::imread("worlds-largest-selfie.jpg");
    cv::Mat pr_img = preprocess_img(img, INPUT_W, INPUT_H);
    //cv::imwrite("preprocessed.jpg", pr_img);

    // For multi-batch, I feed the same image multiple times.
    // If you want to process different images in a batch, you need adapt it.
    for (int b = 0; b < BATCH_SIZE; b++) {
        float *p_data = &data[b * 3 * INPUT_H * INPUT_W];
        for (int i = 0; i < INPUT_H * INPUT_W; i++) {
            p_data[i] = pr_img.at<cv::Vec3b>(i)[0] - 104.0;
            p_data[i + INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[1] - 117.0;
            p_data[i + 2 * INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[2] - 123.0;
        }
    }

    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    //ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);

    // Run inference
    static float prob[BATCH_SIZE * OUTPUT_SIZE];
    for (int cc = 0; cc < 1000; cc++) {
    auto start = std::chrono::system_clock::now();
    doInference(*context, data, prob, BATCH_SIZE);
    auto end = std::chrono::system_clock::now();
    std::cout << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << "us" << std::endl;
    }

    for (int b = 0; b < BATCH_SIZE; b++) {
        std::vector<decodeplugin::Detection> res;
        nms(res, &prob[b * OUTPUT_SIZE], IOU_THRESH);
        std::cout << "number of detections -> " << prob[b * OUTPUT_SIZE] << std::endl;
        std::cout << " -> " << prob[b * OUTPUT_SIZE + 10] << std::endl;
        std::cout << "after nms -> " << res.size() << std::endl;
        cv::Mat tmp = img.clone();
        for (size_t j = 0; j < res.size(); j++) {
            if (res[j].class_confidence < CONF_THRESH) continue;
            cv::Rect r = get_rect_adapt_landmark(tmp, INPUT_W, INPUT_H, res[j].bbox, res[j].landmark);
            cv::rectangle(tmp, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
            //cv::putText(tmp, std::to_string((int)(res[j].class_confidence * 100)) + "%", cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 1);
            for (int k = 0; k < 10; k += 2) {
                cv::circle(tmp, cv::Point(res[j].landmark[k], res[j].landmark[k + 1]), 1, cv::Scalar(255 * (k > 2), 255 * (k > 0 && k < 8), 255 * (k < 6)), 4);
            }
        }
        cv::imwrite(std::to_string(b) + "_result.jpg", tmp);
    }

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    // Print histogram of the output distribution
    //std::cout << "\nOutput:\n\n";
    //for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
    //{
    //    std::cout << prob[i] << ", ";
    //    if (i % 10 == 0) std::cout << i / 10 << std::endl;
    //}
    //std::cout << std::endl;

    return 0;
}


================================================
FILE: retinaface/retinaface_trt.py
================================================
"""
Use TensorRT's Python api to make inferences.
"""
# -*- coding: utf-8 -*
import ctypes
import os
import random
import sys
import threading
import time

import cv2
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
import torch
import torchvision

INPUT_H = 480  #defined in decode.h
INPUT_W = 640
CONF_THRESH = 0.75
IOU_THRESHOLD = 0.4
np.set_printoptions(threshold=np.inf)

def plot_one_box(x, landmark,img, color=None, label=None, line_thickness=None):
    """
    description: Plots one bounding box on image img,

    param:
        x:     a box likes [x1,y1,x2,y2]
        img:    a opencv image object
        color:  color to draw rectangle, such as (0,255,0)
        label:  str
        line_thickness: int
    return:
        no return

    """
    tl = (
            line_thickness or round(0.001 * (img.shape[0] + img.shape[1]) / 2) + 1
    )  # line/font thickness

    color = color or [random.randint(0, 255) for _ in range(3)]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)

    cv2.circle(img, (int(landmark[0]), int(landmark[1])), 1, (0, 0, 255), 4)
    cv2.circle(img, (int(landmark[2]), int(landmark[3])), 1, (0, 255, 255), 4)
    cv2.circle(img, (int(landmark[4]), int(landmark[5])), 1, (255, 0, 255), 4)
    cv2.circle(img, (int(landmark[6]), int(landmark[7])), 1, (0, 255, 0), 4)
    cv2.circle(img, (int(landmark[8]), int(landmark[9])), 1, (255, 0, 0), 4)

    if label:
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(
            img,
            label,
            (c1[0], c1[1] - 2),
            0,
            tl / 3,
            [225, 255, 255],
            thickness=tf,
            lineType=cv2.LINE_AA,
        )


class Retinaface_trt(object):
    """
    description: A Retineface class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.cfx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings

    def infer(self, input_image_path):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.

        self.cfx.push()
        # Restore
        stream = self.stream
        context = self.context
        engine = self.engine
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        input_image, image_raw, origin_h, origin_w = self.preprocess_image(
            input_image_path
        )
        a = time.time()
        # Copy input image to host buffer
        np.copyto(host_inputs[0], input_image.ravel())
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async(bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()
        # Remove any context from the top of the context stack, deactivating it.
        self.cfx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]

        # Do postprocess
        result_boxes, result_scores, result_landmark = self.post_process(
            output, origin_h, origin_w
        )
        b = time.time()-a
        print(b)

        # Draw rectangles and labels on the original image

        # Save image
        for i in range(len(result_boxes)):
            box = result_boxes[i]
            landmark = result_landmark[i]
            plot_one_box(
                box,
                landmark,
                image_raw,
                label="{}:{:.2f}".format( 'Face', result_scores[i]))
        parent, filename = os.path.split(input_image_path)
        save_name = os.path.join(parent, "output_" + filename)

        cv2.imwrite(save_name, image_raw)

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.cfx.pop()

    def preprocess_image(self, input_image_path):
        """
        description: Read an image from image path, resize and pad it to target size,
                     normalize to [0,1],transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """
        image_raw = cv2.imread(input_image_path)
        h, w, c = image_raw.shape

        # Calculate widht and height and paddings
        r_w = INPUT_W / w
        r_h = INPUT_H / h
        if r_h > r_w:
            tw = INPUT_W
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((INPUT_H - th) / 2)
            ty2 = INPUT_H - th - ty1
        else:
            tw = int(r_h * w)
            th = INPUT_H
            tx1 = int((INPUT_W - tw) / 2)
            tx2 = INPUT_W - tw - tx1
            ty1 = ty2 = 0

        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image_raw, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, (128, 128, 128)
        )
        image = image.astype(np.float32)

        # HWC to CHW format:
        image -= (104, 117, 123)
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy(self, origin_h, origin_w, x,landmark):

        y = torch.zeros_like(x) if isinstance(x, torch.Tensor) else np.zeros_like(x)

        r_w = INPUT_W / origin_w
        r_h = INPUT_H / origin_h

        if r_h > r_w:
            y[:, 0] = x[:, 0] / r_w
            y[:, 2] = x[:, 2] / r_w
            y[:, 1] = (x[:, 1] - (INPUT_H - r_w * origin_h) / 2) / r_w
            y[:, 3] = (x[:, 3] - (INPUT_H - r_w * origin_h) / 2) / r_w
            
            landmark[:,0] = landmark[:,0]/r_w
            landmark[:,1] = (landmark[:,1] - (INPUT_H - r_w * origin_h) / 2)/r_w
            landmark[:,2] = landmark[:,2]/r_w
            landmark[:,3] = (landmark[:,3] - (INPUT_H - r_w * origin_h) / 2)/r_w
            landmark[:,4] = landmark[:,4]/r_w
            landmark[:,5] = (landmark[:,5] - (INPUT_H - r_w * origin_h) / 2)/r_w
            landmark[:,6] = landmark[:,6]/r_w
            landmark[:,7] = (landmark[:,7] - (INPUT_H - r_w * origin_h) / 2)/r_w
            landmark[:,8] = landmark[:,8]/r_w
            landmark[:,9] = (landmark[:,9] - (INPUT_H - r_w * origin_h) / 2)/r_w
        else:
            y[:, 0] = (x[:, 0] - (INPUT_W - r_h * origin_w) / 2) / r_h
            y[:, 2] = (x[:, 2] - (INPUT_W - r_h * origin_w) / 2) / r_h
            y[:, 1] = x[:, 1] /r_h
            y[:, 3] = x[:, 3] /r_h

            landmark[:,0] = (landmark[:,0] - (INPUT_W - r_h * origin_w) / 2)/r_h
            landmark[:,1] = landmark[:,1]/ r_h
            landmark[:,2] = (landmark[:,2] - (INPUT_W - r_h * origin_w) / 2)/r_h
            landmark[:,3] = landmark[:,3]/ r_h
            landmark[:,4] = (landmark[:,4] - (INPUT_W - r_h * origin_w) / 2)/r_h
            landmark[:,5] = landmark[:,5]/ r_h
            landmark[:,6] = (landmark[:,6] - (INPUT_W - r_h * origin_w) / 2)/r_h
            landmark[:,7] = landmark[:,7]/ r_h
            landmark[:,8] = (landmark[:,8] - (INPUT_W - r_h * origin_w) / 2)/r_h
            landmark[:,9] = landmark[:,9]/ r_h

        return y, landmark

    def post_process(self, output, origin_h, origin_w):
        """
        description: postprocess the prediction
        param:
            output:     A tensor likes [num_boxes,x1,y1,x2,y2,conf,landmark_x1,landmark_y1,
            landmark_x2,landmark_y2,...]
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes tensor, each row is a box [x1, y1, x2, y2]
            result_scores: finally scores, a tensor, each element is the score correspoing to box
            result_classid: finally classid, a tensor, each element is the classid correspoing to box
        """
        # Get the num of boxes detected
        num = int(output[0])
        # Reshape to a two dimentional ndarray
        pred = np.reshape(output[1:], (-1, 15))[:num, :]
        # to  torch Tensor
        pred = torch.Tensor(pred).cuda()
        # Get the boxes
        boxes = pred[:, :4]
        # Get the scores
        scores = pred[:, 4]
        # Get the landmark
        landmark = pred[:,5:15]
        # Choose those boxes that score > CONF_THRESH
        si = scores > CONF_THRESH
        boxes = boxes[si, :]
        scores = scores[si]

        landmark = landmark[si,:]

        # Get boxes and landmark
        boxes,landmark = self.xywh2xyxy(origin_h, origin_w, boxes,landmark)
        # Do nms
        indices = torchvision.ops.nms(boxes, scores, iou_threshold=IOU_THRESHOLD).cpu()
        result_boxes = boxes[indices, :].cpu()
        result_scores = scores[indices].cpu()
        result_landmark = landmark[indices].cpu()
        return result_boxes, result_scores, result_landmark

class myThread(threading.Thread):
    def __init__(self, func, args):
        threading.Thread.__init__(self)
        self.func = func
        self.args = args

    def run(self):
        self.func(*self.args)

if __name__ == "__main__":
    # load custom plugins,make sure it has been generated
    PLUGIN_LIBRARY = "build/libdecodeplugin.so"
    ctypes.CDLL(PLUGIN_LIBRARY)
    engine_file_path = "build/retina_r50.engine"

    retinaface = Retinaface_trt(engine_file_path)
    input_image_paths = ["zidane.jpg"]
    for i in range(10):
        for input_image_path in input_image_paths:
            # create a new thread to do inference
            thread = myThread(retinaface.infer, [input_image_path])
            thread.start()
            thread.join()

    # destroy the instance
    retinaface.destroy()


================================================
FILE: retinafaceAntiCov/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(retinafaceAntiCov)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

find_package(CUDA REQUIRED)


include_directories(${PROJECT_SOURCE_DIR}/include)
if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
    message("embed_platform on")
    include_directories(/usr/local/cuda/targets/aarch64-linux/include)
    link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
else()
    message("embed_platform off")
    # cuda
    include_directories(/usr/local/cuda/include)
    link_directories(/usr/local/cuda/lib64)

    # tensorrt
    include_directories(/home/lindsay/TensorRT-8.6.1.6/include)
    link_directories(/home/lindsay/TensorRT-8.6.1.6/lib)
    #  include_directories(/home/lindsay/TensorRT-7.2.3.4/include)
    #  link_directories(/home/lindsay/TensorRT-7.2.3.4/lib)


endif()

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")

cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/decode.cu)

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

add_executable(retinafaceAntiCov ${PROJECT_SOURCE_DIR}/retinafaceAntiCov.cpp)
target_link_libraries(retinafaceAntiCov nvinfer)
target_link_libraries(retinafaceAntiCov cudart)
target_link_libraries(retinafaceAntiCov myplugins)
target_link_libraries(retinafaceAntiCov ${OpenCV_LIBS})

add_definitions(-O2 -pthread)


================================================
FILE: retinafaceAntiCov/README.md
================================================
# RetinaFaceAntiCov

 The mxnet implementation is [deepinsight/insightface/RetinaFaceAntiCov](https://github.com/deepinsight/insightface/tree/master/RetinaFaceAntiCov).

## Run

```
1. generate retinafaceAntiCov.wts from mxnet implementation.

git clone https://github.com/deepinsight/insightface.git
cd insightface/RetinaFaceAntiCov
// download its weights 'cov2.zip', put it into insightface/RetinaFaceAntiCov, and unzip it
// put tensorrtx/retinafaceAntiCov/gen_wts.py into insightface/RetinaFaceAntiCov
python gen_wts.py
// a file 'retinafaceAntiCov.wts' will be generated.

2. put retinafaceAntiCov.wts into tensorrtx/retinafaceAntiCov, build and run

git clone https://github.com/wang-xinyu/tensorrtx.git
cd tensorrtx/retinafaceAntiCov
// put retinafaceAntiCov.wts here
mkdir build
cd build
cmake ..
make
sudo ./retinafaceAntiCov -s  // build and serialize model to file i.e. 'retinafaceAntiCov.engine'
wget http://www.kaixian.tv/gd/d/file/201611/07/23efff3a26e2385620e719378c654fb1.jpg -O test.jpg
sudo ./retinafaceAntiCov -d  // deserialize model file and run inference.

3. check the image generated, as follows 'out.jpg'
```

<p align="center">
<img src="https://user-images.githubusercontent.com/15235574/84776553-069c5f80-b013-11ea-893c-70a138b843d6.jpg">
</p>

## Config

- Input shape `INPUT_H`, `INPUT_W` defined in `decode.h`
- FP16/FP32 can be selected by the macro `USE_FP16` in `retinafaceAntiCov.cpp`
- GPU id can be selected by the macro `DEVICE` in `retinafaceAntiCov.cpp`

## More Information

See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)


================================================
FILE: retinafaceAntiCov/decode.cu
================================================
#include "decode.h"
#include "stdio.h"

namespace nvinfer1
{
    DecodePlugin::DecodePlugin()
    {
    }

    DecodePlugin::~DecodePlugin()
    {
    }

    // create the plugin at runtime from a byte stream
    DecodePlugin::DecodePlugin(const void* data, size_t length)
    {
    }

    void DecodePlugin::serialize(void* buffer) const TRT_NOEXCEPT
    {
    }

    size_t DecodePlugin::getSerializationSize() const TRT_NOEXCEPT
    {  
        return 0;
    }

    int DecodePlugin::initialize() TRT_NOEXCEPT
    { 
        return 0;
    }

    Dims DecodePlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT
    {
        //output the result to channel
        int totalCount = 0;
        totalCount += decodeplugin::INPUT_H / 8 * decodeplugin::INPUT_W / 8 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);
        totalCount += decodeplugin::INPUT_H / 16 * decodeplugin::INPUT_W / 16 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);
        totalCount += decodeplugin::INPUT_H / 32 * decodeplugin::INPUT_W / 32 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);

        return Dims3(totalCount + 1, 1, 1);
    }

    // Set plugin namespace
    void DecodePlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT
    {
        mPluginNamespace = pluginNamespace;
    }

    const char* DecodePlugin::getPluginNamespace() const TRT_NOEXCEPT
    {
        return mPluginNamespace;
    }

    // Return the DataType of the plugin output at the requested index
    DataType DecodePlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT
    {
        return DataType::kFLOAT;
    }

    // Return true if output tensor is broadcast across a batch.
    bool DecodePlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT
    {
        return false;
    }

    // Return true if plugin can use input that is broadcast across batch without replication.
    bool DecodePlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT
    {
        return false;
    }

    void DecodePlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT
    {
    }

    // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
    void DecodePlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT
    {
    }

    // Detach the plugin object from its execution context.
    void DecodePlugin::detachFromContext() TRT_NOEXCEPT {}

    const char* DecodePlugin::getPluginType() const TRT_NOEXCEPT
    {
        return "Decode_TRT";
    }

    const char* DecodePlugin::getPluginVersion() const TRT_NOEXCEPT
    {
        return "1";
    }

    void DecodePlugin::destroy() TRT_NOEXCEPT
    {
        delete this;
    }

    // Clone the plugin
    IPluginV2IOExt* DecodePlugin::clone() const TRT_NOEXCEPT
    {
        DecodePlugin *p = new DecodePlugin();
        p->setPluginNamespace(mPluginNamespace);
        return p;
    }

    __device__ float Logist(float data){ return 1./(1. + expf(-data)); };

    __global__ void CalDetection(const float *input, float *output, int num_elem, int step, int anchor) {

        int idx = threadIdx.x + blockDim.x * blockIdx.x;
        if (idx >= num_elem) return;

        int h = decodeplugin::INPUT_H / step;
        int w = decodeplugin::INPUT_W / step;
        int y = idx / w;
        int x = idx % w;
        const float *cls_reg = &input[2 * num_elem];
        const float *bbox_reg = &input[4 * num_elem];
        const float *lmk_reg = &input[12 * num_elem];
        const float *mask_reg = &input[36 * num_elem];

        for (int k = 0; k < 2; ++k) {
            float conf = cls_reg[idx + k * num_elem];
            if (conf < 0.5) continue;

            float *res_count = output;
            int count = (int)atomicAdd(res_count, 1);
            char* data = (char *)res_count + sizeof(float) + count * sizeof(decodeplugin::Detection);
            decodeplugin::Detection* det = (decodeplugin::Detection*)(data);

            float prior[4];
            prior[0] = 7.5 + (float)(x * step);
            prior[1] = 7.5 + (float)(y * step);
            prior[2] = anchor * 2 / (k + 1);
            prior[3] = prior[2];

            //Location
            det->bbox[0] = prior[0] + bbox_reg[idx + k * num_elem * 4] * prior[2];
            det->bbox[1] = prior[1] + bbox_reg[idx + k * num_elem * 4 + num_elem] * prior[3];
            det->bbox[2] = prior[2] * expf(bbox_reg[idx + k * num_elem * 4 + num_elem * 2]);
            det->bbox[3] = prior[3] * expf(bbox_reg[idx + k * num_elem * 4 + num_elem * 3]);
            det->bbox[0] -= (det->bbox[2] - 1) / 2;
            det->bbox[1] -= (det->bbox[3] - 1) / 2;
            det->bbox[2] += det->bbox[0];
            det->bbox[3] += det->bbox[1];
            det->class_confidence = conf;
            for (int i = 0; i < 10; i += 2) {
                det->landmark[i] = prior[0] + lmk_reg[idx + k * num_elem * 10 + num_elem * i] * 0.2 * prior[2];
                det->landmark[i+1] = prior[1] + lmk_reg[idx + k * num_elem * 10 + num_elem * (i + 1)] * 0.2 * prior[3];
            }
            det->mask_confidence = mask_reg[idx + k * num_elem];;
        }
    }

    void DecodePlugin::forwardGpu(const float *const * inputs, float * output, cudaStream_t stream, int batchSize) 
    {
        int num_elem = 0;
        int base_step = 8;
        int base_anchor = 16;
        int thread_count;
        cudaMemset(output, 0, sizeof(float));
        for (unsigned int i = 0; i < 3; ++i)
        {
            num_elem = decodeplugin::INPUT_H / base_step * decodeplugin::INPUT_W / base_step;
            thread_count = (num_elem < thread_count_) ? num_elem : thread_count_;
            CalDetection<<< (num_elem + thread_count - 1) / thread_count, thread_count>>>
                (inputs[i], output, num_elem, base_step, base_anchor);
            base_step *= 2;
            base_anchor *= 4;
        }
    }

    int DecodePlugin::enqueue(int batchSize, const void*const * inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT
    {
        //assert(batchSize == 1);
        //GPU
        //CUDA_CHECK(cudaStreamSynchronize(stream));
        forwardGpu((const float *const *)inputs,(float *)outputs[0],stream,batchSize);

        return 0;
    };

    PluginFieldCollection DecodePluginCreator::mFC{};
    std::vector<PluginField> DecodePluginCreator::mPluginAttributes;

    DecodePluginCreator::DecodePluginCreator()
    {
        mPluginAttributes.clear();

        mFC.nbFields = mPluginAttributes.size();
        mFC.fields = mPluginAttributes.data();
    }

    const char* DecodePluginCreator::getPluginName() const TRT_NOEXCEPT
    {
        return "Decode_TRT";
    }

    const char* DecodePluginCreator::getPluginVersion() const TRT_NOEXCEPT
    {
        return "1";
    }

    const PluginFieldCollection* DecodePluginCreator::getFieldNames() TRT_NOEXCEPT
    {
        return &mFC;
    }

    IPluginV2IOExt* DecodePluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT
    {
        DecodePlugin* obj = new DecodePlugin();
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

    IPluginV2IOExt* DecodePluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT
    {
        // This object will be deleted when the network is destroyed, which will
        // call PReluPlugin::destroy()
        DecodePlugin* obj = new DecodePlugin(serialData, serialLength);
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

}


================================================
FILE: retinafaceAntiCov/decode.h
================================================
#ifndef _DECODE_CU_H
#define _DECODE_CU_H

#include <string>
#include <vector>
#include <iostream>
#include "NvInfer.h"
#include "macros.h"


namespace decodeplugin
{
    struct alignas(float) Detection{
        float bbox[4];  //x1 y1 x2 y2
        float class_confidence;
        float landmark[10];
        float mask_confidence;
    };
    static const int INPUT_H = 640;
    static const int INPUT_W = 640;

//    std::ostream& operator << (std::ostream& os, const decodeplugin::Detection& det) {
//        for(int i = 0; i < 10; i += 2){
//            os << det.mask_confidence << " ";
//        }
//        return os;
//    }
}


namespace nvinfer1
{
    class DecodePlugin: public IPluginV2IOExt
    {
        public:
            DecodePlugin();
            DecodePlugin(const void* data, size_t length);

            ~DecodePlugin();

            int getNbOutputs() const TRT_NOEXCEPT override
            {
                return 1;
            }

            Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;

            int initialize() TRT_NOEXCEPT override;

            virtual void terminate() TRT_NOEXCEPT override {};

            virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0;}

            virtual int enqueue(int batchSize, const void*const * inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;

            virtual size_t getSerializationSize() const TRT_NOEXCEPT override;

            virtual void serialize(void* buffer) const TRT_NOEXCEPT override;

            bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override {
                return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
            }

            const char* getPluginType() const TRT_NOEXCEPT override;

            const char* getPluginVersion() const TRT_NOEXCEPT override;

            void destroy() TRT_NOEXCEPT override;

            IPluginV2IOExt* clone() const TRT_NOEXCEPT override;

            void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;

            const char* getPluginNamespace() const TRT_NOEXCEPT override;

            DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;

            bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override;

            bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;

            void attachToContext(
                    cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;

            void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT override;

            void detachFromContext() TRT_NOEXCEPT override;

            int input_size_;
        private:
            void forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize = 1);
            int thread_count_ = 256;
            const char* mPluginNamespace;
    };

    class DecodePluginCreator : public IPluginCreator
    {
        public:
            DecodePluginCreator();

            ~DecodePluginCreator() TRT_NOEXCEPT override = default;

            const char* getPluginName() const TRT_NOEXCEPT override;

            const char* getPluginVersion() const TRT_NOEXCEPT override;

            const PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;

            IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT override;

            IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;

            void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override
            {
                mNamespace = libNamespace;
            }

            const char* getPluginNamespace() const TRT_NOEXCEPT override
            {
                return mNamespace.c_str();
            }

        private:
            std::string mNamespace;
            static PluginFieldCollection mFC;
            static std::vector<PluginField> mPluginAttributes;
    };
};

#endif 


================================================
FILE: retinafaceAntiCov/gen_wts.py
================================================
import struct
from retinaface_cov import RetinaFaceCoV

gpuid = 0
model = RetinaFaceCoV('./cov2/mnet_cov2', 0, gpuid, 'net3l')

f = open('retinafaceAntiCov.wts', 'w')
f.write('{}\n'.format(len(model.model.get_params()[0].keys()) + len(model.model.get_params()[1].keys())))
for k, v in model.model.get_params()[0].items():
    vr = v.reshape(-1).asnumpy()
    f.write('{} {} '.format(k, len(vr)))
    for vv in vr:
        f.write(' ')
        f.write(struct.pack('>f',float(vv)).hex())
    f.write('\n')
for k, v in model.model.get_params()[1].items():
    vr = v.reshape(-1).asnumpy()
    f.write('{} {} '.format(k, len(vr)))
    for vv in vr:
        f.write(' ')
        f.write(struct.pack('>f',float(vv)).hex())
    f.write('\n')


================================================
FILE: retinafaceAntiCov/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "macros.h"
using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: retinafaceAntiCov/macros.h
================================================
#ifndef __MACROS_H
#define __MACROS_H

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif

#endif  // __MACROS_H

================================================
FILE: retinafaceAntiCov/retinafaceAntiCov.cpp
================================================
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include <opencv2/opencv.hpp>
#include <dirent.h>
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"
#include "decode.h"

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

//#define USE_FP16  // comment out this if want to use FP32
#define DEVICE 0  // GPU id
#define BATCH_SIZE 1  // currently, only support BATCH=1

using namespace nvinfer1;

// stuff we know about the network and the input/output blobs
static const int INPUT_H = decodeplugin::INPUT_H;
static const int INPUT_W = decodeplugin::INPUT_W;
static const int DETECTION_SIZE = sizeof(decodeplugin::Detection) / sizeof(float);
static const int OUTPUT_SIZE = (INPUT_H / 8 * INPUT_W / 8 + INPUT_H / 16 * INPUT_W / 16 + INPUT_H / 32 * INPUT_W / 32) * 2  * DETECTION_SIZE + 1;
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
static Logger gLogger;
REGISTER_TENSORRT_PLUGIN(DecodePluginCreator);

cv::Mat preprocess_img(cv::Mat& img) {
    int w, h, x, y;
    float r_w = INPUT_W / (img.cols*1.0);
    float r_h = INPUT_H / (img.rows*1.0);
    if (r_h > r_w) {
        w = INPUT_W;
        h = r_w * img.rows;
        x = 0;
        y = (INPUT_H - h) / 2;
    } else {
        w = r_h* img.cols;
        h = INPUT_H;
        x = (INPUT_W - w) / 2;
        y = 0;
    }
    cv::Mat re(h, w, CV_8UC3);
    cv::resize(img, re, re.size(), 0, 0, cv::INTER_CUBIC);
    cv::Mat out(INPUT_H, INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128));
    re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
    return out;
}

cv::Rect get_rect_adapt_landmark(cv::Mat& img, float bbox[4], float lmk[10]) {
    int l, r, t, b;
    float r_w = INPUT_W / (img.cols * 1.0);
    float r_h = INPUT_H / (img.rows * 1.0);
    if (r_h > r_w) {
        l = bbox[0] / r_w;
        r = bbox[2] / r_w;
        t = (bbox[1] - (INPUT_H - r_w * img.rows) / 2) / r_w;
        b = (bbox[3] - (INPUT_H - r_w * img.rows) / 2) / r_w;
        for (int i = 0; i < 10; i += 2) {
            lmk[i] /= r_w;
            lmk[i + 1] = (lmk[i + 1] - (INPUT_H - r_w * img.rows) / 2) / r_w;
        }
    } else {
        l = (bbox[0] - (INPUT_W - r_h * img.cols) / 2) / r_h;
        r = (bbox[2] - (INPUT_W - r_h * img.cols) / 2) / r_h;
        t = bbox[1] / r_h;
        b = bbox[3] / r_h;
        for (int i = 0; i < 10; i += 2) {
            lmk[i] = (lmk[i] - (INPUT_W - r_h * img.cols) / 2) / r_h;
            lmk[i + 1] /= r_h;
        }
    }
    return cv::Rect(l, t, r-l, b-t);
}

float iou(float lbox[4], float rbox[4]) {
    float interBox[] = {
        std::max(lbox[0], rbox[0]), //left
        std::min(lbox[2], rbox[2]), //right
        std::max(lbox[1], rbox[1]), //top
        std::min(lbox[3], rbox[3]), //bottom
    };

    if(interBox[2] > interBox[3] || interBox[0] > interBox[1])
        return 0.0f;

    float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]);
    return interBoxS / ((lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) -interBoxS + 0.000001f);
}

bool cmp(decodeplugin::Detection& a, decodeplugin::Detection& b) {
    return a.class_confidence > b.class_confidence;
}

void nms(std::vector<decodeplugin::Detection>& res, float *output, float nms_thresh = 0.4) {
    std::vector<decodeplugin::Detection> dets;
    for (int i = 0; i < output[0]; i++) {
        if (output[DETECTION_SIZE * i + 1 + 4] <= 0.1) continue;
        decodeplugin::Detection det;
        memcpy(&det, &output[DETECTION_SIZE * i + 1], sizeof(decodeplugin::Detection));
        dets.push_back(det);
    }
    std::sort(dets.begin(), dets.end(), cmp);
    if (dets.size() > 5000) dets.erase(dets.begin() + 5000, dets.end());
    for (size_t m = 0; m < dets.size(); ++m) {
        auto& item = dets[m];
        res.push_back(item);
        //std::cout << item.class_confidence << " bbox " << item.bbox[0] << ", " << item.bbox[1] << ", " << item.bbox[2] << ", " << item.bbox[3] << std::endl;
        for (size_t n = m + 1; n < dets.size(); ++n) {
            if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
                dets.erase(dets.begin()+n);
                --n;
            }
        }
    }
}

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        
        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + "_gamma"].values;
    float *beta = (float*)weightMap[lname + "_beta"].values;
    float *mean = (float*)weightMap[lname + "_moving_mean"].values;
    float *var = (float*)weightMap[lname + "_moving_var"].values;
    int len = weightMap[lname + "_moving_var"].count;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};
    
    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

ILayer* convBnRelu(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int num_filters, int k, int s, int p, int g, std::string lname) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    IConvolutionLayer* conv = network->addConvolutionNd(input, num_filters, DimsHW{k, k}, weightMap[lname + "_conv2d_weight"], emptywts);
    assert(conv);
    conv->setStrideNd(DimsHW{s, s});
    conv->setPaddingNd(DimsHW{p, p});
    conv->setNbGroups(g);
    auto bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + "_batchnorm", 1e-3);
    IActivationLayer* relu = network->addActivation(*bn->getOutput(0), ActivationType::kRELU);
    assert(relu);
    return relu;
}

ILayer* convBiasBnRelu(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int num_filters, int k, int s, int p, std::string lname) {
    IConvolutionLayer* conv = network->addConvolutionNd(input, num_filters, DimsHW{k, k}, weightMap[lname + "_weight"], weightMap[lname + "_bias"]);
    assert(conv);
    conv->setStrideNd(DimsHW{s, s});
    conv->setPaddingNd(DimsHW{p, p});
    auto bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + "_bn", 2e-5);
    IActivationLayer* relu = network->addActivation(*bn->getOutput(0), ActivationType::kRELU);
    assert(relu);
    return relu;
}

ILayer* head(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname) {
    auto conv1 = network->addConvolutionNd(input, 32, DimsHW{3, 3}, weightMap[lname + "_conv1_weight"], weightMap[lname + "_conv1_bias"]);
    assert(conv1);
    conv1->setPaddingNd(DimsHW{1, 1});
    auto conv1bn = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "_conv1_bn", 2e-5);

    auto ctxconv1 = convBiasBnRelu(network, weightMap, input, 16, 3, 1, 1, lname + "_context_conv1");

    auto ctxconv2 = network->addConvolutionNd(*ctxconv1->getOutput(0), 16, DimsHW{3, 3}, weightMap[lname + "_context_conv2_weight"], weightMap[lname + "_context_conv2_bias"]);
    assert(ctxconv2);
    ctxconv2->setPaddingNd(DimsHW{1, 1});
    auto ctxconv2bn = addBatchNorm2d(network, weightMap, *ctxconv2->getOutput(0), lname + "_context_conv2_bn", 2e-5);

    auto ctxconv3_1 = convBiasBnRelu(network, weightMap, *ctxconv1->getOutput(0), 16, 3, 1, 1, lname + "_context_conv3_1");
    auto ctxconv3_2 = network->addConvolutionNd(*ctxconv3_1->getOutput(0), 16, DimsHW{3, 3}, weightMap[lname + "_context_conv3_2_weight"], weightMap[lname + "_context_conv3_2_bias"]);
    assert(ctxconv3_2);
    ctxconv3_2->setPaddingNd(DimsHW{1, 1});
    auto ctxconv3_2bn = addBatchNorm2d(network, weightMap, *ctxconv3_2->getOutput(0), lname + "_context_conv3_2_bn", 2e-5);

    ITensor* inputTensors[] = {conv1bn->getOutput(0), ctxconv2bn->getOutput(0), ctxconv3_2bn->getOutput(0)};
    auto cat = network->addConcatenation(inputTensors, 3);
    assert(cat);

    IActivationLayer* relu = network->addActivation(*cat->getOutput(0), ActivationType::kRELU);
    assert(relu);
    return relu;
}

ILayer* reshapeSoftmax(INetworkDefinition *network, ITensor& input, int c) {
    auto re1 = network->addShuffle(input);
    assert(re1);
    re1->setReshapeDimensions(Dims3(c / 2, -1, 0));

    auto sm = network->addSoftMax(*re1->getOutput(0));
    assert(sm);

    auto re2 = network->addShuffle(*sm->getOutput(0));
    assert(re2);
    re2->setReshapeDimensions(Dims3(c, -1, 0));

    return re2;
}

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../retinafaceAntiCov.wts");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    auto conv1 = convBnRelu(network, weightMap, *data, 16, 3, 2, 1, 1, "conv_1");
    auto conv2 = convBnRelu(network, weightMap, *conv1->getOutput(0), 32, 1, 1, 0, 1, "conv_2");
    auto conv3dw = convBnRelu(network, weightMap, *conv2->getOutput(0), 32, 3, 2, 1, 32, "conv_3_dw");
    auto conv3 = convBnRelu(network, weightMap, *conv3dw->getOutput(0), 32, 1, 1, 0, 1, "conv_3");
    auto conv4dw = convBnRelu(network, weightMap, *conv3->getOutput(0), 32, 3, 1, 1, 32, "conv_4_dw");
    auto conv4 = convBnRelu(network, weightMap, *conv4dw->getOutput(0), 32, 1, 1, 0, 1, "conv_4");
    auto conv5dw = convBnRelu(network, weightMap, *conv4->getOutput(0), 32, 3, 2, 1, 32, "conv_5_dw");
    auto conv5 = convBnRelu(network, weightMap, *conv5dw->getOutput(0), 64, 1, 1, 0, 1, "conv_5");
    auto conv6dw = convBnRelu(network, weightMap, *conv5->getOutput(0), 64, 3, 1, 1, 64, "conv_6_dw");
    auto conv6 = convBnRelu(network, weightMap, *conv6dw->getOutput(0), 64, 1, 1, 0, 1, "conv_6");
    // conv6 to c1
    auto conv7dw = convBnRelu(network, weightMap, *conv6->getOutput(0), 64, 3, 2, 1, 64, "conv_7_dw");
    auto conv7 = convBnRelu(network, weightMap, *conv7dw->getOutput(0), 128, 1, 1, 0, 1, "conv_7");
    auto conv8dw = convBnRelu(network, weightMap, *conv7->getOutput(0), 128, 3, 1, 1, 128, "conv_8_dw");
    auto conv8 = convBnRelu(network, weightMap, *conv8dw->getOutput(0), 128, 1, 1, 0, 1, "conv_8");
    auto conv9dw = convBnRelu(network, weightMap, *conv8->getOutput(0), 128, 3, 1, 1, 128, "conv_9_dw");
    auto conv9 = convBnRelu(network, weightMap, *conv9dw->getOutput(0), 128, 1, 1, 0, 1, "conv_9");
    auto conv10dw = convBnRelu(network, weightMap, *conv9->getOutput(0), 128, 3, 1, 1, 128, "conv_10_dw");
    auto conv10 = convBnRelu(network, weightMap, *conv10dw->getOutput(0), 128, 1, 1, 0, 1, "conv_10");
    auto conv11dw = convBnRelu(network, weightMap, *conv10->getOutput(0), 128, 3, 1, 1, 128, "conv_11_dw");
    auto conv11 = convBnRelu(network, weightMap, *conv11dw->getOutput(0), 128, 1, 1, 0, 1, "conv_11");
    auto conv12dw = convBnRelu(network, weightMap, *conv11->getOutput(0), 128, 3, 1, 1, 128, "conv_12_dw");
    auto conv12 = convBnRelu(network, weightMap, *conv12dw->getOutput(0), 128, 1, 1, 0, 1, "conv_12");
    // conv12 to c2
    auto conv13dw = convBnRelu(network, weightMap, *conv12->getOutput(0), 128, 3, 2, 1, 128, "conv_13_dw");
    auto conv13 = convBnRelu(network, weightMap, *conv13dw->getOutput(0), 256, 1, 1, 0, 1, "conv_13");
    auto conv14dw = convBnRelu(network, weightMap, *conv13->getOutput(0), 256, 3, 1, 1, 256, "conv_14_dw");
    auto conv14 = convBnRelu(network, weightMap, *conv14dw->getOutput(0), 256, 1, 1, 0, 1, "conv_14");
    auto conv_final = convBnRelu(network, weightMap, *conv14->getOutput(0), 256, 1, 1, 0, 1, "conv_final");
    // convfinal to c3

    auto rf_c3_lateral = convBiasBnRelu(network, weightMap, *conv_final->getOutput(0), 64, 1, 1, 0, "rf_c3_lateral");
    auto rf_head_s32 = head(network, weightMap, *rf_c3_lateral->getOutput(0), "rf_head_stride32");
    ILayer *cls_score_s32 = network->addConvolutionNd(*rf_head_s32->getOutput(0), 4, DimsHW{1, 1}, weightMap["face_rpn_cls_score_stride32_weight"], weightMap["face_rpn_cls_score_stride32_bias"]);
    cls_score_s32 = reshapeSoftmax(network, *cls_score_s32->getOutput(0), 4);
    auto bbox_s32 = network->addConvolutionNd(*rf_head_s32->getOutput(0), 8, DimsHW{1, 1}, weightMap["face_rpn_bbox_pred_stride32_weight"], weightMap["face_rpn_bbox_pred_stride32_bias"]);
    auto landmark_s32 = network->addConvolutionNd(*rf_head_s32->getOutput(0), 20, DimsHW{1, 1}, weightMap["face_rpn_landmark_pred_stride32_weight"], weightMap["face_rpn_landmark_pred_stride32_bias"]);
    auto rf_head2_s32 = head(network, weightMap, *rf_c3_lateral->getOutput(0), "rf_head2_stride32");
    ILayer *type_score_s32 = network->addConvolutionNd(*rf_head2_s32->getOutput(0), 6, DimsHW{1, 1}, weightMap["face_rpn_type_score_stride32_weight"], weightMap["face_rpn_type_score_stride32_bias"]);
    type_score_s32 = reshapeSoftmax(network, *type_score_s32->getOutput(0), 6);

    float *deval = reinterpret_cast<float*>(malloc(sizeof(float) * 64 * 2 * 2));
    for (int i = 0; i < 64 * 2 * 2; i++) {
        deval[i] = 1.0;
    }
    Weights deconvwts{DataType::kFLOAT, deval, 64 * 2 * 2};
    IDeconvolutionLayer* c3_deconv = network->addDeconvolutionNd(*rf_c3_lateral->getOutput(0), 64, DimsHW{2, 2}, deconvwts, emptywts);
    assert(c3_deconv);
    c3_deconv->setStrideNd(DimsHW{2, 2});
    c3_deconv->setNbGroups(64);
    weightMap["c3_deconv"] = deconvwts;
    auto rf_c2_lateral = convBiasBnRelu(network, weightMap, *conv12->getOutput(0), 64, 1, 1, 0, "rf_c2_lateral");
    auto plus0 = network->addElementWise(*c3_deconv->getOutput(0), *rf_c2_lateral->getOutput(0), ElementWiseOperation::kSUM);
    auto rf_c2_aggr = convBiasBnRelu(network, weightMap, *plus0->getOutput(0), 64, 3, 1, 1, "rf_c2_aggr");
    auto rf_head_s16 = head(network, weightMap, *rf_c2_aggr->getOutput(0), "rf_head_stride16");
    ILayer *cls_score_s16 = network->addConvolutionNd(*rf_head_s16->getOutput(0), 4, DimsHW{1, 1}, weightMap["face_rpn_cls_score_stride16_weight"], weightMap["face_rpn_cls_score_stride16_bias"]);
    cls_score_s16 = reshapeSoftmax(network, *cls_score_s16->getOutput(0), 4);
    auto bbox_s16 = network->addConvolutionNd(*rf_head_s16->getOutput(0), 8, DimsHW{1, 1}, weightMap["face_rpn_bbox_pred_stride16_weight"], weightMap["face_rpn_bbox_pred_stride16_bias"]);
    auto landmark_s16 = network->addConvolutionNd(*rf_head_s16->getOutput(0), 20, DimsHW{1, 1}, weightMap["face_rpn_landmark_pred_stride16_weight"], weightMap["face_rpn_landmark_pred_stride16_bias"]);
    auto rf_head2_s16 = head(network, weightMap, *rf_c2_aggr->getOutput(0), "rf_head2_stride16");
    ILayer *type_score_s16 = network->addConvolutionNd(*rf_head2_s16->getOutput(0), 6, DimsHW{1, 1}, weightMap["face_rpn_type_score_stride16_weight"], weightMap["face_rpn_type_score_stride16_bias"]);
    type_score_s16 = reshapeSoftmax(network, *type_score_s16->getOutput(0), 6);

    IDeconvolutionLayer* c2_deconv = network->addDeconvolutionNd(*rf_c2_aggr->getOutput(0), 64, DimsHW{2, 2}, deconvwts, emptywts);
    assert(c2_deconv);
    c2_deconv->setStrideNd(DimsHW{2, 2});
    c2_deconv->setNbGroups(64);
    auto rf_c1_red = convBiasBnRelu(network, weightMap, *conv6->getOutput(0), 64, 1, 1, 0, "rf_c1_red_conv");
    auto plus1 = network->addElementWise(*c2_deconv->getOutput(0), *rf_c1_red->getOutput(0), ElementWiseOperation::kSUM);
    auto rf_c1_aggr = convBiasBnRelu(network, weightMap, *plus1->getOutput(0), 64, 3, 1, 1, "rf_c1_aggr");
    auto rf_head_s8 = head(network, weightMap, *rf_c1_aggr->getOutput(0), "rf_head_stride8");
    ILayer *cls_score_s8 = network->addConvolutionNd(*rf_head_s8->getOutput(0), 4, DimsHW{1, 1}, weightMap["face_rpn_cls_score_stride8_weight"], weightMap["face_rpn_cls_score_stride8_bias"]);
    cls_score_s8 = reshapeSoftmax(network, *cls_score_s8->getOutput(0), 4);
    auto bbox_s8 = network->addConvolutionNd(*rf_head_s8->getOutput(0), 8, DimsHW{1, 1}, weightMap["face_rpn_bbox_pred_stride8_weight"], weightMap["face_rpn_bbox_pred_stride8_bias"]);
    auto landmark_s8 = network->addConvolutionNd(*rf_head_s8->getOutput(0), 20, DimsHW{1, 1}, weightMap["face_rpn_landmark_pred_stride8_weight"], weightMap["face_rpn_landmark_pred_stride8_bias"]);
    auto rf_head2_s8 = head(network, weightMap, *rf_c1_aggr->getOutput(0), "rf_head2_stride8");
    ILayer *type_score_s8 = network->addConvolutionNd(*rf_head2_s8->getOutput(0), 6, DimsHW{1, 1}, weightMap["face_rpn_type_score_stride8_weight"], weightMap["face_rpn_type_score_stride8_bias"]);
    type_score_s8 = reshapeSoftmax(network, *type_score_s8->getOutput(0), 6);

    ITensor* inputTensors_s32[] = {cls_score_s32->getOutput(0), bbox_s32->getOutput(0), landmark_s32->getOutput(0), type_score_s32->getOutput(0)};
    auto cat_s32 = network->addConcatenation(inputTensors_s32, 4);
    assert(cat_s32);

    ITensor* inputTensors_s16[] = {cls_score_s16->getOutput(0), bbox_s16->getOutput(0), landmark_s16->getOutput(0), type_score_s16->getOutput(0)};
    auto cat_s16 = network->addConcatenation(inputTensors_s16, 4);
    assert(cat_s16);

    ITensor* inputTensors_s8[] = {cls_score_s8->getOutput(0), bbox_s8->getOutput(0), landmark_s8->getOutput(0), type_score_s8->getOutput(0)};
    auto cat_s8 = network->addConcatenation(inputTensors_s8, 4);
    assert(cat_s8);

    auto creator = getPluginRegistry()->getPluginCreator("Decode_TRT", "1");
    PluginFieldCollection pfc;
    IPluginV2 *pluginObj = creator->createPlugin("decode", &pfc);
    ITensor* inputTensors[] = {cat_s8->getOutput(0), cat_s16->getOutput(0), cat_s32->getOutput(0)};
    auto decodelayer = network->addPluginV2(inputTensors, 3, *pluginObj);
    assert(decodelayer);

    decodelayer->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*decodelayer->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#ifdef USE_FP16
    config->setFlag(BuilderFlag::kFP16);
#endif
    std::cout << "Building engine, please wait for a while..." << std::endl;
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*) (mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
    DIR *p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 &&
                strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}

int main(int argc, char** argv) {
    cudaSetDevice(DEVICE);
    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{nullptr};
    size_t size{0};

    if (argc == 2 && std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(BATCH_SIZE, &modelStream);
        assert(modelStream != nullptr);
        std::ofstream p("retinafaceAntiCov.engine", std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 0;
    } else if (argc == 2 && std::string(argv[1]) == "-d") {
        std::ifstream file("retinafaceAntiCov.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./retinafaceAntiCov -s  // serialize model to plan file" << std::endl;
        std::cerr << "./retinafaceAntiCov -d  // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // prepare input data ---------------------------
    static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
    //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
    //    data[i] = 1.0;
    static float prob[BATCH_SIZE * OUTPUT_SIZE];
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    cv::Mat img = cv::imread("test.jpg");
    cv::Mat pr_img = preprocess_img(img);
    for (int i = 0; i < INPUT_H * INPUT_W; i++) {
        data[i] = ((float)pr_img.at<cv::Vec3b>(i)[2] - 127.5) * 0.0078125;
        data[i + INPUT_H * INPUT_W] = ((float)pr_img.at<cv::Vec3b>(i)[1] - 127.5) * 0.0078125;
        data[i + 2 * INPUT_H * INPUT_W] = ((float)pr_img.at<cv::Vec3b>(i)[0] - 127.5) * 0.0078125;
    }

    // Run inference
    auto start = std::chrono::system_clock::now();
    doInference(*context, data, prob, BATCH_SIZE);
    auto end = std::chrono::system_clock::now();
    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;

    std::vector<decodeplugin::Detection> res;
    nms(res, prob);

    for (size_t j = 0; j < res.size(); j++) {
        //if (res[j].class_confidence < 0.1) continue;
        cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].landmark);
        cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
        cv::putText(img, "face: " + std::to_string((int)(res[j].class_confidence * 100)) + "%", cv::Point(r.x, r.y + 20), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 1);
        for (int k = 0; k < 10; k += 2) {
            cv::circle(img, cv::Point(res[j].landmark[k], res[j].landmark[k + 1]), 1, cv::Scalar(255 * (k > 2), 255 * (k > 0 && k < 8), 255 * (k < 6)), 4);
        }
        cv::putText(img, "mask: " + std::to_string((int)(res[j].mask_confidence * 100)) + "%", cv::Point(r.x, r.y + 40), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0x00, 0x00, 0xFF), 1);
    }
    cv::imwrite("out.jpg", img);

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    //Print histogram of the output distribution
    //std::cout << "\nOutput:\n\n";
    //for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
    //{
    //    std::cout << prob[i] << ", ";
    //    if (i % 10 == 0) std::cout << i / 10 << std::endl;
    //}
    //std::cout << std::endl;

    return 0;
}


================================================
FILE: scaled-yolov4/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(yolov4)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

find_package(CUDA REQUIRED)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(/usr/include/x86_64-linux-gnu/)
link_directories(/usr/lib/x86_64-linux-gnu/)

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")

cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu ${PROJECT_SOURCE_DIR}/mish.cu)
target_link_libraries(myplugins nvinfer cudart)

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

add_executable(yolov4csp ${PROJECT_SOURCE_DIR}/yolov4_csp.cpp)
target_link_libraries(yolov4csp nvinfer)
target_link_libraries(yolov4csp cudart)
target_link_libraries(yolov4csp myplugins)
target_link_libraries(yolov4csp ${OpenCV_LIBS})

add_definitions(-O2 -pthread)


================================================
FILE: scaled-yolov4/README.md
================================================
# scaled-yolov4

The Pytorch implementation is from [WongKinYiu/ScaledYOLOv4 yolov4-csp branch](https://github.com/WongKinYiu/ScaledYOLOv4/tree/yolov4-csp). It can load yolov4-csp.cfg and yolov4-csp.weights(from AlexeyAB/darknet).

Note: There is a slight difference in yolov4-csp.cfg for darknet and pytorch. Use the one given in the above repo.

## Config

- Input shape `INPUT_H`, `INPUT_W` defined in yololayer.h
- Number of classes `CLASS_NUM` defined in yololayer.h
- FP16/FP32 can be selected by the macro `USE_FP16` in yolov4_csp.cpp
- GPU id can be selected by the macro `DEVICE` in yolov4_csp.cpp
- NMS thresh `NMS_THRESH` in yolov4_csp.cpp
- bbox confidence threshold `BBOX_CONF_THRESH` in yolov4_csp.cpp
- `BATCH_SIZE` in yolov4_csp.cpp

## How to run

1. generate yolov4_csp.wts from pytorch implementation with yolov4-csp.cfg and yolov4-csp.weights.

```
git clone https://github.com/wang-xinyu/tensorrtx.git
git clone -b yolov4-csp https://github.com/WongKinYiu/ScaledYOLOv4.git
// download yolov4-csp.weights from https://github.com/WongKinYiu/ScaledYOLOv4/tree/yolov4-csp#yolov4-csp
cp {tensorrtx}/scaled-yolov4/gen_wts.py {ScaledYOLOv4/}
cd {ScaledYOLOv4/}
python gen_wts.py yolov4-csp.weights
// a file 'yolov4_csp.wts' will be generated.
```

2. put yolov4_csp.wts into {tensorrtx}/scaled-yolov4, build and run

```
mv yolov4_csp.wts {tensorrtx}/scaled-yolov4/
cd {tensorrtx}/scaled-yolov4
mkdir build
cd build
cmake ..
make
sudo ./yolov4csp -s                          // serialize model to plan file i.e. 'yolov4csp.engine'
sudo ./yolov4csp -d ../../yolov3-spp/samples // deserialize plan file and run inference, the images in samples will be processed.
```

3. check the images generated, as follows. _zidane.jpg and _bus.jpg
<p align="center">
<img src= https://user-images.githubusercontent.com/39617050/117172509-824cf980-ade9-11eb-8e4c-27dbe658e355.jpg>
</p>

<p align="center">
<img src= https://user-images.githubusercontent.com/39617050/117172880-dbb52880-ade9-11eb-839a-0814fd46198e.jpg>
</p>


## More Information

See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)


================================================
FILE: scaled-yolov4/common.hpp
================================================
#include <fstream>
#include <map>
#include <sstream>
#include <vector>
#include <opencv2/opencv.hpp>

#include "NvInfer.h"
#include "yololayer.h"
#include "mish.h"


using namespace nvinfer1;

cv::Mat preprocess_img(cv::Mat& img) {
    int w, h, x, y;
    float r_w = Yolo::INPUT_W / (img.cols*1.0);
    float r_h = Yolo::INPUT_H / (img.rows*1.0);
    if (r_h > r_w) {
        w = Yolo::INPUT_W;
        h = r_w * img.rows;
        x = 0;
        y = (Yolo::INPUT_H - h) / 2;
    } else {
        w = r_h* img.cols;
        h = Yolo::INPUT_H;
        x = (Yolo::INPUT_W - w) / 2;
        y = 0;
    }
    cv::Mat re(h, w, CV_8UC3);
    cv::resize(img, re, re.size());
    cv::Mat out(Yolo::INPUT_H, Yolo::INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128));
    re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
    return out;
}

cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
    int l, r, t, b;
    float r_w = Yolo::INPUT_W / (img.cols * 1.0);
    float r_h = Yolo::INPUT_H / (img.rows * 1.0);
    if (r_h > r_w) {
        l = bbox[0] - bbox[2]/2.f;
        r = bbox[0] + bbox[2]/2.f;
        t = bbox[1] - bbox[3]/2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
        b = bbox[1] + bbox[3]/2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
        l = l / r_w;
        r = r / r_w;
        t = t / r_w;
        b = b / r_w;
    } else {
        l = bbox[0] - bbox[2]/2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
        r = bbox[0] + bbox[2]/2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
        t = bbox[1] - bbox[3]/2.f;
        b = bbox[1] + bbox[3]/2.f;
        l = l / r_h;
        r = r / r_h;
        t = t / r_h;
        b = b / r_h;
    }
    return cv::Rect(l, t, r-l, b-t);
}

float iou(float lbox[4], float rbox[4]) {
    float interBox[] = {
        std::max(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left
        std::min(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right
        std::max(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top
        std::min(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom
    };

    if(interBox[2] > interBox[3] || interBox[0] > interBox[1])
        return 0.0f;

    float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]);
    return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS);
}

bool cmp(const Yolo::Detection& a, const Yolo::Detection& b) {
    return a.det_confidence > b.det_confidence;
}

void nms(std::vector<Yolo::Detection>& res, float *output, float conf_thresh, float nms_thresh = 0.5) {
    int det_size = sizeof(Yolo::Detection) / sizeof(float);
    std::map<float, std::vector<Yolo::Detection>> m;
    for (int i = 0; i < output[0] && i < Yolo::MAX_OUTPUT_BBOX_COUNT; i++) {
        if (output[1 + det_size * i + 4] <= conf_thresh) continue;
        Yolo::Detection det;
        memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
        if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Yolo::Detection>());
        m[det.class_id].push_back(det);
    }
    for (auto it = m.begin(); it != m.end(); it++) {
        //std::cout << it->second[0].class_id << " --- " << std::endl;
        auto& dets = it->second;
        std::sort(dets.begin(), dets.end(), cmp);
        for (size_t m = 0; m < dets.size(); ++m) {
            auto& item = dets[m];
            res.push_back(item);
            for (size_t n = m + 1; n < dets.size(); ++n) {
                if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
                    dets.erase(dets.begin()+n);
                    --n;
                }
            }
        }
    }
}

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        
        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + ".weight"].values;
    float *beta = (float*)weightMap[lname + ".bias"].values;
    float *mean = (float*)weightMap[lname + ".running_mean"].values;
    float *var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};
    
    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

ILayer* convBnMish(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int p, int linx) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap["module_list." + std::to_string(linx) + ".Conv2d.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{p, p});

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "module_list." + std::to_string(linx) + ".BatchNorm2d", 1e-4);

    auto creator = getPluginRegistry()->getPluginCreator("Mish_TRT", "1");
    const PluginFieldCollection* pluginData = creator->getFieldNames();
    IPluginV2 *pluginObj = creator->createPlugin(("mish" + std::to_string(linx)).c_str(), pluginData);
    ITensor* inputTensors[] = {bn1->getOutput(0)};
    auto mish = network->addPluginV2(&inputTensors[0], 1, *pluginObj);
    return mish;
}

================================================
FILE: scaled-yolov4/gen_wts.py
================================================
import struct
import sys
from models.models import *
from utils import *

model = Darknet('models/yolov4-csp.cfg', (512, 512))
weights = sys.argv[1]
device = torch_utils.select_device('0')
if weights.endswith('.pt'):  # pytorch format
    model.load_state_dict(torch.load(weights, map_location=device)['model'])
else:  # darknet format
    load_darknet_weights(model, weights)

with open('yolov4_csp.wts', 'w') as f:
    f.write('{}\n'.format(len(model.state_dict().keys())))
    for k, v in model.state_dict().items():
        vr = v.reshape(-1).cpu().numpy()
        f.write('{} {} '.format(k, len(vr)))
        for vv in vr:
            f.write(' ')
            f.write(struct.pack('>f',float(vv)).hex())
        f.write('\n')


================================================
FILE: scaled-yolov4/logging.h
================================================
/*
 * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
        , mPrefix(other.mPrefix)
        , mShouldLog(other.mShouldLog)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) override
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
            {
                ss << " ";
            }
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//!         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H

================================================
FILE: scaled-yolov4/mish.cu
================================================
#include <cmath>
#include <stdio.h>
#include <cassert>
#include <iostream>
#include "mish.h"

namespace nvinfer1
{
    MishPlugin::MishPlugin()
    {
    }

    MishPlugin::~MishPlugin()
    {
    }

    // create the plugin at runtime from a byte stream
    MishPlugin::MishPlugin(const void* data, size_t length)
    {
        assert(length == sizeof(input_size_));
        input_size_ = *reinterpret_cast<const int*>(data);
    }

    void MishPlugin::serialize(void* buffer) const
    {
        *reinterpret_cast<int*>(buffer) = input_size_;
    }

    size_t MishPlugin::getSerializationSize() const
    {  
        return sizeof(input_size_);
    }

    int MishPlugin::initialize()
    { 
        return 0;
    }

    Dims MishPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
    {
        assert(nbInputDims == 1);
        assert(index == 0);
        input_size_ = inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2];
        // Output dimensions
        return Dims3(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]);
    }

    // Set plugin namespace
    void MishPlugin::setPluginNamespace(const char* pluginNamespace)
    {
        mPluginNamespace = pluginNamespace;
    }

    const char* MishPlugin::getPluginNamespace() const
    {
        return mPluginNamespace;
    }

    // Return the DataType of the plugin output at the requested index
    DataType MishPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
    {
        return DataType::kFLOAT;
    }

    // Return true if output tensor is broadcast across a batch.
    bool MishPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
    {
        return false;
    }

    // Return true if plugin can use input that is broadcast across batch without replication.
    bool MishPlugin::canBroadcastInputAcrossBatch(int inputIndex) const
    {
        return false;
    }

    void MishPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)
    {
    }

    // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
    void MishPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
    {
    }

    // Detach the plugin object from its execution context.
    void MishPlugin::detachFromContext() {}

    const char* MishPlugin::getPluginType() const
    {
        return "Mish_TRT";
    }

    const char* MishPlugin::getPluginVersion() const
    {
        return "1";
    }

    void MishPlugin::destroy()
    {
        delete this;
    }

    // Clone the plugin
    IPluginV2IOExt* MishPlugin::clone() const
    {
        MishPlugin *p = new MishPlugin();
        p->input_size_ = input_size_;
        p->setPluginNamespace(mPluginNamespace);
        return p;
    }

    __device__ float tanh_activate_kernel(float x){return (2/(1 + expf(-2*x)) - 1);}

    __device__ float softplus_kernel(float x, float threshold = 20) {
        if (x > threshold) return x;                // too large
        else if (x < -threshold) return expf(x);    // too small
        return logf(expf(x) + 1);
    }

    __global__ void mish_kernel(const float *input, float *output, int num_elem) {

        int idx = threadIdx.x + blockDim.x * blockIdx.x;
        if (idx >= num_elem) return;

        //float t = exp(input[idx]);
        //if (input[idx] > 20.0) {
        //    t *= t;
        //    output[idx] = (t - 1.0) / (t + 1.0);
        //} else {
        //    float tt = t * t;
        //    output[idx] = (tt + 2.0 * t) / (tt + 2.0 * t + 2.0);
        //}
        //output[idx] *= input[idx];
        output[idx] = input[idx] * tanh_activate_kernel(softplus_kernel(input[idx]));
    }

    void MishPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) {
        int block_size = thread_count_;
        int grid_size = (input_size_ * batchSize + block_size - 1) / block_size;
        mish_kernel<<<grid_size, block_size>>>(inputs[0], output, input_size_ * batchSize);
    }

    int MishPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
    {
        //assert(batchSize == 1);
        //GPU
        //CUDA_CHECK(cudaStreamSynchronize(stream));
        forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);
        return 0;
    }

    PluginFieldCollection MishPluginCreator::mFC{};
    std::vector<PluginField> MishPluginCreator::mPluginAttributes;

    MishPluginCreator::MishPluginCreator()
    {
        mPluginAttributes.clear();

        mFC.nbFields = mPluginAttributes.size();
        mFC.fields = mPluginAttributes.data();
    }

    const char* MishPluginCreator::getPluginName() const
    {
            return "Mish_TRT";
    }

    const char* MishPluginCreator::getPluginVersion() const
    {
            return "1";
    }

    const PluginFieldCollection* MishPluginCreator::getFieldNames()
    {
            return &mFC;
    }

    IPluginV2IOExt* MishPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
    {
        MishPlugin* obj = new MishPlugin();
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

    IPluginV2IOExt* MishPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
    {
        // This object will be deleted when the network is destroyed, which will
        // call MishPlugin::destroy()
        MishPlugin* obj = new MishPlugin(serialData, serialLength);
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

}


================================================
FILE: scaled-yolov4/mish.h
================================================
#ifndef TRTX_MISH_PLUGIN_H
#define TRTX_MISH_PLUGIN_H

#include <string>
#include <vector>
#include "NvInfer.h"

namespace nvinfer1
{
    class MishPlugin: public IPluginV2IOExt
    {
        public:
            explicit MishPlugin();
            MishPlugin(const void* data, size_t length);

            ~MishPlugin();

            int getNbOutputs() const override
            {
                return 1;
            }

            Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;

            int initialize() override;

            virtual void terminate() override {};

            virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;}

            virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override;

            virtual size_t getSerializationSize() const override;

            virtual void serialize(void* buffer) const override;

            bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override {
                return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
            }

            const char* getPluginType() const override;

            const char* getPluginVersion() const override;

            void destroy() override;

            IPluginV2IOExt* clone() const override;

            void setPluginNamespace(const char* pluginNamespace) override;

            const char* getPluginNamespace() const override;

            DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;

            bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override;

            bool canBroadcastInputAcrossBatch(int inputIndex) const override;

            void attachToContext(
                    cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override;

            void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override;

            void detachFromContext() override;

            int input_size_;
        private:
            void forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize = 1);
            int thread_count_ = 256;
            const char* mPluginNamespace;
    };

    class MishPluginCreator : public IPluginCreator
    {
        public:
            MishPluginCreator();

            ~MishPluginCreator() override = default;

            const char* getPluginName() const override;

            const char* getPluginVersion() const override;

            const PluginFieldCollection* getFieldNames() override;

            IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override;

            IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;

            void setPluginNamespace(const char* libNamespace) override
            {
                mNamespace = libNamespace;
            }

            const char* getPluginNamespace() const override
            {
                return mNamespace.c_str();
            }

        private:
            std::string mNamespace;
            static PluginFieldCollection mFC;
            static std::vector<PluginField> mPluginAttributes;
    };
    REGISTER_TENSORRT_PLUGIN(MishPluginCreator);
};

#endif  // TRTX_MISH_PLUGIN_H

================================================
FILE: scaled-yolov4/utils.h
================================================
#ifndef __TRT_UTILS_H_
#define __TRT_UTILS_H_

#include <iostream>
#include <vector>
#include <algorithm>
#include <cudnn.h>

#ifndef CUDA_CHECK

#define CUDA_CHECK(callstr)                                                                    \
    {                                                                                          \
        cudaError_t error_code = callstr;                                                      \
        if (error_code != cudaSuccess) {                                                       \
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
            assert(0);                                                                         \
        }                                                                                      \
    }

#endif

namespace Tn
{
    template<typename T> 
    void write(char*& buffer, const T& val)
    {
        *reinterpret_cast<T*>(buffer) = val;
        buffer += sizeof(T);
    }

    template<typename T> 
    void read(const char*& buffer, T& val)
    {
        val = *reinterpret_cast<const T*>(buffer);
        buffer += sizeof(T);
    }
}

#endif

================================================
FILE: scaled-yolov4/yololayer.cu
================================================
#include <assert.h>
#include "yololayer.h"
#include "utils.h"

using namespace Yolo;

namespace nvinfer1
{
    YoloLayerPlugin::YoloLayerPlugin()
    {
        mClassCount = CLASS_NUM;
        mYoloKernel.clear();
        mYoloKernel.push_back(yolo1);
        mYoloKernel.push_back(yolo2);
        mYoloKernel.push_back(yolo3);

        mKernelCount = mYoloKernel.size();

        CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
        size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;
        for(int ii = 0; ii < mKernelCount; ii ++)
        {
            CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen));
            const auto& yolo = mYoloKernel[ii];
            CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
        }
    }

    YoloLayerPlugin::~YoloLayerPlugin()
    {
    }

    // create the plugin at runtime from a byte stream
    YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length)
    {
        using namespace Tn;
        const char *d = reinterpret_cast<const char *>(data), *a = d;
        read(d, mClassCount);
        read(d, mThreadCount);
        read(d, mKernelCount);
        mYoloKernel.resize(mKernelCount);
        auto kernelSize = mKernelCount*sizeof(YoloKernel);
        memcpy(mYoloKernel.data(),d,kernelSize);
        d += kernelSize;

        CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
        size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;
        for(int ii = 0; ii < mKernelCount; ii ++)
        {
            CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen));
            const auto& yolo = mYoloKernel[ii];
            CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
        }

        assert(d == a + length);
    }

    void YoloLayerPlugin::serialize(void* buffer) const
    {
        using namespace Tn;
        char* d = static_cast<char*>(buffer), *a = d;
        write(d, mClassCount);
        write(d, mThreadCount);
        write(d, mKernelCount);
        auto kernelSize = mKernelCount*sizeof(YoloKernel);
        memcpy(d,mYoloKernel.data(),kernelSize);
        d += kernelSize;

        assert(d == a + getSerializationSize());
    }
    
    size_t YoloLayerPlugin::getSerializationSize() const
    {  
        return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount)  + sizeof(Yolo::YoloKernel) * mYoloKernel.size();
    }

    int YoloLayerPlugin::initialize()
    { 
        return 0;
    }
    
    Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
    {
        //output the result to channel
        int totalsize = MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float);

        return Dims3(totalsize + 1, 1, 1);
    }

    // Set plugin namespace
    void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace)
    {
        mPluginNamespace = pluginNamespace;
    }

    const char* YoloLayerPlugin::getPluginNamespace() const
    {
        return mPluginNamespace;
    }

    // Return the DataType of the plugin output at the requested index
    DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
    {
        return DataType::kFLOAT;
    }

    // Return true if output tensor is broadcast across a batch.
    bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
    {
        return false;
    }

    // Return true if plugin can use input that is broadcast across batch without replication.
    bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const
    {
        return false;
    }

    void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)
    {
    }

    // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
    void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
    {
    }

    // Detach the plugin object from its execution context.
    void YoloLayerPlugin::detachFromContext() {}

    const char* YoloLayerPlugin::getPluginType() const
    {
        return "YoloLayer_TRT";
    }

    const char* YoloLayerPlugin::getPluginVersion() const
    {
        return "1";
    }

    void YoloLayerPlugin::destroy()
    {
        delete this;
    }

    // Clone the plugin
    IPluginV2IOExt* YoloLayerPlugin::clone() const
    {
        YoloLayerPlugin *p = new YoloLayerPlugin();
        p->setPluginNamespace(mPluginNamespace);
        return p;
    }

    __device__ float Logist(float data){ return 1./(1. + exp(-data)); };

    __global__ void CalDetection(const float *input, float *output, int noElements, 
            int yoloWidth, int yoloHeight, const float anchors[CHECK_COUNT*2],int classes,int outputElem) {
 
        int idx = threadIdx.x + blockDim.x * blockIdx.x;
        if (idx >= noElements) return;

        int total_grid = yoloWidth * yoloHeight;
        int bnIdx = idx / total_grid;
        idx = idx - total_grid*bnIdx;
        int info_len_i = 5 + classes;
        const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT);

        for (int k = 0; k < 3; ++k) {
            int class_id = 0;
            float max_cls_prob = 0.0;
            for (int i = 5; i < info_len_i; ++i) {
                float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]);
                if (p > max_cls_prob) {
                    max_cls_prob = p;
                    class_id = i - 5;
                }
            }
            float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]);
            if (max_cls_prob < IGNORE_THRESH || box_prob < IGNORE_THRESH) continue;

            float *res_count = output + bnIdx*outputElem;
            int count = (int)atomicAdd(res_count, 1);
            if (count >= MAX_OUTPUT_BBOX_COUNT) return;
            char* data = (char * )res_count + sizeof(float) + count*sizeof(Detection);
            Detection* det =  (Detection*)(data);

            int row = idx / yoloWidth;
            int col = idx % yoloWidth;

            //Location
            det->bbox[0] = (col + (2 * (Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid]))) - 0.5) * INPUT_W / yoloWidth;
            det->bbox[1] = (row + (2 * (Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid]))) - 0.5) * INPUT_H / yoloHeight;
            det->bbox[2] = (powf(2 * (Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid])), 2)) * anchors[2*k];
            det->bbox[3] = (powf(2 * (Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid])), 2)) * anchors[2*k + 1];
            det->det_confidence = box_prob;
            det->class_id = class_id;
            det->class_confidence = max_cls_prob;
        }
    }

    void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) {

        int outputElem = 1 + MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float);

        for(int idx = 0 ; idx < batchSize; ++idx) {
            CUDA_CHECK(cudaMemset(output + idx*outputElem, 0, sizeof(float)));
        }
        int numElem = 0;
        for (unsigned int i = 0;i< mYoloKernel.size();++i)
        {
            const auto& yolo = mYoloKernel[i];
            numElem = yolo.width*yolo.height*batchSize;
            if (numElem < mThreadCount)
                mThreadCount = numElem;
            CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>>
                (inputs[i],output, numElem, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount ,outputElem);
        }

    }


    int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
    {
        //assert(batchSize == 1);
        //GPU
        //CUDA_CHECK(cudaStreamSynchronize(stream));
        forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);

        return 0;
    }

    PluginFieldCollection YoloPluginCreator::mFC{};
    std::vector<PluginField> YoloPluginCreator::mPluginAttributes;

    YoloPluginCreator::YoloPluginCreator()
    {
        mPluginAttributes.clear();

        mFC.nbFields = mPluginAttributes.size();
        mFC.fields = mPluginAttributes.data();
    }

    const char* YoloPluginCreator::getPluginName() const
    {
            return "YoloLayer_TRT";
    }

    const char* YoloPluginCreator::getPluginVersion() const
    {
            return "1";
    }

    const PluginFieldCollection* YoloPluginCreator::getFieldNames()
    {
            return &mFC;
    }

    IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
    {
        YoloLayerPlugin* obj = new YoloLayerPlugin();
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

    IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
    {
        // This object will be deleted when the network is destroyed, which will
        // call MishPlugin::destroy()
        YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

}

================================================
FILE: scaled-yolov4/yololayer.h
================================================
#ifndef _YOLO_LAYER_H
#define _YOLO_LAYER_H

#include <iostream>
#include <vector>
#include "NvInfer.h"

namespace Yolo
{
    static constexpr int CHECK_COUNT = 3;
    static constexpr float IGNORE_THRESH = 0.1f;
    static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000;
    static constexpr int CLASS_NUM = 80;
    static constexpr int INPUT_H = 512;
    static constexpr int INPUT_W = 512;

    struct YoloKernel
    {
        int width;
        int height;
        float anchors[CHECK_COUNT*2];
    };

    static constexpr YoloKernel yolo1 = {
        INPUT_W / 8,
        INPUT_H / 8,
        {12,16, 19,36, 40,28}
    };
    static constexpr YoloKernel yolo2 = {
        INPUT_W / 16,
        INPUT_H / 16,
        {36,75, 76,55, 72,146}
    };
    static constexpr YoloKernel yolo3 = {
        INPUT_W / 32,
        INPUT_H / 32,
        {142,110, 192,243, 459,401}
    };

    static constexpr int LOCATIONS = 4;
    struct alignas(float) Detection{
        //x y w h
        float bbox[LOCATIONS];
        float det_confidence;
        float class_id;
        float class_confidence;
    };
}


namespace nvinfer1
{
    class YoloLayerPlugin: public IPluginV2IOExt
    {
        public:
            explicit YoloLayerPlugin();
            YoloLayerPlugin(const void* data, size_t length);

            ~YoloLayerPlugin();

            int getNbOutputs() const override
            {
                return 1;
            }

            Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;

            int initialize() override;

            virtual void terminate() override {};

            virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;}

            virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override;

            virtual size_t getSerializationSize() const override;

            virtual void serialize(void* buffer) const override;

            bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override {
                return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
            }

            const char* getPluginType() const override;

            const char* getPluginVersion() const override;

            void destroy() override;

            IPluginV2IOExt* clone() const override;

            void setPluginNamespace(const char* pluginNamespace) override;

            const char* getPluginNamespace() const override;

            DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;

            bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override;

            bool canBroadcastInputAcrossBatch(int inputIndex) const override;

            void attachToContext(
                    cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override;

            void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override;

            void detachFromContext() override;

        private:
            void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream, int batchSize = 1);
            int mClassCount;
            int mKernelCount;
            std::vector<Yolo::YoloKernel> mYoloKernel;
            int mThreadCount = 256;
            void** mAnchor;
            const char* mPluginNamespace;
    };

    class YoloPluginCreator : public IPluginCreator
    {
        public:
            YoloPluginCreator();

            ~YoloPluginCreator() override = default;

            const char* getPluginName() const override;

            const char* getPluginVersion() const override;

            const PluginFieldCollection* getFieldNames() override;

            IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override;

            IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;

            void setPluginNamespace(const char* libNamespace) override
            {
                mNamespace = libNamespace;
            }

            const char* getPluginNamespace() const override
            {
                return mNamespace.c_str();
            }

        private:
            std::string mNamespace;
            static PluginFieldCollection mFC;
            static std::vector<PluginField> mPluginAttributes;
    };
    REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
};

#endif 


================================================
FILE: scaled-yolov4/yolov4_csp.cpp
================================================
#include <iostream>
#include <chrono>
#include <dirent.h>

#include "logging.h"
#include "utils.h"
#include "cuda_runtime_api.h"
#include "common.hpp"

#define USE_FP16  // comment out this if want to use FP32
#define DEVICE 0  // GPU id
#define NMS_THRESH 0.4
#define BBOX_CONF_THRESH 0.5
#define BATCH_SIZE 1

// stuff we know about the network and the input/output blobs
static const int INPUT_H = Yolo::INPUT_H;
static const int INPUT_W = Yolo::INPUT_W;
static const int DETECTION_SIZE = sizeof(Yolo::Detection) / sizeof(float);
static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * DETECTION_SIZE + 1;  // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";

static Logger gLogger;


// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    INetworkDefinition* network = builder -> createNetworkV2(0U);

    // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
    ITensor* data = network -> addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../yolov4_csp.wts");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    // define yolov4 csp layers
    auto l0 = convBnMish(network, weightMap, *data, 32, 3, 1, 1, 0);
    auto l1 = convBnMish(network, weightMap, *l0 -> getOutput(0), 64, 3, 2, 1, 1);
    auto l2 = convBnMish(network, weightMap, *l1 -> getOutput(0), 32, 1, 1, 0, 2);
    auto l3 = convBnMish(network, weightMap, *l2 -> getOutput(0), 64, 3, 1, 1, 3);
    auto ew4 = network -> addElementWise(*l3 -> getOutput(0), *l1 -> getOutput(0), ElementWiseOperation::kSUM);
    auto l5 = convBnMish(network, weightMap, *ew4 -> getOutput(0), 128, 3, 2, 1, 5);
    auto l6 = convBnMish(network, weightMap, *l5 -> getOutput(0), 64, 1, 1, 0, 6);
    auto l7 = l5;
    auto l8 = convBnMish(network, weightMap, *l7 -> getOutput(0), 64, 1, 1, 0, 8);
    auto l9 = convBnMish(network, weightMap, *l8 -> getOutput(0), 64, 1, 1, 0, 9);
    auto l10 = convBnMish(network, weightMap, *l9 -> getOutput(0), 64, 3, 1, 1, 10);
    auto ew11 = network -> addElementWise(*l10 -> getOutput(0), *l8 -> getOutput(0), ElementWiseOperation::kSUM);
    auto l12 = convBnMish(network, weightMap, *ew11 -> getOutput(0), 64, 1, 1, 0, 12);
    auto l13 = convBnMish(network, weightMap, *l12 -> getOutput(0), 64, 3, 1, 1, 13);
    auto ew14 = network -> addElementWise(*l13 -> getOutput(0), *ew11 -> getOutput(0), ElementWiseOperation::kSUM);
    auto l15 = convBnMish(network, weightMap, *ew14 -> getOutput(0), 64, 1, 1, 0, 15);

    ITensor* inputTensors16[] = {l15 -> getOutput(0), l6 -> getOutput(0)};
    auto cat16 = network -> addConcatenation(inputTensors16, 2);

    auto l17 = convBnMish(network, weightMap, *cat16 -> getOutput(0), 128, 1, 1, 0, 17);
    auto l18 = convBnMish(network, weightMap, *l17 -> getOutput(0), 256, 3, 2, 1, 18);
    auto l19 = convBnMish(network, weightMap, *l18 -> getOutput(0), 128, 1, 1, 0, 19);
    auto l20 = l18;
    auto l21 = convBnMish(network, weightMap, *l20 -> getOutput(0), 128, 1, 1, 0, 21);
    auto l22 = convBnMish(network, weightMap, *l21 -> getOutput(0), 128, 1, 1, 0, 22);
    auto l23 = convBnMish(network, weightMap, *l22 -> getOutput(0), 128, 3, 1, 1, 23);
    auto ew24 = network -> addElementWise(*l23 -> getOutput(0), *l21 -> getOutput(0), ElementWiseOperation::kSUM);
    auto l25 = convBnMish(network, weightMap, *ew24 -> getOutput(0), 128, 1, 1, 0, 25);
    auto l26 = convBnMish(network, weightMap, *l25 -> getOutput(0), 128, 3, 1, 1, 26);
    auto ew27 = network -> addElementWise(*l26 -> getOutput(0), *ew24 -> getOutput(0), ElementWiseOperation::kSUM);
    auto l28 = convBnMish(network, weightMap, *ew27 -> getOutput(0), 128, 1, 1, 0, 28);
    auto l29 = convBnMish(network, weightMap, *l28 -> getOutput(0), 128, 3, 1, 1, 29);
    auto ew30 = network -> addElementWise(*l29 -> getOutput(0), *ew27 -> getOutput(0), ElementWiseOperation::kSUM);
    auto l31 = convBnMish(network, weightMap, *ew30 -> getOutput(0), 128, 1, 1, 0, 31);
    auto l32 = convBnMish(network, weightMap, *l31 -> getOutput(0), 128, 3, 1, 1, 32);
    auto ew33 = network -> addElementWise(*l32 -> getOutput(0), *ew30 -> getOutput(0), ElementWiseOperation::kSUM);
    auto l34 = convBnMish(network, weightMap, *ew33 -> getOutput(0), 128, 1, 1, 0, 34);
    auto l35 = convBnMish(network, weightMap, *l34 -> getOutput(0), 128, 3, 1, 1, 35);
    auto ew36 = network -> addElementWise(*l35 -> getOutput(0), *ew33 -> getOutput(0), ElementWiseOperation::kSUM);
    auto l37 = convBnMish(network, weightMap, *ew36 -> getOutput(0), 128, 1, 1, 0, 37);
    auto l38 = convBnMish(network, weightMap, *l37 -> getOutput(0), 128, 3, 1, 1, 38);
    auto ew39 = network -> addElementWise(*l38 -> getOutput(0), *ew36 -> getOutput(0), ElementWiseOperation::kSUM);
    auto l40 = convBnMish(network, weightMap, *ew39 -> getOutput(0), 128, 1, 1, 0, 40);
    auto l41 = convBnMish(network, weightMap, *l40 -> getOutput(0), 128, 3, 1, 1, 41);
    auto ew42 = network -> addElementWise(*l41 -> getOutput(0), *ew39 -> getOutput(0), ElementWiseOperation::kSUM);
    auto l43 = convBnMish(network, weightMap, *ew42 -> getOutput(0), 128, 1, 1, 0, 43);
    auto l44 = convBnMish(network, weightMap, *l43 -> getOutput(0), 128, 3, 1, 1, 44);
    auto ew45 = network -> addElementWise(*l44 -> getOutput(0), *ew42 -> getOutput(0), ElementWiseOperation::kSUM);
    auto l46 = convBnMish(network, weightMap, *ew45 -> getOutput(0), 128, 1, 1, 0, 46);

    ITensor* inputTensors47[] = {l46 -> getOutput(0), l19 -> getOutput(0)};
    auto cat47 = network -> addConcatenation(inputTensors47, 2);

    auto l48 = convBnMish(network, weightMap, *cat47 -> getOutput(0), 256, 1, 1, 0, 48);
    auto l49 = convBnMish(network, weightMap, *l48 -> getOutput(0), 512, 3, 2, 1, 49);
    auto l50 = convBnMish(network, weightMap, *l49 -> getOutput(0), 256, 1, 1, 0, 50);
    auto l51 = l49;
    auto l52 = convBnMish(network, weightMap, *l51 -> getOutput(0), 256, 1, 1, 0, 52);
    auto l53 = convBnMish(network, weightMap, *l52 -> getOutput(0), 256, 1, 1, 0, 53);
    auto l54 = convBnMish(network, weightMap, *l53 -> getOutput(0), 256, 3, 1, 1, 54);
    auto ew55 = network -> addElementWise(*l54 -> getOutput(0), *l52 -> getOutput(0), ElementWiseOperation::kSUM);
    auto l56 = convBnMish(network, weightMap, *ew55 -> getOutput(0), 256, 1, 1, 0, 56);
    auto l57 = convBnMish(network, weightMap, *l56 -> getOutput(0), 256, 3, 1, 1, 57);
    auto ew58 = network -> addElementWise(*l57 -> getOutput(0), *ew55 -> getOutput(0), ElementWiseOperation::kSUM);
    auto l59 = convBnMish(network, weightMap, *ew58 -> getOutput(0), 256, 1, 1, 0, 59);
    auto l60 = convBnMish(network, weightMap, *l59 -> getOutput(0), 256, 3, 1, 1, 60);
    auto ew61 = network -> addElementWise(*l60 -> getOutput(0), *ew58 -> getOutput(0), ElementWiseOperation::kSUM);
    auto l62 = convBnMish(network, weightMap, *ew61 -> getOutput(0), 256, 1, 1, 0, 62);
    auto l63 = convBnMish(network, weightMap, *l62 -> getOutput(0), 256, 3, 1, 1, 63);
    auto ew64 = network -> addElementWise(*l63 -> getOutput(0), *ew61 -> getOutput(0), ElementWiseOperation::kSUM);
    auto l65 = convBnMish(network, weightMap, *ew64 -> getOutput(0), 256, 1, 1, 0, 65);
    auto l66 = convBnMish(network, weightMap, *l65 -> getOutput(0), 256, 3, 1, 1, 66);
    auto ew67 = network -> addElementWise(*l66 -> getOutput(0), *ew64 -> getOutput(0), ElementWiseOperation::kSUM);
    auto l68 = convBnMish(network, weightMap, *ew67 -> getOutput(0), 256, 1, 1, 0, 68);
    auto l69 = convBnMish(network, weightMap, *l68 -> getOutput(0), 256, 3, 1, 1, 69);
    auto ew70 = network -> addElementWise(*l69 -> getOutput(0), *ew67 -> getOutput(0), ElementWiseOperation::kSUM);
    auto l71 = convBnMish(network, weightMap, *ew70 -> getOutput(0), 256, 1, 1, 0, 71);
    auto l72 = convBnMish(network, weightMap, *l71 -> getOutput(0), 256, 3, 1, 1, 72);
    auto ew73 = network -> addElementWise(*l72 -> getOutput(0), *ew70 -> getOutput(0), ElementWiseOperation::kSUM);
    auto l74 = convBnMish(network, weightMap, *ew73 -> getOutput(0), 256, 1, 1, 0, 74);
    auto l75 = convBnMish(network, weightMap, *l74 -> getOutput(0), 256, 3, 1, 1, 75);
    auto ew76 = network -> addElementWise(*l75 -> getOutput(0), *ew73 -> getOutput(0), ElementWiseOperation::kSUM);
    auto l77 = convBnMish(network, weightMap, *ew76 -> getOutput(0), 256, 1, 1, 0, 77);

    ITensor* inputTensors78[] = {l77 -> getOutput(0), l50 -> getOutput(0)};
    auto cat78 = network -> addConcatenation(inputTensors78, 2);

    auto l79 = convBnMish(network, weightMap, *cat78 -> getOutput(0), 512, 1, 1, 0, 79);
    auto l80 = convBnMish(network, weightMap, *l79 -> getOutput(0), 1024, 3, 2, 1, 80);
    auto l81 = convBnMish(network, weightMap, *l80 -> getOutput(0), 512, 1, 1, 0, 81);
    auto l82 = l80;
    auto l83 = convBnMish(network, weightMap, *l82 -> getOutput(0), 512, 1, 1, 0, 83);
    auto l84 = convBnMish(network, weightMap, *l83 -> getOutput(0), 512, 1, 1, 0, 84);
    auto l85 = convBnMish(network, weightMap, *l84 -> getOutput(0), 512, 3, 1, 1, 85);
    auto ew86 = network -> addElementWise(*l85 -> getOutput(0), *l83 -> getOutput(0), ElementWiseOperation::kSUM);
    auto l87 = convBnMish(network, weightMap, *ew86 -> getOutput(0), 512, 1, 1, 0, 87);
    auto l88 = convBnMish(network, weightMap, *l87 -> getOutput(0), 512, 3, 1, 1, 88);
    auto ew89 = network -> addElementWise(*l88 -> getOutput(0), *ew86 -> getOutput(0), ElementWiseOperation::kSUM);
    auto l90 = convBnMish(network, weightMap, *ew89 -> getOutput(0), 512, 1, 1, 0, 90);
    auto l91 = convBnMish(network, weightMap, *l90 -> getOutput(0), 512, 3, 1, 1, 91);
    auto ew92 = network -> addElementWise(*l91 -> getOutput(0), *ew89 -> getOutput(0), ElementWiseOperation::kSUM);
    auto l93 = convBnMish(network, weightMap, *ew92 -> getOutput(0), 512, 1, 1, 0, 93);
    auto l94 = convBnMish(network, weightMap, *l93 -> getOutput(0), 512, 3, 1, 1, 94);
    auto ew95 = network -> addElementWise(*l94 -> getOutput(0), *ew92 -> getOutput(0), ElementWiseOperation::kSUM);
    auto l96 = convBnMish(network, weightMap, *ew95 -> getOutput(0), 512, 1, 1, 0, 96);

    ITensor* inputTensors97[] = {l96 -> getOutput(0), l81 -> getOutput(0)};
    
    auto cat97 = network -> addConcatenation(inputTensors97, 2);

    auto l98 = convBnMish(network, weightMap, *cat97 -> getOutput(0), 1024, 1, 1, 0, 98);

    // ----
    auto l99 = convBnMish(network, weightMap, *l98 -> getOutput(0), 512, 1, 1, 0, 99);
    auto l100 = l98;
    auto l101 = convBnMish(network, weightMap, *l100 -> getOutput(0), 512, 1, 1, 0, 101);
    auto l102 = convBnMish(network, weightMap, *l101 -> getOutput(0), 512, 3, 1, 1, 102);
    auto l103 = convBnMish(network, weightMap, *l102 -> getOutput(0), 512, 1, 1, 0, 103);

    auto pool104 = network -> addPoolingNd(*l103 -> getOutput(0), PoolingType::kMAX, DimsHW{5, 5});
    pool104 -> setPaddingNd(DimsHW{2, 2});
    pool104 -> setStrideNd(DimsHW{1, 1});

    auto l105 = l103;

    auto pool106 = network -> addPoolingNd(*l105 -> getOutput(0), PoolingType::kMAX, DimsHW{9, 9});
    pool106 -> setPaddingNd(DimsHW{4, 4});
    pool106 -> setStrideNd(DimsHW{1, 1});

    auto l107 = l103;

    auto pool108 = network -> addPoolingNd(*l107 -> getOutput(0), PoolingType::kMAX, DimsHW{13, 13});
    pool108 -> setPaddingNd(DimsHW{6, 6});
    pool108 -> setStrideNd(DimsHW{1, 1});

    ITensor* inputTensors109[] = {pool108 -> getOutput(0), pool106 -> getOutput(0), pool104 -> getOutput(0), l103 -> getOutput(0)};
    auto cat109 = network -> addConcatenation(inputTensors109, 4);

    // ---- end spp

    auto l110 = convBnMish(network, weightMap, *cat109 -> getOutput(0), 512, 1, 1, 0, 110);
    auto l111 = convBnMish(network, weightMap, *l110 -> getOutput(0), 512, 3, 1, 1, 111);

    ITensor* inputTensors112[] =  { l111 -> getOutput(0), l99 -> getOutput(0) };
    auto cat112 = network -> addConcatenation(inputTensors112, 2);

    auto l113 = convBnMish(network, weightMap, *cat112 -> getOutput(0), 512, 1, 1, 0, 113);
    auto l114 = convBnMish(network, weightMap, *l113 -> getOutput(0), 256, 1, 1, 0, 114);

    float *deval = reinterpret_cast<float*>(malloc(sizeof(float) * 256 * 2 * 2));
    for (int i = 0; i < 256 * 2 * 2; i++) {
        deval[i] = 1.0;
    }
    Weights upsamplewts115{DataType::kFLOAT, deval, 256 * 2 * 2};
    IDeconvolutionLayer* upsample115 = network -> addDeconvolutionNd(*l114 -> getOutput(0), 256, DimsHW{2, 2}, upsamplewts115, emptywts);
    assert(upsample115);
    upsample115 -> setStrideNd(DimsHW{2, 2});
    upsample115 -> setNbGroups(256);
    weightMap["upsample115"] = upsamplewts115;

    auto l116 = l79;
    auto l117 = convBnMish(network, weightMap, *l116 -> getOutput(0), 256, 1, 1, 0, 117);

    ITensor* inputTensors118[] = {l117 -> getOutput(0), upsample115 -> getOutput(0)};
    auto cat118 = network -> addConcatenation(inputTensors118, 2);

    auto l119 = convBnMish(network, weightMap, *cat118 -> getOutput(0), 256, 1, 1, 0, 119);
    auto l120 = convBnMish(network, weightMap, *l119 -> getOutput(0), 256, 1, 1, 0, 120);
    auto l121 = l119;
    auto l122 = convBnMish(network, weightMap, *l121 -> getOutput(0), 256, 1, 1, 0, 122);
    auto l123 = convBnMish(network, weightMap, *l122 -> getOutput(0), 256, 3, 1, 1, 123);
    auto l124 = convBnMish(network, weightMap, *l123 -> getOutput(0), 256, 1, 1, 0, 124);
    auto l125 = convBnMish(network, weightMap, *l124 -> getOutput(0), 256, 3, 1, 1, 125);
    
    ITensor* inputTensors126[] = {l125 -> getOutput(0), l120 -> getOutput(0)};
    auto cat126 = network -> addConcatenation(inputTensors126, 2);

    auto l127 = convBnMish(network, weightMap, *cat126 -> getOutput(0), 256, 1, 1, 0, 127);
    auto l128 = convBnMish(network, weightMap, *l127 -> getOutput(0), 128, 1, 1, 0, 128);
    
    Weights upsamplewts129{DataType::kFLOAT, deval, 128 * 2 * 2};
    IDeconvolutionLayer* upsample129 = network -> addDeconvolutionNd(*l128 -> getOutput(0), 128, DimsHW{2, 2}, upsamplewts129, emptywts);
    assert(upsample129);
    upsample129 -> setStrideNd(DimsHW{2, 2});
    upsample129 -> setNbGroups(128);

    auto l130 = l48;
    auto l131 = convBnMish(network, weightMap, *l130 -> getOutput(0), 128, 1, 1, 0, 131);

    ITensor* inputTensors132[] = {l131 -> getOutput(0), upsample129 -> getOutput(0)};
    auto cat132 = network -> addConcatenation(inputTensors132, 2);

    auto l133 = convBnMish(network, weightMap, *cat132 -> getOutput(0), 128, 1, 1, 0, 133);
    auto l134 = convBnMish(network, weightMap, *l133 -> getOutput(0), 128, 1, 1, 0, 134);
    auto l135 = l133;
    auto l136 = convBnMish(network, weightMap, *l135 -> getOutput(0), 128, 1, 1, 0, 136);
    auto l137 = convBnMish(network, weightMap, *l136 -> getOutput(0), 128, 3, 1, 1, 137);
    auto l138 = convBnMish(network, weightMap, *l137 -> getOutput(0), 128, 1, 1, 0, 138);
    auto l139 = convBnMish(network, weightMap, *l138 -> getOutput(0), 128, 3, 1, 1, 139);

    ITensor* inputTensors140[] = {l139 -> getOutput(0), l134 -> getOutput(0)};
    auto cat140 = network -> addConcatenation(inputTensors140, 2);

    auto l141 = convBnMish(network, weightMap, *cat140 -> getOutput(0), 128, 1, 1, 0, 141);

    // ---
    auto l142 = convBnMish(network, weightMap, *l141 -> getOutput(0), 256, 3, 1, 1, 142);
    IConvolutionLayer* conv143 = network -> addConvolutionNd(*l142 -> getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.143.Conv2d.weight"], weightMap["module_list.143.Conv2d.bias"]);
    assert(conv143);

    // 144 is yolo layer
    auto l145 = l141;
    auto l146 = convBnMish(network, weightMap, *l145 -> getOutput(0), 256, 3, 2, 1, 146);

    ITensor* inputTensors147[] = {l146 -> getOutput(0), l127 -> getOutput(0)};
    auto cat147 = network -> addConcatenation(inputTensors147, 2);

    auto l148 = convBnMish(network, weightMap, *cat147 -> getOutput(0), 256, 1, 1, 0, 148);
    auto l149 = convBnMish(network, weightMap, *l148 -> getOutput(0), 256, 1, 1, 0, 149);
    auto l150 = l148;
    auto l151 = convBnMish(network, weightMap, *l150 -> getOutput(0), 256, 1, 1, 0, 151);
    auto l152 = convBnMish(network, weightMap, *l151 -> getOutput(0), 256, 3, 1, 1, 152);
    auto l153 = convBnMish(network, weightMap, *l152 -> getOutput(0), 256, 1, 1, 0, 153);
    auto l154 = convBnMish(network, weightMap, *l153 -> getOutput(0), 256, 3, 1, 1, 154);

    ITensor* inputTensors155[] = {l154 -> getOutput(0), l149 -> getOutput(0)};
    auto cat155 = network -> addConcatenation(inputTensors155, 2);

    auto l156 = convBnMish(network, weightMap, *cat155 -> getOutput(0), 256, 1, 1, 0, 156);
    auto l157 = convBnMish(network, weightMap, *l156 -> getOutput(0), 512, 3, 1, 1, 157);   
    IConvolutionLayer* conv158 = network -> addConvolutionNd(*l157 -> getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.158.Conv2d.weight"], weightMap["module_list.158.Conv2d.bias"]);
    assert(conv158);
    // 159 is yolo layer

    auto l160 = l156;
    auto l161 = convBnMish(network, weightMap, *l160 -> getOutput(0), 512, 3, 2, 1, 161);

    ITensor* inputTensors162[] = {l161 -> getOutput(0), l113 -> getOutput(0)};
    auto cat162 = network -> addConcatenation(inputTensors162, 2);

    auto l163 = convBnMish(network, weightMap, *cat162 -> getOutput(0), 512, 1, 1, 0, 163); 
    auto l164 = convBnMish(network, weightMap, *l163 -> getOutput(0), 512, 1, 1, 0, 164); 
    auto l165 = l163;
    auto l166 = convBnMish(network, weightMap, *l165 -> getOutput(0), 512, 1, 1, 0, 166); 
    auto l167 = convBnMish(network, weightMap, *l166 -> getOutput(0), 512, 3, 1, 1, 167);
    auto l168 = convBnMish(network, weightMap, *l167 -> getOutput(0), 512, 1, 1, 0, 168);
    auto l169 = convBnMish(network, weightMap, *l168 -> getOutput(0), 512, 3, 1, 1, 169);

    ITensor* inputTensors170[] = {l169 -> getOutput(0), l164 -> getOutput(0)};
    auto cat170 = network -> addConcatenation(inputTensors170, 2);

    auto l171 = convBnMish(network, weightMap, *cat170 -> getOutput(0), 512, 1, 1, 0, 171);
    auto l172 = convBnMish(network, weightMap, *l171 -> getOutput(0), 1024, 3, 1, 1, 172);

    IConvolutionLayer* conv173 = network -> addConvolutionNd(*l172 -> getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.173.Conv2d.weight"], weightMap["module_list.173.Conv2d.bias"]);
    assert(conv173);
    // 174 is yolo layer

    // add yolo plugin
    auto creator = getPluginRegistry() -> getPluginCreator("YoloLayer_TRT", "1");
    const PluginFieldCollection* pluginData = creator -> getFieldNames();
    IPluginV2* pluginObj = creator -> createPlugin("yololayer", pluginData);
    ITensor* inputTensorsYolo[] = {conv143 -> getOutput(0), conv158 -> getOutput(0), conv173 -> getOutput(0)};
    auto yolo = network -> addPluginV2(inputTensorsYolo, 3, *pluginObj);

    yolo -> getOutput(0) -> setName(OUTPUT_BLOB_NAME);
    network -> markOutput(*yolo -> getOutput(0));

    // Build engine
    builder -> setMaxBatchSize(maxBatchSize);
    config -> setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#ifdef USE_FP16
    config -> setFlag(BuilderFlag::kFP16);
#endif
    std::cout << "Building tensorrt engine, please wait for a while..." << std::endl;
    ICudaEngine* engine = builder -> buildEngineWithConfig(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // Don't need the network any more
    network -> destroy();

    
    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*) (mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
    // create builder
    IBuilder* builder = createInferBuilder(gLogger);

    // create builder config
    IBuilderConfig* config = builder -> createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // serialize the trt engine
    (*modelStream) = engine -> serialize();
    
    // Close everything down
    engine -> destroy();
    builder -> destroy();
    config -> destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(buffers[inputIndex]));
    CUDA_CHECK(cudaFree(buffers[outputIndex]));
}

int read_files_in_dir(const char* p_dir_name, std::vector<std::string> &file_names) {
    DIR *p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file -> d_name, ".") != 0 &&
            strcmp(p_file -> d_name, "..") != 0) {
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }
    closedir(p_dir);
    return 0;
}

int main(int argc, char** argv){
    cudaSetDevice(DEVICE);
    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{nullptr};
    size_t size{0};

    if (argc == 2 && std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(BATCH_SIZE, &modelStream);
        assert(modelStream != nullptr);
        std::ofstream p("yolov4csp.engine", std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 0;
    } else if (argc == 3 && std::string(argv[1]) == "-d") {
        std::ifstream file("yolov4csp.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./yolov4 -s  // serialize model to plan file" << std::endl;
        std::cerr << "./yolov4 -d ../samples  // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    std::vector<std::string> file_names;
    if (read_files_in_dir(argv[2], file_names) < 0) {
        std::cout << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    // prepare input data ---------------------------
    static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
    //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
    //    data[i] = 1.0;
    static float prob[BATCH_SIZE * OUTPUT_SIZE];
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    int fcount = 0;
    for (int f = 0; f < (int)file_names.size(); f++) {
        fcount++;
        if (fcount < BATCH_SIZE && f + 1 != (int)file_names.size()) continue;
        for (int b = 0; b < fcount; b++) {
            cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]);
            if (img.empty()) continue;
            cv::Mat pr_img = preprocess_img(img);
            for (int i = 0; i < INPUT_H * INPUT_W; i++) {
                data[b * 3 * INPUT_H * INPUT_W + i] = pr_img.at<cv::Vec3b>(i)[2] / 255.0;
                data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[1] / 255.0;
                data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[0] / 255.0;
            }
        }

        // Run inference
        auto start = std::chrono::system_clock::now();
        doInference(*context, data, prob, BATCH_SIZE);
        auto end = std::chrono::system_clock::now();
        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
        std::vector<std::vector<Yolo::Detection>> batch_res(fcount);
        for (int b = 0; b < fcount; b++) {
            auto& res = batch_res[b];
            nms(res, &prob[b * OUTPUT_SIZE], BBOX_CONF_THRESH, NMS_THRESH);
        }
        for (int b = 0; b < fcount; b++) {
            auto& res = batch_res[b];
            //std::cout << res.size() << std::endl;
            cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]);
            for (size_t j = 0; j < res.size(); j++) {
                float *p = (float*)&res[j];
                for (size_t k = 0; k < 7; k++) {
                   std::cout << p[k] << ", ";
                }
                std::cout << std::endl;
                cv::Rect r = get_rect(img, res[j].bbox);
                cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
                cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
            }
            cv::imwrite("_" + file_names[f - fcount + 1 + b], img);
        }
        fcount = 0;
    }

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    //Print histogram of the output distribution
    //std::cout << "\nOutput:\n\n";
    //for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
    //{
    //    std::cout << prob[i] << ", ";
    //    if (i % 10 == 0) std::cout << i / 10 << std::endl;
    //}
    //std::cout << std::endl;

    return 0;
}

================================================
FILE: senet/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(senet)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(/usr/include/x86_64-linux-gnu/)
link_directories(/usr/lib/x86_64-linux-gnu/)

add_executable(se_resnet ${PROJECT_SOURCE_DIR}/se_resnet50.cpp)
target_link_libraries(se_resnet nvinfer)
target_link_libraries(se_resnet cudart)

add_definitions(-O2 -pthread)


================================================
FILE: senet/README.md
================================================
# SENet

An implementation of SENet, proposed in Squeeze-and-Excitation Networks by Jie Hu, Li Shen, Samuel Albanie, Gang Sun, Enhua Wu

[https://arxiv.org/abs/1709.01507](https://arxiv.org/abs/1709.01507)

For the Pytorch implementation, you can refer to [wang-xinyu/senet.pytorch](https://github.com/wang-xinyu/senet.pytorch), which is forked from [moskomule/senet.pytorch](https://github.com/moskomule/senet.pytorch).


```
// 1. generate se_resnet50.wts from [wang-xinyu/senet.pytorch](https://github.com/wang-xinyu/senet.pytorch)

// 2. put se_resnet50.wts into tensorrtx/senet

// 3. build and run

cd tensorrtx/senet

mkdir build

cd build

cmake ..

make

sudo ./se_resnet -s   // serialize model to plan file i.e. 'se_resnet50.engine'

sudo ./se_resnet -d   // deserialize plan file and run inference

// 4. see if the output is same as [wang-xinyu/senet.pytorch]
```


================================================
FILE: senet/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) override
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: senet/se_resnet50.cpp
================================================
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include <cmath>

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

// stuff we know about the network and the input/output blobs
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int OUTPUT_SIZE = 1000;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";

using namespace nvinfer1;

static Logger gLogger;

// Load weights from files shared with TensorRT samples.
// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file)
{
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        
        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + ".weight"].values;
    float *beta = (float*)weightMap[lname + ".bias"].values;
    float *mean = (float*)weightMap[lname + ".running_mean"].values;
    float *var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;
    std::cout << "len " << len << std::endl;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};
    
    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

ILayer* seLayer(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c, int w, std::string lname) {
    IPoolingLayer* l1 = network->addPoolingNd(input, PoolingType::kAVERAGE, DimsHW(w, w));
    assert(l1);
    l1->setStrideNd(DimsHW{w, w});
    IFullyConnectedLayer* l2 = network->addFullyConnected(*l1->getOutput(0), c / 16, weightMap[lname + "fc.0.weight"], weightMap[lname+"fc.0.bias"]);
    IActivationLayer* relu1 = network->addActivation(*l2->getOutput(0), ActivationType::kRELU);
    IFullyConnectedLayer* l4 = network->addFullyConnected(*relu1->getOutput(0), c, weightMap[lname+"fc.2.weight"],weightMap[lname+"fc.2.bias"]);
    IActivationLayer* l5 = network->addActivation(*l4->getOutput(0), ActivationType::kSIGMOID);
    ILayer* se = network->addElementWise(input, *l5->getOutput(0), ElementWiseOperation::kPROD);
    assert(se);
    return se;
}

IActivationLayer* bottleneck(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname, int w) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{1, 1}, weightMap[lname + "conv1.weight"], emptywts);
    assert(conv1);

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5);

    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{3, 3}, weightMap[lname + "conv2.weight"], emptywts);
    assert(conv2);
    conv2->setStrideNd(DimsHW{stride, stride});
    conv2->setPaddingNd(DimsHW{1, 1});

    IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5);

    IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU);
    assert(relu2);

    IConvolutionLayer* conv3 = network->addConvolutionNd(*relu2->getOutput(0), outch * 4, DimsHW{1, 1}, weightMap[lname + "conv3.weight"], emptywts);
    assert(conv3);

    IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "bn3", 1e-5);

    ILayer *se = seLayer(network, weightMap, *bn3->getOutput(0), outch * 4, w, lname + "se.");

    IElementWiseLayer* ew1;
    if (stride != 1 || inch != outch * 4) {
        IConvolutionLayer* conv4 = network->addConvolutionNd(input, outch * 4, DimsHW{1, 1}, weightMap[lname + "downsample.0.weight"], emptywts);
        assert(conv4);
        conv4->setStrideNd(DimsHW{stride, stride});

        IScaleLayer* bn4 = addBatchNorm2d(network, weightMap, *conv4->getOutput(0), lname + "downsample.1", 1e-5);
        ew1 = network->addElementWise(*bn4->getOutput(0), *se->getOutput(0), ElementWiseOperation::kSUM);
    } else {
        ew1 = network->addElementWise(input, *se->getOutput(0), ElementWiseOperation::kSUM);
    }
    IActivationLayer* relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU);
    assert(relu3);
    return relu3;
}

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt)
{
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // Create input tensor of shape { 3, INPUT_H, INPUT_W } with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../se_resnet50.wts");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 64, DimsHW{7, 7}, weightMap["conv1.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{2, 2});
    conv1->setPaddingNd(DimsHW{3, 3});

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "bn1", 1e-5);

    // Add activation layer using the ReLU algorithm.
    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
    assert(pool1);
    pool1->setStrideNd(DimsHW{2, 2});
    pool1->setPaddingNd(DimsHW{1, 1});

    IActivationLayer* x = bottleneck(network, weightMap, *pool1->getOutput(0), 64, 64, 1, "layer1.0.", 56);
    x = bottleneck(network, weightMap, *x->getOutput(0), 256, 64, 1, "layer1.1.", 56);
    x = bottleneck(network, weightMap, *x->getOutput(0), 256, 64, 1, "layer1.2.", 56);

    x = bottleneck(network, weightMap, *x->getOutput(0), 256, 128, 2, "layer2.0.", 28);
    x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.1.", 28);
    x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.2.", 28);
    x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.3.", 28);

    x = bottleneck(network, weightMap, *x->getOutput(0), 512, 256, 2, "layer3.0.", 14);
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.1.", 14);
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.2.", 14);
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.3.", 14);
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.4.", 14);
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.5.", 14);

    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 512, 2, "layer4.0.", 7);
    x = bottleneck(network, weightMap, *x->getOutput(0), 2048, 512, 1, "layer4.1.", 7);
    x = bottleneck(network, weightMap, *x->getOutput(0), 2048, 512, 1, "layer4.2.", 7);

    IPoolingLayer* pool2 = network->addPoolingNd(*x->getOutput(0), PoolingType::kAVERAGE, DimsHW{7, 7});
    assert(pool2);
    pool2->setStrideNd(DimsHW{1, 1});

    IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool2->getOutput(0), 1000, weightMap["fc.weight"], weightMap["fc.bias"]);
    assert(fc1);

    fc1->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    std::cout << "set name out" << std::endl;
    network->markOutput(*fc1->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(1 << 20);
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "build out" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*) (mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream)
{
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
    config->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

int main(int argc, char** argv)
{
    if (argc != 2) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./se_resnet -s   // serialize model to plan file" << std::endl;
        std::cerr << "./se_resnet -d   // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{nullptr};
    size_t size{0};

    if (std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(1, &modelStream);
        assert(modelStream != nullptr);

        std::ofstream p("se_resnet50.engine", std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 1;
    } else if (std::string(argv[1]) == "-d") {
        std::ifstream file("se_resnet50.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        return -1;
    }

    static float data[3 * INPUT_H * INPUT_W];
    for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
        data[i] = 1.0;

    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    // Run inference
    static float prob[OUTPUT_SIZE];
    for (int i = 0; i < 10; i++) {
        auto start = std::chrono::system_clock::now();
        doInference(*context, data, prob, 1);
        auto end = std::chrono::system_clock::now();
        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    // Print histogram of the output distribution
    std::cout << "\nOutput:\n\n";
    for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
    {
        std::cout << prob[i] << ", ";
        if (i % 10 == 0) std::cout << std::endl;
    }
    std::cout << std::endl;

    return 0;
}


================================================
FILE: shufflenetv2/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.14)

project(
  shufflenetv2
  VERSION 0.1
  LANGUAGES C CXX CUDA)

if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
  set(CMAKE_CUDA_ARCHITECTURES
      60
      70
      72
      75
      80
      86
      89)
endif()

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_INCLUDE_CURRENT_DIR TRUE)

option(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cudaruntime library" OFF)

find_package(Threads REQUIRED)
find_package(CUDAToolkit REQUIRED)
find_package(OpenCV REQUIRED)

if(NOT TARGET TensorRT::TensorRT)
  include(FindTensorRT.cmake)
endif()

add_executable(${PROJECT_NAME} ${PROJECT_NAME}.cpp)

target_include_directories(${PROJECT_NAME} PRIVATE ${OpenCV_INCLUDE_DIRS})

target_link_libraries(${PROJECT_NAME} PRIVATE Threads::Threads CUDA::cudart
                                              TensorRT::TensorRT ${OpenCV_LIBS})


================================================
FILE: shufflenetv2/FindTensorRT.cmake
================================================
cmake_minimum_required(VERSION 3.17.0)

function(_guess_path var_name required_files)
  set(_result "")

  foreach(path_entry IN LISTS ARGN)
    if(NOT EXISTS "${path_entry}")
      message(DEBUG "skip non-existing path '${path_entry}'")
      continue()
    endif()

    set(_ok TRUE)
    foreach(required_file IN LISTS required_files)
      if(NOT EXISTS "${path_entry}/${required_file}")
        set(_ok FALSE)
        message(DEBUG "'${path_entry}' missing '${required_file}'")
        break()
      endif()
    endforeach()

    if(_ok)
      list(APPEND _result "${path_entry}")
      message(DEBUG "accept '${path_entry}'")
    else()
      message(DEBUG "reject '${path_entry}'")
    endif()
  endforeach()

  if(_result STREQUAL "")
    message(
      FATAL_ERROR
        "_guess_path(${var_name}) failed: no valid path found. required_files='${required_files}' candidates='${ARGN}'"
    )
  endif()

  set(${var_name}
      "${_result}"
      PARENT_SCOPE)
endfunction()

# add library
add_library(TensorRT IMPORTED INTERFACE)
add_library(TensorRT::TensorRT ALIAS TensorRT)

set(TRT_VERSION
    CACHE
      STRING
      "TensorRT version, e.g. \"8.6.1.6\" or \"8.6.1.6+cuda12.0.1.011\", \"8.6.1.6.Windows10.x86_64.cuda-12.0\" etc"
)

if(NOT TRT_VERSION STREQUAL "" AND NOT $ENV{TRT_VERSION} STREQUAL "")
  message(
    WARNING
      "TRT_VERSION defined by cmake and environment variable both, using the later one"
  )
endif()

if(NOT $ENV{TRT_VERSION} STREQUAL "")
  set(TRT_VERSION $ENV{TRT_VERSION})
endif()

string(REGEX MATCH "([0-9]+)" _match ${TRT_VERSION})
set(TRT_MAJOR_VERSION "${_match}")
unset(_match)

if(WIN32)
  set(TensorRT_DIR "C:/Program Files/TensorRT-${TRT_VERSION}")
  if(NOT EXISTS "${TensorRT_DIR}")
    message(
      FATAL_ERROR
        "TensorRT_DIR=${TensorRT_DIR} does not exist!"
    )
  endif()

  if(${TRT_MAJOR_VERSION} GREATER_EQUAL 10)
    set(_modules nvinfer_10 nvinfer_plugin_10 nvinfer_vc_plugin_10
                 nvinfer_dispatch_10 nvinfer_lean_10)
    message(DEBUG "Using ${_modules}")
  else()
    set(_modules nvinfer nvinfer_plugin nvinfer_vc_plugin nvinfer_dispatch
                 nvinfer_lean)
  endif()

  set(TensorRT_LIBRARY_DIR "${TensorRT_DIR}/lib")
  set(TensorRT_INCLUDE_DIR "${TensorRT_DIR}/include")
elseif(UNIX)
  string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" _trt_arch)
  set(_trt_include_candidates)
  if(_trt_arch MATCHES "^(aarch64|arm64|arch64)$")
    set(_trt_include_candidates "/usr/include/aarch64-linux-gnu" "/usr/include"
                                "/usr/local/cuda/targets/aarch64-linux/include")
    set(_trt_library_candidates
        "/usr/local/tensorrt/targets/aarch64-linux-gnu/lib"
        "/usr/lib/aarch64-linux-gnu" "/usr/lib/aarch64-linux-gnu/tegra"
        "/usr/lib")
  elseif(_trt_arch MATCHES "^(x86_64|amd64)$")
    set(_trt_include_candidates
        "/usr/local/tensorrt/targets/x86_64-linux-gnu/include"
        "/usr/include/x86_64-linux-gnu" "/usr/include")
    set(_trt_library_candidates
        "/usr/local/tensorrt/targets/x86_64-linux-gnu/lib"
        "/usr/lib/x86_64-linux-gnu" "/usr/lib")
  else()
    message(FATAL_ERROR "Unknown architecture")
  endif()

  set(_modules nvinfer nvinfer_plugin)
  if(${TRT_MAJOR_VERSION} GREATER_EQUAL 8)
    list(APPEND _modules nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean)
  endif()

  _guess_path(TensorRT_LIBRARY_DIR "libnvinfer.so;libnvinfer_plugin.so"
              ${_trt_library_candidates})
  message(STATUS "TensorRT libraries: ${TensorRT_LIBRARY_DIR}")
  _guess_path(TensorRT_INCLUDE_DIR "NvInfer.h" ${_trt_include_candidates})
  message(STATUS "TensorRT includes: ${TensorRT_INCLUDE_DIR}")
endif()

foreach(lib IN LISTS _modules)
  find_library(
    TensorRT_${lib}_LIBRARY
    NAMES ${lib}
    HINTS ${TensorRT_LIBRARY_DIR})
  list(APPEND TensorRT_LIBRARIES ${TensorRT_${lib}_LIBRARY})
endforeach()

target_link_libraries(TensorRT INTERFACE ${TensorRT_LIBRARIES})

message(STATUS "Found TensorRT libs: ${TensorRT_LIBRARIES}")

set_target_properties(
  TensorRT
  PROPERTIES C_STANDARD 17
             CXX_STANDARD 17
             POSITION_INDEPENDENT_CODE ON
             SKIP_BUILD_RPATH TRUE
             BUILD_WITH_INSTALL_RPATH TRUE
             INSTALL_RPATH "$ORIGIN"
             INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIR}")

unset(TRT_MAJOR_VERSION)
unset(_modules)
unset(_trt_include_candidates)
unset(_trt_library_candidates)
unset(_trt_arch)


================================================
FILE: shufflenetv2/README.md
================================================
# shufflenet v2

ShuffleNetV2 with 0.5x output channels, as described in: [ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design](https://arxiv.org/abs/1807.11164)

Following tricks are used in this demo:

- `torch.chunk` is used in shufflenet v2. We implemented the `chunk(2, dim=C)` by tensorrt plugin. Which is the simplest plugin in this tensorrtx project. You can learn the basic procedures of build tensorrt plugin.
- shuffle layer is used, the `channel_shuffle()` in `pytorchx/shufflenet` can be implemented by two shuffle layers in tensorrt.
- Batchnorm layer, implemented by scale layer.

## Usage

1. use `gen_wts.py` to generate wts file.

```bash
python3 gen_wts.py
```

2. build C++ code

```bash
pushd tensorrtx/shufflenetv2
cmake -S . -B build -G Ninja --fresh
cmake --build build
```

3. serialize wts model to engine file.

```bash
./build/shufflenetv2 -s
```

4. run inference

```bash
./build/shufflenetv2 -i
```

The inference output looks like:

```bash
...
328us
-5.481, -0.1151, 4.004, -1.47, 1.007, -5.943, -2.311, 1.708, 1.569, 0.3112, 1.589, 0.1816, -2.253, -3.261, -3.269, -0.9116, -2.132, -1.159, -2.108, -0.3869, -4.653,
====
...
prediction result:
Top: 0 idx: 285, logits: 10.44, label: Egyptian cat
Top: 1 idx: 309, logits: 10.19, label: bee
Top: 2 idx: 94, logits: 9.399, label: hummingbird
```


================================================
FILE: shufflenetv2/gen_wts.py
================================================
import struct

import cv2
import numpy as np
import torch
from torchvision.models.shufflenetv2 import (
    shufflenet_v2_x0_5,
    shufflenet_v2_x1_0,
    shufflenet_v2_x1_5,
    shufflenet_v2_x2_0,
)


def read_imagenet_labels() -> dict[int, str]:
    """
    read ImageNet 1000 labels

    Returns:
        dict[int, str]: labels dict
    """
    clsid2label = {}
    with open("../assets/imagenet1000_clsidx_to_labels.txt", "r") as f:
        for i in f.readlines():
            k, v = i.split(": ")
            clsid2label.setdefault(int(k), v[1:-3])
    return clsid2label


def preprocess(img: np.array) -> torch.Tensor:
    """
    a preprocess method align with ImageNet dataset

    Args:
        img (np.array): input image

    Returns:
        torch.Tensor: preprocessed image in `NCHW` layout
    """
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
    img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_LINEAR)
    mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
    std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
    img = (img - mean) / std
    img = img.transpose(2, 0, 1)[None, ...]
    return torch.from_numpy(img)


if __name__ == "__main__":
    labels = read_imagenet_labels()
    img = cv2.imread("../assets/cats.jpg", cv2.IMREAD_COLOR)
    img = preprocess(img)

    """
    NOTE: comment out the model you don't want
    """
    models = [
        ("shufflenet_v2_x0_5", shufflenet_v2_x0_5(pretrained=True)),
        ("shufflenet_v2_x1_0", shufflenet_v2_x1_0(pretrained=True)),
        ("shufflenet_v2_x1_5", shufflenet_v2_x1_5(pretrained=True)),
        ("shufflenet_v2_x2_0", shufflenet_v2_x2_0(pretrained=True)),
    ]

    for name, model in models:
        model.eval()
        with torch.inference_mode():
            output = model(img)
        print(f"{name} result:")
        for i, batch in enumerate(torch.topk(output, k=3).indices):
            for j, idx in enumerate(batch):
                print(f"\tBatch: {i}, Top: {j}, logits: {output[i][idx]:.4f}, label: {labels[int(idx)]}")
        print(f"{'=' * 32}")

        with open(f"../models/{name}.wts", "w") as f:
            f.write("{}\n".format(len(model.state_dict().keys())))
            for k, v in model.state_dict().items():
                print("key: ", k)
                print("value: ", v.shape)
                vr = v.reshape(-1).cpu().numpy()
                f.write("{} {}".format(k, len(vr)))
                for vv in vr:
                    f.write(" ")
                    f.write(struct.pack(">f", float(vv)).hex())
                f.write("\n")


================================================
FILE: shufflenetv2/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "NvInferRuntime.h"
#include "macros.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf {
   public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {}

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {}

    ~LogStreamConsumerBuffer() {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr()) {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync() {
        putOutput();
        return 0;
    }

    void putOutput() {
        if (mShouldLog) {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }

   private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase {
   public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog) {}

   protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
   public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(severity <= reportableSeverity),
          mSeverity(severity) {}

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(other.mShouldLog),
          mSeverity(other.mSeverity) {}

    void setReportableSeverity(Severity reportableSeverity) {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

   private:
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger {
   public:
    Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {}

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult {
        kRUNNING,  //!< The test is running
        kPASSED,   //!< The test passed
        kFAILED,   //!< The test failed
        kWAIVED    //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger() { return *this; }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity) { mReportableSeverity = severity; }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom {
       public:
        TestAtom(TestAtom&&) = default;

       private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started), mName(name), mCmdline(cmdline) {}

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline) {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom) {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result) {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass) {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const { return mReportableSeverity; }

   private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result) {
        switch (result) {
            case TestResult::kRUNNING:
                return "RUNNING";
            case TestResult::kPASSED:
                return "PASSED";
            case TestResult::kFAILED:
                return "FAILED";
            case TestResult::kWAIVED:
                return "WAIVED";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result) {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv) {
        std::stringstream ss;
        for (int i = 0; i < argc; i++) {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace {

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

}  // anonymous namespace

#endif  // TENSORRT_LOGGING_H


================================================
FILE: shufflenetv2/macros.h
================================================
#pragma once
#include <NvInfer.h>

#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else

#if defined(_MSC_VER)
#define API __declspec(dllimport)
#else
#define API
#endif
#endif  // API_EXPORTS

#define TRT_VERSION \
    ((NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + (NV_TENSORRT_PATCH * 10) + NV_TENSORRT_BUILD)

#if TRT_VERSION < 7220
#error "TensorRT >= 7.2.2 is required for this demo."
#endif

#if TRT_VERSION >= 8000
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif


================================================
FILE: shufflenetv2/shufflenetv2.cpp
================================================
#include <NvInfer.h>
#include <chrono>
#include <cmath>
#include <iostream>
#include <map>
#include <opencv2/opencv.hpp>
#include <string>
#include <vector>
#include "logging.h"
#include "utils.h"

struct ShuffleNetV2Params {
    std::array<int32_t, 3> repeat;
    std::array<int32_t, 5> output_chn;
};

/**
 * @brief choose one below as the model to be built
 * @param v2_x0_5
 * @param v2_x1_0
 * @param v2_x1_5
 * @param v2_x2_0
 */
[[maybe_unused]] static constexpr ShuffleNetV2Params v2_x0_5 = {{4, 8, 4}, {24, 48, 96, 192, 1024}};
[[maybe_unused]] static constexpr ShuffleNetV2Params v2_x1_0 = {{4, 8, 4}, {24, 116, 232, 464, 1024}};
[[maybe_unused]] static constexpr ShuffleNetV2Params v2_x1_5 = {{4, 8, 4}, {24, 176, 352, 704, 1024}};
[[maybe_unused]] static constexpr ShuffleNetV2Params v2_x2_0 = {{4, 8, 4}, {24, 244, 488, 976, 2048}};

constexpr const std::size_t WORKSPACE_SIZE = 16 << 20;

// stuff we know about shufflenet-v2
constexpr const int64_t N = 1;
constexpr const int32_t INPUT_H = 224;
constexpr const int32_t INPUT_W = 224;
constexpr const std::array<int32_t, 2> SIZES = {3 * INPUT_H * INPUT_W, 1000};
constexpr const std::array<const char*, 2> NAMES = {"data", "logits"};
static constexpr const bool TRT_PREPROCESS = TRT_VERSION >= 8510 ? true : false;
static constexpr const std::array<const float, 3> mean = {0.485f, 0.456f, 0.406f};
static constexpr const std::array<const float, 3> stdv = {0.229f, 0.224f, 0.225f};

static constexpr const char* WTS_PATH = "../models/shufflenet_v2_x0_5.wts";
static constexpr const char* ENGINE_PATH = "../models/shufflenet.engine";
static constexpr const char* LABELS_PATH = "../assets/imagenet1000_clsidx_to_labels.txt";

using namespace nvinfer1;
using WeightMap = std::map<std::string, Weights>;
using M = MatrixOperation;
using NDCF = nvinfer1::NetworkDefinitionCreationFlag;

static Logger gLogger;

Dims debug_shape(const ILayer* l) {
    Dims dims = l->getOutput(0)->getDimensions();
    std::cout << l->getOutput(0)->getName() << ":\t[";
    for (int i = 0; i < dims.nbDims; i++) {
        std::cout << dims.d[i] << ", ";
    }
    std::cout << "]\n";
    return dims;
}

ILayer* addBatchNorm2d(INetworkDefinition* network, WeightMap& weightMap, ITensor& input, const std::string& lname,
                       float eps = 1e-3f) {
    float* gamma = (float*)weightMap[lname + ".weight"].values;
    float* beta = (float*)weightMap[lname + ".bias"].values;
    float* mean = (float*)weightMap[lname + ".running_mean"].values;
    float* var = (float*)weightMap[lname + ".running_var"].values;
    auto len = weightMap[lname + ".running_var"].count;
    std::cout << lname << " running_var len: " << len << "\n";

    auto* scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};

    auto* shval = static_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};
    static const Weights power{DataType::kFLOAT, nullptr, 0ll};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

/**
 * @brief a basic convolution+bn layer with an optional relu layer
 *
 * @param network network definition
 * @param m weight map
 * @param input input tensor
 * @param lname layer name
 * @param ch output channels
 * @param k kernel
 * @param s stride
 * @param p padding
 * @param g groups
 * @param with_relu true if with relu
 * @return ILayer*
 */
ILayer* CBR(INetworkDefinition* network, WeightMap& m, ITensor& input, const std::string& lname, int ch, int k,
            int s = 1, int p = 0, int g = 1, bool with_relu = true, int start_index = 0) {
    static const Weights emptywts{DataType::kFLOAT, nullptr, 0ll};
    auto conv_name = lname + "." + std::to_string(start_index++);
    auto* conv = network->addConvolutionNd(input, ch, DimsHW{k, k}, m[conv_name + ".weight"], emptywts);

    assert(conv);
    conv->setStrideNd(DimsHW{s, s});
    conv->setPaddingNd(DimsHW{p, p});
    conv->setNbGroups(g);
    conv->setName(conv_name.c_str());

    auto bn_name = lname + "." + std::to_string(start_index++);
    auto* bn = addBatchNorm2d(network, m, *conv->getOutput(0), bn_name, 1e-5f);
    bn->setName((bn_name + ".bn").c_str());

    if (with_relu) {
        auto* relu = network->addActivation(*bn->getOutput(0), ActivationType::kRELU);
        auto relu_name = lname + "." + std::to_string(start_index) + ".relu";
        assert(relu);
        relu->setName(relu_name.c_str());
        return relu;
    }
    return bn;
}

/**
 * @brief invered residual block
 *
 * @param network network definition
 * @param m weight map
 * @param input input tensor
 * @param lname layer name
 * @param inch input channels
 * @param outch output channels
 * @param s stride
 * @return ILayer*
 */
ILayer* invertedRes(INetworkDefinition* net, WeightMap& m, ITensor& input, const std::string& lname, int inch,
                    int outch, int s) {
    if (s < 1 || s > 3) {
        std::cerr << "stride must be in [1, 3]\n";
        std::abort();
    }
    int32_t bf /* branch features */ = outch / 2;
    ITensor *x1{nullptr}, *x2{nullptr};

    if (s == 1) {
        auto d = input.getDimensions();
        Dims4 stride{1, 1, 1, 1};
        Dims4 half{d.d[0], d.d[1] / 2, d.d[2], d.d[3]};
        auto* s1 = net->addSlice(input, Dims4{0, 0, 0, 0}, half, stride);
        auto* s2 = net->addSlice(input, Dims4{0, d.d[1] / 2, 0, 0}, half, stride);
        debug_shape(s2);
        x1 = s1->getOutput(0);
        x2 = s2->getOutput(0);
    } else {
        if (s > 1) {
            auto* b1 = CBR(net, m, input, lname + ".branch1", inch, 3, s, 1, inch, false, 0);
            b1 = CBR(net, m, *b1->getOutput(0), lname + ".branch1", inch, 1, 1, 0, 1, true, 2);
            x1 = b1->getOutput(0);
            debug_shape(b1);
        } else {
            x1 = &input;
        }
        x2 = &input;
    }

    auto* b2 = CBR(net, m, *x2, lname + ".branch2", bf, 1, 1, 0, 1, true, 0);
    b2 = CBR(net, m, *b2->getOutput(0), lname + ".branch2", bf, 3, s, 1, bf, false, 3);
    b2 = CBR(net, m, *b2->getOutput(0), lname + ".branch2", bf, 1, 1, 0, 1, true, 5);
    debug_shape(b2);

    std::array<ITensor*, 2> cat_tensors = {x1, b2->getOutput(0)};
    auto* cat = net->addConcatenation(cat_tensors.data(), 2);
    auto cat_name = lname + ".cat";
    assert(cat);
    cat->setName(cat_name.c_str());
    cat->setAxis(1);
    static_cast<void>(debug_shape(cat));

    auto* sf1 = net->addShuffle(*cat->getOutput(0));
    assert(sf1);
    sf1->setName((lname + ".shuffle.1").c_str());
    auto d = cat->getOutput(0)->getDimensions();
    auto dim_sf1 = Dims{5, {d.d[0], 2, d.d[1] / 2, d.d[2], d.d[3]}};
    sf1->setReshapeDimensions(dim_sf1);
    sf1->setSecondTranspose({0, 2, 1, 3, 4});

    auto* sf2 = net->addShuffle(*sf1->getOutput(0));
    assert(sf2);
    sf2->setName((lname + ".shuffle.2").c_str());
    sf2->setReshapeDimensions(d);

    return sf2;
}

/**
 * @brief Create a Engine object
 * 
 * @param N max batch size
 * @param runtime runtime
 * @param builder builder
 * @param config config
 * @param dt data type
 * @param param the type of model to be built
 * @return ICudaEngine* 
 */
ICudaEngine* createEngine(int32_t N, IRuntime* runtime, IBuilder* builder, IBuilderConfig* config, DataType dt,
                          ShuffleNetV2Params param = v2_x0_5) {
    WeightMap m = loadWeights(WTS_PATH);

#if TRT_VERSION >= 11200
    auto flag = 1U << static_cast<int>(NDCF::kSTRONGLY_TYPED);
#elif TRT_VERSION >= 10000
    auto flag = 0U;
#else
    auto flag = 1U << static_cast<int>(NDCF::kEXPLICIT_BATCH);
#endif
    auto* net = builder->createNetworkV2(flag);

    int32_t in_ch = 3;
    ITensor* input{nullptr};
    if constexpr (TRT_PREPROCESS) {
        // for simplicity, resize image on cpu side
        dt = DataType::kUINT8;
        input = net->addInput(NAMES[0], dt, Dims4{N, INPUT_H, INPUT_W, in_ch});
        auto* trans = addTransformLayer(net, *input, true, mean, stdv);
        input = trans->getOutput(0);
    } else {
        input = net->addInput(NAMES[0], dt, Dims4{N, in_ch, INPUT_H, INPUT_W});
    }
    assert(input);

    /** conv1 and maxpool */
    auto* cbr1 = CBR(net, m, *input, "conv1", param.output_chn[0], 3, 2, 1);
    auto* pool1 = net->addPoolingNd(*cbr1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
    assert(pool1);
    pool1->setStrideNd(DimsHW{2, 2});
    pool1->setPaddingNd(DimsHW{1, 1});
    debug_shape(pool1);

    /** stage 2, 3, 4 */
    ILayer* _layer = pool1;
    in_ch = param.output_chn[0];
    for (int stage = 2; stage < 5; ++stage) {
        int32_t out_ch = param.output_chn[stage - 1];
        std::string lname = "stage" + std::to_string(stage);
        std::cout << "================ " << lname << " ================\n";
        _layer = invertedRes(net, m, *_layer->getOutput(0), lname + ".0", in_ch, out_ch, 2);
        debug_shape(_layer);
        for (int j = 1; j < param.repeat[stage - 2]; ++j) {
            _layer = invertedRes(net, m, *_layer->getOutput(0), lname + "." + std::to_string(j), out_ch, out_ch, 1);
        }
        in_ch = out_ch;
    }

    /** conv5, mean and fully connected layer */
    auto* conv5 = CBR(net, m, *_layer->getOutput(0), "conv5", param.output_chn[4], 1, 1, 0);
    auto* mean = net->addReduce(*conv5->getOutput(0), ReduceOperation::kAVG, 0xc, false);
    mean->setName("global_pool(mean)");
    auto* fcw = net->addConstant(DimsHW{1000, 1024}, m["fc.weight"]);
    auto* fcb = net->addConstant(DimsHW{1, 1000}, m["fc.bias"]);
    auto* _fc = net->addMatrixMultiply(*mean->getOutput(0), M::kNONE, *fcw->getOutput(0), M::kTRANSPOSE);
    auto* fc = net->addElementWise(*_fc->getOutput(0), *fcb->getOutput(0), ElementWiseOperation::kSUM);
    fc->getOutput(0)->setName(NAMES[1]);
    debug_shape(fc);

    net->markOutput(*fc->getOutput(0));

#if TRT_VERSION >= 8000
    config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, WORKSPACE_SIZE);
    IHostMemory* mem = builder->buildSerializedNetwork(*net, *config);
    ICudaEngine* engine = runtime->deserializeCudaEngine(mem->data(), mem->size());
    delete net;
#else
    builder->setMaxBatchSize(N);
    config->setMaxWorkspaceSize(WORKSPACE_SIZE);
    ICudaEngine* engine = builder->buildEngineWithConfig(*net, *config);
    net->destroy();
#endif
    std::cout << "build finished\n";

    // Release host memory
    for (auto& mem : m) {
        free((void*)(mem.second.values));
    }

    return engine;
}

void APIToModel(int32_t N, IRuntime* runtime, IHostMemory** modelStream) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(N, runtime, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

#if TRT_VERSION >= 8000
    delete engine;
    delete config;
    delete builder;
#else
    engine->destroy();
    config->destroy();
    builder->destroy();
#endif
}

auto doInference(IExecutionContext& context, void* input, int64_t batchSize) -> std::vector<std::vector<float>> {
    ICudaEngine const& engine = context.getEngine();
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));
    std::vector<void*> buffers;

#if TRT_VERSION >= 8000
    const int32_t nIO = engine.getNbIOTensors();
#else
    const int32_t nIO = engine.getNbBindings();
#endif

    buffers.resize(nIO);
    for (auto i = 0; i < nIO; ++i) {
        std::size_t size = 0;
#if TRT_VERSION >= 8000
        auto* tensor_name = engine.getIOTensorName(i);
        auto s = getSize(engine.getTensorDataType(tensor_name));
        size = s * batchSize * SIZES[i];
        CHECK(cudaMalloc(&buffers[i], size));
        if (i == 0) {
            CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream));
        }
        context.setTensorAddress(tensor_name, buffers[i]);
#else
        const int32_t idx = engine.getBindingIndex(NAMES[i]);
        auto s = getSize(engine.getBindingDataType(idx));
        assert(idx == i);
        size = s * batchSize * SIZES[i];
        CHECK(cudaMalloc(&buffers[i], size));
        if (i == 0) {
            CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream));
        }
#endif
    }

#if TRT_VERSION >= 8000
    assert(context.enqueueV3(stream));
#else
    assert(context.enqueueV2(buffers.data(), stream, nullptr));
#endif

    std::vector<std::vector<float>> prob;
    for (int i = 1; i < nIO; ++i) {
        std::vector<float> tmp(batchSize * SIZES[i], std::nanf(""));
        std::size_t size = batchSize * SIZES[i] * sizeof(float);
        CHECK(cudaMemcpyAsync(tmp.data(), buffers[i], size, cudaMemcpyDeviceToHost, stream));
        prob.emplace_back(tmp);
    }
    CHECK(cudaStreamSynchronize(stream));
    // Release stream and buffers
    CHECK(cudaStreamDestroy(stream));
    for (auto& buffer : buffers) {
        CHECK(cudaFree(buffer));
    }
    return prob;
}

int main(int argc, char** argv) {
    checkTrtEnv();
    if (argc != 2) {
        std::cerr << "arguments not right!\n";
        std::cerr << "./shufflenet -s   // serialize model to plan file\n";
        std::cerr << "./shufflenet -d   // deserialize plan file and run inference\n";
        return -1;
    }

    // create a model using the API directly and serialize it to a stream
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    char* trtModelStream{nullptr};
    std::streamsize size{0};

    if (std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(1, runtime, &modelStream);
        assert(modelStream != nullptr);

        std::ofstream p(ENGINE_PATH, std::ios::binary | std::ios::trunc);
        if (!p) {
            std::cerr << "could not open plan output file\n";
            return -1;
        }
        if (modelStream->size() > static_cast<std::size_t>(std::numeric_limits<std::streamsize>::max())) {
            std::cerr << "this model is too large to serialize\n";
            return -1;
        }
        const auto* data_ptr = reinterpret_cast<const char*>(modelStream->data());
        auto data_size = static_cast<std::streamsize>(modelStream->size());
        p.write(data_ptr, data_size);
#if TRT_VERSION >= 8000
        delete modelStream;
#else
        modelStream->destroy();
#endif
        return 0;
    } else if (std::string(argv[1]) == "-d") {
        std::ifstream file(ENGINE_PATH, std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        return -1;
    }

#if TRT_VERSION >= 8000
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
#else
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
#endif
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    // Run inference
    void* input = nullptr;
    std::vector<float> flat_img;
    cv::Mat img;
    if constexpr (TRT_PREPROCESS) {
        // for simplicity, resize image on cpu side
        img = cv::imread("../assets/cats.jpg", cv::IMREAD_COLOR);
        cv::resize(img, img, cv::Size(INPUT_W, INPUT_H), 0, 0, cv::INTER_LINEAR);
        input = static_cast<void*>(img.data);
    } else {
        img = cv::imread("../assets/cats.jpg", cv::IMREAD_COLOR);
        flat_img = preprocess_img(img, true, mean, stdv, N, INPUT_H, INPUT_W);
        input = flat_img.data();
    }
    for (int i = 0; i < 100; ++i) {
        auto start = std::chrono::system_clock::now();
        auto prob = doInference(*context, input, N);
        auto end = std::chrono::system_clock::now();
        auto period = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
        std::cout << period.count() << "us\n";

        for (auto& vector : prob) {
            int idx = 0;
            for (auto& v : vector) {
                std::cout << std::setprecision(4) << v << ", " << std::flush;
                if (++idx > 20) {
                    std::cout << "\n====\n";
                    break;
                }
            }
        }

        if (i == 99) {
            std::cout << "prediction result:\n";
            auto labels = loadImagenetLabelMap(LABELS_PATH);
            int _top = 0;
            for (auto& [idx, logits] : topk(prob[0], 3)) {
                std::cout << "Top: " << _top++ << " idx: " << idx << ", logits: " << logits
                          << ", label: " << labels[idx] << "\n";
            }
        }
    }
#if TRT_VERSION >= 8000
    delete context;
    delete engine;
    delete runtime;
#else
    context->destroy();
    engine->destroy();
    runtime->destroy();
#endif

    return 0;
}


================================================
FILE: shufflenetv2/utils.h
================================================
#pragma once
#include <cuda_runtime_api.h>
#include <algorithm>
#include <cassert>
#include <fstream>
#include <iostream>
#include <map>
#include <numeric>
#include <opencv2/opencv.hpp>
#include <string>
#include <vector>
#include "macros.h"

using namespace nvinfer1;

#define CHECK(status)                                     \
    do {                                                  \
        auto ret = (status);                              \
        if (ret != cudaSuccess) {                         \
            std::cerr << "Cuda failure: " << ret << "\n"; \
            std::abort();                                 \
        }                                                 \
    } while (0)

static void checkTrtEnv(int device = 0) {
#if TRT_VERSION < 8000
    CHECK(cudaGetDevice(&device));
    cudaDeviceProp prop{};
    CHECK(cudaGetDeviceProperties(&prop, device));
    const int sm = prop.major * 10 + prop.minor;
    if (sm > 86) {
        std::cerr << "TensorRT < 8 does not support SM > 86 on this GPU.";
        std::abort();
    }
#endif
}

/**
 * @brief TensorRT weight files have a simple space delimited format:
 * [type] [size] <data x size in hex>
 * 
 * @param file input weight file path
 * @return std::map<std::string, nvinfer1::Weights> 
 */
static std::map<std::string, nvinfer1::Weights> loadWeights(const std::string& file) {
    std::cout << "Loading weights: " << file << "\n";
    std::map<std::string, nvinfer1::Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--) {
        nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0};

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> wt.count;

        // Load blob
        auto* val = new uint32_t[wt.count];
        input >> std::hex;
        for (auto x = 0ll; x < wt.count; ++x) {
            input >> val[x];
        }
        wt.values = val;
        weightMap[name] = wt;
    }

    return weightMap;
}

/**
 * @brief a preprocess function aligning with ImageNet preprocess in torchvision, only support 3-channel image
 * 
 * @param img opencv image with BGR layout
 * @param bgr2rgb whether to convert BGR to RGB
 * @param mean subtract mean
 * @param std divide std
 * @param n batch size
 * @param h resize height
 * @param w resize width
 * @return std::vector<float> contiguous flatten image data in float32 type
 */
static std::vector<float> preprocess_img(cv::Mat& img, bool bgr2rgb, const std::array<const float, 3>& mean,
                                         const std::array<const float, 3>& std, int n, int h, int w) {
    const auto c = img.channels();
    const auto size = c * h * w;
    if (c != 3) {
        std::cerr << "this demo only supports 3 channel input image.\n";
        std::abort();
    }
    if (bgr2rgb) {
        cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
    }
    cv::resize(img, img, cv::Size(w, h), 0, 0, cv::INTER_LINEAR);
    img.convertTo(img, CV_32FC3, 1.f / 255);
    img = (img - cv::Scalar(mean[0], mean[1], mean[2])) / cv::Scalar(std[0], std[1], std[2]);
    std::vector<float> chw(static_cast<std::size_t>(n) * c * h * w, 0.f);

    // fill all batch with the same input image
    for (int i = 0; i < n; ++i) {
        for (int y = 0; y < h; ++y) {
            for (int x = 0; x < w; ++x) {
                const cv::Vec3f v = img.at<cv::Vec3f>(y, x);
                chw[i * size + 0 * h * w + y * w + x] = v[0];
                chw[i * size + 1 * h * w + y * w + x] = v[1];
                chw[i * size + 2 * h * w + y * w + x] = v[2];
            }
        }
    }
    return chw;
}

static auto topk(const std::vector<float>& v, int k) -> std::vector<std::pair<int, float>> {
    if (k <= 0)
        return {};
    auto stride = std::min<std::ptrdiff_t>(k, static_cast<int64_t>(v.size()));

    std::vector<int> idx(v.size());
    std::iota(idx.begin(), idx.end(), 0);

    std::partial_sort(idx.begin(), idx.begin() + k, idx.end(), [&](int a, int b) { return v[a] > v[b]; });

    std::vector<std::pair<int, float>> out;
    out.reserve(stride);
    for (auto i = 0; i < stride; ++i)
        out.emplace_back(idx[i], v[idx[i]]);
    return out;
}

static std::map<int, std::string> loadImagenetLabelMap(const std::string& path) {
    std::map<int, std::string> labels;
    std::ifstream in(path);
    if (!in.is_open()) {
        return labels;
    }
    std::string line;
    while (std::getline(in, line)) {
        auto colon = line.find(':');
        if (colon == std::string::npos) {
            continue;
        }
        auto first_quote = line.find('\'', colon);
        if (first_quote == std::string::npos) {
            continue;
        }
        auto second_quote = line.find('\'', first_quote + 1);
        if (second_quote == std::string::npos) {
            continue;
        }
        int idx = std::stoi(line.substr(0, colon));
        labels[idx] = line.substr(first_quote + 1, second_quote - first_quote - 1);
    }
    return labels;
}

static ILayer* addTransformLayer(INetworkDefinition* network, ITensor& input, bool bgr2rgb,
                                 const std::array<const float, 3>& mean, const std::array<const float, 3>& std) {
    struct ScaleParams {
        std::array<float, 3> shift;
        std::array<float, 3> scale;
    };
    static std::vector<std::unique_ptr<ScaleParams>> gScaleParams;
    auto params = std::make_unique<ScaleParams>();
    params->shift = {-mean[0] / std[0], -mean[1] / std[1], -mean[2] / std[2]};
    params->scale = {1.f / (std[0] * 255.f), 1.f / (std[1] * 255.f), 1.f / (std[2] * 255.f)};

    static const Weights empty{DataType::kFLOAT, nullptr, 0ll};
    const Weights shift{DataType::kFLOAT, params->shift.data(), 3ll};
    const Weights scale{DataType::kFLOAT, params->scale.data(), 3ll};

    gScaleParams.emplace_back(std::move(params));

    ITensor* in = &input;
    if (input.getType() != DataType::kFLOAT) {
#if TRT_VERSION >= 8000
        auto* cast = network->addCast(input, DataType::kFLOAT);
        assert(cast);
        cast->setName("Cast to FP32");
        in = cast->getOutput(0);
#else
        auto* identity = network->addIdentity(input);
        assert(identity);
        identity->setName("Convert to FP32");
        identity->setOutputType(0, DataType::kFLOAT);
        in = identity->getOutput(0);
#endif
    }
    // Convert from NHWC to NCHW
    auto* perm = network->addShuffle(*in);
    assert(perm);
    perm->setName("NHWC -> NCHW");
    perm->setFirstTranspose(Permutation{0, 3, 1, 2});

    // Convert from BGR to RGB (optional)
    ITensor* data{nullptr};
    if (bgr2rgb) {
        auto add_slice = [&](int c, const char* name) -> ITensor* {
            auto dims = perm->getOutput(0)->getDimensions();
            Dims4 start = {0, c, 0, 0}, stride = {1, 1, 1, 1};
            Dims4 size = {dims.d[0], 1, dims.d[2], dims.d[3]};
            auto* _slice = network->addSlice(*perm->getOutput(0), start, size, stride);
            _slice->setName(name);
            assert(_slice && _slice->getNbOutputs() == 1);
            return _slice->getOutput(0);
        };
        std::array<ITensor*, 3> channels = {add_slice(2, "R"), add_slice(1, "G"), add_slice(0, "B")};
        auto* cat = network->addConcatenation(channels.data(), 3);
        assert(cat);
        cat->setName("RGB");
        cat->setAxis(1);
        data = cat->getOutput(0);
    } else {
        data = perm->getOutput(0);
    }

    // Normalize
    auto* trans = network->addScale(*data, ScaleMode::kCHANNEL, shift, scale, empty);
    assert(trans);
    trans->setName("mean & std");
#if TRT_VERSION >= 8000
    trans->setChannelAxis(1);
#endif
    return trans;
}

static size_t getSize(DataType dt) {
    switch (dt) {
#if TRT_VERSION >= 8510
        case DataType::kUINT8:
#endif
        case DataType::kINT8:
            return sizeof(int8_t);
        case DataType::kFLOAT:
            return sizeof(float);
        case DataType::kHALF:
            return sizeof(int16_t);
        case DataType::kINT32:
            return sizeof(int32_t);
        default: {
            std::cerr << "Unsupported data type\n";
            std::abort();
        }
    }
}


================================================
FILE: squeezenet/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.14)

project(
  squeezenet
  VERSION 0.1
  LANGUAGES C CXX CUDA)

if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
  set(CMAKE_CUDA_ARCHITECTURES
      60
      70
      72
      75
      80
      86
      89)
endif()

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_INCLUDE_CURRENT_DIR TRUE)

option(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cudaruntime library" OFF)

find_package(Threads REQUIRED)
find_package(CUDAToolkit REQUIRED)
find_package(OpenCV REQUIRED)

if(NOT TARGET TensorRT::TensorRT)
  include(FindTensorRT.cmake)
else()
  message("TensorRT has been found, skipping for ${PROJECT_NAME}")
endif()

add_executable(${PROJECT_NAME} "${PROJECT_NAME}.cpp")
target_include_directories(${PROJECT_NAME} PRIVATE ${OpenCV_INCLUDE_DIRS})
target_link_libraries(${PROJECT_NAME} PUBLIC Threads::Threads CUDA::cudart
                                             TensorRT::TensorRT ${OpenCV_LIBS})


================================================
FILE: squeezenet/FindTensorRT.cmake
================================================
cmake_minimum_required(VERSION 3.17.0)

set(TRT_VERSION
    $ENV{TRT_VERSION}
    CACHE
      STRING
      "TensorRT version, e.g. \"8.6.1.6\" or \"8.6.1.6+cuda12.0.1.011\", etc")

function(_guess_path var_name required_files)
  set(_result "")

  foreach(path_entry IN LISTS ARGN)
    if(NOT EXISTS "${path_entry}")
      message(DEBUG "skip non-existing path '${path_entry}'")
      continue()
    endif()

    set(_ok TRUE)
    foreach(required_file IN LISTS required_files)
      if(NOT EXISTS "${path_entry}/${required_file}")
        set(_ok FALSE)
        message(DEBUG "'${path_entry}' missing '${required_file}'")
        break()
      endif()
    endforeach()

    if(_ok)
      list(APPEND _result "${path_entry}")
      message(DEBUG "accept '${path_entry}'")
    else()
      message(DEBUG "reject '${path_entry}'")
    endif()
  endforeach()

  if(_result STREQUAL "")
    message(
      FATAL_ERROR
        "_guess_path(${var_name}) failed: no valid path found. required_files='${required_files}' candidates='${ARGN}'"
    )
  endif()

  set(${var_name}
      "${_result}"
      PARENT_SCOPE)
endfunction()

# find TensorRT include folder
if(NOT DEFINED TensorRT_INCLUDE_DIR)
  if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
    _guess_path(
      TensorRT_INCLUDE_DIR "NvInfer.h" "/usr/include/aarch64-linux-gnu"
      "/usr/include" "/usr/local/cuda/targets/aarch64-linux/include")
  else()
    _guess_path(
      TensorRT_INCLUDE_DIR "NvInfer.h"
      "/usr/local/tensorrt/targets/x86_64-linux-gnu/include"
      "/usr/include/x86_64-linux-gnu" "/usr/include")
  endif()
  message(STATUS "TensorRT includes: ${TensorRT_INCLUDE_DIR}")
endif()

# find TensorRT library folder
if(NOT TensorRT_LIBRARY_DIR)
  if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
    _guess_path(
      TensorRT_LIBRARY_DIR "libnvinfer.so;libnvinfer_plugin.so"
      "/usr/lib/aarch64-linux-gnu;/usr/lib/aarch64-linux-gnu/tegra" "/usr/lib")
  else()
    _guess_path(
      TensorRT_LIBRARY_DIR
      "libnvinfer.so;libnvinfer_plugin.so"
      "/usr/lib/x86_64-linux-gnu;/usr/local/tensorrt/targets/x86_64-linux-gnu/lib;/usr/lib"
    )
  endif()
  message(STATUS "TensorRT libraries: ${TensorRT_LIBRARY_DIR}")
endif()

set(TensorRT_LIBRARIES)

message(STATUS "Found TensorRT lib: ${TensorRT_LIBRARIES}")

# process for different TensorRT version
if(DEFINED TRT_VERSION AND NOT TRT_VERSION STREQUAL "")
  string(REGEX MATCH "([0-9]+)" _match ${TRT_VERSION})
  set(TRT_MAJOR_VERSION "${_match}")
  set(_modules nvinfer nvinfer_plugin)
  unset(_match)

  if(TRT_MAJOR_VERSION GREATER_EQUAL 8)
    list(APPEND _modules nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean)
  endif()
else()
  message(FATAL_ERROR "Please set a environment variable \"TRT_VERSION\"")
endif()

# find and add all modules of TensorRT into list
foreach(lib IN LISTS _modules)
  find_library(
    TensorRT_${lib}_LIBRARY
    NAMES ${lib}
    HINTS ${TensorRT_LIBRARY_DIR})
  list(APPEND TensorRT_LIBRARIES ${TensorRT_${lib}_LIBRARY})
endforeach()

# make the "TensorRT target"
add_library(TensorRT IMPORTED INTERFACE)
add_library(TensorRT::TensorRT ALIAS TensorRT)
target_link_libraries(TensorRT INTERFACE ${TensorRT_LIBRARIES})

set_target_properties(
  TensorRT
  PROPERTIES C_STANDARD 17
             CXX_STANDARD 17
             POSITION_INDEPENDENT_CODE ON
             SKIP_BUILD_RPATH TRUE
             BUILD_WITH_INSTALL_RPATH TRUE
             INSTALL_RPATH "$ORIGIN"
             INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIR}")

unset(TRT_MAJOR_VERSION)
unset(_modules)


================================================
FILE: squeezenet/README.md
================================================
# squeezenet v1.1

SqueezeNet 1.1 model from the official SqueezeNet repo
<https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1>

SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters
than SqueezeNet 1.0, without sacrificing accuracy.

For the Pytorch implementation, you can refer to [pytorchx/squeezenet](https://github.com/wang-xinyu/pytorchx/tree/master/squeezenet)

## Usage

1. use `gen_wts.py` to generate wts file

```bash
python3 gen_wts.py
```

2. build C++ code

```bash
pushd tensorrtx/squeezenet
cmake -S . -B build -G Ninja --fresh
cmake --build build
```

3. serialize wts model to engine file

```bash
./build/squeezenet -s
```

4. run inference

```bash
./build/squeezenet -d
```

output looks like:

```bash
...
====
Execution time: 183us
3.481, 3.901, 4.438, 4.346, 3.3, 6.519, 6.03, 10.89, 10.45, 10.39, 8.874, 5.889, 9.529, 3.703, 5.865, 6.982, 8.894, 7.76, 4.599, 7.89, 4.795,
====
prediction result:
Top: 0 idx: 281, logits: 25.18, label: tabby, tabby cat
Top: 1 idx: 282, logits: 23.2, label: tiger cat
Top: 2 idx: 309, logits: 22.72, label: bee
```


================================================
FILE: squeezenet/gen_wts.py
================================================
import struct

import cv2
import numpy as np
import torch
import torchvision


def read_imagenet_labels() -> dict[int, str]:
    """
    read ImageNet 1000 labels

    Returns:
        dict[int, str]: labels dict
    """
    clsid2label = {}
    with open("../assets/imagenet1000_clsidx_to_labels.txt", "r") as f:
        for i in f.readlines():
            k, v = i.split(": ")
            clsid2label.setdefault(int(k), v[1:-3])
    return clsid2label


def preprocess(img: np.array) -> torch.Tensor:
    """
    a preprocess method align with ImageNet dataset

    Args:
        img (np.array): input image

    Returns:
        torch.Tensor: preprocessed image in `NCHW` layout
    """
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
    img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_LINEAR)
    mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
    std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
    img = (img - mean) / std
    img = img.transpose(2, 0, 1)[None, ...]
    return torch.from_numpy(img)


def main():
    labels = read_imagenet_labels()

    model = torchvision.models.squeezenet1_1(pretrained=True)
    model = model.eval()

    img = cv2.imread("../assets/cats.jpg", cv2.IMREAD_COLOR)
    img = preprocess(img)

    with torch.inference_mode():
        output = model(img)
        for i, batch in enumerate(torch.topk(output, k=3).indices):
            for j, idx in enumerate(batch):
                print(f"\tBatch: {i}, Top: {j}, logits: {output[i][idx]:.4f}, label: {labels[int(idx)]}")
        print(f"{'=' * 32}")

    with open("../models/squeezenet.wts", "w") as f:
        f.write("{}\n".format(len(model.state_dict().keys())))
        for k, v in model.state_dict().items():
            vr = v.reshape(-1).cpu().numpy()
            f.write("{} {} ".format(k, len(vr)))
            print(k, v.shape)
            for vv in vr:
                f.write(" ")
                f.write(struct.pack(">f", float(vv)).hex())
            f.write("\n")
        f.close()


if __name__ == "__main__":
    main()


================================================
FILE: squeezenet/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "NvInferRuntime.h"
#include "macros.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf {
   public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {}

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {}

    ~LogStreamConsumerBuffer() {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr()) {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync() {
        putOutput();
        return 0;
    }

    void putOutput() {
        if (mShouldLog) {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }

   private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase {
   public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog) {}

   protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
   public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(severity <= reportableSeverity),
          mSeverity(severity) {}

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(other.mShouldLog),
          mSeverity(other.mSeverity) {}

    void setReportableSeverity(Severity reportableSeverity) {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

   private:
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger {
   public:
    Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {}

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult {
        kRUNNING,  //!< The test is running
        kPASSED,   //!< The test passed
        kFAILED,   //!< The test failed
        kWAIVED    //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger() { return *this; }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity) { mReportableSeverity = severity; }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom {
       public:
        TestAtom(TestAtom&&) = default;

       private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started), mName(name), mCmdline(cmdline) {}

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline) {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom) {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result) {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass) {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const { return mReportableSeverity; }

   private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result) {
        switch (result) {
            case TestResult::kRUNNING:
                return "RUNNING";
            case TestResult::kPASSED:
                return "PASSED";
            case TestResult::kFAILED:
                return "FAILED";
            case TestResult::kWAIVED:
                return "WAIVED";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result) {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv) {
        std::stringstream ss;
        for (int i = 0; i < argc; i++) {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace {

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

}  // anonymous namespace

#endif  // TENSORRT_LOGGING_H


================================================
FILE: squeezenet/macros.h
================================================
#pragma once

#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else

#if defined(_MSC_VER)
#define API __declspec(dllimport)
#else
#define API
#endif
#endif  // API_EXPORTS

#define TRT_VERSION \
    ((NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + (NV_TENSORRT_PATCH * 10) + NV_TENSORRT_BUILD)

#if TRT_VERSION >= 8000
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif


================================================
FILE: squeezenet/squeezenet.cpp
================================================
#include <NvInfer.h>
#include <chrono>
#include <cmath>
#include <fstream>
#include <iostream>
#include <map>
#include <opencv2/opencv.hpp>
#include <vector>
#include "logging.h"
#include "utils.h"

// stuff we know about squeezenet
static constexpr const int N = 1;
static constexpr const int INPUT_H = 224;
static constexpr const int INPUT_W = 224;
static constexpr const int SIZES[] = {3 * INPUT_H * INPUT_W, N * 1000};
static constexpr const char* NAMES[] = {"data", "prob"};
static constexpr const bool TRT_PREPROCESS = TRT_VERSION >= 8510 ? true : false;
static constexpr const float mean[3] = {0.485f, 0.456f, 0.406f};
static constexpr const float stdv[3] = {0.229f, 0.224f, 0.225f};

static constexpr const char* WTS_PATH = "../models/squeezenet.wts";
static constexpr const char* ENGINE_PATH = "../models/squeezenet.engine";
static constexpr const char* LABELS_PATH = "../assets/imagenet1000_clsidx_to_labels.txt";

using namespace nvinfer1;
using WeightMap = std::map<std::string, Weights>;

static Logger gLogger;

ILayer* fire(INetworkDefinition* network, WeightMap& m, ITensor& input, const std::string& lname,
             int32_t squeeze_planes, int32_t e1x1_planes, int32_t e3x3_planes) {
    auto* conv1 = network->addConvolutionNd(input, squeeze_planes, DimsHW{1, 1}, m[lname + "squeeze.weight"],
                                            m[lname + "squeeze.bias"]);
    assert(conv1);
    auto* relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU)->getOutput(0);

    std::string _c = lname + "expand1x1";
    auto* conv2 = network->addConvolutionNd(*relu1, e1x1_planes, DimsHW{1, 1}, m[_c + ".weight"], m[_c + ".bias"]);
    assert(conv2);
    auto* relu2 = network->addActivation(*conv2->getOutput(0), ActivationType::kRELU);
    assert(relu2);

    _c = lname + "expand3x3";
    auto* conv3 = network->addConvolutionNd(*relu1, e3x3_planes, DimsHW{3, 3}, m[_c + ".weight"], m[_c + ".bias"]);
    assert(conv3);
    conv3->setPaddingNd(DimsHW{1, 1});
    auto* relu3 = network->addActivation(*conv3->getOutput(0), ActivationType::kRELU);
    assert(relu3);

    ITensor* inputTensors[] = {relu2->getOutput(0), relu3->getOutput(0)};
    auto* concat = network->addConcatenation(inputTensors, 2);
    assert(concat);
    return concat;
}

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(int32_t N, IRuntime* runtime, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    auto weightMap = loadWeights(WTS_PATH);
#if TRT_VERSION >= 10000
    auto* network = builder->createNetworkV2(0);
#else
    auto* network = builder->createNetworkV2(1u << static_cast<int>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));
#endif

    ITensor* data{nullptr};
    if constexpr (TRT_PREPROCESS) {
#if TRT_VERSION > 8510
        dt = DataType::kUINT8;
#else
        dt = DataType::kINT8;
#endif
        data = network->addInput(NAMES[0], dt, Dims4{N, INPUT_H, INPUT_W, 3});
        auto* trans = addTransformLayer(network, *data, true, mean, stdv);
        data = trans->getOutput(0);
    } else {
        data = network->addInput(NAMES[0], dt, Dims4{N, 3, INPUT_H, INPUT_W});
    }
    assert(data);

    auto* conv1 = network->addConvolutionNd(*data, 64, DimsHW{3, 3}, weightMap["features.0.weight"],
                                            weightMap["features.0.bias"]);
    assert(conv1);
    conv1->setStrideNd(DimsHW{2, 2});
    auto* relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);
    assert(relu1);
    auto* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
    assert(pool1);
    pool1->setStrideNd(DimsHW{2, 2});
    pool1->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_UP);

    auto* cat1 = fire(network, weightMap, *pool1->getOutput(0), "features.3.", 16, 64, 64);
    cat1 = fire(network, weightMap, *cat1->getOutput(0), "features.4.", 16, 64, 64);

    auto* pool2 = network->addPoolingNd(*cat1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
    assert(pool2);
    pool2->setStrideNd(DimsHW{2, 2});
    pool2->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_UP);
    // pool2->setPostPadding(DimsHW{1, 1});

    cat1 = fire(network, weightMap, *pool2->getOutput(0), "features.6.", 32, 128, 128);
    cat1 = fire(network, weightMap, *cat1->getOutput(0), "features.7.", 32, 128, 128);

    auto* pool3 = network->addPoolingNd(*cat1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
    assert(pool3);
    pool3->setStrideNd(DimsHW{2, 2});
    pool3->setPostPadding(DimsHW{1, 1});
    pool3->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_UP);

    cat1 = fire(network, weightMap, *pool3->getOutput(0), "features.9.", 48, 192, 192);
    cat1 = fire(network, weightMap, *cat1->getOutput(0), "features.10.", 48, 192, 192);
    cat1 = fire(network, weightMap, *cat1->getOutput(0), "features.11.", 64, 256, 256);
    cat1 = fire(network, weightMap, *cat1->getOutput(0), "features.12.", 64, 256, 256);

    // classifier
    auto* conv2 = network->addConvolutionNd(*cat1->getOutput(0), 1000, DimsHW{1, 1}, weightMap["classifier.1.weight"],
                                            weightMap["classifier.1.bias"]);
    assert(conv2);
    auto* relu2 = network->addActivation(*conv2->getOutput(0), ActivationType::kRELU);
    assert(relu2);
    auto* pool4 = network->addPoolingNd(*relu2->getOutput(0), PoolingType::kAVERAGE, DimsHW{14, 14});
    assert(pool4);

    pool4->getOutput(0)->setName(NAMES[1]);
    network->markOutput(*pool4->getOutput(0));

    // Build engine
#if TRT_VERSION >= 8000
    config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, WORKSPACE_SIZE);
    IHostMemory* mem = builder->buildSerializedNetwork(*network, *config);
    auto* engine = runtime->deserializeCudaEngine(mem->data(), mem->size());
    delete network;
#else
    builder->setMaxBatchSize(N);
    config->setMaxWorkspaceSize(WORKSPACE_SIZE);
    auto* engine = builder->buildEngineWithConfig(*network, *config);
    network->destroy();
#endif
    std::cout << "build out" << std::endl;

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    return engine;
}

void APIToModel(int32_t N, IRuntime* runtime, IHostMemory** modelStream) {
    // Create builder
    auto* builder = createInferBuilder(gLogger);
    auto* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    auto* engine = createEngine(N, runtime, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

#if TRT_VERSION >= 8000
    delete engine;
    delete config;
    delete builder;
#else
    engine->destroy();
    config->destroy();
    builder->destroy();
#endif
}

std::vector<std::vector<float>> doInference(IExecutionContext& context, void* input, int32_t batch_size) {
    const auto& engine = context.getEngine();
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));
    std::vector<void*> buffers;

#if TRT_VERSION >= 8000
    const int32_t nIO = engine.getNbIOTensors();
#else
    const int32_t nIO = engine.getNbBindings();
#endif

    buffers.resize(nIO);
    for (auto i = 0; i < nIO; ++i) {
        std::size_t size = 0;
#if TRT_VERSION >= 8000
        const auto* tensor_name = engine.getIOTensorName(i);
        auto s = getSize(engine.getTensorDataType(tensor_name));
        size = s * batch_size * SIZES[i];
        CHECK(cudaMalloc(&buffers[i], size));
        if (i == 0) {
            CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream));
        }
        context.setTensorAddress(tensor_name, buffers[i]);
#else
        const int32_t idx = engine.getBindingIndex(NAMES[i]);
        auto s = getSize(engine.getBindingDataType(idx));
        assert(idx == i);
        size = s * batch_size * SIZES[i];
        CHECK(cudaMalloc(&buffers[i], size));
        if (i == 0) {
            CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream));
        }
#endif
    }

#if TRT_VERSION >= 8000
    assert(context.enqueueV3(stream));
#else
    assert(context.enqueueV2(buffers.data(), stream, nullptr));
#endif

    std::vector<std::vector<float>> prob;
    for (int i = 1; i < nIO; ++i) {
        std::vector<float> tmp(batch_size * SIZES[i], std::nan(""));
        std::size_t size = batch_size * SIZES[i] * sizeof(float);
        CHECK(cudaMemcpyAsync(tmp.data(), buffers[i], size, cudaMemcpyDeviceToHost, stream));
        prob.emplace_back(tmp);
    }
    CHECK(cudaStreamSynchronize(stream));

    cudaStreamDestroy(stream);
    for (auto i = 0; i < nIO; ++i) {
        CHECK(cudaFree(buffers[i]));
    }
    return prob;
}

int main(int argc, char** argv) {
    checkTrtEnv();
    if (argc != 2) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./squeezenet -s   // serialize model to plan file" << std::endl;
        std::cerr << "./squeezenet -d   // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // create a model using the API directly and serialize it to a stream
    auto* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    char* trtModelStream{nullptr};
    size_t size{0};

    if (std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(1, runtime, &modelStream);
        assert(modelStream != nullptr);

        std::ofstream p(ENGINE_PATH, std::ios::binary | std::ios::trunc);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
#if TRT_VERSION >= 8000
        delete modelStream;
#else
        modelStream->destroy();
#endif
        return 0;
    } else if (std::string(argv[1]) == "-d") {
        std::ifstream file(ENGINE_PATH, std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        return -1;
    }

#if TRT_VERSION >= 8000
    auto* engine = runtime->deserializeCudaEngine(trtModelStream, size);
#else
    auto* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
#endif
    assert(engine != nullptr);
    auto* context = engine->createExecutionContext();
    assert(context != nullptr);

    void* input = nullptr;
    std::vector<float> flat_img;
    cv::Mat img;
    if constexpr (TRT_PREPROCESS) {
        // for simplicity, resize image on cpu side
        img = cv::imread("../assets/cats.jpg", cv::IMREAD_COLOR);
        cv::resize(img, img, cv::Size(INPUT_W, INPUT_H), 0, 0, cv::INTER_LINEAR);
        input = static_cast<void*>(img.data);
    } else {
        img = cv::imread("../assets/cats.jpg", cv::IMREAD_COLOR);
        flat_img = preprocess_img(img, true, mean, stdv, N, INPUT_H, INPUT_W);
        input = flat_img.data();
    }

    for (int32_t i = 0; i < 100; ++i) {
        auto _start = std::chrono::system_clock::now();
        auto prob = doInference(*context, input, N);
        auto _end = std::chrono::system_clock::now();
        auto _time = std::chrono::duration_cast<std::chrono::microseconds>(_end - _start).count();
        std::cout << "Execution time: " << _time << "us" << std::endl;

        for (auto vector : prob) {
            int idx = 0;
            for (auto v : vector) {
                std::cout << std::setprecision(4) << v << ", " << std::flush;
                if (++idx > 20) {
                    std::cout << "\n====" << std::endl;
                    break;
                }
            }
        }

        if (i == 99) {
            std::cout << "prediction result: " << std::endl;
            auto labels = loadImagenetLabelMap(LABELS_PATH);
            int _top = 0;
            for (auto& [idx, logits] : topk(prob[0], 3)) {
                std::cout << "Top: " << _top++ << " idx: " << idx << ", logits: " << logits
                          << ", label: " << labels[idx] << std::endl;
            }
        }
    }

    delete[] trtModelStream;
    // Destroy the engine
#if TRT_VERSION >= 8000
    delete context;
    delete engine;
    delete runtime;
#else
    context->destroy();
    engine->destroy();
    runtime->destroy();
#endif
    return 0;
}


================================================
FILE: squeezenet/utils.h
================================================
#pragma once
#include <NvInfer.h>
#include <cuda_runtime_api.h>
#include <algorithm>
#include <cassert>
#include <fstream>
#include <iostream>
#include <map>
#include <numeric>
#include <opencv2/opencv.hpp>
#include <stdexcept>
#include <string>
#include <vector>

using namespace nvinfer1;

#define WORKSPACE_SIZE (16 << 20)

#define CHECK(status)                                          \
    do {                                                       \
        auto ret = (status);                                   \
        if (ret != cudaSuccess) {                              \
            std::cerr << "Cuda failure: " << ret << std::endl; \
            abort();                                           \
        }                                                      \
    } while (0)

static void checkTrtEnv(int device = 0) {
#if TRT_VERSION < 7220
#error "TensorRT >= 7.2.2 is required for this demo."
#endif
#if TRT_VERSION < 8000
    CHECK(cudaGetDevice(&device));
    cudaDeviceProp prop{};
    CHECK(cudaGetDeviceProperties(&prop, device));
    const int sm = prop.major * 10 + prop.minor;
    if (sm > 86) {
        throw std::runtime_error("TensorRT < 8 does not support SM > 86 on this GPU.");
    }
#endif
}

/**
 * @brief TensorRT weight files have a simple space delimited format:
 * [type] [size] <data x size in hex>
 * 
 * @param file input weight file path
 * @return std::map<std::string, nvinfer1::Weights> 
 */
static std::map<std::string, nvinfer1::Weights> loadWeights(const std::string& file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, nvinfer1::Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--) {
        nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0};

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> wt.count;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * wt.count));
        for (uint32_t x = 0; x < wt.count; ++x) {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        weightMap[name] = wt;
    }

    return weightMap;
}

/**
 * @brief a preprocess function aligning with ImageNet preprocess in torchvision, only support 3-channel image
 * 
 * @param img opencv image with BGR layout
 * @param bgr2rgb whether to convert BGR to RGB
 * @param mean subtract mean
 * @param std divide std
 * @param n batch size
 * @param h resize height
 * @param w resize width
 * @return std::vector<float> contiguous flatten image data in float32 type
 */
static std::vector<float> preprocess_img(cv::Mat& img, bool bgr2rgb, const float mean[3], const float std[3], int n,
                                         int h, int w) {
    const int c = img.channels();
    const std::size_t size = c * h * w;
    if (c != 3) {
        throw std::runtime_error("this demo only supports 3 channel input image.");
    }
    if (bgr2rgb) {
        cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
    }
    cv::resize(img, img, cv::Size(w, h), 0, 0, cv::INTER_LINEAR);
    img.convertTo(img, CV_32FC3, 1.f / 255);
    img = (img - cv::Scalar(mean[0], mean[1], mean[2])) / cv::Scalar(std[0], std[1], std[2]);
    std::vector<float> chw(n * c * h * w, 0.f);

    // fill all batch with the same input image
    for (int i = 0; i < n; ++i) {
        for (int y = 0; y < h; ++y) {
            for (int x = 0; x < w; ++x) {
                const cv::Vec3f v = img.at<cv::Vec3f>(y, x);
                chw[i * size + 0 * h * w + y * w + x] = v[0];
                chw[i * size + 1 * h * w + y * h + x] = v[1];
                chw[i * size + 2 * h * w + y * h + x] = v[2];
            }
        }
    }
    return chw;
}

static std::vector<std::pair<int, float>> topk(const std::vector<float>& v, int k) {
    if (k <= 0)
        return {};
    k = std::min<int>(k, v.size());

    std::vector<int> idx(v.size());
    std::iota(idx.begin(), idx.end(), 0);

    std::partial_sort(idx.begin(), idx.begin() + k, idx.end(), [&](int a, int b) { return v[a] > v[b]; });

    std::vector<std::pair<int, float>> out;
    out.reserve(k);
    for (int i = 0; i < k; ++i)
        out.emplace_back(idx[i], v[idx[i]]);
    return out;
}

static std::map<int, std::string> loadImagenetLabelMap(const std::string& path) {
    std::map<int, std::string> labels;
    std::ifstream in(path);
    if (!in.is_open()) {
        return labels;
    }
    std::string line;
    while (std::getline(in, line)) {
        auto colon = line.find(':');
        if (colon == std::string::npos) {
            continue;
        }
        auto first_quote = line.find('\'', colon);
        if (first_quote == std::string::npos) {
            continue;
        }
        auto second_quote = line.find('\'', first_quote + 1);
        if (second_quote == std::string::npos) {
            continue;
        }
        int idx = std::stoi(line.substr(0, colon));
        labels[idx] = line.substr(first_quote + 1, second_quote - first_quote - 1);
    }
    return labels;
}

static ILayer* addTransformLayer(INetworkDefinition* network, ITensor& input, bool bgr2rgb, const float mean[3],
                                 const float std[3]) {
    struct ScaleParams {
        std::array<float, 3> shift;
        std::array<float, 3> scale;
    };
    static std::vector<std::unique_ptr<ScaleParams>> gScaleParams;
    auto params = std::make_unique<ScaleParams>();
    params->shift = {-mean[0] / std[0], -mean[1] / std[1], -mean[2] / std[2]};
    params->scale = {1.f / (std[0] * 255.f), 1.f / (std[1] * 255.f), 1.f / (std[2] * 255.f)};

    static const Weights empty{DataType::kFLOAT, nullptr, 0ll};
    const Weights shift{DataType::kFLOAT, params->shift.data(), 3ll};
    const Weights scale{DataType::kFLOAT, params->scale.data(), 3ll};

    gScaleParams.emplace_back(std::move(params));

    ITensor* in = &input;
    if (input.getType() != DataType::kFLOAT) {
#if TRT_VERSION >= 8000
        auto* cast = network->addCast(input, DataType::kFLOAT);
        assert(cast);
        cast->setName("Cast to FP32");
        in = cast->getOutput(0);
#else
        auto* identity = network->addIdentity(input);
        assert(identity);
        identity->setName("Convert to FP32");
        identity->setOutputType(0, DataType::kFLOAT);
        in = identity->getOutput(0);
#endif
    }

    // Convert from NHWC to NCHW
    auto* perm = network->addShuffle(*in);
    assert(perm);
    perm->setName("NHWC -> NCHW");
    perm->setFirstTranspose(Permutation{0, 3, 1, 2});

    // Convert from BGR to RGB (optional)
    ITensor* data{nullptr};
    if (bgr2rgb) {
        auto add_slice = [&](int c, const char* name) -> ITensor* {
            auto dims = perm->getOutput(0)->getDimensions();
            Dims4 start = {0, c, 0, 0}, stride = {1, 1, 1, 1};
            Dims4 size = {dims.d[0], 1, dims.d[2], dims.d[3]};
            auto* _slice = network->addSlice(*perm->getOutput(0), start, size, stride);
            _slice->setName(name);
            assert(_slice && _slice->getNbOutputs() == 1);
            auto d = _slice->getOutput(0)->getDimensions();
            return _slice->getOutput(0);
        };
        ITensor* channels[] = {add_slice(2, "R"), add_slice(1, "G"), add_slice(0, "B")};
        auto* cat = network->addConcatenation(channels, 3);
        assert(cat);
        cat->setName("RGB");
        cat->setAxis(1);
        data = cat->getOutput(0);
    } else {
        data = perm->getOutput(0);
    }

    // Normalize
    auto* trans = network->addScale(*data, ScaleMode::kCHANNEL, shift, scale, empty);
    assert(trans);
    trans->setName("mean & std");
#if TRT_VERSION >= 8000
    trans->setChannelAxis(1);
#endif
    return trans;
}

static size_t getSize(DataType dt) {
    switch (dt) {
#if TRT_VERSION >= 8510
        case DataType::kUINT8:
#endif
        case DataType::kINT8:
            return sizeof(int8_t);
        case DataType::kFLOAT:
            return sizeof(float);
        case DataType::kHALF:
            return sizeof(int16_t);
        case DataType::kINT32:
            return sizeof(int32_t);
        default:
            throw std::runtime_error("Unsupported data type");
    }
}


================================================
FILE: superpoint/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(SuperPointNet)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

find_package(CUDA REQUIRED)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(/usr/include/x86_64-linux-gnu/)
link_directories(/usr/lib/x86_64-linux-gnu/)

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

add_executable(supernet ${PROJECT_SOURCE_DIR}/supernet.cpp ${PROJECT_SOURCE_DIR}/utils.cpp)
target_link_libraries(supernet nvinfer)
target_link_libraries(supernet cudart)
target_link_libraries(supernet ${OpenCV_LIBS})

add_definitions(-O2 -pthread)

================================================
FILE: superpoint/README.md
================================================
# SuperPoint

The PyTorch implementation is from [magicleap/SuperPointPretrainedNetwork.](https://github.com/magicleap/SuperPointPretrainedNetwork)

The pretrained models are from [magicleap/SuperPointPretrainedNetwork.](https://github.com/magicleap/SuperPointPretrainedNetwork)


## Config

- FP16/FP32 can be selected by the macro `USE_FP16` in supernet.cpp
- GPU id and batch size can be selected by the macro `DEVICE` & `BATCH_SIZE` in supernet.cpp


## How to Run
1.Generate .wts file from the baseline pytorch implementation of pretrained model. The following example described how to generate superpoint_v1.wts from pytorch implementation of superpoint_v1. 
```
git clone https://github.com/xiang-wuu/SuperPointPretrainedNetwork
cd SuperPointPretrainedNetwork
git checkout deploy
// copy tensorrtx/superpoint/gen_wts.py to here(SuperPointPretrainedNetwork)
python gen_wts.py
// a file 'superpoint_v1.wts' will be generated.
// before running gen_wts.py python script make sure you cloned private fork and checkout to deploy branch.
```

2.Put .wts file into tensorrtx/superpoint, build and run
```
cd tensorrtx/superpoint
mkdir build
cd build
cmake ..
make
./supernet -s SuperPointPretrainedNetwork/superpoint_v1.wts    // serialize model to plan file i.e. 'supernet.engine'
```

## Run Demo using SuperPointPretrainedNetwork Python Script
The live demo can be run by inffering TensorRT generated engine file or by the pre-trained pytorch weight file , the `demo_superpoint.py` script is modified to infer automatically by either using TensorRT or PyTorch based on the provided input weight file.
```
cd SuperPointPretrainedNetwork
python demo_superpoint.py assets/nyu_snippet.mp4 --cuda --weights_path tensorrtx/superpoint/build/supernet.engine
// provide absolute path to supernet.engine as input weight file 
python demo_superpoint.py assets/nyu_snippet.mp4 --cuda --weights_path superpoint_v1.pth
// execute above command to infer using pytorch pre-trained weight files instead of tensorrt engine file.
```

## Output
As from the below result there is no significant difference in the inferred output!
<table>
<th>
PyTorch
</th>
<th>
TensorRT
</th>
<tr>
<td>
<img src="https://user-images.githubusercontent.com/107029401/177322379-2782ca66-bcac-4cf6-b6d3-e1b4d4a8e171.gif"/>
</td>
<td>
<img src="https://user-images.githubusercontent.com/107029401/177322387-c945b903-f233-4a43-bfd3-530c46f4f4db.gif"/>
</td>
</tr>
</table>

## TODO
- [ ] Optimizing post-processing using custom TensorRT layer.
- [ ] Benchmark validation for speed accuracy tradeoff with [hpatches](https://github.com/hpatches/hpatches-benchmark) dataset


================================================
FILE: superpoint/gen_wts.py
================================================
import torch
import struct
from model import SuperPointNet

model_name = "superpoint_v1"

net = SuperPointNet()
net.load_state_dict(torch.load("superpoint_v1.pth"))
net = net.cuda()
net.eval()

f = open(model_name + ".wts", "w")
f.write("{}\n".format(len(net.state_dict().keys())))
for k, v in net.state_dict().items():
    vr = v.reshape(-1).cpu().numpy()
    f.write("{} {}".format(k, len(vr)))
    for vv in vr:
        f.write(" ")
        f.write(struct.pack(">f", float(vv)).hex())
    f.write("\n")

================================================
FILE: superpoint/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream &stream, const std::string &prefix, bool shouldLog)
        : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer &&other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm *tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream &mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream &stream, const std::string &prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity), std::ostream(&mBuffer) // links the stream buffer with the stream
          ,
          mShouldLog(severity <= reportableSeverity), mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer &&other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog), std::ostream(&mBuffer) // links the stream buffer with the stream
          ,
          mShouldLog(other.mShouldLog), mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream &severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR:
            return "[F] ";
        case Severity::kERROR:
            return "[E] ";
        case Severity::kWARNING:
            return "[W] ";
        case Severity::kINFO:
            return "[I] ";
        case Severity::kVERBOSE:
            return "[V] ";
        default:
            assert(0);
            return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger &getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char *msg) noexcept override
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom &&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string &name, const std::string &cmdline)
            : mStarted(started), mName(name), mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string &name, const std::string &cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string &name, int argc, char const *const *argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom &testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom &testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom &testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom &testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom &testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom &testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char *severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR:
            return "[F] ";
        case Severity::kERROR:
            return "[E] ";
        case Severity::kWARNING:
            return "[W] ";
        case Severity::kINFO:
            return "[I] ";
        case Severity::kVERBOSE:
            return "[V] ";
        default:
            assert(0);
            return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char *testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING:
            return "RUNNING";
        case TestResult::kPASSED:
            return "PASSED";
        case TestResult::kFAILED:
            return "FAILED";
        case TestResult::kWAIVED:
            return "WAIVED";
        default:
            assert(0);
            return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream &severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom &testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const *const *argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

    //!
    //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
    //!
    //! Example usage:
    //!
    //!     LOG_VERBOSE(logger) << "hello world" << std::endl;
    //!
    inline LogStreamConsumer LOG_VERBOSE(const Logger &logger)
    {
        return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
    }

    //!
    //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
    //!
    //! Example usage:
    //!
    //!     LOG_INFO(logger) << "hello world" << std::endl;
    //!
    inline LogStreamConsumer LOG_INFO(const Logger &logger)
    {
        return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
    }

    //!
    //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
    //!
    //! Example usage:
    //!
    //!     LOG_WARN(logger) << "hello world" << std::endl;
    //!
    inline LogStreamConsumer LOG_WARN(const Logger &logger)
    {
        return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
    }

    //!
    //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
    //!
    //! Example usage:
    //!
    //!     LOG_ERROR(logger) << "hello world" << std::endl;
    //!
    inline LogStreamConsumer LOG_ERROR(const Logger &logger)
    {
        return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
    }

    //!
    //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
    //         ("fatal" severity)
    //!
    //! Example usage:
    //!
    //!     LOG_FATAL(logger) << "hello world" << std::endl;
    //!
    inline LogStreamConsumer LOG_FATAL(const Logger &logger)
    {
        return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
    }

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: superpoint/supernet.cpp
================================================
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include <opencv2/opencv.hpp>
#include <dirent.h>
#include "NvInfer.h"
#include "utils.h"
#include "cuda_runtime_api.h"
#include "logging.h"

//#define USE_FP16  // comment out this if want to use FP32
#define DEVICE 0     // GPU id
#define BATCH_SIZE 1 // currently, only support BATCH=1

// stuff we know about the network and the input/output blobs
static const int INPUT_H = 120;
static const int INPUT_W = 160;
const char *INPUT_BLOB_NAME = "data";
const char *OUTPUT_BLOB_NAME_1 = "semi";
const char *OUTPUT_BLOB_NAME_2 = "desc";

static Logger gLogger;

// create the engine using only the API and not any parser.
ICudaEngine *createEngine(IBuilder *builder, IBuilderConfig *config, std::string path, DataType dt)
{
    INetworkDefinition *network = builder->createNetworkV2(0U);

    // Create input tensor of shape { 3, INPUT_H, INPUT_W } with name INPUT_BLOB_NAME
    ITensor *data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{1, INPUT_H, INPUT_W});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights(path);

    IConvolutionLayer *conv1a = network->addConvolutionNd(*data, 64, DimsHW{3, 3}, weightMap["conv1a.weight"], weightMap["conv1a.bias"]);
    assert(conv1a);
    conv1a->setStrideNd(DimsHW{1, 1});
    conv1a->setPaddingNd(DimsHW{1, 1});
    IActivationLayer *relu1 = network->addActivation(*conv1a->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    IConvolutionLayer *conv1b = network->addConvolutionNd(*relu1->getOutput(0), 64, DimsHW{3, 3}, weightMap["conv1b.weight"], weightMap["conv1b.bias"]);
    assert(conv1b);
    conv1b->setStrideNd(DimsHW{1, 1});
    conv1b->setPaddingNd(DimsHW{1, 1});
    IActivationLayer *relu2 = network->addActivation(*conv1b->getOutput(0), ActivationType::kRELU);
    assert(relu2);

    IPoolingLayer *pool1 = network->addPoolingNd(*relu2->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    assert(pool1);
    pool1->setStrideNd(DimsHW{2, 2});

    IConvolutionLayer *conv2a = network->addConvolutionNd(*pool1->getOutput(0), 64, DimsHW{3, 3}, weightMap["conv2a.weight"], weightMap["conv2a.bias"]);
    assert(conv2a);
    conv2a->setStrideNd(DimsHW{1, 1});
    conv2a->setPaddingNd(DimsHW{1, 1});
    IActivationLayer *relu3 = network->addActivation(*conv2a->getOutput(0), ActivationType::kRELU);
    assert(relu3);

    IConvolutionLayer *conv2b = network->addConvolutionNd(*relu3->getOutput(0), 64, DimsHW{3, 3}, weightMap["conv2b.weight"], weightMap["conv2b.bias"]);
    assert(conv2b);
    conv2b->setStrideNd(DimsHW{1, 1});
    conv2b->setPaddingNd(DimsHW{1, 1});
    IActivationLayer *relu4 = network->addActivation(*conv2b->getOutput(0), ActivationType::kRELU);
    assert(relu4);

    IPoolingLayer *pool2 = network->addPoolingNd(*relu4->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    assert(pool2);
    pool2->setStrideNd(DimsHW{2, 2});

    IConvolutionLayer *conv3a = network->addConvolutionNd(*pool2->getOutput(0), 128, DimsHW{3, 3}, weightMap["conv3a.weight"], weightMap["conv3a.bias"]);
    assert(conv3a);
    conv3a->setStrideNd(DimsHW{1, 1});
    conv3a->setPaddingNd(DimsHW{1, 1});
    IActivationLayer *relu44 = network->addActivation(*conv3a->getOutput(0), ActivationType::kRELU);
    assert(relu44);

    IConvolutionLayer *conv3b = network->addConvolutionNd(*relu44->getOutput(0), 128, DimsHW{3, 3}, weightMap["conv3b.weight"], weightMap["conv3b.bias"]);
    assert(conv3b);
    conv3b->setStrideNd(DimsHW{1, 1});
    conv3b->setPaddingNd(DimsHW{1, 1});
    IActivationLayer *relu5 = network->addActivation(*conv3b->getOutput(0), ActivationType::kRELU);
    assert(relu5);

    IPoolingLayer *pool3 = network->addPoolingNd(*relu5->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    assert(pool3);
    pool3->setStrideNd(DimsHW{2, 2});

    IConvolutionLayer *conv4a = network->addConvolutionNd(*pool3->getOutput(0), 128, DimsHW{3, 3}, weightMap["conv4a.weight"], weightMap["conv4a.bias"]);
    assert(conv4a);
    conv4a->setStrideNd(DimsHW{1, 1});
    conv4a->setPaddingNd(DimsHW{1, 1});
    IActivationLayer *relu6 = network->addActivation(*conv4a->getOutput(0), ActivationType::kRELU);
    assert(relu6);

    IConvolutionLayer *conv4b = network->addConvolutionNd(*relu6->getOutput(0), 128, DimsHW{3, 3}, weightMap["conv4b.weight"], weightMap["conv4b.bias"]);
    assert(conv4b);
    conv4b->setStrideNd(DimsHW{1, 1});
    conv4b->setPaddingNd(DimsHW{1, 1});
    IActivationLayer *relu7 = network->addActivation(*conv4b->getOutput(0), ActivationType::kRELU);
    assert(relu7);

    IConvolutionLayer *convPa = network->addConvolutionNd(*relu7->getOutput(0), 256, DimsHW{3, 3}, weightMap["convPa.weight"], weightMap["convPa.bias"]);
    assert(convPa);
    convPa->setStrideNd(DimsHW{1, 1});
    convPa->setPaddingNd(DimsHW{1, 1});
    IActivationLayer *relu8 = network->addActivation(*convPa->getOutput(0), ActivationType::kRELU);
    assert(relu8);

    IConvolutionLayer *convPb = network->addConvolutionNd(*relu8->getOutput(0), 65, DimsHW{1, 1}, weightMap["convPb.weight"], weightMap["convPb.bias"]);
    assert(convPb);
    convPb->setStrideNd(DimsHW{1, 1});

    IConvolutionLayer *convDa = network->addConvolutionNd(*relu7->getOutput(0), 256, DimsHW{3, 3}, weightMap["convDa.weight"], weightMap["convDa.bias"]);
    assert(convDa);
    convDa->setStrideNd(DimsHW{1, 1});
    convDa->setPaddingNd(DimsHW{1, 1});
    IActivationLayer *relu9 = network->addActivation(*convDa->getOutput(0), ActivationType::kRELU);
    assert(relu9);

    IConvolutionLayer *convDb = network->addConvolutionNd(*relu9->getOutput(0), 256, DimsHW{1, 1}, weightMap["convDb.weight"], weightMap["convDb.bias"]);
    assert(convDb);
    convDb->setStrideNd(DimsHW{1, 1});

    convPb->getOutput(0)->setName(OUTPUT_BLOB_NAME_1);
    std::cout << "set name out1" << std::endl;
    network->markOutput(*convPb->getOutput(0));

    convDb->getOutput(0)->setName(OUTPUT_BLOB_NAME_2);
    std::cout << "set name out2" << std::endl;
    network->markOutput(*convDb->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(BATCH_SIZE);
    config->setMaxWorkspaceSize(1 << 20);

#ifdef USE_FP16
    config->setFlag(BuilderFlag::kFP16);
#endif

    ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "build out" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto &mem : weightMap)
    {
        free((void *)(mem.second.values));
    }

    return engine;
}

// Creat the engine using only the API and not any parser.

void APIToModel(std::string path, IHostMemory **modelStream)
{
    // Create builder
    IBuilder *builder = createInferBuilder(gLogger);
    IBuilderConfig *config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine *engine = createEngine(builder, config, path, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
}

int main(int argc, char **argv)
{
    cudaSetDevice(DEVICE);
    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{nullptr};
    size_t size{0};

    if (argc == 3 && std::string(argv[1]) == "-s")
    {
        IHostMemory *modelStream{nullptr};
        APIToModel(std::string(argv[2]), &modelStream);
        assert(modelStream != nullptr);
        std::ofstream p("supernet.engine", std::ios::binary);
        if (!p)
        {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char *>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 0;
    }
    else
    {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./supernet -s <path_to_.wts_file>  // serialize model to plan file" << std::endl;
        return -1;
    }

    return 0;
}


================================================
FILE: superpoint/utils.cpp
================================================
#include "utils.h"
#include <dirent.h>
#include <string.h>

// Load weights from files shared with TensorRT samples.
// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file)
{
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t *val = reinterpret_cast<uint32_t *>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names)
{
    DIR *p_dir = opendir(p_dir_name);
    if (p_dir == nullptr)
    {
        return -1;
    }

    struct dirent *p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr)
    {
        if (strcmp(p_file->d_name, ".") != 0 &&
            strcmp(p_file->d_name, "..") != 0)
        {
            // std::string cur_file_name(p_dir_name);
            // cur_file_name += "/";
            // cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}

void tokenize(const std::string &str, std::vector<std::string> &tokens, const std::string &delimiters)
{
    // Skip delimiters at beginning.
    std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);

    // Find first non-delimiter.
    std::string::size_type pos = str.find_first_of(delimiters, lastPos);

    while (std::string::npos != pos || std::string::npos != lastPos)
    {
        // Found a token, add it to the vector.
        tokens.push_back(str.substr(lastPos, pos - lastPos));

        // Skip delimiters.
        lastPos = str.find_first_not_of(delimiters, pos);

        // Find next non-delimiter.
        pos = str.find_first_of(delimiters, lastPos);
    }
}


================================================
FILE: superpoint/utils.h
================================================
#pragma once

#include <map>
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "assert.h"
#include <fstream>
#include <iostream>
#include <memory>
#include <vector>
#include <opencv2/opencv.hpp>


using namespace nvinfer1;

#define CHECK(status)                             \
    do                                            \
    {                                             \
        auto ret = (status);                      \
        if (ret != 0)                             \
        {                                         \
            std::cout << "Cuda failure: " << ret; \
            abort();                              \
        }                                         \
    } while (0)


int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names);
std::map<std::string, Weights> loadWeights(const std::string file);
void tokenize(const std::string &str, std::vector<std::string> &tokens, const std::string &delimiters = ",");

================================================
FILE: swin-transformer/semantic-segmentation/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.4)

project(swintransformer)

set(OpenCV_DIR "D:\\opencv\\opencv346\\build")
set(TENSORRT_DIR "D:\\TensorRT-7.0.0.11.Windows10.x86_64.cuda-10.2.cudnn7.6\\TensorRT-7.0.0.11")

add_definitions(-std=c++11)
add_definitions(-DAPI_EXPORTS)
option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -D_MWAITXINTRIN_H_INCLUDED")
if(WIN32)
include_directories(${PROJECT_SOURCE_DIR}/include)
endif(WIN32)


find_package(CUDA REQUIRED)
message(STATUS "    libraries: ${CUDA_LIBRARIES}")
message(STATUS "    include path: ${CUDA_INCLUDE_DIRS}")
include_directories(${CUDA_INCLUDE_DIRS})
set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++11; -g; -G;-gencode; arch=compute_75;code=sm_75)
enable_language(CUDA)  # һӺ ͻvsвҪֶcuda 
include_directories(${TENSORRT_DIR}\\include)
link_directories(${TENSORRT_DIR}\\lib)

# file(GLOB SOURCE_FILES "*.cu")
# cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu ${PROJECT_SOURCE_DIR}/API.h)
# target_link_libraries(myplugins nvinfer cudart)

# opencvϢ
find_package(OpenCV QUIET
    NO_MODULE
    NO_DEFAULT_PATH
    NO_CMAKE_PATH
    NO_CMAKE_ENVIRONMENT_PATH
    NO_SYSTEM_ENVIRONMENT_PATH
    NO_CMAKE_PACKAGE_REGISTRY
    NO_CMAKE_BUILDS_PATH
    NO_CMAKE_SYSTEM_PATH
    NO_CMAKE_SYSTEM_PACKAGE_REGISTRY
)

message(STATUS "OpenCV library status:")
message(STATUS "    version: ${OpenCV_VERSION}")
message(STATUS "    libraries: ${OpenCV_LIBS}")
message(STATUS "    include path: ${OpenCV_INCLUDE_DIRS}")

include_directories(${OpenCV_INCLUDE_DIRS})


file(GLOB SOURCE_FILES "*.h" "*.cpp" "*.cu")
add_executable(swintransformer ${SOURCE_FILES})

target_link_libraries(swintransformer nvinfer nvonnxparser)
target_link_libraries(swintransformer cudart)
target_link_libraries(swintransformer ${OpenCV_LIBS})

# if (WIN32)
    # message(STATUS "copy dll......: ${CMAKE_COMMAND} ${TENSORRT_DIR}")
    # add_custom_command(TARGET swintransformer POST_BUILD
        # COMMAND ${CMAKE_COMMAND} -E copy_if_different ${TENSORRT_DIR}/lib/myelin64_1.dll ./${CMAKE_BUILD_TYPE}/myelin64_1.dll
        # COMMAND ${CMAKE_COMMAND} -E copy_if_different ${TENSORRT_DIR}/lib/nvinfer.dll ./${CMAKE_BUILD_TYPE}/nvinfer.dll
        # COMMAND ${CMAKE_COMMAND} -E copy_if_different ${TENSORRT_DIR}/lib/nvinfer_plugin.dll ./${CMAKE_BUILD_TYPE}/nvinfer_plugin.dll
        # COMMAND ${CMAKE_COMMAND} -E copy_if_different ${TENSORRT_DIR}/lib/nvonnxparser.dll ./${CMAKE_BUILD_TYPE}/nvonnxparser.dll
        # COMMAND ${CMAKE_COMMAND} -E copy_if_different ${TENSORRT_DIR}/lib/nvparsers.dll ./${CMAKE_BUILD_TYPE}/nvparsers.dll
        # COMMAND ${CMAKE_COMMAND} -E copy_if_different ${TENSORRT_DIR}/lib/nvserialize.dll ./${CMAKE_BUILD_TYPE}/nvserialize.dll
        # COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CUDA_TOOLKIT_ROOT_DIR}/bin/cublas64_10.dll ./${CMAKE_BUILD_TYPE}/cublas64_10.dll
    # )
# endif(WIN32)

if(UNIX)
add_definitions(-O2 -pthread)
endif(UNIX)

================================================
FILE: swin-transformer/semantic-segmentation/README.md
================================================
# Swin Transform - Semantic Segmentation

The Pytorch implementation is [microsoft/Swin-Transformer](https://github.com/microsoft/Swin-Transformer.git).

Only support Swin-T, welcome the PR for other backbones.

## Authors

<a href="https://github.com/wdhao"><img src="https://avatars.githubusercontent.com/u/58798355?v=4?s=48" width="40px;" alt=""/></a> 
<a href="https://github.com/wang-xinyu"><img src="https://avatars.githubusercontent.com/u/15235574?s=48&v=4" width="40px;" alt=""/></a> 

## How to Run

1. generate .wts from pytorch with .pt, or download .wts from model zoo

```
git clone https://github.com/microsoft/Swin-Transformer.git
git clone https://github.com/wang-xinyu/tensorrtx.git

python gen_wts.py Swin-Transform.pt
// a file 'Swin-Transform.wts' will be generated.
```

2. build tensorrtx/swin-transform and run

```
cd {tensorrtx}/swin-transform/semantic-segmentation/
mkdir build
cd build
cp {microsoft}/Swin-Transformer/Swin-Transform.wts {tensorrtx}/swin-transformer/semantic-segmentation/build
cmake ..
make
sudo ./swintransformer -s [.wts] [.engine]   // serialize model to plan file
sudo ./swintransformer -d [.engine] [image folder]  // deserialize and run inference, the images in [image folder] will be processed.

```

## More Information

See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)


================================================
FILE: swin-transformer/semantic-segmentation/UpsampleKernel.cu
================================================
#include "UpsmapleKernel.h"


/**
 * @brief caculate the number of cuda kernel for upsample. (Cite from: 《GPU高性能编程CUDA实战》P46,P47)
 * 
 * @param total_thread_num: the number of cuda thread of you want to used for upsample
 * @param max_thread_num: the gpu device property
 * @return int  the number of cuda kernel for upsample
 */
int get_kernel_num(int total_thread_num, int max_thread_num)
{
    return (total_thread_num + max_thread_num - 1)/max_thread_num;
}

int get_max_thread_num()
{
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    return prop.maxThreadsPerBlock;
}

__host__ __forceinline__ float linear_upsampling_compute_scale(int input_size, int output_size)
{
    return float(input_size)/float(output_size) ;
}

__device__ __forceinline__ float linear_upsampling_compute_source_index(float scale, int dst_index, int intput_size)
{
    float src_idx = scale * (dst_index + 0.5)-0.5;
    return (src_idx>=0) ? src_idx : 0;
}


__device__ __forceinline__ int get_index(const int batch_idx, const int channel_idx, const int height_idx, const int width_idx, 
                const int batch_total, const int channel_total, const int width)
{
    int ret_idx = batch_idx * batch_total
                    + channel_idx * channel_total
                    + height_idx * width
                    + width_idx;
    return ret_idx;
}

/**
 * @brief 
 * 
 * @tparam T 
 * @param n 
 * @param input_shape: input data shape. such as [batch, channel, height, width] 
 * @param rate_h 
 * @param rate_w 
 * @param inputs 
 * @param outputs 
 * @return __global__ BilinearKernel 
 * @TODO: 
 *  
 */


template <typename T>
__global__ void BilinearKernel(
        const int n,
        int input_b,
        int input_c,
        int input_h,
        int input_w,
        int output_h,
        int output_w,
        const float rate_h,
        const float rate_w,
        const T* inputs,
        T* outputs)
{

    int index = threadIdx.x + blockIdx.x * blockDim.x;
    if(index < n)
    {
        const int w2 = index % output_w;
        const int h2 = index / output_w;


        const float h1r = linear_upsampling_compute_source_index(rate_h, h2, input_h);
        const int h1 = int(h1r);
        const int h1p = (h1 < input_h - 1) ? 1 : 0;
        const float h1lambda = h1r - h1;
        const float h0lambda = 1 - h1lambda;

        const float w1r = linear_upsampling_compute_source_index(rate_w, w2, input_w);
        const int w1 = int(w1r);
        const int w1p = (w1 < input_w - 1) ? 1 : 0;
        const float w1lambda = w1r - w1;
        const float w0lambda = 1 - w1lambda;

        int s_batch_total_1 = input_c * input_h * input_w;
        int s_channel_total_1 = input_h * input_w;

        int s_batch_total_2 = input_c * output_h * output_w;
        int s_channel_total_2 = output_h * output_w;


        const int batch_size = input_b;
        const int channel_size = input_c;

        for(int b_idx=0; b_idx<batch_size; b_idx++)
        {
            for(int c=0; c<channel_size; c++)
            {
                const T val = h0lambda * (w0lambda * inputs[get_index(b_idx, c, h1, w1, s_batch_total_1, s_channel_total_1, input_w)]
                                    + w1lambda * inputs[get_index(b_idx, c, h1, w1+w1p, s_batch_total_1, s_channel_total_1, input_w)])
                                    + h1lambda * (w0lambda * inputs[get_index(b_idx, c, h1+h1p, w1, s_batch_total_1, s_channel_total_1, input_w)]
                                    + w1lambda * inputs[get_index(b_idx, c, h1+h1p, w1+w1p, s_batch_total_1, s_channel_total_1, input_w)]);
                outputs[get_index(b_idx, c, h2, w2, s_batch_total_2, s_channel_total_2, output_w)] = val;
                
            }
        }
    }
}


int UpsampleInference(
    cudaStream_t stream,
    int n,
    int input_b,
    int input_c,
    int input_h,
    int input_w,
    float scale_h,
    float scale_w,
    const void* inputs,
    void* outputs)
{
    int output_h = int(input_h * scale_h);
    int output_w = int(input_w * scale_w);
    int max_threads = get_max_thread_num();
    int kernel_num = get_kernel_num(n, max_threads);
    float rate_h = linear_upsampling_compute_scale(input_h, output_h);
    float rate_w = linear_upsampling_compute_scale(input_w, output_w);

    BilinearKernel<float><<< kernel_num, max_threads, 0, stream>>>(n,input_b,input_c,input_h,input_w,
                                                                                    output_h, output_w, 
                                                                                    rate_h, rate_w,
                                                                                    static_cast<const float*>(inputs),
                                                                                    static_cast<float*>(outputs));
    return 0;
}


================================================
FILE: swin-transformer/semantic-segmentation/UpsamplePlugin.cpp
================================================
#include <iostream>
#include "UpsmapleKernel.h"
#include "UpsamplePlugin.h"

#include <cassert>
#include <cstring>

using namespace nvinfer1;

// Upsample plugin specific constants
namespace {
    static const char* UPSAMPLE_PLUGIN_VERSION{"1"};
    static const char* UPSAMPLE_PLUGIN_NAME{"UpsamplePlugin"};
}

// Static class fields initialization
PluginFieldCollection UpsamplePluginCreator::mFC{};
std::vector<PluginField> UpsamplePluginCreator::mPluginAttributes;

REGISTER_TENSORRT_PLUGIN(UpsamplePluginCreator);

template<typename T>
void writeToBuffer(char*& buffer, const T& val)
{
    *reinterpret_cast<T*>(buffer) = val;
    buffer += sizeof(T);
}

// Helper function for deserializing plugin
template<typename T>
T readFromBuffer(const char*& buffer)
{
    T val = *reinterpret_cast<const T*>(buffer);
    buffer += sizeof(T);
    return val;
}

UpsamplePlugin::UpsamplePlugin(const std::string name, float scale_h, float scale_w)
    : mLayerName(name)
    , mScaleFactor_h(scale_h)
    , mScaleFactor_w(scale_w)
{
    mInputShape.c() = -1;
    mInputShape.h() = -1;
    mInputShape.w() = -1;
    mInputVolume = 0;
}

UpsamplePlugin::UpsamplePlugin(const std::string name, const void* data, size_t length)
    : mLayerName(name)
{
    const char *d = static_cast<const char *>(data);
    const char *a = d;

    mScaleFactor_h = readFromBuffer<float>(d);
    mScaleFactor_w = readFromBuffer<float>(d);
    mInputVolume = readFromBuffer<size_t>(d);
    mInputShape.c() = readFromBuffer<int>(d);
    mInputShape.h() = readFromBuffer<int>(d);
    mInputShape.w() = readFromBuffer<int>(d);

    assert(d == (a + length));

}

const char* UpsamplePlugin::getPluginType() const
{
    return UPSAMPLE_PLUGIN_NAME;
}

const char* UpsamplePlugin::getPluginVersion() const
{
    return UPSAMPLE_PLUGIN_VERSION;
}

int UpsamplePlugin::getNbOutputs() const
{
    return 1;
}

Dims UpsamplePlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
{
    assert(index == 0);
    assert(nbInputDims == 1);
    assert(inputs[0].nbDims == 3);
    return nvinfer1::DimsCHW{inputs[0].d[0],int(inputs[0].d[1]*mScaleFactor_h), int(inputs[0].d[2]*mScaleFactor_w)};
}

int UpsamplePlugin::initialize()
{
    //printf("UpsamplePlugin::initialize\n");
    return 0;
}


int UpsamplePlugin::enqueue(int batchSize, const void* const* inputs, void** outputs, void*, cudaStream_t stream)
{
    //printf("UpsamplePlugin::enqueue\n");
    int status = -1;

    // Our plugin outputs only one tensor
    void* output = outputs[0];

    // Launch CUDA kernel wrapper and save its return value
    status = UpsampleInference(stream, mInputVolume, 
                                batchSize, mInputShape.c(), mInputShape.h(), mInputShape.w(),
                                mScaleFactor_h,mScaleFactor_w,
                                inputs[0], output);
    return status;
}

size_t UpsamplePlugin::getSerializationSize() const
{
    //printf("UpsamplePlugin::getSerializationSize\n");
    return sizeof(mScaleFactor_h)  + sizeof(mScaleFactor_w) +
            sizeof(mInputVolume) + sizeof(mInputShape.c()) + 
            sizeof(mInputShape.h()) + sizeof(mInputShape.w());
}


void UpsamplePlugin::serialize(void* buffer) const 
{
    //printf("UpsamplePlugin::serialize\n");
    char *d = static_cast<char *>(buffer);
    const char *a = d;

    writeToBuffer(d, mScaleFactor_h);
    writeToBuffer(d, mScaleFactor_w);
    writeToBuffer(d, mInputVolume);
    writeToBuffer(d, mInputShape.c());
    writeToBuffer(d, mInputShape.h());
    writeToBuffer(d, mInputShape.w());

    assert(d == a + getSerializationSize());
}

void UpsamplePlugin::configureWithFormat(const Dims* inputs, int nbInputs, const Dims* outputs, int nbOutputs, DataType type, PluginFormat format, int)
{
    assert(nbOutputs == 1);
    assert(type == DataType::kFLOAT);
    assert(format == PluginFormat::kNCHW);
    assert(inputs[0].nbDims == 3);

    size_t volume = int(inputs[0].d[1]*mScaleFactor_h) * int(inputs[0].d[2]*mScaleFactor_w);
    mInputVolume = volume;
    mInputShape.c() = inputs[0].d[0];
    mInputShape.h() = inputs[0].d[1];
    mInputShape.w() = inputs[0].d[2];
}

bool UpsamplePlugin::supportsFormat(DataType type, PluginFormat format) const
{
    if (type == DataType::kFLOAT && format == PluginFormat::kNCHW)
        return true;
    else
        return false;
}

void UpsamplePlugin::terminate() {}

void UpsamplePlugin::destroy() {
    // This gets called when the network containing plugin is destroyed
    delete this;
}

IPluginV2* UpsamplePlugin::clone() const
{
    return new UpsamplePlugin(mLayerName, mScaleFactor_h, mScaleFactor_w);
}

void UpsamplePlugin::setPluginNamespace(const char* libNamespace) 
{
    mNamespace = libNamespace;
}

const char* UpsamplePlugin::getPluginNamespace() const
{
    return mNamespace.c_str();
}

UpsamplePluginCreator::UpsamplePluginCreator()
{
    mPluginAttributes.emplace_back(PluginField("scaleFactor", nullptr, PluginFieldType::kFLOAT32, 2));

    mFC.nbFields = mPluginAttributes.size();
    mFC.fields = mPluginAttributes.data();
}
const char* UpsamplePluginCreator::getPluginName() const
{
    return UPSAMPLE_PLUGIN_NAME;
}

const char* UpsamplePluginCreator::getPluginVersion() const
{
    return UPSAMPLE_PLUGIN_VERSION;
}

const PluginFieldCollection* UpsamplePluginCreator::getFieldNames()
{
    return &mFC;
}

IPluginV2* UpsamplePluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
{
    float scaleFactor_h = 0.f;
    float scaleFactor_w = 0.f;
    const PluginField* fields = fc->fields;

    assert(fc->nbFields == 1);
    for (int i = 0; i < fc->nbFields; i++){
    
        if (strcmp(fields[i].name, "scaleFactor") == 0) {
            assert(fields[i].type == PluginFieldType::kFLOAT32);
            scaleFactor_h = *(static_cast<const float*>(fields[i].data));
            scaleFactor_w = *(static_cast<const float*>(fields[i].data)+1);
            //std::cout<<scaleFactor_h<< " , "<<scaleFactor_w<<std::endl;
        } 
    }
    return new UpsamplePlugin(name, scaleFactor_h, scaleFactor_w);
}

IPluginV2* UpsamplePluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
{
    return new UpsamplePlugin(name, serialData, serialLength);
}

void UpsamplePluginCreator::setPluginNamespace(const char* libNamespace) 
{
    mNamespace = libNamespace;
}

const char* UpsamplePluginCreator::getPluginNamespace() const
{
    return mNamespace.c_str();
}


================================================
FILE: swin-transformer/semantic-segmentation/UpsamplePlugin.h
================================================
#ifndef UPSAMPLE_PLUGIN_H
#define UPSAMPLE_PLUGIN_H

#include "NvInferPlugin.h"
#include <string>
#include <vector>


using namespace nvinfer1;

class UpsamplePlugin : public IPluginV2
{
public:
    UpsamplePlugin(const std::string name, float scale_h,float scale_w);

    UpsamplePlugin(const std::string name, const void* data, size_t length);

    // It doesn't make sense to make UpsamplePlugin without arguments, so we delete default constructor.
    UpsamplePlugin() = delete;

    int getNbOutputs() const override;

    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;

    int initialize() override;

    void terminate() override;

    size_t getWorkspaceSize(int) const override { return 0; };

    int enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override;

    size_t getSerializationSize() const override;

    void serialize(void* buffer) const override;

    void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) override;

    bool supportsFormat(DataType type, PluginFormat format) const override;

    const char* getPluginType() const override;

    const char* getPluginVersion() const override;

    void destroy() override;

    nvinfer1::IPluginV2* clone() const override;

    void setPluginNamespace(const char* pluginNamespace) override;

    const char* getPluginNamespace() const override;

private:
    const std::string mLayerName;
    bool mAlignCorners;
    float mScaleFactor_h;
    float mScaleFactor_w;
    size_t mInputVolume;
    DimsCHW mInputShape;
    std::string mNamespace;
};

class UpsamplePluginCreator : public IPluginCreator
{
public:
    UpsamplePluginCreator();

    const char* getPluginName() const override;

    const char* getPluginVersion() const override;

    const PluginFieldCollection* getFieldNames() override;

    IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) override;

    IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;
    
    void setPluginNamespace(const char* pluginNamespace) override;

    const char* getPluginNamespace() const override;

private:
    static PluginFieldCollection mFC;
    static std::vector<PluginField> mPluginAttributes;
    std::string mNamespace;
};

#endif


================================================
FILE: swin-transformer/semantic-segmentation/UpsmapleKernel.h
================================================
#ifndef UPSAMPLE_KERNEL_H
#define UPSAMPLE_KERNEL_H

#include <iostream>
#include "NvInfer.h"

int UpsampleInference(
    cudaStream_t stream,
    int n,
    int input_b,
    int input_c,
    int input_h,
    int input_w,
    float scale_h,
    float scale_w,
    const void* inputs,
    void* outputs);


#endif


================================================
FILE: swin-transformer/semantic-segmentation/common.hpp
================================================
#ifndef COMMON_HPP
#define COMMON_HPP

#include "layerNorm.h"
#include "NvInfer.h"
#include "NvInfer.h"
#include "NvInferPlugin.h"
#include "cuda_runtime_api.h"
#include <assert.h>
#include <map>
#include <fstream>
#include<opencv2/core/core.hpp>
#include<opencv2/imgproc/imgproc.hpp>
#include<opencv2/imgcodecs/imgcodecs.hpp>
#include<opencv2/dnn/dnn.hpp>

using namespace nvinfer1;
#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

void mblobFromImages(cv::InputArrayOfArrays images_, cv::OutputArray blob_,
    cv::Size size, const cv::Scalar& mean_, const cv::Scalar& std_, bool swapRB, bool crop)
{
    //CV_TRACE_FUNCTION();
    std::vector<cv::Mat> images;
    images_.getMatVector(images);
    CV_Assert(!images.empty());
    for (int i = 0; i < images.size(); i++)
    {
        cv::Size imgSize = images[i].size();
        if (size == cv::Size())
            size = imgSize;
        if (size != imgSize)
        {
            if (crop)
            {
                float resizeFactor = std::max(size.width / (float)imgSize.width,
                    size.height / (float)imgSize.height);
                resize(images[i], images[i], cv::Size(), resizeFactor, resizeFactor, cv::INTER_LINEAR);
                cv::Rect crop(cv::Point(0.5 * (images[i].cols - size.width),
                    0.5 * (images[i].rows - size.height)),
                    size);
                images[i] = images[i](crop);
            }
            else
                resize(images[i], images[i], size, 0, 0, cv::INTER_LINEAR);
        }
        if (images[i].depth() == CV_8U)
            images[i].convertTo(images[i], CV_32F);
        cv::Scalar mean = mean_;
        cv::Scalar std_num = std_;
        if (swapRB)
        {
            std::swap(mean[0], mean[2]);
            std::swap(std_num[0], std_num[2]);
        }

        images[i] -= mean;
        images[i] /= std_num;
    }

    size_t i, nimages = images.size();
    cv::Mat image0 = images[0];
    int nch = image0.channels();
    CV_Assert(image0.dims == 2);
    cv::Mat image;
    if (nch == 3 || nch == 4)
    {
        int sz[] = { (int)nimages, nch, image0.rows, image0.cols };
        blob_.create(4, sz, CV_32F);
        cv::Mat blob = blob_.getMat();
        cv::Mat ch[4];

        for (i = 0; i < nimages; i++)
        {
            image = images[i];
            CV_Assert(image.depth() == CV_32F);
            nch = image.channels();
            CV_Assert(image.dims == 2 && (nch == 3 || nch == 4));
            CV_Assert(image.size() == image0.size());

            for (int j = 0; j < nch; j++)
                ch[j] = cv::Mat(image.rows, image.cols, CV_32F, blob.ptr((int)i, j));
            if (swapRB)
                std::swap(ch[0], ch[2]);
            split(image, ch);
        }
    }
    else
    {
        CV_Assert(nch == 1);
        int sz[] = { (int)nimages, 1, image0.rows, image0.cols };
        blob_.create(4, sz, CV_32F);
        cv::Mat blob = blob_.getMat();

        for (i = 0; i < nimages; i++)
        {
            cv::Mat image = images[i];
            CV_Assert(image.depth() == CV_32F);
            nch = image.channels();
            CV_Assert(image.dims == 2 && (nch == 1));
            CV_Assert(image.size() == image0.size());

            image.copyTo(cv::Mat(image.rows, image.cols, CV_32F, blob.ptr((int)i, 0)));
        }
    }
}
cv::Mat BlobFromImages(cv::InputArrayOfArrays images, cv::Size size,
    const cv::Scalar& mean, const cv::Scalar& std_num, bool swapRB, bool crop)
{
    //CV_TRACE_FUNCTION();
    cv::Mat blob;
    mblobFromImages(images, blob, size, mean, std_num, swapRB, crop);
    return blob;
}
void debug_print(ITensor *input_tensor,std::string head)
{
    std::cout << head<< " : ";

       for (int i = 0; i < input_tensor->getDimensions().nbDims; i++)
       {
           std::cout << input_tensor->getDimensions().d[i] << " ";
       }
       std::cout<<std::endl;

}
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{ DataType::kFLOAT, nullptr, 0 };
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

ITensor* m_layerNorm(INetworkDefinition *m_Network,std::map<std::string, Weights> weightMap,ITensor *input, string lname)
{
    auto creator = getPluginRegistry()->getPluginCreator("layerNorm_trt","1");

    PluginField pluginMultidata[2];

    const PluginFieldCollection* pluginData = creator->getFieldNames();
    IPluginV2 *pluginObj = creator->createPlugin(lname.c_str(), pluginData);
    ITensor* inputTensors[] = {input};
    auto ln_ms = m_Network->addPluginV2(inputTensors, 1, *pluginObj);
    auto ln_m = m_Network->addElementWise(*input,*ln_ms->getOutput(0),ElementWiseOperation::kSUB);
    auto ln = m_Network->addElementWise(*ln_m->getOutput(0),*ln_ms->getOutput(1),ElementWiseOperation::kDIV);
    Weights W = weightMap[lname + ".weight"];
    int len = W.count;
    Dims wb ;
    wb.nbDims = ln->getOutput(0)->getDimensions().nbDims;
    for (int i = 0 ; i < wb.nbDims; i++)
    {
        if (i != wb.nbDims -1)
            wb.d[i] = 1;
        else{
            wb.d[i] = len;
        }
    }
    auto wgts = m_Network->addConstant(wb,W);
    auto p_w = m_Network->addElementWise(*ln->getOutput(0),*wgts->getOutput(0),ElementWiseOperation::kPROD);
    Weights B = weightMap[lname + ".bias"];
    auto bias = m_Network->addConstant(wb,B);
    auto sum_bias = m_Network->addElementWise(*p_w->getOutput(0),*bias->getOutput(0),ElementWiseOperation::kSUM);
    debug_print(sum_bias->getOutput(0),lname);
    return sum_bias->getOutput(0);
}
ITensor* layerNorm(INetworkDefinition *m_Network,std::map<std::string, Weights> weightMap,ITensor *input, string lname)
{
    auto mean = m_Network->addReduce(*input, ReduceOperation::kAVG, 2, true);
    assert(mean);

    auto sub_mean = m_Network->addElementWise(*input, *mean->getOutput(0), ElementWiseOperation::kSUB);
    assert(sub_mean);
//    float SCALING_ONE = 1.0;
//    float SHIFT_ZERO = 0.0;
//    float POWER_TWO = 2.0;
//    // implement pow2 with scale
//    Weights scale{ DataType::kFLOAT, &SCALING_ONE, 1 };
//    Weights shift{ DataType::kFLOAT, &SHIFT_ZERO, 1 };
//    Weights power{ DataType::kFLOAT, &POWER_TWO, 1 };
//    auto pow2 = m_Network->addScaleNd(*sub_mean->getOutput(0), ScaleMode::kUNIFORM, shift, scale, power,0);
//    assert(pow2);
    auto pow2 = m_Network->addElementWise(*sub_mean->getOutput(0), *sub_mean->getOutput(0), ElementWiseOperation::kPROD);
    assert(pow2);
    debug_print(pow2->getOutput(0),"pow2");
    auto pow_mean = m_Network->addReduce(*pow2->getOutput(0), ReduceOperation::kAVG, 2, true);
    assert(pow_mean);
    debug_print(pow_mean->getOutput(0),"pow_mean");
    float E = 1e-5;
    Weights EPS{DataType::kFLOAT,nullptr,1};
    EPS.values = &E;
    auto eps = m_Network->addConstant(Dims2{1,1}, EPS);
    assert(eps);

    auto add_eps = m_Network->addElementWise(*pow_mean->getOutput(0), *eps->getOutput(0), ElementWiseOperation::kSUM);
    assert(add_eps);

    auto sqrt = m_Network->addUnary(*add_eps->getOutput(0), UnaryOperation::kSQRT);
    assert(sqrt);

    auto div = m_Network->addElementWise(*sub_mean->getOutput(0), *sqrt->getOutput(0), ElementWiseOperation::kDIV);
    assert(div);
    debug_print(div->getOutput(0),"div");

    string weightsFile = lname + ".weight";
    string biasFile = lname + ".bias";

    int d_model = input->getDimensions().d[input->getDimensions().nbDims - 1];
    cout<<"d_model = "<<d_model<<endl;
    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * d_model));
    for (int i = 0; i < d_model; i++) {
        pval[i] = 1.0;
    }
    Weights norm1_power{ DataType::kFLOAT, pval, d_model };
    auto affine = m_Network->addScaleNd(
        *div->getOutput(0),
        ScaleMode::kELEMENTWISE,
        weightMap[biasFile],
        weightMap[weightsFile],
        norm1_power,1);
    assert(affine);
    return affine->getOutput(0);
}
ITensor* conv(INetworkDefinition *m_Network,std::map<std::string, Weights> weightMap,ITensor *input, string lname,
              int c_out,bool bias = true,int k = 4 , int s = 4, int p = 0)
{
    Weights Bias{ DataType::kFLOAT, nullptr, 0 };
    if(bias)
        Bias = weightMap[lname + ".bias"];
    auto out = m_Network->addConvolutionNd(*input,c_out,Dims2{k,k},weightMap[lname + ".weight"],Bias);
    out->setStrideNd(Dims2{s,s});
    out->setPaddingNd(Dims2{p,p});
    out->setNbGroups(1);
    debug_print(out->getOutput(0),lname);
    return out->getOutput(0);
}
ITensor* shuffle_reshape(INetworkDefinition *m_Network,ITensor *input,Dims reshapeDims)
{
    auto out = m_Network->addShuffle(*input);
    out->setReshapeDimensions(reshapeDims);
    debug_print(out->getOutput(0),"reshape");
    return out->getOutput(0);
}
ITensor* shuffle_permute(INetworkDefinition *m_Network,ITensor *input,Permutation permutation)
{
    auto out = m_Network->addShuffle(*input);
    out->setFirstTranspose(permutation);
    debug_print(out->getOutput(0),"permute");
    return out->getOutput(0);
}
ITensor* shuffle_reshapeApermute(INetworkDefinition *m_Network,ITensor *input,Dims reshapeDims,
                                 Permutation permutation,bool firstReshape)
{
    auto out = m_Network->addShuffle(*input);
    out->setReshapeDimensions(reshapeDims);
    if(firstReshape)
        out->setSecondTranspose(permutation);
    else
        out->setFirstTranspose(permutation);
    debug_print(out->getOutput(0),"shuffle");
    return out->getOutput(0);
}
ITensor* trt_transform_imgMask(INetworkDefinition *m_Network,int hw, int window_size, int shift_size)
{
    int Hp = hw;
    int Wp = hw;
    Weights Mask_param{DataType::kFLOAT,nullptr,Hp*Wp};
    float *mask_param = new float[Hp*Wp];
    for(int i = 0; i < Hp ; i++)
    {
        for(int j = 0; j < Wp; j++)
        {
            if(i<Hp-window_size && j<Wp-window_size)
                mask_param[i*Wp + j] = 0.0;
            else if(i<Hp-window_size && j>=Wp-window_size && j < Wp-shift_size)
                mask_param[i*Wp + j] = 1.0;
            else if(i<Hp-window_size &&  j >= Wp-shift_size)
                mask_param[i*Wp + j] = 2.0;

            else if(i >= Hp-window_size && i < Hp-shift_size && j<Wp-window_size)
                mask_param[i*Wp + j] = 3.0;
            else if(i >= Hp-window_size && i < Hp-shift_size && j>=Wp-window_size && j < Wp-shift_size)
                mask_param[i*Wp + j] = 4.0;
            else if(i >= Hp-window_size && i < Hp-shift_size && j >= Wp-shift_size)
                mask_param[i*Wp + j] = 5.0;

            else if(i >=  Hp-shift_size && j<Wp-window_size)
                mask_param[i*Wp + j] = 6.0;
            else if(i >=  Hp-shift_size && j>=Wp-window_size && j < Wp-shift_size)
                mask_param[i*Wp + j] = 7.0;
            else if(i >=  Hp-shift_size && j >= Wp-shift_size)
                mask_param[i*Wp + j] = 8.0;
            else{
                cout<<" i && j not limit"<<endl;
                return nullptr;
            }
        }
    }
    Mask_param.values = mask_param;
    auto img_mask = m_Network->addConstant(Dims4{1,Hp,Wp,1},Mask_param);
    auto img_mask_shuffle = m_Network->addShuffle(*img_mask->getOutput(0));
    Dims shuffle1_dims;
    shuffle1_dims.nbDims = 6;
    int dims[] = {1,Hp/window_size,window_size,Wp/window_size,window_size,1};
    for(int i = 0 ; i < 6; i++)
        shuffle1_dims.d[i] = dims[i];
    img_mask_shuffle->setReshapeDimensions(shuffle1_dims);
    img_mask_shuffle->setSecondTranspose(Permutation{0,1,3,2,4,5});
    auto img_mask_shuffle2 = m_Network->addShuffle(*img_mask_shuffle->getOutput(0));
    img_mask_shuffle2->setReshapeDimensions(Dims3{-1,1,window_size*window_size});
    auto img_mask_shuffle3 = m_Network->addShuffle(*img_mask_shuffle->getOutput(0)) ;
    img_mask_shuffle3->setReshapeDimensions(Dims3{-1,window_size*window_size,1});
    auto atten_mask = m_Network->addElementWise(*img_mask_shuffle2->getOutput(0),*img_mask_shuffle3->getOutput(0),ElementWiseOperation::kSUB);

    auto creator = getPluginRegistry()->getPluginCreator("fillmaskLayer_TRT", "1");
    const PluginFieldCollection* pluginData = creator->getFieldNames();
    IPluginV2 *pluginObj = creator->createPlugin("fillmask", pluginData);
    ITensor* inputTensors[] = {atten_mask->getOutput(0)};
    auto fillmask = m_Network->addPluginV2(inputTensors, 1, *pluginObj);

    debug_print(fillmask->getOutput(0),"imgMask");
    return fillmask->getOutput(0);
}
ITensor* trt_transform_pad(INetworkDefinition *m_Network,ITensor *input,int window_size)
{
    int h = input->getDimensions().d[0];
    int w = input->getDimensions().d[1];
    int c = input->getDimensions().d[2];
    int pad_h = (window_size - h%window_size)%window_size;
    int pad_w = (window_size - w%window_size)%window_size;

    ITensor* temp = input;
    if(pad_h != 0)
    {
        Weights pad1{DataType::kFLOAT,nullptr,pad_h*w*c};
        cout<<pad_h*w*c<<endl;
        float *p1 = new float[pad_h*w*c];
        for(int i = 0 ; i < pad_h*w*c; i++)
            p1[i] = 0.f;
        pad1.values = p1;
        auto Pad1 = m_Network->addConstant(Dims3{pad_h,w,c},pad1);
        ITensor *cat1[2] = {temp,Pad1->getOutput(0)};
        auto xp1 = m_Network->addConcatenation(cat1,2);
        xp1->setAxis(0);
        temp = xp1->getOutput(0);
    }
    if(pad_w != 0)
    {
        Weights pad2{DataType::kFLOAT,nullptr,pad_w*(h+pad_h)*c};
        cout<<pad_w*(h+pad_h)*c<<endl;
        float *p2 = new float[pad_w*(h+pad_h)*c];
        for(int i = 0 ; i < pad_w*(h+pad_h)*c; i++)
            p2[i] = 0.0f;
        pad2.values = p2;
        auto Pad2 = m_Network->addConstant(Dims3{(h+pad_h),pad_w,c},pad2);
        ITensor *cat2[] = {temp,Pad2->getOutput(0)};
        auto xp2 = m_Network->addConcatenation(cat2,2);
        xp2->setAxis(1);
        temp = xp2->getOutput(0);
    }
    debug_print(temp, "pad");
    return  temp;
}
ITensor* trt_swinRoll(INetworkDefinition *m_Network,ITensor *input,vector<int> shifts, vector<int> dims)
{
    int len = shifts.size();
    Dims input_dim = input->getDimensions();
    int nbdims = input_dim.nbDims;
    ITensor *temp = input;
    for(int i = 0 ; i < len; i++)
    {
        Dims start, size,stride;
        start.nbDims = nbdims;
        size.nbDims = nbdims;
        stride.nbDims = nbdims;
        if(shifts[i] > 0)
        {
            for(int j = 0 ; j < nbdims; j++)
            {
                if(j != (dims[i] -1 ))
                {
                    start.d[j] = 0;
                    size.d[j] = input_dim.d[j];
                    stride.d[j] = 1;
                }
                else{
                    start.d[j] = 0;
                    size.d[j] = input_dim.d[j] - shifts[i];
                    stride.d[j] = 1;
                }
            }

            auto cat1 = m_Network->addSlice(*temp,start,size,stride);

            for(int j = 0 ; j < nbdims; j++)
            {
                if(j != (dims[i] - 1))
                {
                    start.d[j] = 0;
                    size.d[j] = input_dim.d[j];
                    stride.d[j] = 1;
                }
                else{
                    start.d[j] = input_dim.d[j] - shifts[i];
                    size.d[j] = shifts[i];
                    stride.d[j] = 1;
                }
            }
            auto cat2 = m_Network->addSlice(*temp,start,size,stride);
            ITensor *cat[] ={cat2->getOutput(0),cat1->getOutput(0)};
            auto Cat = m_Network->addConcatenation(cat,2);
            Cat->setAxis(dims[i] - 1);
            temp = Cat->getOutput(0);
        }
        if(shifts[i] < 0)
        {
            for(int j = 0 ; j < nbdims; j++)
            {
                if(j != (dims[i] - 1))
                {
                    start.d[j] = 0;
                    size.d[j] = input_dim.d[j];
                    stride.d[j] = 1;
                }
                else{
                    start.d[j] = 0;
                    size.d[j] = abs(shifts[i]);
                    stride.d[j] = 1;
                }
            }
            auto cat1 = m_Network->addSlice(*temp,start,size,stride);
            debug_print(cat1->getOutput(0), "cat1 dims : ");
            for(int j = 0 ; j < nbdims; j++)
            {
                if(j != (dims[i] - 1))
                {
                    start.d[j] = 0;
                    size.d[j] = input_dim.d[j];
                    stride.d[j] = 1;
                }
                else{
                    start.d[j] =  abs(shifts[i]);
                    size.d[j] = input_dim.d[j] - abs(shifts[i]);
                    stride.d[j] = 1;
                }
            }
            auto cat2 = m_Network->addSlice(*temp,start,size,stride);
            debug_print(cat2->getOutput(0), "cat2 dims : ");
            ITensor *cat[] ={cat2->getOutput(0),cat1->getOutput(0)};
            auto Cat = m_Network->addConcatenation(cat,2);
            Cat->setAxis(dims[i] - 1);
            temp = Cat->getOutput(0);
        }
    }
    return temp;
}
ITensor* trt_transform_window_partition(INetworkDefinition *m_Network,ITensor *input,int window_size)
{
    auto shuffle1 = m_Network->addShuffle(*input);
    Dims shuffle1_dims;
    shuffle1_dims.nbDims = 5;
    int h = input->getDimensions().d[0];
    int w = input->getDimensions().d[1];
    int c = input->getDimensions().d[2];

    int dims[] = {h/window_size,window_size,w/window_size,window_size,c};
    for(int i = 0 ; i < shuffle1_dims.nbDims; i++)
        shuffle1_dims.d[i] = dims[i];
    shuffle1->setReshapeDimensions(shuffle1_dims);
    shuffle1->setSecondTranspose(Permutation{0,2,1,3,4});
    debug_print(shuffle1->getOutput(0)," shuffle1 dims : ");
    auto shuffle2 = m_Network->addShuffle(*shuffle1->getOutput(0));
    shuffle2->setReshapeDimensions(Dims3{-1,window_size*window_size,c});

    debug_print(shuffle2->getOutput(0), "window partition");
    return shuffle2->getOutput(0);
}
ITensor* trt_swinLinear(INetworkDefinition *m_Network,std::map<std::string, Weights> weightMap,
                        ITensor *input, string lname, bool bias = true)
{
    int c = input->getDimensions().d[input->getDimensions().nbDims-1];
    string fc_wpath = lname + ".weight";
    Weights fcW = weightMap[fc_wpath];
    int len_fcw = fcW.count;
    if(len_fcw == 0)
    {
        cout<<"file is not open,please check it's path: "<<fc_wpath<<endl;
        assert(0);
    }
    Dims fcWdims;
    fcWdims.nbDims = input->getDimensions().nbDims;
    if(fcWdims.nbDims == 2)
    {
        fcWdims.d[0] = len_fcw/c;
        fcWdims.d[1] = c;
    }
    else {
        fcWdims.d[0] = 1;
        fcWdims.d[1] = len_fcw/c;
        fcWdims.d[2] = c;
    }
    auto fc_w_constant = m_Network->addConstant(fcWdims,fcW);
    auto fc_w_mm = m_Network->addMatrixMultiply(*input,MatrixOperation::kNONE,
                                                *fc_w_constant->getOutput(0),MatrixOperation::kTRANSPOSE);

    string fc_bpath = lname +".bias";
    Weights fcB = weightMap[fc_bpath];
    int len_fcb = fcB.count;
    if(!bias)
    {
        cout<<lname<<" bias is Null!"<<endl;
        debug_print(fc_w_mm->getOutput(0),lname);
        return fc_w_mm->getOutput(0);
    }
    Dims fcBdims;
    fcBdims.nbDims = input->getDimensions().nbDims;
    if(fcBdims.nbDims == 2)
    {
        fcBdims.d[0] = 1;
        fcBdims.d[1] = len_fcb;
    }
    else {
        fcBdims.d[0] = 1;
        fcBdims.d[1] = 1;
        fcBdims.d[2] = len_fcb;
    }
    auto fc_b_constant = m_Network->addConstant(fcBdims,fcB);
    auto fc = m_Network->addElementWise(*fc_w_mm->getOutput(0),*fc_b_constant->getOutput(0),ElementWiseOperation::kSUM);
    debug_print(fc->getOutput(0),lname);
    return fc->getOutput(0);
}
ITensor* trt_trainsform_WindowAttention(INetworkDefinition *m_Network,std::map<std::string, Weights> weightMap,ITensor *input,
                                        ITensor* mask,string lname,int dim, int num_heads,int window_size, int shift_size)
{

    int b = input->getDimensions().d[0];
    int n = input->getDimensions().d[1];
    int c = input->getDimensions().d[2];

    auto qkv = trt_swinLinear(m_Network,weightMap,input,lname+".qkv");

    Dims qkv_dim;
    qkv_dim.nbDims = 5;
    int d[5] = {b,n,3,num_heads,c/num_heads};
    for(int i = 0; i < 5; i++)
        qkv_dim.d[i] = d[i];
    Permutation qkv_p;
    int p[5] = {2, 0, 3, 1, 4};
    for(int i = 0; i < 5; i++)
        qkv_p.order[i] = p[i];
    auto qkv_shuffle = shuffle_reshapeApermute(m_Network,qkv,qkv_dim,qkv_p,true);

    Dims qkvDims = qkv_shuffle->getDimensions();
    Dims qstart,kstart,vstart,sizes,stride;
    qstart.nbDims = 5;
    kstart.nbDims = 5;
    vstart.nbDims = 5;
    sizes.nbDims = 5;
    stride.nbDims = 5;
    for(int i = 0; i < 5; i++)
    {
        if(i == 0)
        {
            qstart.d[0] = 0;
            kstart.d[0] = 1;
            vstart.d[0] = 2;
            sizes.d[0] = 1;
            stride.d[0] =1;
        }
        else{
            qstart.d[i] = 0;
            kstart.d[i] = 0;
            vstart.d[i] = 0;
            sizes.d[i] = qkvDims.d[i];
            stride.d[i] =1;
        }
    }
    auto q = m_Network->addSlice(*qkv_shuffle,qstart,sizes,stride);
    auto k = m_Network->addSlice(*qkv_shuffle,kstart,sizes,stride);
    auto v = m_Network->addSlice(*qkv_shuffle,vstart,sizes,stride);

    // q * s
    int len = 1;
    Weights scale_w{DataType::kFLOAT,nullptr,len};
    float *scale = new float[len];
    for(int i = 0 ; i < len; i++)
        scale[i] = 1 / sqrt(dim/num_heads);
    scale_w.values = scale;
    Dims scale_dim;
    scale_dim.nbDims = 5;

    for(int i = 0 ; i < 5; i++)
        scale_dim.d[i] = 1;
    auto Scale = m_Network->addConstant(scale_dim,scale_w);
    auto qs = m_Network->addElementWise(*q->getOutput(0),*Scale->getOutput(0),ElementWiseOperation::kPROD);
    auto qs_ = m_Network->addShuffle(*qs->getOutput(0));
    qs_->setReshapeDimensions(Dims4{qkvDims.d[1],qkvDims.d[2],qkvDims.d[3],qkvDims.d[4]});
    auto k_ = m_Network->addShuffle(*k->getOutput(0));
    k_->setReshapeDimensions(Dims4{qkvDims.d[1],qkvDims.d[2],qkvDims.d[3],qkvDims.d[4]});
    auto attn = m_Network->addMatrixMultiply(*qs_->getOutput(0),MatrixOperation::kNONE,
                                             *k_->getOutput(0),MatrixOperation::kTRANSPOSE);
    auto relatbias = m_Network->addConstant(Dims2{(2*window_size -1)*(2*window_size -1),num_heads},weightMap[lname + ".relative_position_bias_table"]);
    Dims r_i_dims;
    r_i_dims.nbDims = 1;
    r_i_dims.d[0] = window_size*window_size * window_size*window_size;
    Weights index{DataType::kINT32,nullptr,r_i_dims.d[0]};
    int* idx = new int[r_i_dims.d[0]];
    for (int i = 0; i < r_i_dims.d[0]; i++) {
        idx[i] =(int)((float*)weightMap[lname+".relative_position_index"].values)[i];
    }
    //idx = (int*)weightMap[lname+".relative_position_index"].values;
    //cout<<"idx = "<<((float*)weightMap[lname+".relative_position_index"].values)[0]<<endl;
    index.values = idx;
    auto relatidx = m_Network->addConstant(r_i_dims,index);
    auto relat = m_Network->addGather(*relatbias->getOutput(0),*relatidx->getOutput(0),0);
    auto relat_view = shuffle_reshapeApermute(m_Network,relat->getOutput(0),
                                              Dims4{1,window_size*window_size,window_size*window_size,-1},
                                              Permutation{0,3,1,2},true);
    auto attn_rv = m_Network->addElementWise(*attn->getOutput(0),*relat_view,ElementWiseOperation::kSUM);
    ITensor *Attn_rv = attn_rv->getOutput(0);
    if (mask != nullptr)
    {
        Dims maskdims;
        maskdims.nbDims = mask->getDimensions().nbDims +1;
        maskdims.d[0] = mask->getDimensions().d[0];
        maskdims.d[1] = 1;
        for(int i = 2; i< maskdims.nbDims; i++)
        {
            maskdims.d[i] = mask->getDimensions().d[i-1];
        }
        auto maskshuffle = m_Network->addShuffle(*mask);
        maskshuffle->setReshapeDimensions(maskdims);
        auto attn_rnM = m_Network->addElementWise(*attn_rv->getOutput(0),*maskshuffle->getOutput(0),ElementWiseOperation::kSUM);
        Attn_rv = attn_rnM->getOutput(0);
    }
    auto attn_rv_s = m_Network->addSoftMax(*Attn_rv);
    attn_rv_s->setAxes(8);
    auto v_ = m_Network->addShuffle(*v->getOutput(0));
    v_->setReshapeDimensions(Dims4{qkvDims.d[1],qkvDims.d[2],qkvDims.d[3],qkvDims.d[4]});
    auto attn_v = m_Network->addMatrixMultiply(*attn_rv_s->getOutput(0),MatrixOperation::kNONE,
                                               *v_->getOutput(0),MatrixOperation::kNONE);
    auto x_reshape = shuffle_reshapeApermute(m_Network,attn_v->getOutput(0),Dims3{b,n,c},Permutation{0,2,1,3},false);
    auto x_linear = trt_swinLinear(m_Network,weightMap,x_reshape,lname+".proj");
    return x_linear;
}
ITensor* trt_window_reverse(INetworkDefinition *m_Network, ITensor *input, int window_size, int H, int W)
{
    Dims viewDims;
    viewDims.nbDims = 5;
    int d[5] = {H/window_size,W/window_size,window_size,window_size,-1};
    for(int i = 0; i < 5; i++)
        viewDims.d[i] = d[i];
    auto x_view = shuffle_reshape(m_Network,input,viewDims);
    auto output = shuffle_reshapeApermute(m_Network,x_view,Dims3{H,W,-1},Permutation{0,2,1,3,4},false);
    return output;
}
ITensor* gelu(INetworkDefinition *m_Network,ITensor *input)
{
    auto creator = getPluginRegistry()->getPluginCreator("geluLayer_TRT", "1");
    const PluginFieldCollection* pluginData = creator->getFieldNames();
    IPluginV2 *pluginObj = creator->createPlugin("gelu", pluginData);
    ITensor* inputTensors[] = {input};
    auto g = m_Network->addPluginV2(inputTensors, 1, *pluginObj);
    return g->getOutput(0);
}
//ITensor* adaptiveAvgPool2d(INetworkDefinition *m_Network,ITensor *input)
//{
//    auto creator = getPluginRegistry()->getPluginCreator("adaptiveAvgPooling_TRT", "1");
//    const PluginFieldCollection* pluginData = creator->getFieldNames();
//    IPluginV2 *pluginObj = creator->createPlugin("apAvgPool", pluginData);
//    ITensor* inputTensors[] = {input};
//    auto g = m_Network->addPluginV2(inputTensors, 1, *pluginObj);
//    return g->getOutput(0);
//}
ITensor* trt_transform_mlp(INetworkDefinition *m_Network,std::map<std::string, Weights> weightMap,ITensor *input,
                           string lname,int dim,int mlp_ratio = 4)
{
//    auto fc1 = m_Network->addFullyConnected(*input,dim * mlp_ratio,
//                                            weightMap[lname+".fc1.weight"],weightMap[lname+".fc1.bias"]);
    auto fc1 = trt_swinLinear(m_Network,weightMap,input,lname+".fc1");
    auto act = gelu(m_Network,fc1);
//    auto fc2 = m_Network->addFullyConnected(*act,dim ,
//                                            weightMap[lname+".fc2.weight"],weightMap[lname+".fc2.bias"]);
    auto fc2 = trt_swinLinear(m_Network,weightMap,act,lname+".fc2");
    return fc2;
}
ITensor* blk(INetworkDefinition *m_Network,std::map<std::string, Weights> weightMap,ITensor *input, ITensor* mask, string lname,
             int hw,int dim, int num_heads,int window_size,int shift_size,int mlp_ratio = 4)
{
    int c = input->getDimensions().d[input->getDimensions().nbDims - 1];
    auto x = input;
    auto norm1 = m_layerNorm(m_Network,weightMap,x,lname+".norm1");
    //auto norm1 = x;
    auto view1 = shuffle_reshape(m_Network,norm1,Dims3{hw,hw,c});
    auto pad = trt_transform_pad(m_Network,view1,window_size);
    int hp = pad->getDimensions().d[0];
    int wp = pad->getDimensions().d[1];
    ITensor* shifted_x;
    ITensor* atten_mask = nullptr;
    if(shift_size > 0)
    {
        shifted_x = trt_swinRoll(m_Network,pad,{-3,-3},{1,2});
        atten_mask = mask;
    }
    else
    {
        shifted_x = pad;
    }
    auto x_windows = trt_transform_window_partition(m_Network,shifted_x,window_size);
    auto x_atten_windows = trt_trainsform_WindowAttention(m_Network,weightMap,x_windows,atten_mask,lname+".attn",dim,num_heads,
                                                          window_size,shift_size);
    auto x_atten_windows_view = shuffle_reshape(m_Network,x_atten_windows,Dims4{-1,window_size,window_size,c});

    shifted_x = trt_window_reverse(m_Network,x_atten_windows_view,window_size,hp,wp);
    if(shift_size > 0)
    {
        x = trt_swinRoll(m_Network,shifted_x,{3,3},{1,2});
    }
    else {
        x = shifted_x;
    }
    if(hw < hp){
        auto sss = m_Network->addSlice(*x,Dims3{0,0,0},Dims3{hw,hw,c},Dims3{1,1,1});
        x = sss->getOutput(0);
    }
    x = shuffle_reshape(m_Network,x,Dims2{hw*hw,c});
    x = m_Network->addElementWise(*x,*input,ElementWiseOperation::kSUM)->getOutput(0);
    auto norm2 = m_layerNorm(m_Network,weightMap,x,lname+".norm2");
    //auto norm2 = x;
    auto mlp = trt_transform_mlp(m_Network,weightMap,norm2,lname+".mlp",dim);
    auto out= m_Network->addElementWise(*x,*mlp,ElementWiseOperation::kSUM)->getOutput(0);
    debug_print(out, "blk");
    return out;
}
ITensor* downsample(INetworkDefinition* m_Network,std::map<std::string, Weights> weightMap,ITensor *input,
                    string lname, int hw)
{
    int c = input->getDimensions().d[input->getDimensions().nbDims - 1];
    auto x = shuffle_reshape(m_Network,input,Dims3{hw,hw,c});
    auto x0 = m_Network->addSlice(*x,Dims3{0,0,0},Dims3{hw/2,hw/2,c},Dims3{2,2,1});
    auto x1 = m_Network->addSlice(*x,Dims3{1,0,0},Dims3{hw/2,hw/2,c},Dims3{2,2,1});
    auto x2 = m_Network->addSlice(*x,Dims3{0,1,0},Dims3{hw/2,hw/2,c},Dims3{2,2,1});
    auto x3 = m_Network->addSlice(*x,Dims3{1,1,0},Dims3{hw/2,hw/2,c},Dims3{2,2,1});
    ITensor* inputTensors[] = { x0->getOutput(0), x1->getOutput(0), x2->getOutput(0), x3->getOutput(0) };
    auto cat = m_Network->addConcatenation(inputTensors, 4);
    cat->setAxis(2);
    auto cat_view = shuffle_reshape(m_Network,cat->getOutput(0),Dims2{-1,4*c});
    auto norm = m_layerNorm(m_Network,weightMap,cat_view,lname+".norm");
    //auto norm = cat_view;
    auto reduction = trt_swinLinear(m_Network,weightMap,norm,lname+".reduction",false);
    return reduction;
}
ITensor* addBatchNorm2d(
INetworkDefinition *network,
std::map<std::string, Weights> weightMap,
ITensor* input,
const std::string& lname,
float eps = 1e-5
) {
    float *gamma = (float*)(weightMap[lname + ".weight"].values);
    float *beta = (float*)(weightMap[lname + ".bias"].values);
    float *mean = (float*)(weightMap[lname + ".running_mean"].values);
    float *var = (float*)(weightMap[lname + ".running_var"].values);
    int len = weightMap[lname + ".running_var"].count;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{ DataType::kFLOAT, scval, len };

    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{ DataType::kFLOAT, shval, len };

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{ DataType::kFLOAT, pval, len };

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(*input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1->getOutput(0);
}
ITensor* transform_lateral_conv(INetworkDefinition* m_Network,std::map<std::string, Weights> weightMap,ITensor* input,
                                string lname, int k = 1, int s = 1,int out_features = 512)
{
    Weights empty{DataType::kFLOAT,nullptr,0};
    auto conv = m_Network->addConvolutionNd(*input,out_features,Dims2{k,k},weightMap[lname+".conv.weight"],empty);
    conv->setStrideNd(Dims2{s,s});
    conv->setNbGroups(1);
    conv->setPaddingNd(Dims2{k/2,k/2});
    ITensor* bn = addBatchNorm2d(m_Network,weightMap,conv->getOutput(0),lname+".bn");
    auto act = m_Network->addActivation(*bn,ActivationType::kRELU);
    return act->getOutput(0);
}
ITensor* resize(INetworkDefinition* m_Network, ITensor* input, int grid)
{
    float scale_h = 2.0f;
    float scale_w = 2.0f;

    scale_h = 1.0*grid / input->getDimensions().d[1];
    scale_w = 1.0*grid / input->getDimensions().d[2];

    auto creator = getPluginRegistry()->getPluginCreator("UpsamplePlugin", "1");
    PluginField pField[1];
    float *s = new float[2];
    s[0] = scale_h;
    s[1] = scale_w;
    pField[0].data = s;
    pField[0].length = 2;
    pField[0].type = PluginFieldType::kFLOAT32;
    pField[0].name = "scaleFactor";

    PluginFieldCollection pluginData;
    pluginData.nbFields = 1;
    pluginData.fields = pField;
    IPluginV2 *pluginObj = creator->createPlugin("upSample", &pluginData);
    ITensor* inputTensors[] = {input};
    auto upS = m_Network->addPluginV2(inputTensors, 1, *pluginObj);
    return upS->getOutput(0);
}
ITensor* transform_psp(INetworkDefinition* m_Network,std::map<std::string, Weights> weightMap,ITensor* input,
                       string lname, int output_Avg_Size, int out_features = 512)
{
    int inH = input->getDimensions().d[1];
    int inW = input->getDimensions().d[2];
    int kH = inH / output_Avg_Size;
    int kW = inW / output_Avg_Size;
    auto avgPool = m_Network->addPoolingNd(*input,PoolingType::kAVERAGE,Dims2{kH,kW});
    avgPool->setStrideNd(Dims2{kH,kW});
    auto cba = transform_lateral_conv(m_Network,weightMap,avgPool->getOutput(0),lname,1,1,out_features);
    auto out = resize(m_Network,cba,inH);
    return out;
}
ITensor* up_Add(INetworkDefinition* m_Network,ITensor* input1,ITensor* input2)
{
    auto in1 = resize(m_Network,input1,input2->getDimensions().d[1]);
    auto out = m_Network->addElementWise(*in1,*input2,ElementWiseOperation::kSUM);
    return out->getOutput(0);
}


#endif // COMMON_HPP


================================================
FILE: swin-transformer/semantic-segmentation/fillmask.cu
================================================
#include "fillmask.h"
#include <math.h>
namespace nvinfer1
{
    fillmask::fillmask()
    {
    }

    fillmask::~fillmask()
    {
    }
    // create the plugin at runtime from a byte stream
    fillmask::fillmask(const void* data, size_t length)
    {
        const char *d = reinterpret_cast<const char *>(data), *a = d;
        Tn::read(d, mInputSize);
        assert(d == a + length);
    }

    void fillmask::serialize(void* buffer) const
    {
        char* d = static_cast<char*>(buffer), *a = d;
        Tn::write(d, mInputSize);
        assert(d == a + getSerializationSize());
    }

    size_t fillmask::getSerializationSize() const
    {
        return sizeof(mInputSize);
    }

    int fillmask::initialize()
    {
        return 0;
    }

    Dims fillmask::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
    {
        assert(nbInputDims == 1);
        Dims outputDims;
        outputDims.nbDims = inputs[0].nbDims;
        for (int i = 0; i < inputs[0].nbDims; i++) {
            outputDims.d[i] = inputs[0].d[i];
        }
        return outputDims;
    }

    // Set plugin namespace
    void fillmask::setPluginNamespace(const char* pluginNamespace)
    {
        mPluginNamespace = pluginNamespace;
    }

    const char* fillmask::getPluginNamespace() const
    {
        return mPluginNamespace;
    }

    // Return the DataType of the plugin output at the requested index
    DataType fillmask::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
    {
        return DataType::kFLOAT;
    }

    // Return true if output tensor is broadcast across a batch.
    bool fillmask::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
    {
        return false;
    }

    // Return true if plugin can use input that is broadcast across batch without replication.
    bool fillmask::canBroadcastInputAcrossBatch(int inputIndex) const
    {
        return false;
    }

    void fillmask::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)
    {

        mInputSize = 1;
        for (int i = 0; i < in[0].dims.nbDims; i++) {
            mInputSize *= in[0].dims.d[i];
        }
    }

    // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
    void fillmask::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
    {
    }

    // Detach the plugin object from its execution context.
    void fillmask::detachFromContext() {}

    const char* fillmask::getPluginType() const
    {
        return "fillmaskLayer_TRT";
    }

    const char* fillmask::getPluginVersion() const
    {
        return "1";
    }

    void fillmask::destroy()
    {
        delete this;
    }

    // Clone the plugin
    IPluginV2IOExt* fillmask::clone() const
    {
        fillmask *p = new fillmask();
        p->setPluginNamespace(mPluginNamespace);
        p->setInputSize(mInputSize);
        return p;
    }

    __global__ void fillmaskKer(const float *in, float *out, int size) {
        int idx = threadIdx.x + blockIdx.x * blockDim.x;
        if (idx >= size)
            return;
        if (in[idx] != 0.0)
            out[idx] = -100.0;
        else
            out[idx] = 0.0;
    }
    void fillmask::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) {

        int numElem = batchSize * mInputSize;
        fillmaskKer<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount>>>
            (inputs[0], output, numElem);
    }

    int fillmask::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
    {
        forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);
        return 0;
    }

    PluginFieldCollection fillmaskCreator::mFC{};
    std::vector<PluginField> fillmaskCreator::mPluginAttributes;

    fillmaskCreator::fillmaskCreator()
    {
        mPluginAttributes.clear();
        mFC.nbFields = mPluginAttributes.size();
        mFC.fields = mPluginAttributes.data();
    }

    const char* fillmaskCreator::getPluginName() const
    {
            return "fillmaskLayer_TRT";
    }

    const char* fillmaskCreator::getPluginVersion() const
    {
            return "1";
    }

    const PluginFieldCollection* fillmaskCreator::getFieldNames()
    {
            return &mFC;
    }

    IPluginV2IOExt* fillmaskCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
    {
        fillmask* obj = new fillmask();
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

    IPluginV2IOExt* fillmaskCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
    {
        // This object will be deleted when the network is destroyed, which will
        fillmask* obj = new fillmask(serialData, serialLength);
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }


}


================================================
FILE: swin-transformer/semantic-segmentation/fillmask.h
================================================
#ifndef FILLMASK_H
#define FILLMASK_H


#include <vector>
#include <string>
#include "NvInfer.h"
#include "myhpp.h"
#include <assert.h>
#include "utilsn.h"

namespace nvinfer1
{
    class fillmask:public IPluginV2IOExt
    {
    public:
        explicit fillmask();
        fillmask(const void* data, size_t length);
        ~fillmask();
        int getNbOutputs() const override
        {
            return 1;
        }

        Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;
        int initialize() override;
        virtual void terminate() override {};
        virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;}
        virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override;
        virtual size_t getSerializationSize() const override;
        virtual void serialize(void* buffer) const override;

        bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override {
            return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
        }

        const char* getPluginType() const override;
        const char* getPluginVersion() const override;
        void destroy() override;
        IPluginV2IOExt* clone() const override;
        void setPluginNamespace(const char* pluginNamespace) override;
        const char* getPluginNamespace() const override;
        DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;
        bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override;
        bool canBroadcastInputAcrossBatch(int inputIndex) const override;
        void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override;
        void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override;
        void detachFromContext() override;

        void setInputSize(int s) {
            mInputSize = s;
        }

    private:
        void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1);
        int mThreadCount = 256;
        int mInputSize;
        const char* mPluginNamespace;
    };

    class fillmaskCreator : public IPluginCreator
    {
        public:
            fillmaskCreator();
            ~fillmaskCreator() override = default;
            const char* getPluginName() const override;
            const char* getPluginVersion() const override;
            const PluginFieldCollection* getFieldNames() override;
            IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override;
            IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;

            void setPluginNamespace(const char* libNamespace) override
            {
                mNamespace = libNamespace;
            }

            const char* getPluginNamespace() const override
            {
                return mNamespace.c_str();
            }

        private:
            std::string mNamespace;
            static PluginFieldCollection mFC;
            static std::vector<PluginField> mPluginAttributes;
    };
    REGISTER_TENSORRT_PLUGIN(fillmaskCreator);
};

#endif // FILLMASK_H


================================================
FILE: swin-transformer/semantic-segmentation/gelu.cu
================================================
#include "gelu.h"
#include <math.h>
namespace nvinfer1
{
    gelu::gelu()
    {
    }

    gelu::~gelu()
    {
    }
    // create the plugin at runtime from a byte stream
    gelu::gelu(const void* data, size_t length)
    {
        const char *d = reinterpret_cast<const char *>(data), *a = d;
        Tn::read(d, mInputSize);
        assert(d == a + length);
    }

    void gelu::serialize(void* buffer) const
    {
        char* d = static_cast<char*>(buffer), *a = d;
        Tn::write(d, mInputSize);
        assert(d == a + getSerializationSize());
    }

    size_t gelu::getSerializationSize() const
    {
        return sizeof(mInputSize);
    }

    int gelu::initialize()
    {
        return 0;
    }

    Dims gelu::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
    {
        assert(nbInputDims == 1);
        Dims outputDims;
        outputDims.nbDims = inputs[0].nbDims;
        for (int i = 0; i < inputs[0].nbDims; i++) {
            outputDims.d[i] = inputs[0].d[i];
        }
        return outputDims;
    }

    // Set plugin namespace
    void gelu::setPluginNamespace(const char* pluginNamespace)
    {
        mPluginNamespace = pluginNamespace;
    }

    const char* gelu::getPluginNamespace() const
    {
        return mPluginNamespace;
    }

    // Return the DataType of the plugin output at the requested index
    DataType gelu::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
    {
        return DataType::kFLOAT;
    }

    // Return true if output tensor is broadcast across a batch.
    bool gelu::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
    {
        return false;
    }

    // Return true if plugin can use input that is broadcast across batch without replication.
    bool gelu::canBroadcastInputAcrossBatch(int inputIndex) const
    {
        return false;
    }

    void gelu::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)
    {

        mInputSize = 1;
        for (int i = 0; i < in[0].dims.nbDims; i++) {
            mInputSize *= in[0].dims.d[i];
        }
    }

    // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
    void gelu::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
    {
    }

    // Detach the plugin object from its execution context.
    void gelu::detachFromContext() {}

    const char* gelu::getPluginType() const
    {
        return "geluLayer_TRT";
    }

    const char* gelu::getPluginVersion() const
    {
        return "1";
    }

    void gelu::destroy()
    {
        delete this;
    }

    // Clone the plugin
    IPluginV2IOExt* gelu::clone() const
    {
        gelu *p = new gelu();
        p->setPluginNamespace(mPluginNamespace);
        p->setInputSize(mInputSize);
        return p;
    }

    __global__ void geluKer(const float *in, float *out, int size) {
        int idx = threadIdx.x + blockIdx.x * blockDim.x;
        if (idx >= size)
            return;
        //x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
        out[idx] = in[idx] * 0.5 *(1.0 + erf(in[idx]/1.4142135381698608));
    }
    void gelu::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) {

        int numElem = batchSize * mInputSize;
        geluKer<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount>>>
            (inputs[0], output, numElem);
    }

    int gelu::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
    {
        forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);
        return 0;
    }

    PluginFieldCollection geluCreator::mFC{};
    std::vector<PluginField> geluCreator::mPluginAttributes;

    geluCreator::geluCreator()
    {
        mPluginAttributes.clear();
        mFC.nbFields = mPluginAttributes.size();
        mFC.fields = mPluginAttributes.data();
    }

    const char* geluCreator::getPluginName() const
    {
            return "geluLayer_TRT";
    }

    const char* geluCreator::getPluginVersion() const
    {
            return "1";
    }

    const PluginFieldCollection* geluCreator::getFieldNames()
    {
            return &mFC;
    }

    IPluginV2IOExt* geluCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
    {
        gelu* obj = new gelu();
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

    IPluginV2IOExt* geluCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
    {
        // This object will be deleted when the network is destroyed, which will
        gelu* obj = new gelu(serialData, serialLength);
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }


}


================================================
FILE: swin-transformer/semantic-segmentation/gelu.h
================================================
#ifndef GELU_H
#define GELU_H

#include <vector>
#include <string>
#include "NvInfer.h"
#include "myhpp.h"
#include <assert.h>
#include "utilsn.h"
#define M_PI       3.14159265358979323846   // pi
namespace nvinfer1
{
    class gelu:public IPluginV2IOExt
    {
    public:
        explicit gelu();
        gelu(const void* data, size_t length);
        ~gelu();
        int getNbOutputs() const override
        {
            return 1;
        }

        Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;
        int initialize() override;
        virtual void terminate() override {};
        virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;}
        virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override;
        virtual size_t getSerializationSize() const override;
        virtual void serialize(void* buffer) const override;

        bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override {
            return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
        }

        const char* getPluginType() const override;
        const char* getPluginVersion() const override;
        void destroy() override;
        IPluginV2IOExt* clone() const override;
        void setPluginNamespace(const char* pluginNamespace) override;
        const char* getPluginNamespace() const override;
        DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;
        bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override;
        bool canBroadcastInputAcrossBatch(int inputIndex) const override;
        void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override;
        void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override;
        void detachFromContext() override;

        void setInputSize(int s) {
            mInputSize = s;
        }

    private:
        void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1);
        int mThreadCount = 256;
        int mInputSize;
        const char* mPluginNamespace;
    };

    class geluCreator : public IPluginCreator
    {
        public:
            geluCreator();
            ~geluCreator() override = default;
            const char* getPluginName() const override;
            const char* getPluginVersion() const override;
            const PluginFieldCollection* getFieldNames() override;
            IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override;
            IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;

            void setPluginNamespace(const char* libNamespace) override
            {
                mNamespace = libNamespace;
            }

            const char* getPluginNamespace() const override
            {
                return mNamespace.c_str();
            }

        private:
            std::string mNamespace;
            static PluginFieldCollection mFC;
            static std::vector<PluginField> mPluginAttributes;
    };
    REGISTER_TENSORRT_PLUGIN(geluCreator);
};
#endif // GELU_H


================================================
FILE: swin-transformer/semantic-segmentation/gen_wts.py
================================================
import torch
import struct
import sys

# Initialize
pt_file = sys.argv[1]
# Load model
model = torch.load(pt_file, map_location=torch.device('cpu'))['model'].float()  # load to FP32
model.to(device).eval()

with open(pt_file.split('.')[0] + '.wts', 'w') as f:
    f.write('{}\n'.format(len(model.state_dict().keys())))
    for k, v in model.state_dict().items():
        vr = v.reshape(-1).cpu().numpy()
        f.write('{} {} '.format(k, len(vr)))
        for vv in vr:
            f.write(' ')
            f.write(struct.pack('>f',float(vv)).hex())
        f.write('\n')


================================================
FILE: swin-transformer/semantic-segmentation/include/dirent.h
================================================
/*
 * Dirent interface for Microsoft Visual Studio
 *
 * Copyright (C) 1998-2019 Toni Ronkko
 * This file is part of dirent.  Dirent may be freely distributed
 * under the MIT license.  For all details and documentation, see
 * https://github.com/tronkko/dirent
 */
#ifndef DIRENT_H
#define DIRENT_H

/* Hide warnings about unreferenced local functions */
#if defined(__clang__)
#   pragma clang diagnostic ignored "-Wunused-function"
#elif defined(_MSC_VER)
#   pragma warning(disable:4505)
#elif defined(__GNUC__)
#   pragma GCC diagnostic ignored "-Wunused-function"
#endif

/*
 * Include windows.h without Windows Sockets 1.1 to prevent conflicts with
 * Windows Sockets 2.0.
 */
#ifndef WIN32_LEAN_AND_MEAN
#   define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>

#include <stdio.h>
#include <stdarg.h>
#include <wchar.h>
#include <string.h>
#include <stdlib.h>
#include <malloc.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <errno.h>

/* Indicates that d_type field is available in dirent structure */
#define _DIRENT_HAVE_D_TYPE

/* Indicates that d_namlen field is available in dirent structure */
#define _DIRENT_HAVE_D_NAMLEN

/* Entries missing from MSVC 6.0 */
#if !defined(FILE_ATTRIBUTE_DEVICE)
#   define FILE_ATTRIBUTE_DEVICE 0x40
#endif

/* File type and permission flags for stat(), general mask */
#if !defined(S_IFMT)
#   define S_IFMT _S_IFMT
#endif

/* Directory bit */
#if !defined(S_IFDIR)
#   define S_IFDIR _S_IFDIR
#endif

/* Character device bit */
#if !defined(S_IFCHR)
#   define S_IFCHR _S_IFCHR
#endif

/* Pipe bit */
#if !defined(S_IFFIFO)
#   define S_IFFIFO _S_IFFIFO
#endif

/* Regular file bit */
#if !defined(S_IFREG)
#   define S_IFREG _S_IFREG
#endif

/* Read permission */
#if !defined(S_IREAD)
#   define S_IREAD _S_IREAD
#endif

/* Write permission */
#if !defined(S_IWRITE)
#   define S_IWRITE _S_IWRITE
#endif

/* Execute permission */
#if !defined(S_IEXEC)
#   define S_IEXEC _S_IEXEC
#endif

/* Pipe */
#if !defined(S_IFIFO)
#   define S_IFIFO _S_IFIFO
#endif

/* Block device */
#if !defined(S_IFBLK)
#   define S_IFBLK 0
#endif

/* Link */
#if !defined(S_IFLNK)
#   define S_IFLNK 0
#endif

/* Socket */
#if !defined(S_IFSOCK)
#   define S_IFSOCK 0
#endif

/* Read user permission */
#if !defined(S_IRUSR)
#   define S_IRUSR S_IREAD
#endif

/* Write user permission */
#if !defined(S_IWUSR)
#   define S_IWUSR S_IWRITE
#endif

/* Execute user permission */
#if !defined(S_IXUSR)
#   define S_IXUSR 0
#endif

/* Read group permission */
#if !defined(S_IRGRP)
#   define S_IRGRP 0
#endif

/* Write group permission */
#if !defined(S_IWGRP)
#   define S_IWGRP 0
#endif

/* Execute group permission */
#if !defined(S_IXGRP)
#   define S_IXGRP 0
#endif

/* Read others permission */
#if !defined(S_IROTH)
#   define S_IROTH 0
#endif

/* Write others permission */
#if !defined(S_IWOTH)
#   define S_IWOTH 0
#endif

/* Execute others permission */
#if !defined(S_IXOTH)
#   define S_IXOTH 0
#endif

/* Maximum length of file name */
#if !defined(PATH_MAX)
#   define PATH_MAX MAX_PATH
#endif
#if !defined(FILENAME_MAX)
#   define FILENAME_MAX MAX_PATH
#endif
#if !defined(NAME_MAX)
#   define NAME_MAX FILENAME_MAX
#endif

/* File type flags for d_type */
#define DT_UNKNOWN 0
#define DT_REG S_IFREG
#define DT_DIR S_IFDIR
#define DT_FIFO S_IFIFO
#define DT_SOCK S_IFSOCK
#define DT_CHR S_IFCHR
#define DT_BLK S_IFBLK
#define DT_LNK S_IFLNK

/* Macros for converting between st_mode and d_type */
#define IFTODT(mode) ((mode) & S_IFMT)
#define DTTOIF(type) (type)

/*
 * File type macros.  Note that block devices, sockets and links cannot be
 * distinguished on Windows and the macros S_ISBLK, S_ISSOCK and S_ISLNK are
 * only defined for compatibility.  These macros should always return false
 * on Windows.
 */
#if !defined(S_ISFIFO)
#   define S_ISFIFO(mode) (((mode) & S_IFMT) == S_IFIFO)
#endif
#if !defined(S_ISDIR)
#   define S_ISDIR(mode) (((mode) & S_IFMT) == S_IFDIR)
#endif
#if !defined(S_ISREG)
#   define S_ISREG(mode) (((mode) & S_IFMT) == S_IFREG)
#endif
#if !defined(S_ISLNK)
#   define S_ISLNK(mode) (((mode) & S_IFMT) == S_IFLNK)
#endif
#if !defined(S_ISSOCK)
#   define S_ISSOCK(mode) (((mode) & S_IFMT) == S_IFSOCK)
#endif
#if !defined(S_ISCHR)
#   define S_ISCHR(mode) (((mode) & S_IFMT) == S_IFCHR)
#endif
#if !defined(S_ISBLK)
#   define S_ISBLK(mode) (((mode) & S_IFMT) == S_IFBLK)
#endif

/* Return the exact length of the file name without zero terminator */
#define _D_EXACT_NAMLEN(p) ((p)->d_namlen)

/* Return the maximum size of a file name */
#define _D_ALLOC_NAMLEN(p) ((PATH_MAX)+1)


#ifdef __cplusplus
extern "C" {
#endif


/* Wide-character version */
struct _wdirent {
    /* Always zero */
    long d_ino;

    /* File position within stream */
    long d_off;

    /* Structure size */
    unsigned short d_reclen;

    /* Length of name without \0 */
    size_t d_namlen;

    /* File type */
    int d_type;

    /* File name */
    wchar_t d_name[PATH_MAX+1];
};
typedef struct _wdirent _wdirent;

struct _WDIR {
    /* Current directory entry */
    struct _wdirent ent;

    /* Private file data */
    WIN32_FIND_DATAW data;

    /* True if data is valid */
    int cached;

    /* Win32 search handle */
    HANDLE handle;

    /* Initial directory name */
    wchar_t *patt;
};
typedef struct _WDIR _WDIR;

/* Multi-byte character version */
struct dirent {
    /* Always zero */
    long d_ino;

    /* File position within stream */
    long d_off;

    /* Structure size */
    unsigned short d_reclen;

    /* Length of name without \0 */
    size_t d_namlen;

    /* File type */
    int d_type;

    /* File name */
    char d_name[PATH_MAX+1];
};
typedef struct dirent dirent;

struct DIR {
    struct dirent ent;
    struct _WDIR *wdirp;
};
typedef struct DIR DIR;


/* Dirent functions */
static DIR *opendir (const char *dirname);
static _WDIR *_wopendir (const wchar_t *dirname);

static struct dirent *readdir (DIR *dirp);
static struct _wdirent *_wreaddir (_WDIR *dirp);

static int readdir_r(
    DIR *dirp, struct dirent *entry, struct dirent **result);
static int _wreaddir_r(
    _WDIR *dirp, struct _wdirent *entry, struct _wdirent **result);

static int closedir (DIR *dirp);
static int _wclosedir (_WDIR *dirp);

static void rewinddir (DIR* dirp);
static void _wrewinddir (_WDIR* dirp);

static int scandir (const char *dirname, struct dirent ***namelist,
    int (*filter)(const struct dirent*),
    int (*compare)(const struct dirent**, const struct dirent**));

static int alphasort (const struct dirent **a, const struct dirent **b);

static int versionsort (const struct dirent **a, const struct dirent **b);


/* For compatibility with Symbian */
#define wdirent _wdirent
#define WDIR _WDIR
#define wopendir _wopendir
#define wreaddir _wreaddir
#define wclosedir _wclosedir
#define wrewinddir _wrewinddir


/* Internal utility functions */
static WIN32_FIND_DATAW *dirent_first (_WDIR *dirp);
static WIN32_FIND_DATAW *dirent_next (_WDIR *dirp);

static int dirent_mbstowcs_s(
    size_t *pReturnValue,
    wchar_t *wcstr,
    size_t sizeInWords,
    const char *mbstr,
    size_t count);

static int dirent_wcstombs_s(
    size_t *pReturnValue,
    char *mbstr,
    size_t sizeInBytes,
    const wchar_t *wcstr,
    size_t count);

static void dirent_set_errno (int error);


/*
 * Open directory stream DIRNAME for read and return a pointer to the
 * internal working area that is used to retrieve individual directory
 * entries.
 */
static _WDIR*
_wopendir(
    const wchar_t *dirname)
{
    _WDIR *dirp;
    DWORD n;
    wchar_t *p;

    /* Must have directory name */
    if (dirname == NULL  ||  dirname[0] == '\0') {
        dirent_set_errno (ENOENT);
        return NULL;
    }

    /* Allocate new _WDIR structure */
    dirp = (_WDIR*) malloc (sizeof (struct _WDIR));
    if (!dirp) {
        return NULL;
    }

    /* Reset _WDIR structure */
    dirp->handle = INVALID_HANDLE_VALUE;
    dirp->patt = NULL;
    dirp->cached = 0;

    /*
     * Compute the length of full path plus zero terminator
     *
     * Note that on WinRT there's no way to convert relative paths
     * into absolute paths, so just assume it is an absolute path.
     */
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
    /* Desktop */
    n = GetFullPathNameW (dirname, 0, NULL, NULL);
#else
    /* WinRT */
    n = wcslen (dirname);
#endif

    /* Allocate room for absolute directory name and search pattern */
    dirp->patt = (wchar_t*) malloc (sizeof (wchar_t) * n + 16);
    if (dirp->patt == NULL) {
        goto exit_closedir;
    }

    /*
     * Convert relative directory name to an absolute one.  This
     * allows rewinddir() to function correctly even when current
     * working directory is changed between opendir() and rewinddir().
     *
     * Note that on WinRT there's no way to convert relative paths
     * into absolute paths, so just assume it is an absolute path.
     */
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
    /* Desktop */
    n = GetFullPathNameW (dirname, n, dirp->patt, NULL);
    if (n <= 0) {
        goto exit_closedir;
    }
#else
    /* WinRT */
    wcsncpy_s (dirp->patt, n+1, dirname, n);
#endif

    /* Append search pattern \* to the directory name */
    p = dirp->patt + n;
    switch (p[-1]) {
    case '\\':
    case '/':
    case ':':
        /* Directory ends in path separator, e.g. c:\temp\ */
        /*NOP*/;
        break;

    default:
        /* Directory name doesn't end in path separator */
        *p++ = '\\';
    }
    *p++ = '*';
    *p = '\0';

    /* Open directory stream and retrieve the first entry */
    if (!dirent_first (dirp)) {
        goto exit_closedir;
    }

    /* Success */
    return dirp;

    /* Failure */
exit_closedir:
    _wclosedir (dirp);
    return NULL;
}

/*
 * Read next directory entry.
 *
 * Returns pointer to static directory entry which may be overwritten by
 * subsequent calls to _wreaddir().
 */
static struct _wdirent*
_wreaddir(
    _WDIR *dirp)
{
    struct _wdirent *entry;

    /*
     * Read directory entry to buffer.  We can safely ignore the return value
     * as entry will be set to NULL in case of error.
     */
    (void) _wreaddir_r (dirp, &dirp->ent, &entry);

    /* Return pointer to statically allocated directory entry */
    return entry;
}

/*
 * Read next directory entry.
 *
 * Returns zero on success.  If end of directory stream is reached, then sets
 * result to NULL and returns zero.
 */
static int
_wreaddir_r(
    _WDIR *dirp,
    struct _wdirent *entry,
    struct _wdirent **result)
{
    WIN32_FIND_DATAW *datap;

    /* Read next directory entry */
    datap = dirent_next (dirp);
    if (datap) {
        size_t n;
        DWORD attr;

        /*
         * Copy file name as wide-character string.  If the file name is too
         * long to fit in to the destination buffer, then truncate file name
         * to PATH_MAX characters and zero-terminate the buffer.
         */
        n = 0;
        while (n < PATH_MAX  &&  datap->cFileName[n] != 0) {
            entry->d_name[n] = datap->cFileName[n];
            n++;
        }
        entry->d_name[n] = 0;

        /* Length of file name excluding zero terminator */
        entry->d_namlen = n;

        /* File type */
        attr = datap->dwFileAttributes;
        if ((attr & FILE_ATTRIBUTE_DEVICE) != 0) {
            entry->d_type = DT_CHR;
        } else if ((attr & FILE_ATTRIBUTE_DIRECTORY) != 0) {
            entry->d_type = DT_DIR;
        } else {
            entry->d_type = DT_REG;
        }

        /* Reset dummy fields */
        entry->d_ino = 0;
        entry->d_off = 0;
        entry->d_reclen = sizeof (struct _wdirent);

        /* Set result address */
        *result = entry;

    } else {

        /* Return NULL to indicate end of directory */
        *result = NULL;

    }

    return /*OK*/0;
}

/*
 * Close directory stream opened by opendir() function.  This invalidates the
 * DIR structure as well as any directory entry read previously by
 * _wreaddir().
 */
static int
_wclosedir(
    _WDIR *dirp)
{
    int ok;
    if (dirp) {

        /* Release search handle */
        if (dirp->handle != INVALID_HANDLE_VALUE) {
            FindClose (dirp->handle);
        }

        /* Release search pattern */
        free (dirp->patt);

        /* Release directory structure */
        free (dirp);
        ok = /*success*/0;

    } else {

        /* Invalid directory stream */
        dirent_set_errno (EBADF);
        ok = /*failure*/-1;

    }
    return ok;
}

/*
 * Rewind directory stream such that _wreaddir() returns the very first
 * file name again.
 */
static void
_wrewinddir(
    _WDIR* dirp)
{
    if (dirp) {
        /* Release existing search handle */
        if (dirp->handle != INVALID_HANDLE_VALUE) {
            FindClose (dirp->handle);
        }

        /* Open new search handle */
        dirent_first (dirp);
    }
}

/* Get first directory entry (internal) */
static WIN32_FIND_DATAW*
dirent_first(
    _WDIR *dirp)
{
    WIN32_FIND_DATAW *datap;
    DWORD error;

    /* Open directory and retrieve the first entry */
    dirp->handle = FindFirstFileExW(
        dirp->patt, FindExInfoStandard, &dirp->data,
        FindExSearchNameMatch, NULL, 0);
    if (dirp->handle != INVALID_HANDLE_VALUE) {

        /* a directory entry is now waiting in memory */
        datap = &dirp->data;
        dirp->cached = 1;

    } else {

        /* Failed to open directory: no directory entry in memory */
        dirp->cached = 0;
        datap = NULL;

        /* Set error code */
        error = GetLastError ();
        switch (error) {
        case ERROR_ACCESS_DENIED:
            /* No read access to directory */
            dirent_set_errno (EACCES);
            break;

        case ERROR_DIRECTORY:
            /* Directory name is invalid */
            dirent_set_errno (ENOTDIR);
            break;

        case ERROR_PATH_NOT_FOUND:
        default:
            /* Cannot find the file */
            dirent_set_errno (ENOENT);
        }

    }
    return datap;
}

/*
 * Get next directory entry (internal).
 *
 * Returns
 */
static WIN32_FIND_DATAW*
dirent_next(
    _WDIR *dirp)
{
    WIN32_FIND_DATAW *p;

    /* Get next directory entry */
    if (dirp->cached != 0) {

        /* A valid directory entry already in memory */
        p = &dirp->data;
        dirp->cached = 0;

    } else if (dirp->handle != INVALID_HANDLE_VALUE) {

        /* Get the next directory entry from stream */
        if (FindNextFileW (dirp->handle, &dirp->data) != FALSE) {
            /* Got a file */
            p = &dirp->data;
        } else {
            /* The very last entry has been processed or an error occurred */
            FindClose (dirp->handle);
            dirp->handle = INVALID_HANDLE_VALUE;
            p = NULL;
        }

    } else {

        /* End of directory stream reached */
        p = NULL;

    }

    return p;
}

/*
 * Open directory stream using plain old C-string.
 */
static DIR*
opendir(
    const char *dirname)
{
    struct DIR *dirp;

    /* Must have directory name */
    if (dirname == NULL  ||  dirname[0] == '\0') {
        dirent_set_errno (ENOENT);
        return NULL;
    }

    /* Allocate memory for DIR structure */
    dirp = (DIR*) malloc (sizeof (struct DIR));
    if (!dirp) {
        return NULL;
    }
    {
        int error;
        wchar_t wname[PATH_MAX + 1];
        size_t n;

        /* Convert directory name to wide-character string */
        error = dirent_mbstowcs_s(
            &n, wname, PATH_MAX + 1, dirname, PATH_MAX + 1);
        if (error) {
            /*
             * Cannot convert file name to wide-character string.  This
             * occurs if the string contains invalid multi-byte sequences or
             * the output buffer is too small to contain the resulting
             * string.
             */
            goto exit_free;
        }


        /* Open directory stream using wide-character name */
        dirp->wdirp = _wopendir (wname);
        if (!dirp->wdirp) {
            goto exit_free;
        }

    }

    /* Success */
    return dirp;

    /* Failure */
exit_free:
    free (dirp);
    return NULL;
}

/*
 * Read next directory entry.
 */
static struct dirent*
readdir(
    DIR *dirp)
{
    struct dirent *entry;

    /*
     * Read directory entry to buffer.  We can safely ignore the return value
     * as entry will be set to NULL in case of error.
     */
    (void) readdir_r (dirp, &dirp->ent, &entry);

    /* Return pointer to statically allocated directory entry */
    return entry;
}

/*
 * Read next directory entry into called-allocated buffer.
 *
 * Returns zero on success.  If the end of directory stream is reached, then
 * sets result to NULL and returns zero.
 */
static int
readdir_r(
    DIR *dirp,
    struct dirent *entry,
    struct dirent **result)
{
    WIN32_FIND_DATAW *datap;

    /* Read next directory entry */
    datap = dirent_next (dirp->wdirp);
    if (datap) {
        size_t n;
        int error;

        /* Attempt to convert file name to multi-byte string */
        error = dirent_wcstombs_s(
            &n, entry->d_name, PATH_MAX + 1, datap->cFileName, PATH_MAX + 1);

        /*
         * If the file name cannot be represented by a multi-byte string,
         * then attempt to use old 8+3 file name.  This allows traditional
         * Unix-code to access some file names despite of unicode
         * characters, although file names may seem unfamiliar to the user.
         *
         * Be ware that the code below cannot come up with a short file
         * name unless the file system provides one.  At least
         * VirtualBox shared folders fail to do this.
         */
        if (error  &&  datap->cAlternateFileName[0] != '\0') {
            error = dirent_wcstombs_s(
                &n, entry->d_name, PATH_MAX + 1,
                datap->cAlternateFileName, PATH_MAX + 1);
        }

        if (!error) {
            DWORD attr;

            /* Length of file name excluding zero terminator */
            entry->d_namlen = n - 1;

            /* File attributes */
            attr = datap->dwFileAttributes;
            if ((attr & FILE_ATTRIBUTE_DEVICE) != 0) {
                entry->d_type = DT_CHR;
            } else if ((attr & FILE_ATTRIBUTE_DIRECTORY) != 0) {
                entry->d_type = DT_DIR;
            } else {
                entry->d_type = DT_REG;
            }

            /* Reset dummy fields */
            entry->d_ino = 0;
            entry->d_off = 0;
            entry->d_reclen = sizeof (struct dirent);

        } else {

            /*
             * Cannot convert file name to multi-byte string so construct
             * an erroneous directory entry and return that.  Note that
             * we cannot return NULL as that would stop the processing
             * of directory entries completely.
             */
            entry->d_name[0] = '?';
            entry->d_name[1] = '\0';
            entry->d_namlen = 1;
            entry->d_type = DT_UNKNOWN;
            entry->d_ino = 0;
            entry->d_off = -1;
            entry->d_reclen = 0;

        }

        /* Return pointer to directory entry */
        *result = entry;

    } else {

        /* No more directory entries */
        *result = NULL;

    }

    return /*OK*/0;
}

/*
 * Close directory stream.
 */
static int
closedir(
    DIR *dirp)
{
    int ok;
    if (dirp) {

        /* Close wide-character directory stream */
        ok = _wclosedir (dirp->wdirp);
        dirp->wdirp = NULL;

        /* Release multi-byte character version */
        free (dirp);

    } else {

        /* Invalid directory stream */
        dirent_set_errno (EBADF);
        ok = /*failure*/-1;

    }
    return ok;
}

/*
 * Rewind directory stream to beginning.
 */
static void
rewinddir(
    DIR* dirp)
{
    /* Rewind wide-character string directory stream */
    _wrewinddir (dirp->wdirp);
}

/*
 * Scan directory for entries.
 */
static int
scandir(
    const char *dirname,
    struct dirent ***namelist,
    int (*filter)(const struct dirent*),
    int (*compare)(const struct dirent**, const struct dirent**))
{
    struct dirent **files = NULL;
    size_t size = 0;
    size_t allocated = 0;
    const size_t init_size = 1;
    DIR *dir = NULL;
    struct dirent *entry;
    struct dirent *tmp = NULL;
    size_t i;
    int result = 0;

    /* Open directory stream */
    dir = opendir (dirname);
    if (dir) {

        /* Read directory entries to memory */
        while (1) {

            /* Enlarge pointer table to make room for another pointer */
            if (size >= allocated) {
                void *p;
                size_t num_entries;

                /* Compute number of entries in the enlarged pointer table */
                if (size < init_size) {
                    /* Allocate initial pointer table */
                    num_entries = init_size;
                } else {
                    /* Double the size */
                    num_entries = size * 2;
                }

                /* Allocate first pointer table or enlarge existing table */
                p = realloc (files, sizeof (void*) * num_entries);
                if (p != NULL) {
                    /* Got the memory */
                    files = (dirent**) p;
                    allocated = num_entries;
                } else {
                    /* Out of memory */
                    result = -1;
                    break;
                }

            }

            /* Allocate room for temporary directory entry */
            if (tmp == NULL) {
                tmp = (struct dirent*) malloc (sizeof (struct dirent));
                if (tmp == NULL) {
                    /* Cannot allocate temporary directory entry */
                    result = -1;
                    break;
                }
            }

            /* Read directory entry to temporary area */
            if (readdir_r (dir, tmp, &entry) == /*OK*/0) {

                /* Did we get an entry? */
                if (entry != NULL) {
                    int pass;

                    /* Determine whether to include the entry in result */
                    if (filter) {
                        /* Let the filter function decide */
                        pass = filter (tmp);
                    } else {
                        /* No filter function, include everything */
                        pass = 1;
                    }

                    if (pass) {
                        /* Store the temporary entry to pointer table */
                        files[size++] = tmp;
                        tmp = NULL;

                        /* Keep up with the number of files */
                        result++;
                    }

                } else {

                    /*
                     * End of directory stream reached => sort entries and
                     * exit.
                     */
                    qsort (files, size, sizeof (void*),
                        (int (*) (const void*, const void*)) compare);
                    break;

                }

            } else {
                /* Error reading directory entry */
                result = /*Error*/ -1;
                break;
            }

        }

    } else {
        /* Cannot open directory */
        result = /*Error*/ -1;
    }

    /* Release temporary directory entry */
    free (tmp);

    /* Release allocated memory on error */
    if (result < 0) {
        for (i = 0; i < size; i++) {
            free (files[i]);
        }
        free (files);
        files = NULL;
    }

    /* Close directory stream */
    if (dir) {
        closedir (dir);
    }

    /* Pass pointer table to caller */
    if (namelist) {
        *namelist = files;
    }
    return result;
}

/* Alphabetical sorting */
static int
alphasort(
    const struct dirent **a, const struct dirent **b)
{
    return strcoll ((*a)->d_name, (*b)->d_name);
}

/* Sort versions */
static int
versionsort(
    const struct dirent **a, const struct dirent **b)
{
    /* FIXME: implement strverscmp and use that */
    return alphasort (a, b);
}

/* Convert multi-byte string to wide character string */
static int
dirent_mbstowcs_s(
    size_t *pReturnValue,
    wchar_t *wcstr,
    size_t sizeInWords,
    const char *mbstr,
    size_t count)
{
    int error;

#if defined(_MSC_VER)  &&  _MSC_VER >= 1400

    /* Microsoft Visual Studio 2005 or later */
    error = mbstowcs_s (pReturnValue, wcstr, sizeInWords, mbstr, count);

#else

    /* Older Visual Studio or non-Microsoft compiler */
    size_t n;

    /* Convert to wide-character string (or count characters) */
    n = mbstowcs (wcstr, mbstr, sizeInWords);
    if (!wcstr  ||  n < count) {

        /* Zero-terminate output buffer */
        if (wcstr  &&  sizeInWords) {
            if (n >= sizeInWords) {
                n = sizeInWords - 1;
            }
            wcstr[n] = 0;
        }

        /* Length of resulting multi-byte string WITH zero terminator */
        if (pReturnValue) {
            *pReturnValue = n + 1;
        }

        /* Success */
        error = 0;

    } else {

        /* Could not convert string */
        error = 1;

    }

#endif
    return error;
}

/* Convert wide-character string to multi-byte string */
static int
dirent_wcstombs_s(
    size_t *pReturnValue,
    char *mbstr,
    size_t sizeInBytes, /* max size of mbstr */
    const wchar_t *wcstr,
    size_t count)
{
    int error;

#if defined(_MSC_VER)  &&  _MSC_VER >= 1400

    /* Microsoft Visual Studio 2005 or later */
    error = wcstombs_s (pReturnValue, mbstr, sizeInBytes, wcstr, count);

#else

    /* Older Visual Studio or non-Microsoft compiler */
    size_t n;

    /* Convert to multi-byte string (or count the number of bytes needed) */
    n = wcstombs (mbstr, wcstr, sizeInBytes);
    if (!mbstr  ||  n < count) {

        /* Zero-terminate output buffer */
        if (mbstr  &&  sizeInBytes) {
            if (n >= sizeInBytes) {
                n = sizeInBytes - 1;
            }
            mbstr[n] = '\0';
        }

        /* Length of resulting multi-bytes string WITH zero-terminator */
        if (pReturnValue) {
            *pReturnValue = n + 1;
        }

        /* Success */
        error = 0;

    } else {

        /* Cannot convert string */
        error = 1;

    }

#endif
    return error;
}

/* Set errno variable */
static void
dirent_set_errno(
    int error)
{
#if defined(_MSC_VER)  &&  _MSC_VER >= 1400

    /* Microsoft Visual Studio 2005 and later */
    _set_errno (error);

#else

    /* Non-Microsoft compiler or older Microsoft compiler */
    errno = error;

#endif
}


#ifdef __cplusplus
}
#endif
#endif /*DIRENT_H*/


================================================
FILE: swin-transformer/semantic-segmentation/layerNorm.cu
================================================
#include <assert.h>
#include "layerNorm.h"
#include "utilsn.h"
#include <assert.h>
#include <vector>


namespace nvinfer1
{

layernorm::layernorm()
{
}
layernorm::~layernorm()
{

}
layernorm::layernorm(const void* data, size_t length)
{
    const char *d = reinterpret_cast<const char *>(data), *a = d;
    Tn::read(d, mInputSize);
    Tn::read(d,Length);

    assert(d == a + length);
}
int layernorm::initialize()
{
    return 0;
}
void layernorm::serialize(void* buffer) const
{
    char* d = static_cast<char*>(buffer), *a = d;
    Tn::write(d, mInputSize);
    Tn::write(d,Length);
    assert(d == a + getSerializationSize());
}
size_t layernorm::getSerializationSize() const
{
    return sizeof(mInputSize) + sizeof(Length);
}
Dims layernorm::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
{
//    outputDims.nbDims  = inputs[0].nbDims;
//    outputDims.d[0] = inputs[0].d[0];
//    for (int var = 1; var < inputs[0].nbDims; ++var) {
//        outputDims.d[var] = 1;
//    }
    return Dims2{inputs[0].d[0],1};
}
void layernorm::setPluginNamespace(const char* pluginNamespace)
{
    mPluginNamespace = pluginNamespace;
}
const char* layernorm::getPluginNamespace() const
{
    return mPluginNamespace;
}
const char* layernorm::getPluginType() const
{
    return "layerNorm_trt";
}
const char* layernorm::getPluginVersion() const
{
    return "1";
}
DataType layernorm::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
{
    return inputTypes[0] ;//== nvinfer1::DataType::kFLOAT ? nvinfer1::DataType::kFLOAT : nvinfer1::DataType::kHALF;
}
void layernorm::destroy()
{
    delete this;
}
IPluginV2IOExt* layernorm::clone() const
{
    layernorm *ln = new layernorm();
    ln->setPluginNamespace(mPluginNamespace);
    ln->setInputSize(mInputSize,Length);
    return ln;
}
bool layernorm::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
{
    return false;
}
bool layernorm::canBroadcastInputAcrossBatch(int inputIndex) const
{
    return false;
}
void layernorm::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
{}
void layernorm::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)
{

    int size = 1;
    for(int i = 0 ; i < in[0].dims.nbDims ; i++)
    {
        size *= in[0].dims.d[i];
    }
    mInputSize = size;
    Length = in[0].dims.d[in[0].dims.nbDims - 1];
}
void layernorm::detachFromContext()
{}

__device__ welford welford_update(welford a, const float *currValue, int length)
{
    #pragma unroll
    for(int i = 0; i < length; i++){
        a.count += 1;
        float delta = currValue[i] - a.mean;
        a.mean += delta / a.count;
        float delta2 = currValue[i] - a.mean;
        a.M2 += delta * delta2;
    }
    return a;
}
__device__ void mean_std(float* mean, float *std, const float *currValue,int l,int count = 0, float m = 0.0, float s = 0.0)
{
    #pragma unroll
    for(int i = 0; i < l; i++){
        count += 1;
        float delta = currValue[i] - m;
        m += delta / count;
        float delta2 = currValue[i] - m;
        s += delta * delta2;
    }
    *mean = m;
    *std = sqrt((s / count) + 1e-5);
}
__global__ void lnCudaKer(const float *in, float *mean, float *std, int size,int l)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx >= size)
        return;
    mean_std(&mean[idx],&std[idx],in+idx*l,l);
    //printf("idx = %d,mean = %f, std = %f\n",idx,mean[idx],std[idx]);
}
void layernorm::forwardGpu(const float *const *inputs, float *mean, float *std, cudaStream_t stream, int batchSize)
{
    int numElem = batchSize * mInputSize/Length;

    lnCudaKer<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount>>>
        (inputs[0], mean,std, numElem,Length);
}
int layernorm::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
{
    forwardGpu((const float *const *)inputs, (float*)outputs[0], (float*)outputs[1], stream, batchSize);
    return 0;
}

PluginFieldCollection layernormCreator::mFC{};
std::vector<PluginField> layernormCreator::mPluginAttributes;
layernormCreator::layernormCreator()
{
    mPluginAttributes.clear();

    mFC.nbFields = mPluginAttributes.size();
    mFC.fields = mPluginAttributes.data();
}

const char* layernormCreator::getPluginName() const
{
    return "layerNorm_trt";
}
const char* layernormCreator::getPluginVersion() const
{
    return "1";
}
const PluginFieldCollection* layernormCreator::getFieldNames()
{
    return &mFC;
}
IPluginV2IOExt* layernormCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
{
    layernorm* obj = new layernorm();
    obj->setPluginNamespace(mNamespace.c_str());

    return obj;
}
IPluginV2IOExt* layernormCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
{
    layernorm* obj = new layernorm(serialData, serialLength);
    obj->setPluginNamespace(mNamespace.c_str());
    return obj;
}


}


================================================
FILE: swin-transformer/semantic-segmentation/layerNorm.h
================================================
#ifndef LAYERNORM_H
#define LAYERNORM_H

#include <vector>
#include <string>
#include <iostream>
#include <NvInfer.h>
#include <memory>
#include <string.h>
#include <cstdint>
#include <stdlib.h>

using namespace std;

struct welford
{
    int count = 0;
    double mean = 0.f;
    double M2 = 0.f;
};

namespace nvinfer1{
class layernorm : public IPluginV2IOExt
{
public:
    layernorm();
    layernorm(const void* data, size_t length);
    ~layernorm();
    int getNbOutputs() const override
    {
        return 2;
    }

    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;

    int initialize() override;

    virtual void terminate() override {};

    virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0; }

    virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override;

    virtual size_t getSerializationSize() const override;

    virtual void serialize(void* buffer) const override;

    bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override {
        return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
    }

    void setPluginNamespace(const char* pluginNamespace) override;

    const char* getPluginNamespace() const override;

    const char* getPluginType() const override;

    const char* getPluginVersion() const override;

    void destroy() override;

    IPluginV2IOExt* clone() const override;


    DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;

    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override;

    bool canBroadcastInputAcrossBatch(int inputIndex) const override;

    void attachToContext(
        cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override;

    void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override;

    void detachFromContext() override;

    void setInputSize(int s, int l) {
        mInputSize = s;
        Length = l;
    }


private:
    void forwardGpu(const float *const * inputs, float *mean, float *std, cudaStream_t stream, int batchSize = 1);
    int mThreadCount = 256;
    int mInputSize;
    int Length;
    Dims outputDims ;
    const char* mPluginNamespace;
};
class layernormCreator : public IPluginCreator
{
    public:
        layernormCreator();

        ~layernormCreator() override = default;

        const char* getPluginName() const override;

        const char* getPluginVersion() const override;

        const PluginFieldCollection* getFieldNames() override;

        IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override;

        IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;

        void setPluginNamespace(const char* libNamespace) override
        {
            mNamespace = libNamespace;
        }

        const char* getPluginNamespace() const override
        {
            return mNamespace.c_str();
        }

    private:
        std::string mNamespace;
        static PluginFieldCollection mFC;
        static std::vector<PluginField> mPluginAttributes;

};

REGISTER_TENSORRT_PLUGIN(layernormCreator);
};

#endif // LAYERNORM_H


================================================
FILE: swin-transformer/semantic-segmentation/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) override
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: swin-transformer/semantic-segmentation/main.cpp
================================================
#include <iostream>


using namespace std;


================================================
FILE: swin-transformer/semantic-segmentation/myhpp.h
================================================
#ifndef MYHPP_H
#define MYHPP_H

#include <assert.h>
#include <iostream>
#include<vector>
#include<map>
#define _USE_MATH_DEFINES
#include <math.h>
#include <cmath>
#include<string>
#include<fstream>
#include<streambuf>
#include<ctime>
#include<chrono>
#include<iomanip>
#include<cuda_runtime.h>
#include<opencv2/core/core.hpp>
#include<opencv2/imgproc/imgproc.hpp>
#include<opencv2/imgcodecs/imgcodecs.hpp>
#include<opencv2/dnn/dnn.hpp>
//#include <opencv2/highgui/highgui.hpp>
#include<stdio.h>
#include<cuda.h>
//#include <cudnn.h>
#include <cublas_v2.h>
#include<driver_types.h>
#include<NvInfer.h>
#include<NvInferPlugin.h>
#include<NvOnnxParser.h>
#include<NvOnnxConfig.h>
#include<cstdint>


#endif // MYHPP_H


================================================
FILE: swin-transformer/semantic-segmentation/trainsform.cpp
================================================
#include "common.hpp"
#include "logging.h"
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>

#define USE_FP32

static Logger gLogger;

const char *INPUT_BLOB_NAME = "data";
const char *OUTPUT_BLOB_NAME = "output";
static const int bs = 1;
static const int channels = 96;
static const int ch = 3;
static const int INPUT_H = 576;
static const int INPUT_W = 576;
static const int NUM_CLASSES = 15;
static const int outputSize = 576 * 576;
cudaStream_t m_cudaStream;
vector<void *> m_bindings;
IExecutionContext *m_context;

ICudaEngine *createEngine(unsigned int maxBatchSize, IBuilder *builder, IBuilderConfig *config, DataType dt,std::string wtsPath)
{
    INetworkDefinition *network = builder->createNetworkV2(0U);
    ITensor *data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ch, INPUT_H, INPUT_W});
    assert(data);
    std::map<std::string, Weights> weightMap = loadWeights(wtsPath);
    ITensor* conv1 = conv(network, weightMap, data, "backbone.patch_embed.proj", channels);
    ITensor* shuffle1 = shuffle_reshapeApermute(network, conv1, Dims2{channels, -1}, Permutation{1, 0}, true);
    ITensor *ln = m_layerNorm(network, weightMap, shuffle1, "backbone.patch_embed.norm");
    debug_print(ln, "ln");
    //layer0

    ITensor *mask0 = trt_transform_imgMask(network, 147, 7, 3);
    ITensor *blk00 = blk(network, weightMap, ln, mask0, "backbone.layers.0.blocks.0", INPUT_H / 4, channels, 3, 7, 0);
    debug_print(blk00, "blk00");
    ITensor *blk01 = blk(network, weightMap, blk00, mask0, "backbone.layers.0.blocks.1", INPUT_H / 4, channels, 3, 7, 3);
    debug_print(blk01, "blk01");
    ITensor* out0 = m_layerNorm(network, weightMap, blk01, "backbone.norm0");
    out0 = shuffle_reshapeApermute(network, out0, Dims3{INPUT_H / 4, INPUT_H / 4, channels}, Permutation{2, 0, 1}, true);
    ITensor *down_layer0 = downsample(network, weightMap, blk01, "backbone.layers.0.downsample", INPUT_H / 4);
    debug_print(down_layer0, "down_blk1");
    //layer1
    ITensor *mask1 = trt_transform_imgMask(network, 77, 7, 3);
    ITensor *blk10 = blk(network, weightMap, down_layer0, mask1, "backbone.layers.1.blocks.0", INPUT_H / 8, channels * 2, 6, 7, 0);
    debug_print(blk10, "blk10");
    ITensor *blk11 = blk(network, weightMap, blk10, mask1, "backbone.layers.1.blocks.1", INPUT_H / 8, channels * 2, 6, 7, 3);
    debug_print(blk11, "blk11");
    ITensor* out1 = m_layerNorm(network, weightMap, blk11, "backbone.norm1");
    out1 = shuffle_reshapeApermute(network, out1, Dims3{INPUT_H / 8, INPUT_H / 8, channels * 2}, Permutation{2, 0, 1}, true);
    ITensor *down_layer1 = downsample(network, weightMap, blk11, "backbone.layers.1.downsample", INPUT_H / 8);
    debug_print(down_layer1, "down_layer1");
    //layer2
    ITensor *mask2 = trt_transform_imgMask(network, 42, 7, 3);
    ITensor *blk20 = blk(network, weightMap, down_layer1, mask2, "backbone.layers.2.blocks.0", INPUT_H / 16, channels * 4, 12, 7, 0);
    debug_print(blk20, "blk20");
    ITensor *blk21 = blk(network, weightMap, blk20, mask2, "backbone.layers.2.blocks.1", INPUT_H / 16, channels * 4, 12, 7, 3);
    debug_print(blk21, "blk21");
    ITensor *blk22 = blk(network, weightMap, blk21, mask2, "backbone.layers.2.blocks.2", INPUT_H / 16,channels * 4, 12, 7, 0);
    debug_print(blk22, "blk22");
    ITensor *blk23 = blk(network, weightMap, blk22, mask2, "backbone.layers.2.blocks.3", INPUT_H / 16, channels * 4, 12, 7, 3);
    debug_print(blk23, "blk23");
    ITensor *blk24 = blk(network, weightMap, blk23, mask2, "backbone.layers.2.blocks.4", INPUT_H / 16, channels * 4, 12, 7, 0);
    debug_print(blk24, "blk24");
    ITensor *blk25 = blk(network, weightMap, blk24, mask2, "backbone.layers.2.blocks.5", INPUT_H / 16, channels * 4, 12, 7, 3);
    debug_print(blk25, "blk25");
    ITensor* out2 = m_layerNorm(network, weightMap, blk25, "backbone.norm2");
    out2 = shuffle_reshapeApermute(network, out2, Dims3{INPUT_H / 16, INPUT_H / 16, channels * 4}, Permutation{2, 0, 1}, true);
    ITensor *down_layer2 = downsample(network, weightMap, blk25, "backbone.layers.2.downsample", INPUT_H / 16);
    debug_print(down_layer2, "down_layer2");
    //layer3
    ITensor *mask3 = trt_transform_imgMask(network, 21, 7, 3);
    ITensor *blk30 = blk(network, weightMap, down_layer2, mask3, "backbone.layers.3.blocks.0", INPUT_H / 32, channels * 8, 24, 7, 0);
    debug_print(blk30, "blk30");
    ITensor *blk31 = blk(network, weightMap, blk30, mask3, "backbone.layers.3.blocks.1", INPUT_H / 32, channels * 8, 24, 7, 3);
    debug_print(blk31, "blk31");
    ITensor* out3 = m_layerNorm(network, weightMap, blk31, "backbone.norm3");
    out3 = shuffle_reshapeApermute(network, out3, Dims3{INPUT_H / 32, INPUT_H / 32, channels * 8}, Permutation{2, 0, 1}, true);
    ITensor* out[4] = {out0, out1, out2, out3};
    out0 = transform_lateral_conv(network, weightMap, out0, "decode_head.lateral_convs.0");  // 512,INPUT_H/4,INPUT_H/4
    out1 = transform_lateral_conv(network, weightMap, out1, "decode_head.lateral_convs.1");  // 512,INPUT_H/8,INPUT_H/8
    out2 = transform_lateral_conv(network, weightMap, out2, "decode_head.lateral_convs.2");  // 512,INPUT_H/16,INPUT_H/16
    auto psp_out_0 = transform_psp(network, weightMap, out3, "decode_head.psp_modules.0.1", 1);
    auto psp_out_1 = transform_psp(network, weightMap, out3, "decode_head.psp_modules.1.1", 2);
    auto psp_out_2 = transform_psp(network, weightMap, out3, "decode_head.psp_modules.2.1", 3);
    auto psp_out_3 = transform_psp(network, weightMap, out3, "decode_head.psp_modules.3.1", 6);
    ITensor* psp_outs[5] = {out3, psp_out_0, psp_out_1, psp_out_2, psp_out_3};
    auto PSP_outs = network->addConcatenation(psp_outs, 5);
    PSP_outs->setAxis(0);
    debug_print(PSP_outs->getOutput(0), "PSP_outs");
    out3 = transform_lateral_conv(network, weightMap, PSP_outs->getOutput(0), "decode_head.bottleneck", 3, 1, 512);  // 512,INPUT_H/32,INPUT_H/32
    debug_print(out3, "out3");
    auto laterals2 = up_Add(network, out3, out2);
    auto laterals1 = up_Add(network, laterals2, out1);
    auto laterals0 = up_Add(network, laterals1, out0);
    auto fpn0 = transform_lateral_conv(network, weightMap, laterals0, "decode_head.fpn_convs.0", 3, 1, 512);
    auto fpn1 = transform_lateral_conv(network, weightMap, laterals1, "decode_head.fpn_convs.1", 3, 1, 512);
    auto fpn2 = transform_lateral_conv(network, weightMap, laterals2, "decode_head.fpn_convs.2", 3, 1, 512);
    fpn1 = resize(network, fpn1,fpn0->getDimensions().d[1]);
    fpn2 = resize(network, fpn2,fpn0->getDimensions().d[1]);
    auto fpn3 = resize(network, out3, fpn0->getDimensions().d[1]);
    ITensor* fpn_outs[4] = {fpn0, fpn1, fpn2, fpn3};
    auto FPN_outs = network->addConcatenation(fpn_outs, 4);
    FPN_outs->setAxis(0);
    debug_print(FPN_outs->getOutput(0), "FPN_outs");
    auto fpn_output = transform_lateral_conv(network, weightMap, FPN_outs->getOutput(0), "decode_head.fpn_bottleneck", 3, 1, 512);
    debug_print(fpn_output, "fpn_output");
    auto seg = network->addConvolutionNd(*fpn_output, NUM_CLASSES, Dims2{1, 1}, weightMap["decode_head.conv_seg.weight"], weightMap["decode_head.conv_seg.bias"]);
    seg->setStrideNd(Dims2{1, 1});
    debug_print(seg->getOutput(0), "seg");
    auto seg_resize = resize(network, seg->getOutput(0), INPUT_H);
    debug_print(seg_resize, "seg_resize");
    auto output = network->addTopK(*seg_resize, TopKOperation::kMAX, 1, 0X01)->getOutput(1);
    debug_print(output, "output");

    std::cout << "set name out" << std::endl;
    output->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*output);
    builder->setMaxBatchSize(12);
    config->setMaxWorkspaceSize((1 << 30)); // 1G
#ifdef USE_FP16
    std::cout<< "use fp16"<<std::endl;
    config->setFlag(BuilderFlag::kFP16);
#endif
    ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "build success!" << std::endl;
    network->destroy();

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory **modelStream,std::string wtsPath)
{
    IBuilder *builder = createInferBuilder(gLogger);
    IBuilderConfig *config = builder->createBuilderConfig();
    ICudaEngine *engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT, wtsPath);
    assert(engine != nullptr);
    (*modelStream) = engine->serialize();
    engine->destroy();
    builder->destroy();
}

void createEng(std::string wtsPath, std::string engine_name)
{
    char *trtModelStream{nullptr};
    size_t size{0};

    IHostMemory *modelStream{nullptr};
    APIToModel(bs, &modelStream, wtsPath);
    assert(modelStream != nullptr);
    std::ofstream p(engine_name, std::ios::binary);
    if (!p)
    {
        std::cerr << "could not open plan output file" << std::endl;
        return;
    }
    p.write(reinterpret_cast<const char *>(modelStream->data()), modelStream->size());
    modelStream->destroy();
    std::ifstream file(engine_name, std::ios::binary);
    if (file.good())
    {
        file.seekg(0, file.end);
        size = file.tellg();
        file.seekg(0, file.beg);
        trtModelStream = new char[size];
        assert(trtModelStream);
        file.read(trtModelStream, size);
        file.close();
    }
}

void inference_init(string ENGPath,ICudaEngine *m_engine)
{
    ifstream cache(ENGPath, ios::binary);
    cache.seekg(0, ios::end);
    const int engSize = cache.tellg();
    cache.seekg(0, ios::beg);
    void *modelMem = malloc(engSize);
    cache.read((char*)modelMem, engSize);
    cache.close();
    IRuntime *runtime = nvinfer1::createInferRuntime(gLogger);
    m_engine = runtime->deserializeCudaEngine(modelMem, engSize);
    runtime->destroy();
    free(modelMem);
    if (!m_engine) {
        cout << "deserialize eng error!" << endl;
        return;
    }
    m_context = m_engine->createExecutionContext();
    if (cudaStreamCreate(&m_cudaStream) != 0) return;
    int bindings = m_engine->getNbBindings();
    if (bindings < 2)
    {
        cout << "Error! the network have one input and one output at least!" << endl;
        return;
    }
    cout << "1111111111111" << endl;
    m_bindings.resize(bindings, nullptr);
    CHECK(cudaMalloc(&m_bindings.at(0), bs * ch * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&m_bindings.at(1), bs * outputSize * 4));
}

void doInference(const float *input, int *output)
{
    cout << "do infer:" << endl;
    CHECK(cudaMemcpyAsync(m_bindings.at(0), input, bs * ch * INPUT_H * INPUT_W * sizeof(float),
                          cudaMemcpyHostToDevice, m_cudaStream));

    m_context->enqueue(bs, m_bindings.data(), m_cudaStream, nullptr);

    CHECK(cudaMemcpyAsync(output, m_bindings.at(1), bs * outputSize * 4,
                          cudaMemcpyDeviceToHost, m_cudaStream));

    cudaStreamSynchronize(m_cudaStream);
}


int main(int argc, char** argv)
{
    cout << "begin" << endl;
    //string wts = "G:/shaj/trainsform/ktn5n6_29.511.21.8.wts";
    //string eng = "G:/shaj/trainsform/trainsform.eng";
    std::string argv1 = argv[1];
    if (argv1 == "-s") {
        string wts = argv[2];
        string eng = argv[3];
        createEng(wts,eng);
    } else {
        string eng = argv[2];

        ICudaEngine *m_engine;

        inference_init(eng,m_engine);

        vector<cv::Mat> testVal;
        map<string,cv::Mat> dataProb;
        vector<string> imgs;
        cv::Mat img;
        //string pattern_dir = "G:/shaj/trainsform";
        string pattern_dir = argv[3];
        string pattern = pattern_dir+ "/*.bmp";
        vector<cv::String> images_names;
        cv::glob(pattern, images_names, false);
        int i = 0;
        cv::Scalar Mean = cv::Scalar(123.675, 116.28, 103.53);
        cv::Scalar Std = cv::Scalar(58.395, 57.12, 57.375);
        cv::Size size = {INPUT_H,INPUT_W};

        for (auto image_name: images_names)
        {
            if (i < bs)
            {
                cv::Mat Img = cv::imread(image_name, 1);

                testVal.push_back(Img);
                cout << image_name << endl;
                imgs.push_back(image_name);
            }
        }
        float *data = new float[bs * ch * INPUT_H * INPUT_W];
        int *output = new int[bs * outputSize];

        cv::Mat Transed_t = BlobFromImages(testVal, cv::Size{INPUT_H, INPUT_W}, Mean, Std, true, false);

        memcpy(data, Transed_t.data, bs * ch * INPUT_H * INPUT_W * sizeof(float));

        //for(int i = 0 ; i< 20; i++){

        auto start_time = std::chrono::system_clock::now();
        doInference(data, output);

        auto end_time = std::chrono::system_clock::now();
        float duration;
        duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
        cout << "time:" << duration << endl;
        //}
        //    for(int i = 0; i < 100; i++)
        //        cout<<i<<":"<<output[i]<<endl;

        int n = 0;
        int *out = new int[outputSize];
        string outPath = pattern_dir + "/output";
        for (int i = 0; i < testVal.size(); i++)
        {
            cv::Mat img = cv::imread(imgs[i], 1);
            cv::Mat dst;
            cv::resize(img,dst,cv::Size{INPUT_H, INPUT_W});
            //string outPath_n = outPath + "/"+to_string(n) + ".jpg";
            n += 1;
            out = output + i * outputSize;
            for (int i = 0; i < outputSize; i++)
            {
                if (out[i] != 0)
                {
                    int w = i % (INPUT_H);
                    int h = i / (INPUT_W);
                    dst.at<cv::Vec3b>(h, w)[0] =  out[i] * 10;
                    dst.at<cv::Vec3b>(h, w)[1] =  out[i] * 30;
                    dst.at<cv::Vec3b>(h, w)[2] =  out[i] * 40;
                }
            }
            //cout<<outPath_n<<endl;
            string outPath_result = imgs[i].replace(0, pattern_dir.size(), outPath);
            cout << outPath_result << endl;
            cv::imwrite(outPath_result, dst);
        }
        testVal.clear();
        imgs.clear();
    }

    m_context->destroy();
    m_engine->destroy();
    for (auto bindings: m_bindings) {
        cudaFree(bindings);
    }
    cudaFree(m_cudaStream);

    cout << "swin_transform" << endl;
    return 0;
}


================================================
FILE: swin-transformer/semantic-segmentation/utilsn.h
================================================
#ifndef UTILSN_H
#define UTILSN_H

#include <iostream>
#include <vector>
#include <algorithm>
#include <cudnn.h>
#include <NvInfer.h>
#include "myhpp.h"
using namespace std;

#ifndef CUDA_CHECK

#define CUDA_CHECK(callstr)                                                                    \
    {                                                                                          \
        cudaError_t error_code = callstr;                                                      \
        if (error_code != cudaSuccess) {                                                       \
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
            assert(0);                                                                         \
        }                                                                                      \
    }

#endif

namespace Tn
{
    class Profiler : public nvinfer1::IProfiler
    {
    public:
        void printLayerTimes(int itrationsTimes)
        {
            float totalTime = 0;
            for (size_t i = 0; i < mProfile.size(); i++)
            {
                printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / itrationsTimes);
                totalTime += mProfile[i].second;
            }
            printf("Time over all layers: %4.3f\n", totalTime / itrationsTimes);
        }
    private:
        typedef std::pair<std::string, float> Record;
        std::vector<Record> mProfile;

        virtual void reportLayerTime(const char* layerName, float ms)
        {
            auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; });
            if (record == mProfile.end())
               { mProfile.push_back(std::make_pair(layerName, ms));}
            else
                record->second += ms;
        }
    };

    template<typename T>
    void write(char*& buffer, const T& val)
    {
        *reinterpret_cast<T*>(buffer) = val;
        buffer += sizeof(T);
    }

    template<typename T>
    void read(const char*& buffer, T& val)
    {
        val = *reinterpret_cast<const T*>(buffer);
        buffer += sizeof(T);
    }

//    void* copyToDevice(const void* data, size_t count)
//    {
//        void* deviceData;
//        cudaMalloc(&deviceData, count);
//        cudaMemcpy(deviceData, data, count, cudaMemcpyHostToDevice);
//        return deviceData;
//    }
//    void deserializeToDevice(const char*& hostBuffer, void*& deviceWeights, size_t size)
//    {
//        deviceWeights = copyToDevice(hostBuffer, size);
//        hostBuffer += size;
//    }
//    size_t type2size(nvinfer1::DataType type) { return sizeof(float); }
//    void convertAndCopyToBuffer(char*& buffer, const nvinfer1::Weights& weights)
//    {
//        memcpy(buffer, weights.values, weights.count * type2size(weights.type));
//        buffer += weights.count * type2size(weights.type);
//    }


}

#endif // UTILSN_H


================================================
FILE: tsm/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(TSM)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)

# tensorrt
include_directories(/home/ubuntu/TensorRT/include/)
link_directories(/home/ubuntu/TensorRT/lib/)

add_executable(tsm_r50 ${PROJECT_SOURCE_DIR}/tsm_r50.cpp)
target_link_libraries(tsm_r50 nvinfer)
target_link_libraries(tsm_r50 cudart)

add_definitions(-O2 -pthread)


================================================
FILE: tsm/README.md
================================================
# Temporal Shift Module

TSM-R50 from "TSM: Temporal Shift Module for Efficient Video Understanding" <https://arxiv.org/abs/1811.08383>

TSM is a widely used Action Recognition model. This TensorRT implementation is tested with TensorRT 5.1 and TensorRT 7.2.

For the PyTorch implementation, you can refer to [open-mmlab/mmaction2](https://github.com/open-mmlab/mmaction2) or [mit-han-lab/temporal-shift-module](https://github.com/mit-han-lab/temporal-shift-module).

More details about the shift module(which is the core of TSM) could to [test_shift.py](./test_shift.py).

## Tutorial

+ An example could refer to [demo.sh](./demo.sh)
  + Requirements: Successfully installed `torch>=1.3.0, torchvision`

+ Step 1: Train/Download TSM-R50 checkpoints from [offical Github repo](https://github.com/mit-han-lab/temporal-shift-module) or [MMAction2](https://github.com/open-mmlab/mmaction2)
  + Supported settings: `num_segments`, `shift_div`, `num_classes`.
  + Fixed settings: `backbone`(ResNet50), `shift_place`(blockres), `temporal_pool`(False).

+ Step 2: Convert PyTorch checkpoints to TensorRT weights.

```shell
python gen_wts.py /path/to/pytorch.pth --out-filename /path/to/tensorrt.wts
```

+ Step 3: Test Python API.
  + Modify configs in `tsm_r50.py`.
  + Inference with `tsm_r50.py`.

```python
# Supported settings
BATCH_SIZE = 1
NUM_SEGMENTS = 8
INPUT_H = 224
INPUT_W = 224
OUTPUT_SIZE = 400
SHIFT_DIV = 8
```

```shell
usage: tsm_r50.py [-h] [--tensorrt-weights TENSORRT_WEIGHTS] [--input-video INPUT_VIDEO] [--save-engine-path SAVE_ENGINE_PATH] [--load-engine-path LOAD_ENGINE_PATH] [--test-mmaction2] [--mmaction2-config MMACTION2_CONFIG] [--mmaction2-checkpoint MMACTION2_CHECKPOINT] [--test-cpp] [--cpp-result-path CPP_RESULT_PATH]

optional arguments:
  -h, --help            show this help message and exit
  --tensorrt-weights TENSORRT_WEIGHTS
                        Path to TensorRT weights, which is generated by gen_weights.py
  --input-video INPUT_VIDEO
                        Path to local video file
  --save-engine-path SAVE_ENGINE_PATH
                        Save engine to local file
  --load-engine-path LOAD_ENGINE_PATH
                        Saved engine file path
  --test-mmaction2      Compare TensorRT results with MMAction2 Results
  --mmaction2-config MMACTION2_CONFIG
                        Path to MMAction2 config file
  --mmaction2-checkpoint MMACTION2_CHECKPOINT
                        Path to MMAction2 checkpoint url or file path
  --test-cpp            Compare Python API results with C++ API results
  --cpp-result-path CPP_RESULT_PATH
                        Path to C++ API results
```

+ Step 4: Test C++ API.
  + Mocify Configs in `tsm_r50.cpp`.
  + Build from source code: `mkdir build && cd build && cmake .. && make`
  + Generate Engine file: `./tsm_r50 -s`
  + Inference with genrated engine file and write predictions to local: `./tsm_r50 -d`
  + Compare results with Python API: `python tsm_r50.py --tensorrt-weights /path/to/tensorrt.weights --test-cpp --cpp-result-file /path/to/cpp-result.txt`

## TODO

+ [x] Python Shift module.
+ [x] Generate wts of official tsm and mmaction2 tsm.
+ [x] Python API Definition
+ [x] Test with mmaction2 demo
+ [x] Tutorial
+ [x] C++ API Definition


================================================
FILE: tsm/demo.sh
================================================
# Step 1: Get checkpoints from mmaction2
# https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/tsm
wget https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/tsm_r50_1x1x8_50e_kinetics400_rgb_20200607-af7fb746.pth

# Step 2: Convert pytorch checkpoints to TensorRT weights
python gen_wts.py tsm_r50_1x1x8_50e_kinetics400_rgb_20200607-af7fb746.pth --out-filename ./tsm_r50_kinetics400_mmaction2.wts

# Step 3: Test Python API.
# 3.1 Skip this step since we use default settings.
# 3.2 Inference
# 3.2.1 Save local engine file to `./tsm_r50_kinetics400_mmaction2.trt`.
python tsm_r50.py \
    --tensorrt-weights ./tsm_r50_kinetics400_mmaction2.wts \
    --save-engine-path ./tsm_r50_kinetics400_mmaction2.trt

# 3.2.2 Predict the recognition result using a single video `demo.mp4`.
#       Should print `Result class id 6`, aka `arm wrestling`
# Download demo video
wget https://raw.githubusercontent.com/open-mmlab/mmaction2/master/demo/demo.mp4
# # use *.wts as input
# python tsm_r50.py --tensorrt-weights ./tsm_r50_kinetics400_mmaction2.wts \
#     --input-video ./demo.mp4
# use engine file as input
python tsm_r50.py --load-engine-path ./tsm_r50_kinetics400_mmaction2.trt \
    --input-video ./demo.mp4

# 3.2.3 Optional: Compare inference result with MMAction2 TSM-R50 model
#       Have to install MMAction2 First, please refer to https://github.com/open-mmlab/mmaction2/blob/master/docs/install.md
# pip3 install pytest-runner
# pip3 install mmcv
# pip3 install mmaction2
# # use *.wts as input
# python tsm_r50.py \
#     --tensorrt-weights ./tsm_r50_kinetics400_mmaction2.wts \
#     --test-mmaction2 \
#     --mmaction2-config mmaction2_tsm_r50_config.py \
#     --mmaction2-checkpoint tsm_r50_1x1x8_50e_kinetics400_rgb_20200607-af7fb746.pth
# # use TensorRT engine as input
# python tsm_r50.py \
#     --load-engine-path ./tsm_r50_kinetics400_mmaction2.trt \
#     --test-mmaction2 \
#     --mmaction2-config mmaction2_tsm_r50_config.py \
#     --mmaction2-checkpoint tsm_r50_1x1x8_50e_kinetics400_rgb_20200607-af7fb746.pth

# Step 4: Test Python API.
# 4.1 Skip this step since we use default settings.
# 4.2 Build CPP
mkdir build && cd build && cmake .. && make
# 4.3 Generate Engine file
./tsm_r50 -s
# 4.4 Get Predictions
./tsm_r50 -d
# 4.5 Compare C++ Results with Python Results
cd ..
python tsm_r50.py --test-cpp --tensorrt-weights ./tsm_r50_kinetics400_mmaction2.wts


================================================
FILE: tsm/gen_wts.py
================================================
import argparse
import struct

import torch
import numpy as np


def write_one_weight(writer, name, weight):
    assert isinstance(weight, np.ndarray)
    values = weight.reshape(-1)
    writer.write('{} {}'.format(name, len(values)))
    for value in values:
        writer.write(' ')
        # float to bytes to hex_string
        writer.write(struct.pack('>f', float(value)).hex())
    writer.write('\n')


def convert_name(name):
    return name.replace("module.", "").replace("base_model.", "").\
        replace("net.", "").replace("new_fc", "fc").replace("backbone.", "").\
        replace("cls_head.fc_cls", "fc").replace(".conv.", ".").\
        replace("conv1.bn", "bn1").replace("conv2.bn", "bn2").\
        replace("conv3.bn", "bn3").replace("downsample.bn", "downsample.1").\
        replace("downsample.weight", "downsample.0.weight")


def main(args):
    ckpt = torch.load(args.checkpoint)['state_dict']
    ckpt = {k: v for k, v in ckpt.items() if 'num_batches_tracked' not in k}
    with open(args.out_filename, "w") as f:
        f.write(f"{len(ckpt)}\n")
        for k, v in ckpt.items():
            key = convert_name(k)
            write_one_weight(f, key, v.cpu().numpy())


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("checkpoint", type=str, help="Path to checkpoint file")
    parser.add_argument("--out-filename",
                        type=str,
                        default="tsm_r50.wts",
                        help="Path to converted wegiths file")
    args = parser.parse_args()
    main(args)


================================================
FILE: tsm/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) override
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: tsm/mmaction2_tsm_r50_config.py
================================================
# model settings
model = dict(
    type='Recognizer2D',
    backbone=dict(
        type='ResNetTSM',
        pretrained='torchvision://resnet50',
        depth=50,
        norm_eval=False,
        shift_div=8),
    cls_head=dict(
        type='TSMHead',
        num_classes=400,
        in_channels=2048,
        spatial_type='avg',
        consensus=dict(type='AvgConsensus', dim=1),
        dropout_ratio=0.5,
        init_std=0.001,
        is_shift=True),
    # model training and testing settings
    train_cfg=None,
    test_cfg=dict(average_clips='prob'))


================================================
FILE: tsm/test_shift.py
================================================
import numpy as np
import pycuda.autoinit  # noqa
import pycuda.driver as cuda
import tensorrt as trt
import torch
from numpy.testing import assert_array_almost_equal

INPUT_BLOB_NAME = 'input'
OUTPUT_BLOB_NAME = 'output'


def shift_mit(x, num_segments, shift_div=8):
    """Official temporal shift module.
    
    Code Reference: https://github.com/mit-han-lab/temporal-shift-module/blob/master/ops/temporal_shift.py # noqa
    Cannot convert to ONNX Model.
    """
    nt, c, h, w = x.size()
    n_batch = nt // num_segments
    x = x.view(n_batch, num_segments, c, h, w)

    fold = c // shift_div

    out = torch.zeros_like(x)
    out[:, :-1, :fold] = x[:, 1:, :fold]  # shift left
    out[:, 1:, fold:2 * fold] = x[:, :-1, fold:2 * fold]  # shift right
    out[:, :, 2 * fold:] = x[:, :, 2 * fold:]  # not shift

    return out.view(nt, c, h, w)


def shift_mmaction2(x, num_segments, shift_div=8):
    """MMAction2 temporal shift module.
    
    Code Reference: https://github.com/open-mmlab/mmaction2/blob/master/mmaction/models/backbones/resnet_tsm.py # noqa
    Could convert to ONNX Model.
    """
    # [N, C, H, W]
    n, c, h, w = x.size()

    # [N // num_segments, num_segments, C, H*W]
    # can't use 5 dimensional array on PPL2D backend for caffe
    x = x.view(-1, num_segments, c, h * w)

    # get shift fold
    fold = c // shift_div

    # split c channel into three parts:
    # left_split, mid_split, right_split
    left_split = x[:, :, :fold, :]
    mid_split = x[:, :, fold:2 * fold, :]
    right_split = x[:, :, 2 * fold:, :]

    # can't use torch.zeros(*A.shape) or torch.zeros_like(A)
    # because array on caffe inference must be got by computing

    # shift left on num_segments channel in `left_split`
    zeros = left_split - left_split
    blank = zeros[:, :1, :, :]
    left_split = left_split[:, 1:, :, :]
    left_split = torch.cat((left_split, blank), 1)

    # shift right on num_segments channel in `mid_split`
    zeros = mid_split - mid_split
    blank = zeros[:, :1, :, :]
    mid_split = mid_split[:, :-1, :, :]
    mid_split = torch.cat((blank, mid_split), 1)

    # right_split: no shift

    # concatenate
    out = torch.cat((left_split, mid_split, right_split), 2)

    # [N, C, H, W]
    # restore the original dimension
    return out.view(n, c, h, w)


def _tensorrt_shift_module(network,
                           input,
                           num_segments=8,
                           shift_div=8,
                           input_shape=(16, 64, 32, 32)):
    """Temporal shift module implemented by TensorRT Network Definition API."""
    fold = input_shape[1] // shift_div
    batch_size = input_shape[0] // num_segments

    # reshape
    reshape = network.add_shuffle(input)
    assert reshape
    reshape.reshape_dims = (batch_size, num_segments) + tuple(input_shape[-3:])

    # left
    left_split = network.add_slice(reshape.get_output(0),
                                   start=(0, 1, 0, 0, 0),
                                   shape=(batch_size, num_segments - 1, fold,
                                          input_shape[2], input_shape[3]),
                                   stride=(1, 1, 1, 1, 1))
    assert left_split
    left_split_shape = (batch_size, 1, fold, input_shape[2], input_shape[3])
    left_blank = network.add_constant(shape=left_split_shape,
                                      weights=np.zeros(left_split_shape,
                                                       np.float32))
    assert left_blank
    left = network.add_concatenation(
        [left_split.get_output(0),
         left_blank.get_output(0)])
    assert left
    left.axis = 1

    # mid
    mid_split_shape = (batch_size, 1, fold, input_shape[2], input_shape[3])
    mid_blank = network.add_constant(shape=mid_split_shape,
                                     weights=np.zeros(mid_split_shape,
                                                      np.float32))
    assert mid_blank
    mid_split = network.add_slice(reshape.get_output(0),
                                  start=(0, 0, fold, 0, 0),
                                  shape=(batch_size, num_segments - 1, fold,
                                         input_shape[2], input_shape[3]),
                                  stride=(1, 1, 1, 1, 1))
    assert mid_split
    mid = network.add_concatenation(
        [mid_blank.get_output(0),
         mid_split.get_output(0)])
    assert mid
    mid.axis = 1

    # right
    right = network.add_slice(reshape.get_output(0),
                              start=(0, 0, 2 * fold, 0, 0),
                              shape=(batch_size, num_segments,
                                     input_shape[1] - 2 * fold, input_shape[2],
                                     input_shape[3]),
                              stride=(1, 1, 1, 1, 1))

    # concat
    concat = network.add_concatenation(
        [left.get_output(0),
         mid.get_output(0),
         right.get_output(0)])
    assert concat
    concat.axis = 2

    # reshape
    reshape2 = network.add_shuffle(concat.get_output(0))
    assert reshape2
    reshape2.reshape_dims = input_shape
    return reshape2


def shift_tensorrt(x, num_segments, shift_div, input_shape):
    """Test TensorRT temporal shift module."""
    assert isinstance(x, np.ndarray)

    gLogger = trt.Logger(trt.Logger.INFO)
    builder = trt.Builder(gLogger)
    config = builder.create_builder_config()

    # create engine
    explicit_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    network = builder.create_network(explicit_flag)
    input = network.add_input(INPUT_BLOB_NAME, trt.float32, input_shape)
    assert input
    output = _tensorrt_shift_module(network,
                                    input,
                                    num_segments=num_segments,
                                    shift_div=shift_div,
                                    input_shape=input_shape)
    assert output

    # generate engine by builder/network/config
    output.get_output(0).name = OUTPUT_BLOB_NAME
    network.mark_output(output.get_output(0))
    builder.max_batch_size = 1
    builder.max_workspace_size = 1 << 20
    engine = builder.build_engine(network, config)
    del network
    assert engine.num_bindings == 2, f'{engine.num_bindings}'
    context = engine.create_execution_context()

    # buffer
    host_in = cuda.pagelocked_empty(trt.volume(input_shape), dtype=np.float32)
    np.copyto(host_in, x.ravel())
    host_out = cuda.pagelocked_empty(trt.volume(input_shape), dtype=np.float32)
    devide_in = cuda.mem_alloc(host_in.nbytes)
    devide_out = cuda.mem_alloc(host_out.nbytes)
    bindings = [int(devide_in), int(devide_out)]
    stream = cuda.Stream()

    # do inference
    cuda.memcpy_htod_async(devide_in, host_in, stream)
    context.execute_async(bindings=bindings, stream_handle=stream.handle)
    cuda.memcpy_dtoh_async(host_out, devide_out, stream)
    stream.synchronize()

    return np.array(host_out.reshape(*input_shape))


if __name__ == '__main__':
    INPUT_SHAPE = (16, 64, 32, 32)
    assert len(INPUT_SHAPE) == 4
    NUM_SEGMENTS = 8
    SHIFT_DIV = 8

    # inference
    inputs = np.random.rand(*INPUT_SHAPE).astype(np.float32)
    inputs_pytorch = torch.tensor(inputs)
    with torch.no_grad():
        rmit = shift_mit(inputs_pytorch, NUM_SEGMENTS, SHIFT_DIV).numpy()
        rmmaction2 = shift_mmaction2(inputs_pytorch, NUM_SEGMENTS,
                                     SHIFT_DIV).numpy()
    rtensorrt = shift_tensorrt(inputs, NUM_SEGMENTS, SHIFT_DIV, INPUT_SHAPE)

    # test results
    assert_array_almost_equal(rmit, rtensorrt)
    assert_array_almost_equal(rmmaction2, rtensorrt)
    print("Tests PASSED")


================================================
FILE: tsm/tsm_r50.cpp
================================================
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include <cmath>
#include <cstring>

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int OUTPUT_SIZE = 400;
static const int NUM_SEGMENTS = 8;
static const int SHIFT_DIV = 8;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
const char* WEIGHTS_PATH = "../tsm_r50_kinetics400_mmaction2.wts";
const char* ENGINE_PATH = "./tsm_r50_kinetics400_mmaction2_cpp.trt";
const char* RESULT_PATH = "./result.txt";

using namespace nvinfer1;

static Logger gLogger;

// Load weights from files shared with TensorRT samples.
// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file)
{
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        
        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

void print(char* name, ITensor* tensor) {
    Dims dim = tensor->getDimensions();
    std::cout << name << " " << dim.d[0] << " " << dim.d[1] << " " << dim.d[2] << " " << dim.d[3] <<std::endl;
}

IConcatenationLayer* addShift(INetworkDefinition *network, ITensor& input, Dims4 inputShape, int numSegments, int shiftDiv) {
    int fold = int(inputShape.d[1] / shiftDiv);
    float* zeros = reinterpret_cast<float*>(malloc(sizeof(zeros) * fold*inputShape.d[2]*inputShape.d[3]));
    memset(zeros, 0, sizeof(zeros) * fold*inputShape.d[2]*inputShape.d[3]);
    Weights zeros_weights{DataType::kFLOAT, zeros, fold*inputShape.d[2]*inputShape.d[3]};

    // left
    ISliceLayer* left1 = network->addSlice(input, Dims4{1, 0, 0, 0}, Dims4{numSegments - 1, fold, inputShape.d[2], inputShape.d[3]}, Dims4{1, 1, 1, 1});
    IConstantLayer* left2 = network->addConstant(Dims4{1, fold, inputShape.d[2], inputShape.d[3]}, zeros_weights);
    ITensor* tensorsLeft[] = {left1->getOutput(0), left2->getOutput(0)};
    IConcatenationLayer* left = network->addConcatenation(tensorsLeft, 2);
    left->setAxis(0);

    // mid
    IConstantLayer* mid1 = network->addConstant(Dims4{1, fold, inputShape.d[2], inputShape.d[3]}, zeros_weights);
    ISliceLayer* mid2 = network->addSlice(input, Dims4{0, fold, 0, 0}, Dims4{numSegments - 1, fold, inputShape.d[2], inputShape.d[3]}, Dims4{1, 1, 1, 1});
    ITensor* tensorsMid[] = {mid1->getOutput(0), mid2->getOutput(0)};
    IConcatenationLayer* mid = network->addConcatenation(tensorsMid, 2);
    mid->setAxis(0);

    // right
    ISliceLayer* right = network->addSlice(input, Dims4{0, 2 * fold, 0, 0}, Dims4{numSegments, inputShape.d[1] - 2 * fold, inputShape.d[2], inputShape.d[3]}, Dims4{1, 1, 1, 1});

    // concatenate left/mid/right
    ITensor* tensors[] = {left->getOutput(0), mid->getOutput(0), right->getOutput(0)};
    IConcatenationLayer* concat = network->addConcatenation(tensors, 3);
    concat->setAxis(1);
    return concat;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + ".weight"].values;
    float *beta = (float*)weightMap[lname + ".bias"].values;
    float *mean = (float*)weightMap[lname + ".running_mean"].values;
    float *var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};
    
    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

IActivationLayer* bottleneck(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname, Dims4 inputShape) {
    IConcatenationLayer* shift = addShift(network, input, inputShape, NUM_SEGMENTS, SHIFT_DIV);
    assert(shift);

    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    IConvolutionLayer* conv1 = network->addConvolution(*shift->getOutput(0), outch, DimsHW{1, 1}, weightMap[lname + "conv1.weight"], emptywts);

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5);

    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    IConvolutionLayer* conv2 = network->addConvolution(*relu1->getOutput(0), outch, DimsHW{3, 3}, weightMap[lname + "conv2.weight"], emptywts);
    assert(conv2);
    conv2->setStride(DimsHW{stride, stride});
    conv2->setPadding(DimsHW{1, 1});

    IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5);

    IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU);
    assert(relu2);

    IConvolutionLayer* conv3 = network->addConvolution(*relu2->getOutput(0), outch * 4, DimsHW{1, 1}, weightMap[lname + "conv3.weight"], emptywts);
    assert(conv3);

    IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "bn3", 1e-5);

    IElementWiseLayer* ew1;
    if (stride != 1 || inch != outch * 4) {
        IConvolutionLayer* conv4 = network->addConvolution(input, outch * 4, DimsHW{1, 1}, weightMap[lname + "downsample.0.weight"], emptywts);
        assert(conv4);
        conv4->setStride(DimsHW{stride, stride});

        IScaleLayer* bn4 = addBatchNorm2d(network, weightMap, *conv4->getOutput(0), lname + "downsample.1", 1e-5);
        ew1 = network->addElementWise(*bn4->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM);
    } else {
        ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM);
    }
    IActivationLayer* relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU);
    assert(relu3);
    return relu3;
}

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, DataType dt)
{
    INetworkDefinition* network = builder->createNetwork();

    // Create input tensor of shape {NUM_SEGMENTS, 3, INPUT_H, INPUT_W } with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims4{NUM_SEGMENTS, 3, INPUT_H, INPUT_W});
    assert(data);
    print("input", data);

    std::map<std::string, Weights> weightMap = loadWeights(WEIGHTS_PATH);
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    IConvolutionLayer* conv1 = network->addConvolution(*data, 64, DimsHW{7, 7}, weightMap["conv1.weight"], emptywts);
    assert(conv1);
    conv1->setStride(DimsHW{2, 2});
    conv1->setPadding(DimsHW{3, 3});

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "bn1", 1e-5);

    // Add activation layer using the ReLU algorithm.
    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    // Add max pooling layer with stride of 2x2 and kernel size of 2x2.
    IPoolingLayer* pool1 = network->addPooling(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
    assert(pool1);
    pool1->setStride(DimsHW{2, 2});
    pool1->setPadding(DimsHW{1, 1});
    
    int curHeight = int(INPUT_H / 4);
    int curWidth = int(INPUT_W / 4);
    IActivationLayer* x = bottleneck(network, weightMap, *pool1->getOutput(0), 64, 64, 1, "layer1.0.", Dims4{NUM_SEGMENTS, 64, curHeight, curWidth});
    x = bottleneck(network, weightMap, *x->getOutput(0), 256, 64, 1, "layer1.1.", Dims4{NUM_SEGMENTS, 256, curHeight, curWidth});
    x = bottleneck(network, weightMap, *x->getOutput(0), 256, 64, 1, "layer1.2.", Dims4{NUM_SEGMENTS, 256, curHeight, curWidth});
    
    x = bottleneck(network, weightMap, *x->getOutput(0), 256, 128, 2, "layer2.0.", Dims4{NUM_SEGMENTS, 256, curHeight, curWidth});
    curHeight = int(INPUT_H / 8);
    curWidth = int(INPUT_W / 8);
    x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.1.", Dims4{NUM_SEGMENTS, 512, curHeight, curWidth});
    x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.2.", Dims4{NUM_SEGMENTS, 512, curHeight, curWidth});
    x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.3.", Dims4{NUM_SEGMENTS, 512, curHeight, curWidth});
    
    x = bottleneck(network, weightMap, *x->getOutput(0), 512, 256, 2, "layer3.0.", Dims4{NUM_SEGMENTS, 512, curHeight, curWidth});
    curHeight = int(INPUT_H / 16);
    curWidth = int(INPUT_W / 16);
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.1.", Dims4{NUM_SEGMENTS, 1024, curHeight, curWidth});
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.2.", Dims4{NUM_SEGMENTS, 1024, curHeight, curWidth});
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.3.", Dims4{NUM_SEGMENTS, 1024, curHeight, curWidth});
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.4.", Dims4{NUM_SEGMENTS, 1024, curHeight, curWidth});
    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.5.", Dims4{NUM_SEGMENTS, 1024, curHeight, curWidth});

    x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 512, 2, "layer4.0.", Dims4{NUM_SEGMENTS, 1024, curHeight, curWidth});
    curHeight = int(INPUT_H / 32);
    curWidth = int(INPUT_W / 32);
    x = bottleneck(network, weightMap, *x->getOutput(0), 2048, 512, 1, "layer4.1.", Dims4{NUM_SEGMENTS, 2048, curHeight, curWidth});
    x = bottleneck(network, weightMap, *x->getOutput(0), 2048, 512, 1, "layer4.2.", Dims4{NUM_SEGMENTS, 2048, curHeight, curWidth});

    IPoolingLayer* pool2 = network->addPooling(*x->getOutput(0), PoolingType::kAVERAGE, DimsHW{curHeight, curWidth});
    assert(pool2);
    pool2->setStride(DimsHW{1, 1});
    
    IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool2->getOutput(0), OUTPUT_SIZE, weightMap["fc.weight"], weightMap["fc.bias"]);
    assert(fc1);

    IReduceLayer* reduce = network->addReduce(*fc1->getOutput(0), ReduceOperation::kAVG, 1, false);
    assert(reduce);

    ISoftMaxLayer* softmax = network->addSoftMax(*reduce->getOutput(0));
    assert(softmax);
    softmax->setAxes(1);

    softmax->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*softmax->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    ICudaEngine* engine = builder->buildCudaEngine(*network);

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*) (mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream)
{
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * NUM_SEGMENTS * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * NUM_SEGMENTS * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

int main(int argc, char** argv)
{
    if (argc != 2) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./tsm_r50 -s   // serialize model to plan file" << std::endl;
        std::cerr << "./tsm_r50 -d   // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{nullptr};
    size_t size{0};

    if (std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(1, &modelStream);
        assert(modelStream != nullptr);

        std::ofstream p(ENGINE_PATH, std::ios::binary);
        if (!p)
        {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 1;
    } else if (std::string(argv[1]) == "-d") {
        std::ifstream file(ENGINE_PATH, std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        return -1;
    }


    // Subtract mean from image
    static float data[NUM_SEGMENTS * 3 * INPUT_H * INPUT_W];
    for (int i = 0; i < NUM_SEGMENTS * 3 * INPUT_H * INPUT_W; i++)
        data[i] = 1.0;

    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    // Run inference
    static float prob[OUTPUT_SIZE];
    doInference(*context, data, prob, 1);

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    // Print histogram of the output distribution
    std::cout << "\nOutput:\n\n";
    for (unsigned int i = 0; i < 10; i++)
    {
        std::cout << prob[i] << ", ";
    }
    std::cout << std::endl;
    for (unsigned int i = 0; i < 10; i++)
    {
        std::cout << prob[OUTPUT_SIZE - 10 + i] << ", ";
    }
    std::cout << std::endl;
    std::fstream writer(RESULT_PATH, std::ios::out);

    writer << prob[0];
    for(int i = 1; i < OUTPUT_SIZE ; i++) {
        writer << " " << prob[i];
    }
    writer.close();

    return 0;
}


================================================
FILE: tsm/tsm_r50.py
================================================
import argparse
import os
import struct

import numpy as np
import pycuda.autoinit  # noqa
import pycuda.driver as cuda
import tensorrt as trt

BATCH_SIZE = 1
NUM_SEGMENTS = 8
INPUT_H = 224
INPUT_W = 224
OUTPUT_SIZE = 400
SHIFT_DIV = 8

assert INPUT_H % 32 == 0 and INPUT_W % 32 == 0, \
    "Input height and width should be a multiple of 32."

EPS = 1e-5
INPUT_BLOB_NAME = "data"
OUTPUT_BLOB_NAME = "prob"

TRT_LOGGER = trt.Logger(trt.Logger.INFO)


def load_weights(file):
    print(f"Loading weights: {file}")

    assert os.path.exists(file), f'Unable to load weight file {file}'

    weight_map = {}
    with open(file, "r") as f:
        lines = [line.strip() for line in f]
    count = int(lines[0])
    assert count == len(lines) - 1
    for i in range(1, count + 1):
        splits = lines[i].split(" ")
        name = splits[0]
        cur_count = int(splits[1])
        assert cur_count + 2 == len(splits)
        values = []
        for j in range(2, len(splits)):
            # hex string to bytes to float
            values.append(struct.unpack(">f", bytes.fromhex(splits[j])))
        weight_map[name] = np.array(values, dtype=np.float32)

    return weight_map


def add_shift_module(network, input, input_shape, num_segments=8, shift_div=8):
    fold = input_shape[1] // shift_div

    # left
    left_split = network.add_slice(input,
                                   start=(1, 0, 0, 0),
                                   shape=(num_segments - 1, fold,
                                          input_shape[2], input_shape[3]),
                                   stride=(1, 1, 1, 1))
    assert left_split
    left_split_shape = (1, fold, input_shape[2], input_shape[3])
    left_blank = network.add_constant(shape=left_split_shape,
                                      weights=np.zeros(left_split_shape,
                                                       np.float32))
    assert left_blank
    left = network.add_concatenation(
        [left_split.get_output(0),
         left_blank.get_output(0)])
    assert left
    left.axis = 0

    # mid
    mid_split_shape = (1, fold, input_shape[2], input_shape[3])
    mid_blank = network.add_constant(shape=mid_split_shape,
                                     weights=np.zeros(mid_split_shape,
                                                      np.float32))
    assert mid_blank
    mid_split = network.add_slice(input,
                                  start=(0, fold, 0, 0),
                                  shape=(num_segments - 1, fold,
                                         input_shape[2], input_shape[3]),
                                  stride=(1, 1, 1, 1))
    assert mid_split
    mid = network.add_concatenation(
        [mid_blank.get_output(0),
         mid_split.get_output(0)])
    assert mid
    mid.axis = 0

    # right
    right = network.add_slice(input,
                              start=(0, 2 * fold, 0, 0),
                              shape=(num_segments, input_shape[1] - 2 * fold,
                                     input_shape[2], input_shape[3]),
                              stride=(1, 1, 1, 1))

    # concat left mid right
    output = network.add_concatenation(
        [left.get_output(0),
         mid.get_output(0),
         right.get_output(0)])
    assert output
    output.axis = 1
    return output


def add_batch_norm_2d(network, weight_map, input, layer_name, eps):
    gamma = weight_map[layer_name + ".weight"]
    beta = weight_map[layer_name + ".bias"]
    mean = weight_map[layer_name + ".running_mean"]
    var = weight_map[layer_name + ".running_var"]
    var = np.sqrt(var + eps)

    scale = gamma / var
    shift = -mean / var * gamma + beta
    return network.add_scale(input=input,
                             mode=trt.ScaleMode.CHANNEL,
                             shift=shift,
                             scale=scale)


def bottleneck(network, weight_map, input, in_channels, out_channels, stride,
               layer_name, input_shape):
    shift = add_shift_module(network, input, input_shape, NUM_SEGMENTS,
                             SHIFT_DIV)
    assert shift

    conv1 = network.add_convolution(input=shift.get_output(0),
                                    num_output_maps=out_channels,
                                    kernel_shape=(1, 1),
                                    kernel=weight_map[layer_name +
                                                      "conv1.weight"],
                                    bias=trt.Weights())
    assert conv1

    bn1 = add_batch_norm_2d(network, weight_map, conv1.get_output(0),
                            layer_name + "bn1", EPS)
    assert bn1

    relu1 = network.add_activation(bn1.get_output(0),
                                   type=trt.ActivationType.RELU)
    assert relu1

    conv2 = network.add_convolution(input=relu1.get_output(0),
                                    num_output_maps=out_channels,
                                    kernel_shape=(3, 3),
                                    kernel=weight_map[layer_name +
                                                      "conv2.weight"],
                                    bias=trt.Weights())
    assert conv2
    conv2.stride = (stride, stride)
    conv2.padding = (1, 1)

    bn2 = add_batch_norm_2d(network, weight_map, conv2.get_output(0),
                            layer_name + "bn2", EPS)
    assert bn2

    relu2 = network.add_activation(bn2.get_output(0),
                                   type=trt.ActivationType.RELU)
    assert relu2

    conv3 = network.add_convolution(input=relu2.get_output(0),
                                    num_output_maps=out_channels * 4,
                                    kernel_shape=(1, 1),
                                    kernel=weight_map[layer_name +
                                                      "conv3.weight"],
                                    bias=trt.Weights())
    assert conv3

    bn3 = add_batch_norm_2d(network, weight_map, conv3.get_output(0),
                            layer_name + "bn3", EPS)
    assert bn3

    if stride != 1 or in_channels != 4 * out_channels:
        conv4 = network.add_convolution(
            input=input,
            num_output_maps=out_channels * 4,
            kernel_shape=(1, 1),
            kernel=weight_map[layer_name + "downsample.0.weight"],
            bias=trt.Weights())
        assert conv4
        conv4.stride = (stride, stride)

        bn4 = add_batch_norm_2d(network, weight_map, conv4.get_output(0),
                                layer_name + "downsample.1", EPS)
        assert bn4

        ew1 = network.add_elementwise(bn4.get_output(0), bn3.get_output(0),
                                      trt.ElementWiseOperation.SUM)
    else:
        ew1 = network.add_elementwise(input, bn3.get_output(0),
                                      trt.ElementWiseOperation.SUM)
    assert ew1

    relu3 = network.add_activation(ew1.get_output(0),
                                   type=trt.ActivationType.RELU)
    assert relu3

    return relu3


def create_engine(maxBatchSize, builder, dt, weights):
    weight_map = load_weights(weights)
    network = builder.create_network()

    data = network.add_input(INPUT_BLOB_NAME, dt,
                             (NUM_SEGMENTS, 3, INPUT_H, INPUT_W))
    assert data

    conv1 = network.add_convolution(input=data,
                                    num_output_maps=64,
                                    kernel_shape=(7, 7),
                                    kernel=weight_map["conv1.weight"],
                                    bias=trt.Weights())
    assert conv1
    conv1.stride = (2, 2)
    conv1.padding = (3, 3)

    bn1 = add_batch_norm_2d(network, weight_map, conv1.get_output(0), "bn1",
                            EPS)
    assert bn1

    relu1 = network.add_activation(bn1.get_output(0),
                                   type=trt.ActivationType.RELU)
    assert relu1

    pool1 = network.add_pooling(input=relu1.get_output(0),
                                window_size=trt.DimsHW(3, 3),
                                type=trt.PoolingType.MAX)
    assert pool1
    pool1.stride = (2, 2)
    pool1.padding = (1, 1)

    cur_height = INPUT_H // 4
    cur_width = INPUT_W // 4
    x = bottleneck(network, weight_map, pool1.get_output(0), 64, 64, 1,
                   "layer1.0.", (NUM_SEGMENTS, 64, cur_height, cur_width))
    x = bottleneck(network, weight_map, x.get_output(0), 256, 64, 1,
                   "layer1.1.", (NUM_SEGMENTS, 256, cur_height, cur_width))
    x = bottleneck(network, weight_map, x.get_output(0), 256, 64, 1,
                   "layer1.2.", (NUM_SEGMENTS, 256, cur_height, cur_width))

    x = bottleneck(network, weight_map, x.get_output(0), 256, 128, 2,
                   "layer2.0.", (NUM_SEGMENTS, 256, cur_height, cur_width))
    cur_height = INPUT_H // 8
    cur_width = INPUT_W // 8
    x = bottleneck(network, weight_map, x.get_output(0), 512, 128, 1,
                   "layer2.1.", (NUM_SEGMENTS, 512, cur_height, cur_width))
    x = bottleneck(network, weight_map, x.get_output(0), 512, 128, 1,
                   "layer2.2.", (NUM_SEGMENTS, 512, cur_height, cur_width))
    x = bottleneck(network, weight_map, x.get_output(0), 512, 128, 1,
                   "layer2.3.", (NUM_SEGMENTS, 512, cur_height, cur_width))
    x = bottleneck(network, weight_map, x.get_output(0), 512, 256, 2,
                   "layer3.0.", (NUM_SEGMENTS, 512, cur_height, cur_width))
    cur_height = INPUT_H // 16
    cur_width = INPUT_W // 16
    x = bottleneck(network, weight_map, x.get_output(0), 1024, 256, 1,
                   "layer3.1.", (NUM_SEGMENTS, 1024, cur_height, cur_width))
    x = bottleneck(network, weight_map, x.get_output(0), 1024, 256, 1,
                   "layer3.2.", (NUM_SEGMENTS, 1024, cur_height, cur_width))
    x = bottleneck(network, weight_map, x.get_output(0), 1024, 256, 1,
                   "layer3.3.", (NUM_SEGMENTS, 1024, cur_height, cur_width))
    x = bottleneck(network, weight_map, x.get_output(0), 1024, 256, 1,
                   "layer3.4.", (NUM_SEGMENTS, 1024, cur_height, cur_width))
    x = bottleneck(network, weight_map, x.get_output(0), 1024, 256, 1,
                   "layer3.5.", (NUM_SEGMENTS, 1024, cur_height, cur_width))

    x = bottleneck(network, weight_map, x.get_output(0), 1024, 512, 2,
                   "layer4.0.", (NUM_SEGMENTS, 1024, cur_height, cur_width))
    cur_height = INPUT_H // 32
    cur_width = INPUT_W // 32
    x = bottleneck(network, weight_map, x.get_output(0), 2048, 512, 1,
                   "layer4.1.", (NUM_SEGMENTS, 2048, cur_height, cur_width))
    x = bottleneck(network, weight_map, x.get_output(0), 2048, 512, 1,
                   "layer4.2.", (NUM_SEGMENTS, 2048, cur_height, cur_width))

    pool2 = network.add_pooling(x.get_output(0),
                                window_size=trt.DimsHW(cur_height, cur_width),
                                type=trt.PoolingType.AVERAGE)
    assert pool2
    pool2.stride = (1, 1)

    fc1 = network.add_fully_connected(input=pool2.get_output(0),
                                      num_outputs=OUTPUT_SIZE,
                                      kernel=weight_map['fc.weight'],
                                      bias=weight_map['fc.bias'])
    assert fc1

    reshape = network.add_shuffle(fc1.get_output(0))
    assert reshape
    reshape.reshape_dims = (NUM_SEGMENTS, OUTPUT_SIZE)

    reduce = network.add_reduce(reshape.get_output(0),
                                op=trt.ReduceOperation.AVG,
                                axes=1,
                                keep_dims=False)
    assert reduce

    softmax = network.add_softmax(reduce.get_output(0))
    assert softmax
    softmax.axes = 1

    softmax.get_output(0).name = OUTPUT_BLOB_NAME
    network.mark_output(softmax.get_output(0))

    # Build engine
    builder.max_batch_size = maxBatchSize
    builder.max_workspace_size = 1 << 20
    engine = builder.build_cuda_engine(network)

    del network
    del weight_map

    return engine


def do_inference(context, host_in, host_out, batchSize):
    devide_in = cuda.mem_alloc(host_in.nbytes)
    devide_out = cuda.mem_alloc(host_out.nbytes)
    bindings = [int(devide_in), int(devide_out)]
    stream = cuda.Stream()

    cuda.memcpy_htod_async(devide_in, host_in, stream)
    context.execute_async(batch_size=batchSize,
                          bindings=bindings,
                          stream_handle=stream.handle)
    cuda.memcpy_dtoh_async(host_out, devide_out, stream)
    stream.synchronize()


def inference_mmaction2(inputs, config, checkpoint):
    import torch
    from mmaction.models import build_model
    from mmcv import Config
    from mmcv.runner import load_checkpoint

    cfg = Config.fromfile(config)
    cfg.model.backbone.pretrained = None
    model = build_model(cfg.model,
                        train_cfg=None,
                        test_cfg=cfg.get('test_cfg'))
    load_checkpoint(model, checkpoint, map_location='cpu')
    model.eval()
    inputs = torch.tensor(inputs)
    with torch.no_grad():
        return model(return_loss=False, imgs=inputs)


def main(args):
    assert not (args.save_engine_path and args.load_engine_path)

    if args.load_engine_path:
        # load from local file
        runtime = trt.Runtime(TRT_LOGGER)
        assert runtime
        with open(args.load_engine_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
    else:
        # Create network and engine
        assert args.tensorrt_weights
        builder = trt.Builder(TRT_LOGGER)
        engine = create_engine(BATCH_SIZE, builder, trt.float32,
                               args.tensorrt_weights)
    assert engine
    assert engine.num_bindings == 2

    if args.save_engine_path is not None:
        # save engine to local file
        with open(args.save_engine_path, "wb") as f:
            f.write(engine.serialize())
        print(f"{args.save_engine_path} Generated successfully.")

    context = engine.create_execution_context()
    assert context

    host_in = cuda.pagelocked_empty(BATCH_SIZE * NUM_SEGMENTS * 3 * INPUT_H *
                                    INPUT_W,
                                    dtype=np.float32)
    host_out = cuda.pagelocked_empty(BATCH_SIZE * OUTPUT_SIZE,
                                     dtype=np.float32)

    if args.test_mmaction2:
        assert args.mmaction2_config and args.mmaction2_checkpoint, \
            "MMAction2 config and checkpoint couldn't be None"

        data = np.random.randn(BATCH_SIZE, NUM_SEGMENTS, 3, INPUT_H,
                               INPUT_W).astype(np.float32)

        # TensorRT inference
        np.copyto(host_in, data.ravel())
        do_inference(context, host_in, host_out, BATCH_SIZE)

        # pytorch inference
        pytorch_results = inference_mmaction2(data, args.mmaction2_config,
                                              args.mmaction2_checkpoint)

        # test
        from numpy.testing import assert_array_almost_equal
        assert_array_almost_equal(host_out.reshape(-1),
                                  pytorch_results.reshape(-1),
                                  decimal=4)
        print("MMAction2 TEST PASSED")

    if args.test_cpp:
        assert args.cpp_result_path, "Should set --cpp-result-path"
        assert os.path.exists(args.cpp_result_path),\
            f"{args.cpp_result} doesn't exist"

        # C++ API fixed inputs
        inputs = np.ones((BATCH_SIZE, NUM_SEGMENTS, 3, INPUT_H, INPUT_W),
                         dtype=np.float32)

        # TensorRT inference
        np.copyto(host_in, inputs.ravel())
        do_inference(context, host_in, host_out, BATCH_SIZE)

        # Read cpp inference results
        with open(args.cpp_result_path, "r") as f:
            data = f.read().strip()
        cpp_results = np.array([float(d)
                                for d in data.split(" ")]).astype(np.float32)

        # test
        from numpy.testing import assert_array_almost_equal
        assert_array_almost_equal(host_out.reshape(-1),
                                  cpp_results.reshape(-1),
                                  decimal=4)
        print("CPP TEST PASSED")

    if args.input_video:
        # Get ONE prediction result from ONE video
        # Use demo.mp4 from MMAction2
        import cv2

        # get selected frame id of uniform sampling
        cap = cv2.VideoCapture(args.input_video)
        sample_length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        avg_interval = sample_length / float(NUM_SEGMENTS)
        base_offsets = np.arange(NUM_SEGMENTS) * avg_interval
        clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int32)

        # read frames
        frames = []
        for i in range(max(clip_offsets) + 1):
            flag, frame = cap.read()
            if i in clip_offsets:
                frames.append(cv2.resize(frame, (INPUT_W, INPUT_W)))
        frames = np.array(frames)

        # preprocessing frames
        mean = np.array([123.675, 116.28, 103.53])
        std = np.array([58.395, 57.12, 57.375])
        frames = (frames - mean) / std
        frames = frames.transpose([0, 3, 1, 2])

        # TensorRT inference
        np.copyto(host_in, frames.ravel())
        do_inference(context, host_in, host_out, BATCH_SIZE)
        # For demo.mp4, should be 6, aka arm wrestling
        class_id = np.argmax(host_out.reshape(-1))
        print(
            f'Result class id {class_id}, socre {round(host_out[class_id]):.2f}'
        )


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--tensorrt-weights",
        type=str,
        default=None,
        help="Path to TensorRT weights, which is generated by gen_weights.py")
    parser.add_argument("--input-video",
                        type=str,
                        default=None,
                        help="Path to local video file")
    parser.add_argument("--save-engine-path",
                        type=str,
                        default=None,
                        help="Save engine to local file")
    parser.add_argument("--load-engine-path",
                        type=str,
                        default=None,
                        help="Saved engine file path")
    parser.add_argument("--test-mmaction2",
                        action='store_true',
                        help="Compare TensorRT results with MMAction2 Results")
    parser.add_argument("--mmaction2-config",
                        type=str,
                        default=None,
                        help="Path to MMAction2 config file")
    parser.add_argument("--mmaction2-checkpoint",
                        type=str,
                        default=None,
                        help="Path to MMAction2 checkpoint url or file path")
    parser.add_argument("--test-cpp",
                        action='store_true',
                        help="Compare Python API results with C++ API results")
    parser.add_argument("--cpp-result-path",
                        type=str,
                        default='./build/result.txt',
                        help="Path to C++ API results")

    main(parser.parse_args())


================================================
FILE: tutorials/check_fp16_int8_support.md
================================================
# Check if Your GPU Supports FP16/INT8

## 1. check your GPU Compute Capability

visit https://developer.nvidia.com/cuda-gpus#compute and check your GPU compute capability.

For example, GTX1080 is 6.1, Tesla T4 is 7.5.

## 2. check the hardware-precision-matrix

visit https://docs.nvidia.com/deeplearning/tensorrt/support-matrix/index.html#hardware-precision-matrix and check the matrix.

For example, compute capability 6.1 supports FP32 and INT8. 7.5 supports FP32, FP16, INT8, FP16 tensor core, etc.


================================================
FILE: tutorials/faq.md
================================================
# Frequently Asked Questions (FAQ)

## 1. fatal error: NvInfer.h: No such file or directory

`NvInfer.h` is one of the headers of TensorRT. If you install the tensorrt DEB package, the headers should in `/usr/include/x86_64-linux-gnu/`. If you install tensorrt TAR or ZIP file, it is recommended to manage TensorRT with modern CMake syntax, e.g. [FindTensorRT.cmake](../lenet/FindTensorRT.cmake).

`dpkg -L` can print out the contents of a DEB package.

```
$ dpkg -L libnvinfer-dev
/.
/usr
/usr/lib
/usr/lib/x86_64-linux-gnu
/usr/lib/x86_64-linux-gnu/libnvinfer_static.a
/usr/lib/x86_64-linux-gnu/libmyelin_compiler_static.a
/usr/lib/x86_64-linux-gnu/libmyelin_executor_static.a
/usr/lib/x86_64-linux-gnu/libmyelin_pattern_library_static.a
/usr/lib/x86_64-linux-gnu/libmyelin_pattern_runtime_static.a
/usr/include
/usr/include/x86_64-linux-gnu
/usr/include/x86_64-linux-gnu/NvInfer.h
/usr/include/x86_64-linux-gnu/NvInferRuntime.h
/usr/include/x86_64-linux-gnu/NvInferRuntimeCommon.h
/usr/include/x86_64-linux-gnu/NvInferVersion.h
/usr/include/x86_64-linux-gnu/NvUtils.h
/usr/share
/usr/share/doc
/usr/share/doc/libnvinfer-dev
/usr/share/doc/libnvinfer-dev/copyright
/usr/share/doc/libnvinfer-dev/changelog.Debian
/usr/lib/x86_64-linux-gnu/libmyelin.so
/usr/lib/x86_64-linux-gnu/libnvinfer.so
```

## 2. fatal error: cuda_runtime_api.h: No such file or directory

`cuda_runtime_api.h` is from cuda-cudart. If you met this error, you need find where it is and adapt the `include_directories` and `link_directories` of cuda in `CMakeLists.txt`.

```
$ dpkg -L cuda-cudart-dev-10-0
/.
/usr
/usr/local
/usr/local/cuda-10.0
/usr/local/cuda-10.0/targets
/usr/local/cuda-10.0/targets/x86_64-linux
/usr/local/cuda-10.0/targets/x86_64-linux/lib
/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudadevrt.a
/usr/local/cuda-10.0/targets/x86_64-linux/lib/libOpenCL.so.1.1
/usr/local/cuda-10.0/targets/x86_64-linux/lib/libculibos.a
/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudart_static.a
/usr/local/cuda-10.0/targets/x86_64-linux/include
/usr/local/cuda-10.0/targets/x86_64-linux/include/cuda_runtime_api.h
/usr/local/cuda-10.0/targets/x86_64-linux/include/cudart_platform.h
/usr/local/cuda-10.0/targets/x86_64-linux/include/cuda_device_runtime_api.h
/usr/local/cuda-10.0/targets/x86_64-linux/include/cuda_runtime.h
/usr/lib
/usr/lib/pkgconfig
/usr/lib/pkgconfig/cudart-10.0.pc
/usr/share
/usr/share/doc
/usr/share/doc/cuda-cudart-dev-10-0
/usr/share/doc/cuda-cudart-dev-10-0/changelog.Debian.gz
/usr/share/doc/cuda-cudart-dev-10-0/copyright
/usr/local/cuda-10.0/targets/x86_64-linux/lib/libOpenCL.so
/usr/local/cuda-10.0/targets/x86_64-linux/lib/libOpenCL.so.1
/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudart.so
```

## 3. .wts not prepared or not in the right directory

If .wts file not in the right directory. The loadWeights() function will report error. Error logs like following.

By default, the .wts file usually should be put in the same dir as `build`. For example, `tensorrtx/yolov5/yolov5s.wts`. And the .wts path defined in `yolov5.cpp`.

```
std::map<std::__cxx11::basic_string, nvinfer1::Weights> loadWeights(std::__cxx11::string): Assertion `input.is_open() && "Unable to load weight file."' failed.
Aborted (core dumped)
```

## 4. yolo -s failed, class_num not adapted

If you train your own yolo model, you need set the `CLASS_NUM` in `yololayer.h`. Which is `80` by default. Otherwise, you will get errors like following.

```
[Convolution]: kernel weights has count xxx but xxx was expected
void APIToModel(unsigned int, nvinfer1::IHostMemory**): Assertion `engine != nullptr' failed.
Aborted (core dumped)
```


================================================
FILE: tutorials/from_pytorch_to_trt_stepbystep_hrnet.md
================================================
# 使用 TRT 加速网络-零

本次教程以 HRNet 分类器（HRNet-W18-C-Small-v2）为例子

code：https://github.com/HRNet/HRNet-Image-Classification

paper：https://arxiv.org/abs/1908.07919

## 1 论文网络的基本了解

无论是仅仅使用网络还是要对网络改进，首先都要对网络有一定了解。对于这种比较火的网络，网上大批详解博客，可以多去阅读，加上论文，来对网络理解。

HRNet 分类器网络看起来很简单，如下图

![682463-20200104221712824-157549407](https://user-images.githubusercontent.com/20653176/93749152-ff957680-fc2b-11ea-883c-79046e41ace8.png)

从网络中可看到基本组件很简单：卷积和 upsmple。【这里就表明网络 TRT 加速时不会有 plugin 的需求。】

参考博客：

1. https://www.cnblogs.com/darkknightzh/p/12150637.html
2. https://zhuanlan.zhihu.com/p/143385915
3. https://blog.csdn.net/weixin_37993251/article/details/88043650
4. https://blog.csdn.net/weixin_38715903/article/details/101629781?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-2.channel_param&depth_1-utm_source=dis

## 2 pytorch 代码跑通

跑通 demo 是很重要的一步。跑通后就可以一步一步跟进，看到底走了哪些层，这样心里就会有一个基本框架；然后可以生成 wts 文件；同时也可以生成 onnx 文件。

上述的**参考博客 4**中对代码有详细介绍，可以详细分析下。

建议：**对于运行环境，建议使用 anaconda 的 conda create 创建虚拟环境，这样没有一系列环境问题。**

```python
conda create -n xx python=3.7   # 创建环境
activate xx    # 激活
pip install xxxx  # 安装包
deactivate xx  # 推出环境
```

在生成 wts 文件时，没有必须每次都是去配置`gen_wts.py`，主要是读取模型，保存模型参数。只要 demo 文件跑通就可以随时保存为 wts。

## 3 pytorch 代码 debug

这一步骤单独拉出来是因为在 debug 的过程中，要关注经过哪些层，预处理有哪些，后处理有哪些。另外在后面搭建 TRT 网络时，还要根据 debug 过程在中的一些信息来调试 trt 网络。

## 4 网络的可视化

将 pytorch 模型保存为 onnx，可有可无。但是建议如果可以保存，就使用 onnx 来可视化网络。这样对网络架构一级每层的输入输出就会非常明了。

如果无法保存 onnx，搭建网络时，要根据 wts 来分析，比较麻烦。

另外强烈建议：**无论是否保存了 onnx，都要手动在纸上将网络在画一遍，，并且将每层的输出维度标注下来，这样搭建层比较多的网络时，不会晕，并且在 debugTRT 网络时可以有效定位错误。**

在手动画网络图时，可以给每个节点“标号”，利用该“标号”在搭建 TRT 网络时，可以很清楚知道 **“哪个节点输入，经过某种操作，输出哪个节点。”**

在 onnx 图中看到几个层一定要心里有数：

比如下面红线框出的一大块实际上就是 upsample 层

![](imgs/93747936-0ae7a280-fc2a-11ea-86c1-9f72622402b9.png))

下面的为 FC 层：

![image-20200918141448071](https://user-images.githubusercontent.com/20653176/93749177-0de39280-fc2c-11ea-8a20-b8ab0b3b940f.png)

Conv+BN+Relu 层

![image-20200918141632723](https://user-images.githubusercontent.com/20653176/93749201-189e2780-fc2c-11ea-9aad-0ac7723575c4.png)

ResBlock 层

![image-20200918141709487](https://user-images.githubusercontent.com/20653176/93749220-2358bc80-fc2c-11ea-998a-0892755dfbc0.png)

单击节点。会有详细信息，这些信息使搭建网络变得方便。

![image-20200918141931327](https://user-images.githubusercontent.com/20653176/93749222-2489e980-fc2c-11ea-9025-c5d367efd7f9.png)

如果无法导出 onnx：

搭建网络时需要从 wts 中查看层名，各个卷积层信息需要从代码中分析。

![image_f](https://user-images.githubusercontent.com/20653176/93750398-fd341c00-fc2d-11ea-9077-ee749b6aef41.png)

![image-20200918142959711](https://user-images.githubusercontent.com/20653176/93749484-8fd3bb80-fc2c-11ea-951d-3c1f403e521a.png)

## 5 TRT 搭建网络

搭建网络时就按照 onnx 图一层一层搭建。

几点建议：

1 要不断去查 API 的使用 https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/index.html

2 利用已有的模块，不要重复造轮子

3 各个层名使用 onnx 的 id，这样在搭建网络时不会晕。，根据 onnx 的结点信息，各层之间的连接也不会出错。

## 6 TRT 网络 debug

搭建网络过程肯定会出错，debug 是必要的手段：

1 打印每层的维度

```c++
Dims dim = id_1083->getOutput(0)->getDimensions();
std::cout << dim[0] << " " << dim[1] << " " << dim[2] << " " << dim[3] << std::endl;
```

**一般如果出现生成 engine 就失败的情况，就从 createEngine 的第一句开始调试，并且随时关注窗口输出，如果在某一层出现大量提示信息，那么该层就会有问题，就将该层的输入 tensor 维度和输出 tensor 维度信息都打印出来，看输出的维度是否正常。**

2 打印输出

TRT 是先构建网络，然后再 enqueue 时才能得到各层的输出信息，因此若想对比每一层的输出，需要将该层设置为 output 层

```c++
out->getOutput(0)->setName(OUTPUT_BLOB_NAME);  // out可替换为任意一层
network->markOutput(*out->getOutput(0));
```

3 关注输入层 data

数据层的 debug 无需第 2 步的做法，直接可以查看预处理后的结果。在 debug

## 7 TRT 代码整理

这里就是将 TRT 搭建的网络，能封装函数，就封装为函数模块，增加代码可读性。


================================================
FILE: tutorials/getting_started.md
================================================
# Getting Started with TensorRTx

## 1. Setup the development environment

(**RECOMMENDED**) If you prefer to run everything in a docker container, check [HERE](../docker/README.md)

If you prefer to install every dependencies locally, check [HERE](./install.md)

## 2. Run TensorRTx demo

It is recommended to go through the [lenet5](https://github.com/wang-xinyu/tensorrtx/tree/master/lenet) or [mlp](https://github.com/wang-xinyu/tensorrtx/tree/master/mlp) first. But if you are proficient in TensorRT, please check the readme file of the model you want directly.

We use "lenet5" to explain how we build DL network with TensorRT API.

### 2.1. Export lenet5 weights in pytorch

1. Clone the [wang-xinyu/pytorchx](https://github.com/wang-xinyu/pytorchx) in your machine, then enter lenet folder:

   ```bash
   pip install torch
   git clone https://github.com/wang-xinyu/pytorchx
   cd pytorchx/lenet
   ```

2. Run lenet5.py to generate lenet5.pth which is the pytorch serialized model. The lenet5 arch is defined in lenet5.py.

   ```bash
   python lenet5.py
   ```

3. Run inference.py to generate lenet5.wts, which is weights file for tensorrt.

   ```bash
   python inference.py
   ```

The terminal output would be like:

```txt
the output of lenet5 is [[0.0950, 0.0998, 0.1101, 0.0975, 0.0966, 0.1097, 0.0948, 0.1056, 0.0992, 0.0917]], shape is [1, 10].

cuda device count:  2
input:  torch.Size([1, 1, 32, 32])
conv1 torch.Size([1, 6, 28, 28])
pool1:  torch.Size([1, 6, 14, 14])
conv2 torch.Size([1, 16, 10, 10])
pool2 torch.Size([1, 16, 5, 5])
view:  torch.Size([1, 400])
fc1:  torch.Size([1, 120])
lenet out: tensor([[0.0950, 0.0998, 0.1101, 0.0975, 0.0966, 0.1097, 0.0948, 0.1056, 0.0992,
         0.0917]], device='cuda:0', grad_fn=<SoftmaxBackward>)
```

### 2.2. Run lenet5 in TensorRT

Clone the wang-xinyu/tensorrtx in your machine. Enter lenet folder, copy lenet5.wts generated above, and cmake&make c++ code.

And of course you should install cuda/cudnn/tensorrt first. You might need to adapt the tensorrt path in CMakeLists.txt if you install tensorrt from tar package.

```bash
git clone https://github.com/wang-xinyu/tensorrtx
cd tensorrtx/lenet
cp [PATH-OF-pytorchx]/pytorchx/lenet/lenet5.wts .
cmake -S . -B build
cd build
make
```

If the `make` succeed, the executable `lenet` will generated.

Run lenet to build tensorrt engine and serialize it to file `lenet5.engine`.

```bash
./lenet -s
```

Deserialize the engine and run inference.

```bash
./lenet -d
```

You should see the output like this,

```txt
Output:

0.0949623, 0.0998472, 0.110072, 0.0975036, 0.0965564, 0.109736, 0.0947979, 0.105618, 0.099228, 0.0916792,
```

## 3. Compare the two output

As the input to pytorch and tensorrt are same, i.e. a [1,1,32,32] all ones tensor.

So the output should be same, otherwise there must be something wrong.

```txt
The pytorch output is
0.0950, 0.0998, 0.1101, 0.0975, 0.0966, 0.1097, 0.0948, 0.1056, 0.0992, 0.0917

The tensorrt output is
0.0949623, 0.0998472, 0.110072, 0.0975036, 0.0965564, 0.109736, 0.0947979, 0.105618, 0.099228, 0.0916792
```

Same! exciting, isn't it?

## 4. The `.wts` content format

The `.wts` is plain text file, e.g. `lenet5.wts`, part of the contents are:

```txt
10
conv1.weight 150 be40ee1b bd20bab8 bdc4bc53 ...
conv1.bias 6 bd327058 ...
conv2.weight 2400 3c6f2220 3c693090 ...
conv2.bias 16 bd183967 bcb1ac8a ...
fc1.weight 48000 3c162c20 bd25196a ...
fc1.bias 120 3d3c3d49 bc64b948 ...
fc2.weight 10080 bce095a4 3d33b9dc ...
fc2.bias 84 bc71eaa0 3d9b276c ...
fc3.weight 840 3c252870 3d855351 ...
fc3.bias 10 bdbe4bb8 3b119ee0 ...
...
```

The first line is a number, indicate how many lines it has, excluding itself.

And then each line is

`[weight name] [value count = N] [value1] [value2], ..., [valueN]`

The value is in HEX format.

## 5. Frequently Asked Questions (FAQ)

check [HERE](./faq.md) for the answers of questions you may encounter.


================================================
FILE: tutorials/install.md
================================================
# Install the dependencies of tensorrtx

Using docker as development environment is strongly recommended, you may check [HERE](../docker/README) for the deployment instructions of docker container and _ignore_ the rest of this document.

While if this is not your case, we always recommend using major LTS version of your OS, Nvidia driver, CUDA, and so on.

## OS

Ubuntu-22.04 is recommended. It is strongly recommended to use `apt` to manage packages in Ubuntu.

## Nvidia Related

### Driver

You should install the nvidia driver first before anything else, go to [Ubuntu Driver Installation Guide](https://docs.nvidia.com/datacenter/tesla/driver-installation-guide/index.html#ubuntu) for more details.

**NOTE**: Since version 560, the installation step is a little different than before, check [HERE](https://docs.nvidia.com/datacenter/tesla/driver-installation-guide/index.html#recent-updates) for more details.

### CUDA

Go to [NVIDIA CUDA Installation Guide for Linux](https://developer.nvidia.com/cuda-10.0-download-archive) for the detailed steps.

**NOTE**:

- Do not forget to check [Post-installation Actions](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#post-installation-actions) to setup the environment correctly.
- Make your CUDA version comply with your driver version
- If you want multi-version CUDA, docker is strongly recommended.

### TensorRT

check [HERE](https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html#downloading) to install TensorRT.

### (Optional) OpenCV

```
sudo apt-get update && sudo apt install libgtk-3-dev libopencv-dev
```

## Verify installation

```
dpkg -l | grep cuda
dpkg -l | grep nvinfer
dpkg -l | grep opencv
```


================================================
FILE: tutorials/measure_performance.md
================================================
# Measure performance of TensorRT

## 1. add some variables and structures

see https://github.com/NVIDIA/TensorRT/tree/master/samples/sampleNMT for more detail.

```c++
// for rcnn, you can put these code into common.hpp
#include "logging.h" // rcnn/logging.h
static Logger gLogger{ Logger::Severity::kINFO };
static LogStreamConsumer gLogInfo{ LOG_INFO(gLogger) };

struct SimpleProfiler : public nvinfer1::IProfiler
{
    struct Record
    {
        float time{ 0 };
        int count{ 0 };
    };

    virtual void reportLayerTime(const char* layerName, float ms)
    {
        mProfile[layerName].count++;
        mProfile[layerName].time += ms;
        if (std::find(mLayerNames.begin(), mLayerNames.end(), layerName) == mLayerNames.end())
        {
            mLayerNames.push_back(layerName);
        }
    }

    SimpleProfiler(const char* name, const std::vector<SimpleProfiler>& srcProfilers = std::vector<SimpleProfiler>())
        : mName(name)
    {
        for (const auto& srcProfiler : srcProfilers)
        {
            for (const auto& rec : srcProfiler.mProfile)
            {
                auto it = mProfile.find(rec.first);
                if (it == mProfile.end())
                {
                    mProfile.insert(rec);
                }
                else
                {
                    it->second.time += rec.second.time;
                    it->second.count += rec.second.count;
                }
            }
        }
    }

    friend std::ostream& operator<<(std::ostream& out, const SimpleProfiler& value)
    {
        out << "========== " << value.mName << " profile ==========" << std::endl;
        float totalTime = 0;
        std::string layerNameStr = "TensorRT layer name";
        int maxLayerNameLength = std::max(static_cast<int>(layerNameStr.size()), 70);
        for (const auto& elem : value.mProfile)
        {
            totalTime += elem.second.time;
            maxLayerNameLength = std::max(maxLayerNameLength, static_cast<int>(elem.first.size()));
        }

        auto old_settings = out.flags();
        auto old_precision = out.precision();
        // Output header
        {
            out << std::setw(maxLayerNameLength) << layerNameStr << " ";
            out << std::setw(12) << "Runtime, "
                << "%"
                << " ";
            out << std::setw(12) << "Invocations"
                << " ";
            out << std::setw(12) << "Runtime, ms" << std::endl;
        }
        for (size_t i = 0; i < value.mLayerNames.size(); i++)
        {
            const std::string layerName = value.mLayerNames[i];
            auto elem = value.mProfile.at(layerName);
            out << std::setw(maxLayerNameLength) << layerName << " ";
            out << std::setw(12) << std::fixed << std::setprecision(1) << (elem.time * 100.0F / totalTime) << "%"
                << " ";
            out << std::setw(12) << elem.count << " ";
            out << std::setw(12) << std::fixed << std::setprecision(2) << elem.time << std::endl;
        }
        out.flags(old_settings);
        out.precision(old_precision);
        out << "========== " << value.mName << " total runtime = " << totalTime << " ms ==========" << std::endl;

        return out;
    }

private:
    std::string mName;
    std::vector<std::string> mLayerNames;
    std::map<std::string, Record> mProfile;
};
```

## 2. set profiler for context and print the log

```c++
// you'd better set name for every layers
// build engine
// build context
auto sp = SimpleProfiler("test");
context->setProfiler(&sp);
context->enqueue(...);
gLogInfo << sp << std::endl;
```


================================================
FILE: tutorials/migration_guide.md
================================================
# Migration Guide

## <u>Newest</u> Migration Guide

Please check [Page](https://docs.nvidia.com/deeplearning/tensorrt/migration-guide/index.html)

For any archives version, please check this [Page](https://docs.nvidia.com/deeplearning/tensorrt/archives/index.html)

## (DEPRECATED) Migrating from TensorRT 4.x to 7.x

**NOTE**: Both TensorRT 4.x and 7.x are **DEPRECATED** by NVIDIA officially, so this part is **outdated**.

The following APIs are deprecated and replaced in TensorRT 7.
- `DimsCHW`, replaced by `Dims3`
- `addConvolution()`, replaced by `addConvolutionNd()`
- `addPooling()`, replaced by `addPoolingNd()`
- `addDeconvolution()`, replaced by `addDeconvolutionNd()`
- `createNetwork()`, replaced by `createNetworkV2()`
- `buildCudaEngine()`, replaced by `buildEngineWithConfig()`
- `createPReLUPlugin()`, replaced by `addActivation()` with `ActivationType::kLEAKY_RELU`
- `IPlugin` and `IPluginExt` class, replaced by `IPluginV2IOExt` or `IPluginV2DynamicExt`
- Use the new `Logger` class defined in `logging.h`


================================================
FILE: tutorials/multi_GPU_processing.md
================================================
# How to Implement Multi-GPU Processing

Maybe you hope to take advantage of multiple GPU to make inference even faster. Here are few tips to help you deal with it! Take **YOLO V4** as an example.

## 1. Make custom plugin (i.e. YOLO layer and Mish layer for YOLO V4) running asynchronically.

To do this, we need to use CudaStream parameter in the kernels of all custom layers and use asynchronous functions.
For example, in function ` forwardGpu()` of **yololayer.cu**, you need to do the following changes to make sure that the engine will be running on a specific CudaStream.

  1) Change `cudaMemset(output + idx*outputElem, 0, sizeof(float))` to `cudaMemsetAsync(output + idx*outputElem, 0, sizeof(float), stream)`
  2) Change `CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>>(inputs[i],output, numElem, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount ,outputElem)` to `CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>(inputs[i],output, numElem, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount ,outputElem)`

  ## 2. Create an engine for each device you want to use.

  Maybe it is a good idea to create a struct to store the engine, context and buffer for each device individually. For example,
  ```
  struct Plan{
    IRuntime* runtime;
    ICudaEngine* engine;
    IExecutionContext* context;
    void buffers[2];
    cudaStream_t stream;
  };
  ```
  And then use `cudaSetDevice()` to make each engine you create running on specific device. Moreover, to maximize performance, make sure that the engine file you are using to deserialize is the one tensor RT optimized for this device.

  ## 3. Use function wisely
  Here are some knowledge I learned when trying to parallelize the inference.
  1) Do not use synchronized function , like `cudaFree()`, during inference.
  2) Using `cudaMallocHost()` instead of `malloc()` when allocating memory on the host side.


================================================
FILE: tutorials/run_on_windows.md
================================================
# How to Compile and Run on Windows

This tutorial can be applied to any models in this repo. Only need to adapt couple of lines.

## Environments

* vs (only vs2015, vs2017 tested)
* cuda
* TensorRT
* Cmake
* opencv
* dirent.h for windows, put into tensorrtx/include, download from https://github.com/tronkko/dirent

  ![image-20200828131208257](https://user-images.githubusercontent.com/20653176/91524367-99217f00-e931-11ea-9a13-fb420403b73b.png)

## Compile and Run

### 1. Modify CmakeLists.txt

```cmake
cmake_minimum_required(VERSION 2.6)

project(yolov5) # 1
set(OpenCV_DIR "D:\\opencv\\opencv346\\build")  #2
set(TRT_DIR "D:\\TensorRT-7.0.0.11.Windows10.x86_64.cuda-10.2.cudnn7.6\\TensorRT-7.0.0.11")  #3

add_definitions(-std=c++11)
option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads)

# setup CUDA
find_package(CUDA REQUIRED)
message(STATUS "    libraries: ${CUDA_LIBRARIES}")
message(STATUS "    include path: ${CUDA_INCLUDE_DIRS}")

include_directories(${CUDA_INCLUDE_DIRS})

####
enable_language(CUDA)  # add this line, then no need to setup cuda path in vs
####
include_directories(${PROJECT_SOURCE_DIR}/include)
include_directories(${TRT_DIR}\\include)

# -D_MWAITXINTRIN_H_INCLUDED for solving error: identifier "__builtin_ia32_mwaitx" is undefined
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -D_MWAITXINTRIN_H_INCLUDED")

# setup opencv
find_package(OpenCV QUIET
    NO_MODULE
    NO_DEFAULT_PATH
    NO_CMAKE_PATH
    NO_CMAKE_ENVIRONMENT_PATH
    NO_SYSTEM_ENVIRONMENT_PATH
    NO_CMAKE_PACKAGE_REGISTRY
    NO_CMAKE_BUILDS_PATH
    NO_CMAKE_SYSTEM_PATH
    NO_CMAKE_SYSTEM_PACKAGE_REGISTRY
)

message(STATUS "OpenCV library status:")
message(STATUS "    version: ${OpenCV_VERSION}")
message(STATUS "    libraries: ${OpenCV_LIBS}")
message(STATUS "    include path: ${OpenCV_INCLUDE_DIRS}")

include_directories(${OpenCV_INCLUDE_DIRS})
link_directories(${TRT_DIR}\\lib)

add_executable(yolov5 ${PROJECT_SOURCE_DIR}/yolov5.cpp ${PROJECT_SOURCE_DIR}/yololayer.cu ${PROJECT_SOURCE_DIR}/yololayer.h)   #4

target_link_libraries(yolov5 "nvinfer" "nvinfer_plugin")   #5
target_link_libraries(yolov5 ${OpenCV_LIBS})          #6
target_link_libraries(yolov5 ${CUDA_LIBRARIES})   #7
target_link_libraries(yolov5 Threads::Threads)       #8
```

Notice: 8 lines to adapt in CMakeLists.txt, marked with #1-#8

- #1 project name, set according to your project name
- #2 your opencv path
- #3 your tensorrt path
- #4 source file needed, including .cpp .cu .h
- #5-#8 libs needed

### 2. run cmake-gui to config the project

#### 2.1 open cmake-gui and set the path

![image-20200828124434245](https://user-images.githubusercontent.com/20653176/91524158-1dbfcd80-e931-11ea-8a82-518eaf391d5a.png)

#### 2.2 click **Configure** and set the envs

![image-20200828124902923](https://user-images.githubusercontent.com/20653176/91524303-75f6cf80-e931-11ea-8591-64a8a1a9292b.png)

#### 2.3 click **Finish**, and wait for the `Configuring done`

![image-20200828124951872](https://user-images.githubusercontent.com/20653176/91524340-8b6bf980-e931-11ea-9ea4-141f5b94aa0a.png)

#### 2.4 click **Generate**

![image-20200828125046738](https://user-images.githubusercontent.com/20653176/91524350-8eff8080-e931-11ea-9ed1-82c5af2f558f.png)

#### 2.5 click **Open Project**

![image-20200828125215067](https://user-images.githubusercontent.com/20653176/91524352-9030ad80-e931-11ea-877e-dc08bfaef731.png)

#### 2.6 Click **Generate -> Generate solution**

![image-20200828125402056](https://user-images.githubusercontent.com/20653176/91524356-9161da80-e931-11ea-84ba-177e12200e04.png)

### 3. run in command line

cd to the path of exe (e.g. E:\LearningCodes\GithubRepo\tensorrtx\yolov5\build\Debug)

```
yolov5.exe -s             // serialize model to plan file i.e. 'yolov5s.engine'
yolov5.exe -d  ../samples // deserialize plan file and run inference, the images in samples will be processed.
```

**Notice**: while serializing the model, the .wts should put in the parent dir of xxx.vcxproj, or just modify the .wts path in yolov5.cpp

![image-20200828125938472](https://user-images.githubusercontent.com/20653176/91524358-93c43480-e931-11ea-81b6-ae01b92e1146.png)

### 4. run in vs

In vs, firstly `Set As Startup Project`, and then setup `Project ==> Properties ==> Configuration Properties ==> Debugging ==> Command Arguments` as `-s` or `-d ../yolov3-spp/samples`. Then can run or debug.

![image-20200828130117902](https://user-images.githubusercontent.com/20653176/91524360-94f56180-e931-11ea-9873-39bed7ee19f1.png)

![image-20200828130415658](https://user-images.githubusercontent.com/20653176/91524362-96bf2500-e931-11ea-8c79-8db3a25fc135.png)

![image-20200828131516231](https://user-images.githubusercontent.com/20653176/91524370-9a52ac00-e931-11ea-8c1a-acf828fe81b4.png)

**Notice**: The .dll of tensorrt and opencv should be put in the same directory with exe file. Or set environment variables in windows.(Not recommended)


================================================
FILE: ufld/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(lane_det)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

# cuda directory
include_directories(/usr/local/cuda/include/)
link_directories(/usr/local/cuda/lib64/)

# tensorrt
#include_directories(/workspace/TensorRT-7.2.3.4/include/)
#link_directories(/workspace/TensorRT-7.2.3.4/lib/)


find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

add_executable(lane_det ${PROJECT_SOURCE_DIR}/lane_det.cpp)
target_link_libraries(lane_det nvinfer)
target_link_libraries(lane_det cudart)
target_link_libraries(lane_det ${OpenCV_LIBS})

add_definitions(-O2 -pthread)


================================================
FILE: ufld/README.md
================================================
# Ultra-Fast-Lane-Detection(UFLD)

The Pytorch implementation is [Ultra-Fast-Lane-Detection](https://github.com/cfzd/Ultra-Fast-Lane-Detection).

## How to Run
```
1. generate lane.wts and lane.onnx from pytorch with tusimple_18.pth

git clone https://github.com/wang-xinyu/tensorrtx.git
git clone https://github.com/cfzd/Ultra-Fast-Lane-Detection.git
// download its weights 'tusimple_18.pth'
// copy tensorrtx/ufld/gen_wts.py into Ultra-Fast-Lane-Detection/
// ensure the file name is tusimple_18.pth and lane.wts in gen_wts.py
// go to Ultra-Fast-Lane-Detection
python gen_wts.py
// a file 'lane.wts' will be generated.
// then ( not necessary )
python pth2onnx.py
//a file 'lane.onnx' will be generated.

2. build tensorrtx/ufld and run

mkdir build
cd build
cmake ..
make
sudo ./lane_det -s          // serialize model to plan file i.e. 'lane.engine'
sudo ./lane_det -d  PATH_TO_YOUR_IMAGE_FOLDER // deserialize plan file and run inference, the images will be processed.

```

## More Information
1. Changed the preprocess and postprocess in tensorrtx, give a different way to convert NHWC to NCHW in preprocess and just show the result using opencv rather than saving the result in postprocess.
2. If there are some bugs where you inference with multi batch_size, just modify the code in preprocess or postprocess, it's not complicated.
3. Some results are stored in resluts folder.


================================================
FILE: ufld/common.hpp
================================================
#ifndef LANE_DET_COMMON_H_
#define LANE_DET_COMMON_H_

#include <iostream>
#include <fstream>
#include <map>
#include <string>
#include <sstream>
#include <vector>
#include <opencv2/opencv.hpp>
#include "dirent.h"
#include "NvInfer.h"
#include <chrono>

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

using namespace nvinfer1;

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + ".weight"].values;
    float *beta = (float*)weightMap[lname + ".bias"].values;
    float *mean = (float*)weightMap[lname + ".running_mean"].values;
    float *var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};

    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

ILayer* convBnLeaky( INetworkDefinition *network, std::map<std::string, Weights>& weightMap,
                     ITensor& input, int outch, int ksize, int s, int p, int g,
                     std::string lname, int i, bool use_bn = false )
{
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    IConvolutionLayer* conv1 = network->addConvolution(input, outch, DimsHW{ ksize, ksize }, weightMap[lname + ".conv"+ std::to_string(i) + ".weight"], weightMap[lname + ".conv" + std::to_string(i)+".bias"]);
    assert(conv1);
    conv1->setStride(DimsHW{s, s});
    conv1->setPadding(DimsHW{p, p});
    conv1->setNbGroups(g);
    if (use_bn)
    {
        IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".batchnorm"+std::to_string(i), 1e-5);
        auto relu = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
        assert(relu);
        return relu;
    }
    else
    {
        auto relu = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);
        assert(relu);
        return relu;
    }
}

IActivationLayer* basicBlock(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname) {
    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };

    IConvolutionLayer* conv1 = network->addConvolution(input, outch, DimsHW{ 3, 3 }, weightMap[lname + "conv1.weight"], emptywts);
    assert(conv1);
    conv1->setStride(DimsHW{ stride, stride });
    conv1->setPadding(DimsHW{ 1, 1 });

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5);

    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    IConvolutionLayer* conv2 = network->addConvolution(*relu1->getOutput(0), outch, DimsHW{ 3, 3 }, weightMap[lname + "conv2.weight"], emptywts);
    assert(conv2);
    conv2->setPadding(DimsHW{ 1, 1 });

    IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5);

    IElementWiseLayer* ew1;
    if (inch != outch) {
        IConvolutionLayer* conv3 = network->addConvolution(input, outch, DimsHW{ 1, 1 }, weightMap[lname + "downsample.0.weight"], emptywts);
        assert(conv3);
        conv3->setStride(DimsHW{ stride, stride });
        IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "downsample.1", 1e-5);
        ew1 = network->addElementWise(*bn3->getOutput(0), *bn2->getOutput(0), ElementWiseOperation::kSUM);
    }
    else {
        ew1 = network->addElementWise(input, *bn2->getOutput(0), ElementWiseOperation::kSUM);
    }
    IActivationLayer* relu2 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU);
    assert(relu2);
    return relu2;
}

int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
    DIR *p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 &&
                strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }
    closedir(p_dir);
    return 0;
}

#endif


================================================
FILE: ufld/gen_wts.py
================================================
import torch
import struct
#import models.crnn as crnn
from model.model import parsingNet

# Initialize
model = parsingNet(pretrained = False, backbone='18', cls_dim = (101, 56, 4), use_aux=False)
device = 'cpu'
# Load model
state_dict = torch.load('tusimple_18.pth', map_location='cpu')['model']
model.to(device).eval()

f = open('lane.wts', 'w')
f.write('{}\n'.format(len(state_dict.keys())))
for k, v in state_dict.items():
    vr = v.reshape(-1).cpu().numpy()
    f.write('{} {} '.format(k, len(vr)))
    for vv in vr:
        f.write(' ')
        f.write(struct.pack('>f',float(vv)).hex())
    f.write('\n')


================================================
FILE: ufld/lane_det.cpp
================================================
#include <iostream>
#include <chrono>
#include <string>
#include <sstream>
#include "cuda_runtime_api.h"
#include "logging.h"
#include "common.hpp"

#define USE_FP16  // comment out this if want to use FP32
#define DEVICE 0  // GPU id
#define BATCH_SIZE 1
static const int INPUT_C = 3;
static const int INPUT_H = 288;
static const int INPUT_W = 800;
static const int OUTPUT_C = 101;
static const int OUTPUT_H = 56;
static const int OUTPUT_W = 4;
static const int OUTPUT_SIZE = OUTPUT_C * OUTPUT_H * OUTPUT_W;
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
static Logger gLogger;

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder,IBuilderConfig* builderConfig, DataType dt) {
    INetworkDefinition* network = builder->createNetworkV2(0U);
    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };

    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{INPUT_C, INPUT_H, INPUT_W });
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../lane.wts");
#if 0
    /* print layer names */
    for(std::map<std::string, Weights>::iterator iter = weightMap.begin(); iter != weightMap.end() ; iter++)
    {
        std::cout << iter->first << std::endl;
    }
#endif
    auto conv1 = network->addConvolution(*data, 64, DimsHW{ 7, 7 }, weightMap["model.conv1.weight"], emptywts);
    assert(conv1);
    conv1->setStride(DimsHW{2, 2});
    conv1->setPadding(DimsHW{3, 3});
    conv1->setNbGroups(1);

    auto bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "model.bn1", 1e-5);
    auto relu0 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    IPoolingLayer* pool0 = network->addPooling(*relu0->getOutput(0), PoolingType::kMAX, DimsHW{ 3, 3 });
    pool0->setStride( DimsHW{ 2, 2 } );
    pool0->setPadding( DimsHW{ 1, 1 } );
    assert(pool0);

    auto basic0 = basicBlock(network, weightMap, *pool0->getOutput(0), 64, 64, 1, "model.layer1.0.");
    auto basic1 = basicBlock(network, weightMap, *basic0->getOutput(0), 64, 64, 1, "model.layer1.1.");
    auto basic2_0 = basicBlock(network, weightMap, *basic1->getOutput(0), 64, 128, 2, "model.layer2.0.");

    auto basic2_1 = basicBlock(network, weightMap, *basic2_0->getOutput(0), 128, 128, 1, "model.layer2.1.");

    auto basic3_0 = basicBlock(network, weightMap, *basic2_1->getOutput(0), 128, 256, 2, "model.layer3.0.");

    auto basic3_1 = basicBlock(network, weightMap, *basic3_0->getOutput(0), 256, 256, 1, "model.layer3.1.");

    auto basic4_0 = basicBlock(network, weightMap, *basic3_1->getOutput(0), 256, 512, 2, "model.layer4.0.");

    auto basic4_1 = basicBlock(network, weightMap, *basic4_0->getOutput(0), 512, 512, 1, "model.layer4.1.");

#if 0
    /* just for debug */
    Dims dims1 = basic4_1->getOutput(0)->getDimensions();
    for (int i = 0; i < dims1.nbDims; i++)
    {
        std::cout << dims1.d[i] << "-" << (int)dims1.type[i] << "   ";
    }
    std::cout << std::endl;
#endif

    auto conv2 = network->addConvolution(*basic4_1->getOutput(0), 8, DimsHW{ 1, 1 }, weightMap["pool.weight"], weightMap["pool.bias"]);
    assert(conv2);
    conv2->setStride(DimsHW{1, 1});
    conv2->setPadding(DimsHW{0, 0});
    conv2->setNbGroups(1);

    IShuffleLayer* permute0 = network->addShuffle(*conv2->getOutput(0));
    assert(permute0);
    permute0->setReshapeDimensions( Dims2{1, 1800});

    auto fcwts0 = network->addConstant(nvinfer1::Dims2(2048, 1800), weightMap["cls.0.weight"]);
    auto matrixMultLayer0 = network->addMatrixMultiply(*permute0->getOutput(0), MatrixOperation::kNONE, *fcwts0->getOutput(0), MatrixOperation::kTRANSPOSE);

    assert(matrixMultLayer0 != nullptr);
    // Add elementwise layer for adding bias
    auto fcbias0 = network->addConstant(nvinfer1::Dims2(1, 2048), weightMap["cls.0.bias"]);

    auto addBiasLayer0 = network->addElementWise(*matrixMultLayer0->getOutput(0), *fcbias0->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
    assert(addBiasLayer0 != nullptr);

    auto relu = network->addActivation(*addBiasLayer0->getOutput(0), ActivationType::kRELU);

    auto fcwts1 = network->addConstant(nvinfer1::Dims2(22624, 2048), weightMap["cls.2.weight"]);
    auto matrixMultLayer1 = network->addMatrixMultiply(*relu->getOutput(0), MatrixOperation::kNONE, *fcwts1->getOutput(0), MatrixOperation::kTRANSPOSE);

    assert(matrixMultLayer1 != nullptr);
    // Add elementwise layer for adding bias
    auto fcbias1 = network->addConstant(nvinfer1::Dims2(1, 22624), weightMap["cls.2.bias"]);

    auto addBiasLayer1 = network->addElementWise(*matrixMultLayer1->getOutput(0), *fcbias1->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
    assert(addBiasLayer1 != nullptr);

    IShuffleLayer* permute1 = network->addShuffle(*addBiasLayer1->getOutput(0));
    assert(permute1);
    permute1->setReshapeDimensions( Dims3{ 101, 56, 4 });

    permute1->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*permute1->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    builderConfig->setMaxWorkspaceSize(16 * (1 << 20));// 16MB

#ifdef USE_FP16
    if(builder->platformHasFastFp16()) {
        std::cout << "Platform supports fp16 mode and use it !!!" << std::endl;
        builderConfig->setFlag(BuilderFlag::kFP16);
    } else {
        std::cout << "Platform doesn't support fp16 mode so you can't use it !!!" << std::endl;
    }
#endif
    std::cout << "Building engine, please wait for a while..." << std::endl;
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *builderConfig);
    std::cout << "Build engine successfully!" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*)(mem.second.values));
    }

   return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* builderConfig = builder->createBuilderConfig();
    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, builderConfig, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float),
          cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float),
          cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

std::vector<float> prepareImage(cv::Mat & img)
{
    cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
    cv::Mat resized;
    cv::resize(img, resized, cv::Size(INPUT_W, INPUT_H));

    cv::Mat img_float;

    resized.convertTo(img_float, CV_32FC3, 1. / 255.);

    // HWC TO CHW
    std::vector<cv::Mat> input_channels(INPUT_C);
    cv::split(img_float, input_channels);

    // normalize
    std::vector<float> result(INPUT_H * INPUT_W * INPUT_C);
    auto data = result.data();
    int channelLength = INPUT_H * INPUT_W;
    static float mean[]= {0.485, 0.456, 0.406};
    static float std[] = {0.229, 0.224, 0.225};
    for (int i = 0; i < INPUT_C; ++i) {
        cv::Mat normed_channel = (input_channels[i] - mean[i]) / std[i];
        memcpy(data, normed_channel.data, channelLength * sizeof(float));
        data += channelLength;
    }

    return result;
}

/* (101,56,4), add softmax on 101_axis and calculate Expect */
void softmax_mul(float* x, float* y, int rows, int cols, int chan)
{
    for(int i = 0, wh = rows * cols; i < rows; i++)
    {
        for(int j = 0; j < cols; j++)
        {
            float sum = 0.0;
            float expect = 0.0;
            for(int k = 0; k < chan - 1; k++)
            {
                x[k * wh + i * cols + j] = exp(x[k * wh + i * cols + j]);
                sum += x[k * wh + i * cols + j];
            }
            for(int k = 0; k < chan - 1; k++)
            {
                x[k * wh + i * cols + j] /= sum;
            }
            for(int k = 0; k < chan - 1; k++)
            {
                x[k * wh + i * cols + j] = x[k * wh + i * cols + j] * (k + 1);
                expect += x[k * wh + i * cols + j];
            }
            y[i * cols + j] = expect;
        }
    }
}
/* (101,56,4), calculate max index on 101_axis */
void argmax(float* x, float* y, int rows, int cols, int chan)
{
    for(int i = 0,wh = rows * cols; i < rows; i++)
    {
        for(int j = 0; j < cols; j++)
        {
            int max = -10000000;
            int max_ind = -1;
            for(int k = 0; k < chan; k++)
            {
                if(x[k * wh + i * cols + j] > max)
                {
                    max = x[k * wh + i * cols + j];
                    max_ind = k;
                }
            }
            y[i * cols + j] = max_ind;
        }
    }
}

int main(int argc, char** argv)
{
    cudaSetDevice(DEVICE);
    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{ nullptr };
    size_t size{ 0 };

    if (argc == 2 && std::string(argv[1]) == "-s")
    {
            IHostMemory* modelStream{ nullptr };
            APIToModel(BATCH_SIZE, &modelStream);
            assert(modelStream != nullptr);
            std::ofstream p("lane_det.engine", std::ios::binary);
            if (!p) {
                    std::cerr << "could not open plan output file" << std::endl;
                    return -1;
            }
            p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
            modelStream->destroy();
            return 0;
    }
    else if (argc == 3 && std::string(argv[1]) == "-d")
    {
            std::ifstream file("lane_det.engine", std::ios::binary);
            if (file.good()) {
                    file.seekg(0, file.end);
                    size = file.tellg();
                    file.seekg(0, file.beg);
                    trtModelStream = new char[size];
                    assert(trtModelStream);
                    file.read(trtModelStream, size);
                    file.close();
            }
    }
    else
    {
            std::cerr << "arguments not right!" << std::endl;
            std::cerr << "./crnn -s  // serialize model to plan file" << std::endl;
            std::cerr << "./crnn -d ../samples  // deserialize plan file and run inference" << std::endl;
            return -1;
    }

    /* prepare input data */
    static float data[BATCH_SIZE * INPUT_C * INPUT_H * INPUT_W];
    static float prob[BATCH_SIZE * OUTPUT_SIZE];
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    std::vector<std::string> file_names;
    if (read_files_in_dir(argv[2], file_names) < 0) {
            std::cout << "read_files_in_dir failed." << std::endl;
            return -1;
    }

    int fcount = 0;
    int vis_h = 720;
    int vis_w = 1280;
    int col_sample_w = 8;
    for (int f = 0; f < (int)file_names.size(); f++)
    {
        cv::Mat vis;
        fcount++;
        if (fcount < BATCH_SIZE && f + 1 != (int)file_names.size()) continue;
        for (int b = 0; b < fcount; b++)
        {
            cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b], 1);
            if (img.empty()) continue;
            cv::resize(img, vis, cv::Size(vis_w, vis_h));
            std::vector<float> result(INPUT_C * INPUT_W * INPUT_H);
            result = prepareImage(img);
            memcpy(data, &result[0], INPUT_C * INPUT_W * INPUT_H * sizeof(float));
        }

        // Run inference
        auto start = std::chrono::system_clock::now();
        doInference(*context, data, prob, BATCH_SIZE); //prob: size (101, 56, 4)
        auto end = std::chrono::system_clock::now();
        std::cout << "inference time is "
                  << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
                  << " ms" << std::endl;

        std::vector<int> tusimple_row_anchor
            { 64,  68,  72,  76,  80,  84,  88,  92,  96,  100, 104, 108, 112,
              116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164,
              168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216,
              220, 224, 228, 232, 236, 240, 244, 248, 252, 256, 260, 264, 268,
              272, 276, 280, 284 };

        float max_ind[BATCH_SIZE * OUTPUT_H * OUTPUT_W];
        float prob_reverse[BATCH_SIZE * OUTPUT_SIZE];
        /* do out_j = out_j[:, ::-1, :] in python list*/
        float expect[BATCH_SIZE * OUTPUT_H * OUTPUT_W];
        for (int k = 0, wh = OUTPUT_W * OUTPUT_H; k < OUTPUT_C; k++)
        {
            for(int j = 0; j < OUTPUT_H; j ++)
            {
                for(int l = 0; l < OUTPUT_W; l++)
                {
                    prob_reverse[k * wh + (OUTPUT_H - 1 - j) * OUTPUT_W + l] =
                        prob[k * wh + j * OUTPUT_W + l];
                }
            }
        }

        argmax(prob_reverse, max_ind, OUTPUT_H, OUTPUT_W, OUTPUT_C);
        /* calculate softmax and Expect */
        softmax_mul(prob_reverse, expect, OUTPUT_H, OUTPUT_W, OUTPUT_C);
        for(int k = 0; k < OUTPUT_H; k++) {
            for(int j = 0; j < OUTPUT_W; j++) {
                max_ind[k * OUTPUT_W + j] == 100 ? expect[k * OUTPUT_W + j] = 0 :
                    expect[k * OUTPUT_W + j] = expect[k * OUTPUT_W + j];
            }
        }
        std::vector<int> i_ind;
        for(int k = 0; k < OUTPUT_W; k++) {
            int ii = 0;
            for(int g = 0; g < OUTPUT_H; g++) {
                if(expect[g * OUTPUT_W + k] != 0)
                    ii++;
            }
            if(ii > 2) {
                i_ind.push_back(k);
            }
        }
        for(int k = 0; k < OUTPUT_H; k++) {
            for(int ll = 0; ll < i_ind.size(); ll++) {
                if(expect[OUTPUT_W * k + i_ind[ll]] > 0) {
                    cv::Point pp =
                        { int(expect[OUTPUT_W * k + i_ind[ll]] * col_sample_w * vis_w / INPUT_W) - 1,
                          int( vis_h * tusimple_row_anchor[OUTPUT_H - 1 - k] / INPUT_H) - 1 };
                    cv::circle(vis, pp, 8, CV_RGB(0, 255 ,0), 2);
                }
            }
        }
        cv::imshow("lane_vis",vis);
        cv::waitKey(0);
    }

    return 0;
}


================================================
FILE: ufld/logging.h
================================================
#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "macros.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override 
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: ufld/macros.h
================================================
#ifndef __MACROS_H
#define __MACROS_H

#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else

#if defined(_MSC_VER)
#define API __declspec(dllimport)
#else
#define API
#endif
#endif  // API_EXPORTS

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif

#endif  // __MACROS_H


================================================
FILE: ufld/pth2onnx.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import torch.onnx as torch_onnx
from model.model import parsingNet

MODELPATH = "tusimple_18.pth"

net = parsingNet(pretrained = False, backbone='18', cls_dim = (101, 56, 4), use_aux=False).cuda()

state_dict = torch.load(MODELPATH, map_location='cpu')['model']

net.train(False)

x = torch.randn(1, 3, 288, 800).cuda()

torch_onnx.export(net, x, "lane.onnx", verbose=True, input_names=["input"], output_names=["output"],opset_version=11)


================================================
FILE: unet/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(unet)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

# cuda directory
include_directories(/usr/local/cuda/include/)
link_directories(/usr/local/cuda/lib64/)

# tensorrt
include_directories(/workspace/TensorRT-7.2.3.4/include/)
link_directories(/workspace/TensorRT-7.2.3.4/lib/)

# opencv library
find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

# link library and add exec file
add_executable(unet ${PROJECT_SOURCE_DIR}/unet.cpp)
target_link_libraries(unet nvinfer)
target_link_libraries(unet cudart)
target_link_libraries(unet ${OpenCV_LIBS})

add_definitions(-O2 -pthread)


================================================
FILE: unet/README.md
================================================
# UNet

Pytorch model from [Pytorch-UNet](https://github.com/milesial/Pytorch-UNet).

## Contributors

<a href="https://github.com/YuzhouPeng"><img src="https://avatars.githubusercontent.com/u/13601004?v=4?s=48" width="40px;" alt=""/></a>
<a href="https://github.com/East-Face"><img src="https://avatars.githubusercontent.com/u/35283869?v=4s=48" width="40px;" alt=""/></a>
<a href="https://github.com/irvingzhang0512"><img src="https://avatars.githubusercontent.com/u/22089207?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/wang-xinyu"><img src="https://avatars.githubusercontent.com/u/15235574?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/nengwp"><img src="https://avatars.githubusercontent.com/u/44516353?s=96&v=4" width="40px;" alt=""/></a>


## Requirements

Now TensorRT 8.x is supported and you can use it.
The key cause of the previous bug is the pooling layer Stride setting problem.

## Build and Run

1. Generate .wts
```
cp {path-of-tensorrtx}/unet/gen_wts.py Pytorch-UNet/
cd Pytorch-UNet/
wget https://github.com/milesial/Pytorch-UNet/releases/download/v3.0/unet_carvana_scale0.5_epoch2.pth
python gen_wts.py unet_carvana_scale0.5_epoch2.pth
```

2. Generate TensorRT engine
```
cd tensorrtx/unet/
mkdir build
cd build
cmake ..
make
cp {path-of-Pytorch-UNet}/unet.wts .
./unet -s
```

3. Run inference
```
wget https://raw.githubusercontent.com/wang-xinyu/tensorrtx/f60dcc7bec28846cd973fc95ac829c4e57a11395/unet/samples/0cdf5b5d0ce1_01.jpg
./unet -d 0cdf5b5d0ce1_01.jpg
```

4. Check result.jpg

<p align="center">
<img src="https://user-images.githubusercontent.com/15235574/207358769-dacf908e-f65d-4b2e-bc53-4fa2a9114c2a.jpg" height="360px;">
</p>

# Benchmark

Pytorch | TensorRT FP32 | TensorRT FP16
---- | ----- | ------ 
816x672  | 816x672 | 816x672
58ms  | 43ms (batchsize 8) | 14ms (batchsize 8)

## More Information

See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)


================================================
FILE: unet/common.hpp
================================================
#ifndef UNET_COMMON_H_
#define UNET_COMMON_H_

#include <fstream>
#include <map>
#include <sstream>
#include <vector>
#include <opencv2/opencv.hpp>
#include "NvInfer.h"

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

using namespace nvinfer1;

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        
        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + ".weight"].values;
    float *beta = (float*)weightMap[lname + ".bias"].values;
    float *mean = (float*)weightMap[lname + ".running_mean"].values;
    float *var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};
    
    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

#endif


================================================
FILE: unet/gen_wts.py
================================================
import torch
import sys
import struct

def main():
  device = torch.device('cpu')
  state_dict = torch.load(sys.argv[1], map_location=device)

  f = open("unet.wts", 'w')
  f.write("{}\n".format(len(state_dict.keys())))
  for k, v in state_dict.items():
    print('key: ', k)
    print('value: ', v.shape)
    vr = v.reshape(-1).cpu().numpy()
    f.write("{} {}".format(k, len(vr)))
    for vv in vr:
      f.write(" ")
      f.write(struct.pack(">f", float(vv)).hex())
    f.write("\n")
  f.close()

if __name__ == '__main__':
  main()


================================================
FILE: unet/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "macros.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override 
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: unet/macros.h
================================================
#ifndef __MACROS_H
#define __MACROS_H

#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else

#if defined(_MSC_VER)
#define API __declspec(dllimport)
#else
#define API
#endif
#endif  // API_EXPORTS

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif

#endif  // __MACROS_H


================================================
FILE: unet/unet.cpp
================================================
#include <iostream>
#include <chrono>
#include "cuda_runtime_api.h"
#include "logging.h"
#include "common.hpp"

#define DEVICE 0
#define USE_FP32  // USE_FP32 or USE_FP16
#define CONF_THRESH 0.5
#define BATCH_SIZE 1
#define cls 2
#define BILINEAR false

// stuff we know about the network and the input/output blobs
static const int INPUT_H = 640;
static const int INPUT_W = 959;
static const int OUTPUT_SIZE = INPUT_H * INPUT_W * cls;
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
static Logger gLogger;

using namespace nvinfer1;

ILayer* doubleConv(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, std::string lname, int midch) {
  Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
  IConvolutionLayer* conv1 = network->addConvolutionNd(input, midch, DimsHW{ ksize, ksize }, weightMap[lname + ".double_conv.0.weight"], emptywts);
  conv1->setStrideNd(DimsHW{ 1, 1 });
  conv1->setPaddingNd(DimsHW{ 1, 1 });
  conv1->setNbGroups(1);
  IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".double_conv.1", 0);
  IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU);
  IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{ 3, 3 }, weightMap[lname + ".double_conv.3.weight"], emptywts);
  conv2->setStrideNd(DimsHW{ 1, 1 });
  conv2->setPaddingNd(DimsHW{ 1, 1 });
  conv2->setNbGroups(1);
  IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + ".double_conv.4", 0);
  IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kLEAKY_RELU);
  assert(relu2);
  return relu2;
}

ILayer* down(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int p, std::string lname) {
  IPoolingLayer* pool1 = network->addPoolingNd(input, PoolingType::kMAX, DimsHW{ 2, 2 });
  pool1->setStrideNd(DimsHW{ 2, 2 });
  assert(pool1);
  ILayer* dcov1 = doubleConv(network, weightMap, *pool1->getOutput(0), outch, 3, lname + ".maxpool_conv.1", outch);
  assert(dcov1);
  return dcov1;
}

ILayer* up(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input1, ITensor& input2, int resize, int outch, int midch, std::string lname) {
  if (BILINEAR) {
    // add upsample bilinear
    IResizeLayer* deconv1 = network->addResize(input1);
    auto outdims = input2.getDimensions();
    deconv1->setOutputDimensions(outdims);
    deconv1->setResizeMode(ResizeMode::kLINEAR);
    deconv1->setAlignCorners(true);

    int diffx = input2.getDimensions().d[1] - deconv1->getOutput(0)->getDimensions().d[1];
    int diffy = input2.getDimensions().d[2] - deconv1->getOutput(0)->getDimensions().d[2];

    ILayer* pad1 = network->addPaddingNd(*deconv1->getOutput(0), DimsHW{ diffx / 2, diffy / 2 }, DimsHW{ diffx - (diffx / 2), diffy - (diffy / 2) });
    // dcov1->setPaddingNd(DimsHW{diffx / 2, diffx - diffx / 2},DimsHW{diffy / 2, diffy - diffy / 2});
    ITensor* inputTensors[] = { &input2,pad1->getOutput(0) };
    auto cat = network->addConcatenation(inputTensors, 2);
    assert(cat);
    if (midch == 64) {
      ILayer* dcov1 = doubleConv(network, weightMap, *cat->getOutput(0), outch, 3, lname + ".conv", outch);
      assert(dcov1);
      return dcov1;
    } else {
      int midch1 = outch / 2;
      ILayer* dcov1 = doubleConv(network, weightMap, *cat->getOutput(0), midch1, 3, lname + ".conv", outch);
      assert(dcov1);
      return dcov1;
    }
  } else {
    IDeconvolutionLayer* deconv1 = network->addDeconvolutionNd(input1, resize, DimsHW{ 2, 2 }, weightMap[lname + ".up.weight"], weightMap[lname + ".up.bias"]);
    deconv1->setStrideNd(DimsHW{ 2, 2 });
    deconv1->setNbGroups(1);

    int diffx = input2.getDimensions().d[1] - deconv1->getOutput(0)->getDimensions().d[1];
    int diffy = input2.getDimensions().d[2] - deconv1->getOutput(0)->getDimensions().d[2];

    ILayer* pad1 = network->addPaddingNd(*deconv1->getOutput(0), DimsHW{ diffx / 2, diffy / 2 }, DimsHW{ diffx - (diffx / 2), diffy - (diffy / 2) });
    // dcov1->setPaddingNd(DimsHW{diffx / 2, diffx - diffx / 2},DimsHW{diffy / 2, diffy - diffy / 2});
    ITensor* inputTensors[] = { &input2,pad1->getOutput(0) };
    auto cat = network->addConcatenation(inputTensors, 2);
    assert(cat);
    ILayer* dcov1 = doubleConv(network, weightMap, *cat->getOutput(0), midch, 3, lname + ".conv", outch);
    assert(dcov1);
    return dcov1;
  }
}

ILayer* outConv(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, std::string lname) {
  // Weights emptywts{DataType::kFLOAT, nullptr, 0};
  IConvolutionLayer* conv1 = network->addConvolutionNd(input, cls, DimsHW{ 1, 1 }, weightMap[lname + ".conv.weight"], weightMap[lname + ".conv.bias"]);
  assert(conv1);
  conv1->setStrideNd(DimsHW{ 1, 1 });
  conv1->setPaddingNd(DimsHW{ 0, 0 });
  conv1->setNbGroups(1);
  return conv1;
}

ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, std::string wts_path) {
  INetworkDefinition* network = builder->createNetworkV2(0U);

  // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
  ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W });
  assert(data);

  std::map<std::string, Weights> weightMap = loadWeights(wts_path);
  Weights emptywts{ DataType::kFLOAT, nullptr, 0 };

  // build network
  auto x1 = doubleConv(network, weightMap, *data, 64, 3, "inc", 64);
  auto x2 = down(network, weightMap, *x1->getOutput(0), 128, 1, "down1");
  auto x3 = down(network, weightMap, *x2->getOutput(0), 256, 1, "down2");
  auto x4 = down(network, weightMap, *x3->getOutput(0), 512, 1, "down3");
  auto channel = 512;
  if (!BILINEAR) {
    channel = 1024;
  }
  auto x5 = down(network, weightMap, *x4->getOutput(0), channel, 1, "down4");
  ILayer* x6 = up(network, weightMap, *x5->getOutput(0), *x4->getOutput(0), 512, 512, 512, "up1");
  ILayer* x7 = up(network, weightMap, *x6->getOutput(0), *x3->getOutput(0), 256, 256, 256, "up2");
  ILayer* x8 = up(network, weightMap, *x7->getOutput(0), *x2->getOutput(0), 128, 128, 128, "up3");
  ILayer* x9 = up(network, weightMap, *x8->getOutput(0), *x1->getOutput(0), 64, 64, 64, "up4");
  ILayer* x10 = outConv(network, weightMap, *x9->getOutput(0), OUTPUT_SIZE, "outc");

  x10->getOutput(0)->setName(OUTPUT_BLOB_NAME);
  network->markOutput(*x10->getOutput(0));

  // Build engine
  builder->setMaxBatchSize(maxBatchSize);
  config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#ifdef USE_FP16
  config->setFlag(BuilderFlag::kFP16);
#endif
  std::cout << "Building engine, please wait for a while..." << std::endl;
  ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
  std::cout << "Build engine successfully!" << std::endl;

  // Don't need the network any more
  network->destroy();

  // Release host memory
  for (auto& mem : weightMap) {
    free((void*)(mem.second.values));
  }

  return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** model_stream, std::string wts_path) {
  // Create builder
  IBuilder* builder = createInferBuilder(gLogger);
  IBuilderConfig* config = builder->createBuilderConfig();

  // Create model to populate the network, then set the outputs and create an engine
  ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT, wts_path);
  assert(engine != nullptr);

  // Serialize the engine
  (*model_stream) = engine->serialize();

  // Close everything down
  engine->destroy();
  builder->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
  const ICudaEngine& engine = context.getEngine();

  // Pointers to input and output device buffers to pass to engine.
  // Engine requires exactly IEngine::getNbBindings() number of buffers.
  assert(engine.getNbBindings() == 2);
  void* buffers[2];

  // In order to bind the buffers, we need to know the names of the input and output tensors.
  // Note that indices are guaranteed to be less than IEngine::getNbBindings()
  const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
  const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

  // Create GPU buffers on device
  CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
  CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

  // Create stream
  cudaStream_t stream;
  CHECK(cudaStreamCreate(&stream));

  // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
  CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
  context.enqueue(batchSize, buffers, stream, nullptr);
  CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));

  cudaStreamSynchronize(stream);

  // Release stream and buffers
  cudaStreamDestroy(stream);
  CHECK(cudaFree(buffers[inputIndex]));
  CHECK(cudaFree(buffers[outputIndex]));
}

int main(int argc, char** argv) {
  cudaSetDevice(DEVICE);

  char* trt_model_stream = nullptr;
  size_t size = 0;
  std::string engine_name = "unet.engine";
  std::string wts_path = "unet.wts";

  if (argc == 2 && std::string(argv[1]) == "-s") {
    // Create a TensorRT model and serialize it to a file
    IHostMemory* model_stream{ nullptr };
    APIToModel(BATCH_SIZE, &model_stream, wts_path);
    assert(model_stream != nullptr);
    std::ofstream p(engine_name, std::ios::binary);
    if (!p) {
      std::cerr << "could not open plan output file" << std::endl;
      return -1;
    }
    p.write(reinterpret_cast<const char*>(model_stream->data()), model_stream->size());
    model_stream->destroy();
    return 0;
  } else if (argc == 3 && std::string(argv[1]) == "-d") {
    // Load engine file
    std::ifstream file(engine_name, std::ios::binary);
    if (file.good()) {
      file.seekg(0, file.end);
      size = file.tellg();
      file.seekg(0, file.beg);
      trt_model_stream = new char[size];
      assert(trt_model_stream);
      file.read(trt_model_stream, size);
      file.close();
    }
  } else {
    std::cerr << "arguments not right!" << std::endl;
    std::cerr << "./unet -s  // serialize model to plan file" << std::endl;
    std::cerr << "./unet -d ../samples  // deserialize plan file and run inference" << std::endl;
    return -1;
  }

  // Prepare input output data
  static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
  static float prob[BATCH_SIZE * OUTPUT_SIZE];

  // Deserialize engine
  IRuntime* runtime = createInferRuntime(gLogger);
  assert(runtime != nullptr);
  ICudaEngine* engine = runtime->deserializeCudaEngine(trt_model_stream, size);
  assert(engine != nullptr);
  IExecutionContext* context = engine->createExecutionContext();
  assert(context != nullptr);
  delete[] trt_model_stream;

  cv::Mat img = cv::imread(argv[2]);

  // Preprocess
  cv::resize(img, img, cv::Size(INPUT_W, INPUT_H));
  for (int i = 0; i < INPUT_H * INPUT_W; i++) {
    data[i] = (img.at<cv::Vec3b>(i)[2]) / 255.0;
    data[i + INPUT_H * INPUT_W] = (img.at<cv::Vec3b>(i)[1]) / 255.0;
    data[i + 2 * INPUT_H * INPUT_W] = (img.at<cv::Vec3b>(i)[0]) / 255.0;
  }

  // Run inference
  auto start = std::chrono::system_clock::now();
  doInference(*context, data, prob, BATCH_SIZE);
  auto end = std::chrono::system_clock::now();
  std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;

  // Postprocess
  cv::Mat result = cv::Mat::zeros(INPUT_H, INPUT_W, CV_8UC3);
  for (int i = 0; i < INPUT_H * INPUT_W; i++) {
    float fmax = prob[i];
    int index = 0;
    for (int j = 1; j < cls; j++) {
      if (prob[i + j * INPUT_H * INPUT_W] > fmax) {
        index = j;
        fmax = prob[i + j * INPUT_H * INPUT_W];
      }
    }

    if (index == 1) {
      result.at<cv::Vec3b>(i) = cv::Vec3b(255, 255, 255);
    }
  }

  cv::imwrite("result.jpg", result);

  // Destroy the engine
  context->destroy();
  engine->destroy();
  runtime->destroy();

  return 0;
}


================================================
FILE: vgg/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(vgg)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(/usr/include/x86_64-linux-gnu/)
link_directories(/usr/lib/x86_64-linux-gnu/)

add_executable(vgg ${PROJECT_SOURCE_DIR}/vgg11.cpp)
target_link_libraries(vgg nvinfer)
target_link_libraries(vgg cudart)

add_definitions(-O2 -pthread)


================================================
FILE: vgg/README.md
================================================
# vgg

VGG 11-layer model (configuration "A") from
    "Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>

For the Pytorch implementation, you can refer to [pytorchx/vgg](https://github.com/wang-xinyu/pytorchx/tree/master/vgg)

VGG's architecture is simple, just some conv, relu, maxpool, and fc layers.

```
// 1. generate vgg.wts from [pytorchx/vgg](https://github.com/wang-xinyu/pytorchx/tree/master/vgg)

// 2. put vgg.wts into tensorrtx/vgg

// 3. build and run

cd tensorrtx/vgg

mkdir build

cd build

cmake ..

make

sudo ./vgg -s   // serialize model to plan file i.e. 'vgg.engine'
sudo ./vgg -d   // deserialize plan file and run inference

// 4. see if the output is same as pytorchx/vgg
```


================================================
FILE: vgg/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) override
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: vgg/vgg11.cpp
================================================
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include "logging.h"

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

// stuff we know about the network and the input/output blobs
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int OUTPUT_SIZE = 1000;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";

using namespace nvinfer1;

static Logger gLogger;

// Load weights from files shared with TensorRT samples.
// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file)
{
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        
        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt)
{
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // Create input tensor of shape { 3, INPUT_H, INPUT_W } with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../vgg.wts");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 64, DimsHW{3, 3}, weightMap["features.0.weight"], weightMap["features.0.bias"]);
    assert(conv1);
    conv1->setPaddingNd(DimsHW{1, 1});
    IActivationLayer* relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);
    assert(relu1);
    IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    assert(pool1);
    pool1->setStrideNd(DimsHW{2, 2});

    conv1 = network->addConvolutionNd(*pool1->getOutput(0), 128, DimsHW{3, 3}, weightMap["features.3.weight"], weightMap["features.3.bias"]);
    conv1->setPaddingNd(DimsHW{1, 1});
    relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);
    pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    pool1->setStrideNd(DimsHW{2, 2});

    conv1 = network->addConvolutionNd(*pool1->getOutput(0), 256, DimsHW{3, 3}, weightMap["features.6.weight"], weightMap["features.6.bias"]);
    conv1->setPaddingNd(DimsHW{1, 1});
    relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);
    conv1 = network->addConvolutionNd(*relu1->getOutput(0), 256, DimsHW{3, 3}, weightMap["features.8.weight"], weightMap["features.8.bias"]);
    conv1->setPaddingNd(DimsHW{1, 1});
    relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);
    pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    pool1->setStrideNd(DimsHW{2, 2});

    conv1 = network->addConvolutionNd(*pool1->getOutput(0), 512, DimsHW{3, 3}, weightMap["features.11.weight"], weightMap["features.11.bias"]);
    conv1->setPaddingNd(DimsHW{1, 1});
    relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);
    conv1 = network->addConvolutionNd(*relu1->getOutput(0), 512, DimsHW{3, 3}, weightMap["features.13.weight"], weightMap["features.13.bias"]);
    conv1->setPaddingNd(DimsHW{1, 1});
    relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);
    pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    pool1->setStrideNd(DimsHW{2, 2});

    conv1 = network->addConvolutionNd(*pool1->getOutput(0), 512, DimsHW{3, 3}, weightMap["features.16.weight"], weightMap["features.16.bias"]);
    conv1->setPaddingNd(DimsHW{1, 1});
    relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);
    conv1 = network->addConvolutionNd(*relu1->getOutput(0), 512, DimsHW{3, 3}, weightMap["features.18.weight"], weightMap["features.18.bias"]);
    conv1->setPaddingNd(DimsHW{1, 1});
    relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);
    pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    pool1->setStrideNd(DimsHW{2, 2});

    IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool1->getOutput(0), 4096, weightMap["classifier.0.weight"], weightMap["classifier.0.bias"]);
    assert(fc1);
    relu1 = network->addActivation(*fc1->getOutput(0), ActivationType::kRELU);
    fc1 = network->addFullyConnected(*relu1->getOutput(0), 4096, weightMap["classifier.3.weight"], weightMap["classifier.3.bias"]);
    relu1 = network->addActivation(*fc1->getOutput(0), ActivationType::kRELU);
    fc1 = network->addFullyConnected(*relu1->getOutput(0), 1000, weightMap["classifier.6.weight"], weightMap["classifier.6.bias"]);

    fc1->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    std::cout << "set name out" << std::endl;
    network->markOutput(*fc1->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(1 << 20);
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "build out" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*) (mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream)
{
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
    config->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

int main(int argc, char** argv)
{
    if (argc != 2) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./vgg -s   // serialize model to plan file" << std::endl;
        std::cerr << "./vgg -d   // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{nullptr};
    size_t size{0};

    if (std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(1, &modelStream);
        assert(modelStream != nullptr);

        std::ofstream p("vgg.engine", std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 1;
    } else if (std::string(argv[1]) == "-d") {
        std::ifstream file("vgg.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        return -1;
    }

    static float data[3 * INPUT_H * INPUT_W];
    for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
        data[i] = 1;

    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    // Run inference
    static float prob[OUTPUT_SIZE];
    for (int i = 0; i < 10; i++) {
        auto start = std::chrono::system_clock::now();
        doInference(*context, data, prob, 1);
        auto end = std::chrono::system_clock::now();
        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    // Print histogram of the output distribution
    std::cout << "\nOutput:\n\n";
    for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
    {
        std::cout << prob[i] << ", ";
        if (i % 10 == 0) std::cout << i / 10 << std::endl;
    }
    std::cout << std::endl;

    return 0;
}


================================================
FILE: vit/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.17.0)

project(
  vit
  VERSION 0.1
  LANGUAGES C CXX CUDA)

if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
  set(CMAKE_CUDA_ARCHITECTURES 80 86 89 90 100 120)
endif()

set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_INCLUDE_CURRENT_DIR TRUE)

option(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cudaruntime library" OFF)

find_package(Threads REQUIRED)
find_package(CUDAToolkit REQUIRED)
find_package(OpenCV REQUIRED)

if(NOT TARGET TensorRT::TensorRT)
  include(FindTensorRT.cmake)
else()
  message("TensorRT has been found, skipping for ${PROJECT_NAME}")
endif()

add_executable(${PROJECT_NAME} "${PROJECT_NAME}.cc" "cuda_allocator.cc"
                               "profiler.cc")
target_include_directories(${PROJECT_NAME} PRIVATE ${OpenCV_INCLUDE_DIRS})
target_link_libraries(
  ${PROJECT_NAME} PUBLIC Threads::Threads CUDA::cudart CUDA::cuda_driver
                         TensorRT::TensorRT ${OpenCV_LIBS})

if(WIN32)
  set_target_properties(
    ${PROJECT_NAME} PROPERTIES MSVC_RUNTIME_LIBRARY
                               "MultiThreaded$<$<CONFIG:Debug>:Debug>")
endif()


================================================
FILE: vit/FindTensorRT.cmake
================================================
cmake_minimum_required(VERSION 3.17.0)

function(_guess_path var_name required_files)
  set(_result "")

  foreach(path_entry IN LISTS ARGN)
    if(NOT EXISTS "${path_entry}")
      message(DEBUG "skip non-existing path '${path_entry}'")
      continue()
    endif()

    set(_ok TRUE)
    foreach(required_file IN LISTS required_files)
      if(NOT EXISTS "${path_entry}/${required_file}")
        set(_ok FALSE)
        message(DEBUG "'${path_entry}' missing '${required_file}'")
        break()
      endif()
    endforeach()

    if(_ok)
      list(APPEND _result "${path_entry}")
      message(DEBUG "accept '${path_entry}'")
    else()
      message(DEBUG "reject '${path_entry}'")
    endif()
  endforeach()

  if(_result STREQUAL "")
    message(
      FATAL_ERROR
        "_guess_path(${var_name}) failed: no valid path found. required_files='${required_files}' candidates='${ARGN}'"
    )
  endif()

  set(${var_name}
      "${_result}"
      PARENT_SCOPE)
endfunction()

# add library
add_library(TensorRT IMPORTED INTERFACE)
add_library(TensorRT::TensorRT ALIAS TensorRT)

set(TRT_VERSION
    CACHE
      STRING
      "TensorRT version, e.g. \"8.6.1.6\" or \"8.6.1.6+cuda12.0.1.011\", \"8.6.1.6.Windows10.x86_64.cuda-12.0\" etc"
)

if(NOT TRT_VERSION STREQUAL "" AND NOT $ENV{TRT_VERSION} STREQUAL "")
  message(
    WARNING
      "TRT_VERSION defined by cmake and environment variable both, using the later one"
  )
endif()

if(NOT $ENV{TRT_VERSION} STREQUAL "")
  set(TRT_VERSION $ENV{TRT_VERSION})
endif()

string(REGEX MATCH "([0-9]+)" _match ${TRT_VERSION})
set(TRT_MAJOR_VERSION "${_match}")
unset(_match)

if(WIN32)
  set(TensorRT_DIR "C:/Program Files/TensorRT-${TRT_VERSION}")
  if(NOT EXISTS "${TensorRT_DIR}")
    message(FATAL_ERROR "TensorRT_DIR=${TensorRT_DIR} does not exist!")
  endif()

  if(${TRT_MAJOR_VERSION} GREATER_EQUAL 10)
    set(_modules nvinfer_10 nvinfer_plugin_10 nvinfer_vc_plugin_10
                 nvinfer_dispatch_10 nvinfer_lean_10)
    message(DEBUG "Using ${_modules}")
  else()
    set(_modules nvinfer nvinfer_plugin nvinfer_vc_plugin nvinfer_dispatch
                 nvinfer_lean)
  endif()

  set(TensorRT_LIBRARY_DIR "${TensorRT_DIR}/lib")
  set(TensorRT_INCLUDE_DIR "${TensorRT_DIR}/include")
elseif(UNIX)
  string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" _trt_arch)
  set(_trt_include_candidates)
  if(_trt_arch MATCHES "^(aarch64|arm64|arch64)$")
    set(_trt_include_candidates "/usr/include/aarch64-linux-gnu" "/usr/include"
                                "/usr/local/cuda/targets/aarch64-linux/include")
    set(_trt_library_candidates
        "/usr/local/tensorrt/targets/aarch64-linux-gnu/lib"
        "/usr/lib/aarch64-linux-gnu" "/usr/lib/aarch64-linux-gnu/tegra"
        "/usr/lib")
  elseif(_trt_arch MATCHES "^(x86_64|amd64)$")
    set(_trt_include_candidates
        "/usr/local/tensorrt/targets/x86_64-linux-gnu/include"
        "/usr/include/x86_64-linux-gnu" "/usr/include")
    set(_trt_library_candidates
        "/usr/local/tensorrt/targets/x86_64-linux-gnu/lib"
        "/usr/lib/x86_64-linux-gnu" "/usr/lib")
  else()
    message(FATAL_ERROR "Unknown architecture")
  endif()

  set(_modules nvinfer nvinfer_plugin)
  if(${TRT_MAJOR_VERSION} GREATER_EQUAL 8)
    list(APPEND _modules nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean)
  endif()

  _guess_path(TensorRT_LIBRARY_DIR "libnvinfer.so;libnvinfer_plugin.so"
              ${_trt_library_candidates})
  message(STATUS "TensorRT libraries: ${TensorRT_LIBRARY_DIR}")
  _guess_path(TensorRT_INCLUDE_DIR "NvInfer.h" ${_trt_include_candidates})
  message(STATUS "TensorRT includes: ${TensorRT_INCLUDE_DIR}")
endif()

foreach(lib IN LISTS _modules)
  find_library(
    TensorRT_${lib}_LIBRARY
    NAMES ${lib}
    HINTS ${TensorRT_LIBRARY_DIR})
  list(APPEND TensorRT_LIBRARIES ${TensorRT_${lib}_LIBRARY})
endforeach()

target_link_libraries(TensorRT INTERFACE ${TensorRT_LIBRARIES})

message(STATUS "Found TensorRT libs: ${TensorRT_LIBRARIES}")

set_target_properties(
  TensorRT
  PROPERTIES C_STANDARD 17
             CXX_STANDARD 17
             POSITION_INDEPENDENT_CODE ON
             SKIP_BUILD_RPATH TRUE
             BUILD_WITH_INSTALL_RPATH TRUE
             INSTALL_RPATH "$ORIGIN"
             INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIR}")

unset(TRT_MAJOR_VERSION)
unset(_modules)
unset(_trt_include_candidates)
unset(_trt_library_candidates)
unset(_trt_arch)


================================================
FILE: vit/README.md
================================================
# Vision Transformers (ViT)

## 1. Overview

This is a handwritten TensorRT implementation of the Vision Transformers[arxiv.org.2010.11929](https://arxiv.org/abs/2010.11929) paper.

**Note**:

- Swi-GeLU activation layer is supported since TensorRT **10.0**+ SDK, we can use a approximation way as TensorRT does, check below for details.

## 2. Details

### 2.1 Features

- Support TensorRT SDK 8.5.1+ ~ 10.15.1+
- Support Windows11 OS
- Support native or self-implemented Swi-GeLU
- Support native or self-implemented multihead self-attention
- Support a dummy profiler by default
- Support a dummy output allocator by default
- Use optimization profile by default

### 2.2 Current limitations

- cannot use `IAttenion` with TensorRT SDK 10.14 ~ 10.15 because of the bugs in TensorRT
- TensorRT < 8 is not supported because some ops are not inplemented in cuDNN
- SM < 86, TensorRT < 10, CUDA < 12 cases are _NOT_ fully tested yet

### 2.3 Usage

1. use `gen_wts.py` to generate `.wts` file.

```bash
python gen_wts.py
```

2. build C++ code

```bash
pushd tensorrtx/vit
cmake -S . -B build -G Ninja --fresh
cmake --build build
```

3. serialize `.wts` model to engine file.

```bash
./build/vit -s
```

4. run inference

```bash
./build/vit -d
```

On **RTX 4080, TensorRT 10.15.1 SDK**, the output looks like:

```bash
...
====
1880us
-1.125, 0.4623, -0.1215, -0.007384, -0.004307, -0.7021, -0.748, 0.2031, -0.4862, -0.008939, -1.151, -0.408, -0.3259, 0.2202, 0.04537, -2.008, -0.2832, 0.04394, 0.5326, 0.1724, 0.5655,
====
prediction result:
Top: 0 idx: 285, logits: 8.262, label: Egyptian cat
Top: 1 idx: 281, logits: 7.872, label: tabby, tabby cat
Top: 2 idx: 282, logits: 6.477, label: tiger cat
========== VisionTransformerProfiler ==========
                                                                                                          TensorRT layer name    Runtime, %  Invocations Runtime, ms
                                                                  Reformatting CopyNode for Input Tensor 0 to patch embedding          3.2%           20         0.95
                                                                                                              patch embedding          1.5%           20         0.45
Reformatting CopyNode for Input Tensor 0 to {ForeignNode[(Unnamed Layer* 3) [Constant]...(Unnamed Layer* 518) [ElementWise]]}          0.2%           20         0.06
                                                                                                        __myl_ReshTran_myl3_0          0.8%           20         0.24
                                                                __myl_ConcAddCastMeanSubMulMeanAddSqrtDivMulCastMulAdd_myl3_1          0.3%           20         0.08
                vit.encoder.layer.0.attentionvalue+vit.encoder.layer.0.attentionkey+vit.encoder.layer.0.attentionquery_myl3_2          1.4%           20         0.40
                                                                                                    __myl_TranReshMove_myl3_3          0.2%           20         0.06
                                                                                                    __myl_TranReshMove_myl3_4          0.2%           20         0.07
                                                                                                    __myl_TranReshMove_myl3_5          0.2%           20         0.06
                                                                                                          _gemm_mha_v2_myl3_6          0.5%           20         0.14
                                                                                                    __myl_MoveReshTran_myl3_7          0.2%           20         0.06
...
========== VisionTransformerProfiler total runtime = 29.67 ms ==========
```

as is shown above, we successfully triggered the internal MHA fused kernel fusion pass inside TensorRT (i.e., **"Myelin"** or **"myl"** in short), especially the MHA fused kernel: `_gemm_mha_v2_myl3_6`.

## 3. transformer details

`ViTLayer()` builds one ViT encoder block (Transformer encoder layer) using TensorRT primitives. The implementation corresponds to a **Pre-LayerNorm** Transformer layer (typical for ViT), including:

- LayerNorm before attention
- Multi-Head Self-Attention (MHSA): QKV projections → scaled dot-product attention → output projection
- Residual connection
- LayerNorm after attention
- Feed-Forward Network (FFN / MLP): dense → GeLU → dense
- Residual connection

The function returns the final residual output tensor.

### 3.1 Notation and Tensor Shapes

Let the input tensor (TensorRT `input`) be:

$$
\mathbf{X} \in \mathbb{R}^{N \times L \times D}
$$

Where:

- (N): batch size (represented by `N` in your code)
- (L): sequence length (number of tokens; dynamic in code via `-1`)
- (D): hidden size, fixed at 768 in this implementation

The attention head configuration:

$$
H = \tt{param.head\_num}, \qquad d = \frac{D}{H}
$$

### 3.2 Weight shapes (conceptual)

For a standard Transformer block:

- Q/K/V projection weights:
  $$
  \mathbf{W}_Q, \mathbf{W}_K, \mathbf{W}_V \in \mathbb{R}^{D \times D}
  $$
- Q/K/V biases (**NOTE**:Not used by native nvidia interface):
  $$
  \mathbf{b}_Q, \mathbf{b}_K, \mathbf{b}_V \in \mathbb{R}^{D}
  $$
- Output projection:
  $$
  \mathbf{W}_O \in \mathbb{R}^{D \times D}, \quad \mathbf{b}_O \in \mathbb{R}^{D}
  $$
- FFN (MLP) with expansion ratio 4:
  $$
  \mathbf{W}_1 \in \mathbb{R}^{D \times 4D}, \ \mathbf{b}_1 \in \mathbb{R}^{4D}
  $$
  $$
  \mathbf{W}_2 \in \mathbb{R}^{4D \times D}, \ \mathbf{b}_2 \in \mathbb{R}^{D}
  $$
  Here ($4 D = 3072$).

### 3.3 High-Level Block Structure

_Pre-LN Transformer Encoder Layer_ implements the following canonical computation:

$$
\begin{aligned}
\mathbf{X}' &= \mathrm{LN}_1(\mathbf{X}) \\
\mathbf{A} &= \mathrm{MHSA}(\mathbf{X}') \\
\mathbf{Y} &= \mathbf{X} + \mathbf{A} \\
\mathbf{Y}' &= \mathrm{LN}_2(\mathbf{Y}) \\
\mathbf{F} &= \mathrm{FFN}(\mathbf{Y}') \\
\mathbf{Z} &= \mathbf{Y} + \mathbf{F}
\end{aligned}
$$

The function returns ($\mathbf{Z}$).

### 3.4 LayerNorm Definition

LayerNorm is applied over the **last dimension** (D) (hidden size), independently for each ($(n, \ell)$) position.

For a token vector ($\mathbf{x} \in \mathbb{R}^{D}$):

$$
\mathrm{LN}(\mathbf{x}) = \gamma \odot \frac{\mathbf{x} - \mu}{\sqrt{\sigma^2 + \varepsilon}} + \beta
$$

Where:

$$
\mu = \frac{1}{D}\sum_{i=1}^{D} x_i,
\qquad
\sigma^2 = \frac{1}{D}\sum_{i=1}^{D}(x_i - \mu)^2
$$

- ($\gamma$) corresponds to `.weight`
- ($\beta$) corresponds to `.bias`
- ($\varepsilon = \tt{param.lnorm\_eps}$)

### 3.5 QKV Projections (Code Section 2.1)

#### 3.5.1 Linear projections

Let:

$$
\mathbf{X}' = \mathrm{LN}_1(\mathbf{X})
$$

Compute:

$$
\begin{aligned}
\mathbf{Q} &= \mathbf{X}' \mathbf{W}_Q^\top + \mathbf{b}_Q \
\mathbf{K} &= \mathbf{X}' \mathbf{W}_K^\top + \mathbf{b}_K \
\mathbf{V} &= \mathbf{X}' \mathbf{W}_V^\top + \mathbf{b}_V
\end{aligned}
\qquad
\mathbf{Q},\mathbf{K},\mathbf{V} \in \mathbb{R}^{N \times L \times D}
$$

#### 3.5.2 Multi-Head Reshape + Transpose (Shuffle Layers)

Multi-head attention splits the hidden dimension (D) into (H) heads of size (d).

#### 3.5.3 Reshape and transpose

Starting from:

$$
\mathbf{Q} \in \mathbb{R}^{N \times L \times D}
$$

Reshape:

$$
\mathbf{Q}_r \in \mathbb{R}^{N \times L \times H \times d}
$$

Transpose (swap axes to put heads first):

$$
\mathbf{Q}_h \in \mathbb{R}^{N \times H \times L \times d}
$$

Same for ($\mathbf{K}$) and ($\mathbf{V}$).

Code:

```cpp
q_s->setReshapeDimensions(Dims4{N, -1, H, d});
q_s->setSecondTranspose({0, 2, 1, 3}); // (N,H,L,d)
```

#### 3.5.4 SDPA (Scaled Dot-Product Attention)

For each batch (n) and head (h), define:

$$
\mathbf{Q}^{(n,h)} \in \mathbb{R}^{L \times d}, \quad
\mathbf{K}^{(n,h)} \in \mathbb{R}^{L \times d}, \quad
\mathbf{V}^{(n,h)} \in \mathbb{R}^{L \times d}
$$

#### 3.5.5 Attention logits ($QK^\top$)

$$
\mathbf{S}^{(n,h)} = \mathbf{Q}^{(n,h)} \left(\mathbf{K}^{(n,h)}\right)^\top
\in \mathbb{R}^{L \times L}
$$

In tensor form:

$$
\mathbf{S} \in \mathbb{R}^{N \times H \times L \times L}
$$

Code:

```cpp
qk = MatMul(q_s, NONE, k_s, TRANSPOSE); // (N,H,L,d) x (N,H,d,L) -> (N,H,L,L)
```

#### 3.5.6 Scaling

Scaled dot-product uses:

$$
\alpha = \frac{1}{\sqrt{d}}
$$

$$
\tilde{\mathbf{S}} = \alpha \mathbf{S}
$$

Code:

```cpp
scale_val = 1/sqrt(d);
attn_qk = qk * scale; // ElementWise PROD
```

#### 3.5.7 Softmax normalization

Softmax is applied on the **last dimension** (keys index), for each query position, So:

$$
\mathbf{P} \in \mathbb{R}^{N \times H \times L \times L}
$$

Code:

```cpp
qk_softmax = SoftMax(attn_qk);
qk_softmax->setAxes(1U << (nbDims-1)); // last axis
```

#### 3.5.8 Weighted sum of values

Each head output:

$$
\mathbf{O}^{(n,h)} = \mathbf{P}^{(n,h)} \mathbf{V}^{(n,h)}
\in \mathbb{R}^{L \times d}
$$

Thus:

$$
\mathbf{O} \in \mathbb{R}^{N \times H \times L \times d}
$$

Code:

```cpp
attn_qkv = MatMul(qk_softmax, NONE, v_s, NONE); // (N,H,L,L)x(N,H,L,d)->(N,H,L,d)
```

### 3.6 Merge Heads + Output Projection

#### 3.6.1 Merge heads

Transpose back:

$$
\mathbf{O} \in \mathbb{R}^{N \times H \times L \times d}
\ \xrightarrow{\text{transpose}}
\mathbb{R}^{N \times L \times H \times d}
$$

Then reshape:

$$
\mathbb{R}^{N \times L \times (H\cdot d)} = \mathbb{R}^{N \times L \times D}
$$

Code:

```cpp
attn_out->setFirstTranspose({0, 2, 1, 3}); // (N,L,H,d)
attn_out->setReshapeDimensions(Dims3{N, -1, 768}); // (N,L,D)
```

#### 3.6.2 Output projection

$$
\mathbf{A} = \mathbf{O}_{\text{merged}} \mathbf{W}_O^\top + \mathbf{b}_O
\quad\in\mathbb{R}^{N \times L \times D}
$$

Code:

```cpp
attn_fcw = MatMul(attn_out, out_proj_w^T);
attn_fcb = attn_fcw + out_proj_b;
```

### 3.7 Residual Connection After Attention

$$
\mathbf{Y} = \mathbf{X} + \mathbf{A}
\quad\in\mathbb{R}^{N \times L \times D}
$$

Code:

```cpp
attn_residual = input + attn_fcb;
```

This identity path is crucial for gradient flow and stability; at inference time it preserves a “direct” signal path even if attention becomes sharp or noisy.

### 3.8 Post-Attention LayerNorm

$$
\mathbf{Y}' = \mathrm{LN}_2(\mathbf{Y})
$$

Code:

```cpp
post_lnorm = Normalization(attn_residual, post_ln_scale, post_ln_bias)
```

### 3.9 Feed-Forward Network (FFN / MLP)

ViT uses a 2-layer MLP with expansion ratio 4 and GeLU activation.

#### 3.9.1 First dense layer (expand to 3072)

$$
\mathbf{H} = \mathbf{Y}' \mathbf{W}_1^\top + \mathbf{b}_1
\quad\in\mathbb{R}^{N \times L \times 4D}
$$

Code:

```cpp
inter0 = MatMul(post_lnorm, iw^T); // iw shape conceptually (4D, D)
inter1 = inter0 + ib;
```

#### 3.9.2 GeLU activation

$$
\mathrm{GeLU}(x) = x \Phi(x)
$$

Where (\Phi) is the standard normal CDF.

Common tanh approximation (widely used in implementations):

$$
\mathrm{GeLU}(x) \approx \frac
{x\times \bigg(1+\tanh\Big(\sqrt\frac{2}{\pi}\times (x+0.044715\times x^3)\Big)\bigg)}
{2}
$$

Code calls:

```cpp
inter_act = addGeLU(net, inter1);
```

#### 3.9.3 Second dense layer (project back to 768)

$$
\mathbf{F} = \mathrm{GeLU}(\mathbf{H}) \mathbf{W}_2^\top + \mathbf{b}_2
\quad\in\mathbb{R}^{N \times L \times D}
$$

Code:

```cpp
out0 = MatMul(inter_act, ow^T); // ow conceptually (D, 4D)
out1 = out0 + ob;
```

### 3.10 Final Residual Connection

$$
\mathbf{Z} = \mathbf{Y} + \mathbf{F}
\quad\in\mathbb{R}^{N \times L \times D}
$$

Code:

```cpp
output_residual = out1 + attn_residual;
return output_residual;
```

## 4. Compact Step-by-Step Shape Trace

Below is a shape trace aligned with the main operations (assuming dynamic (L)):

Input

$$ \mathbf{X}: (N, L, 768) $$

Pre-LN

$$ \mathbf{X}': (N, L, 768) $$

Q/K/V projections

$$ \mathbf{Q},\mathbf{K},\mathbf{V}: (N, L, 768) $$

Reshape + transpose to heads

$$ \mathbf{Q}\_h,\mathbf{K}\_h,\mathbf{V}\_h: (N, H, L, d) $$

Attention logits

$$ \mathbf{S}: (N, H, L, L) $$

Softmax weights

$$ \mathbf{P}: (N, H, L, L) $$

Head outputs

$$ \mathbf{O}: (N, H, L, d) $$

Merge heads

$$ \mathbf{O}\_{\text{merged}}: (N, L, 768) $$

Output projection

$$ \mathbf{A}: (N, L, 768) $$

Residual

$$ \mathbf{Y}: (N, L, 768) $$

Post-LN

$$ \mathbf{Y}': (N, L, 768) $$

FFN expand

$$ \mathbf{H}: (N, L, 3072) $$

FFN project

$$ \mathbf{F}: (N, L, 768) $$

Final residual

$$ \mathbf{Z}: (N, L, 768) $$


================================================
FILE: vit/cuda_allocator.cc
================================================
#include "cuda_allocator.h"
#include <cuda.h>
#include <cstdint>
#include <cstdlib>
#include <memory>
#include <mutex>
#include "macros.h"
#include "utils.h"

namespace {
constexpr int kCudaVersionAsyncMin = 11020;
constexpr int kCudaVersionCuMemMin = 12000;
}  // namespace

struct CudaOutputAllocator::Allocation {
    void* ptr{nullptr};
    std::size_t size{0};
    OutputAllocKind kind{OutputAllocKind::kCudaMallocManaged};
    CUmemGenericAllocationHandle handle{};
    CUdeviceptr addr{};
    std::size_t mapped_size{0};
};

static auto getCudaRuntimeVersion() -> int {
    int version = 0;
    if (cudaRuntimeGetVersion(&version) != cudaSuccess) {
        return 0;
    }
    return version;
}

static auto getCudaDriverVersion() -> int {
    int version = 0;
    if (cudaDriverGetVersion(&version) != cudaSuccess) {
        return 0;
    }
    return version;
}

std::unique_ptr<CudaOutputAllocator> CudaOutputAllocator::Create(cudaStream_t stream, int device) {
    CHECK(cudaSetDevice(device));
    const int rt = getCudaRuntimeVersion();
    const int drv = getCudaDriverVersion();

    OutputAllocKind kind = OutputAllocKind::kCudaMallocManaged;
    if (rt >= kCudaVersionCuMemMin && drv >= kCudaVersionCuMemMin) {
        kind = OutputAllocKind::kCuMem;
    } else if (rt >= kCudaVersionAsyncMin) {
        kind = OutputAllocKind::kCudaMallocAsync;
    }
    return std::make_unique<CudaOutputAllocator>(stream, kind, device);
}

CudaOutputAllocator::CudaOutputAllocator(cudaStream_t stream, OutputAllocKind kind, int device)
    : stream_(stream), kind_(kind), device_(device) {}

CudaOutputAllocator::~CudaOutputAllocator() {
    std::lock_guard<std::mutex> lock(mutex_);
    for (auto& entry : allocations_) {
        release(entry.first, entry.second);
    }
}

#if TRT_VERSION < 10000
// NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
void* CudaOutputAllocator::reallocateOutput(const char* tensorName, void* currentMemory, uint64_t size,
                                            uint64_t alignment) TRT_NOEXCEPT {
    (void)alignment;
    if (!tensorName) {
        return nullptr;
    }
    std::lock_guard<std::mutex> lock(mutex_);
    auto& alloc = allocations_[tensorName];
    if (alloc.ptr && size <= alloc.size) {
        return alloc.ptr;
    }
    if (alloc.ptr) {
        release(tensorName, alloc);
    } else if (currentMemory != nullptr && size == 0) {
        return currentMemory;
    }

    Allocation fresh = allocate(static_cast<std::size_t>(size));
    if (!fresh.ptr) {
        return nullptr;
    }
    alloc = fresh;
    return alloc.ptr;
}
#else
// NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
void* CudaOutputAllocator::reallocateOutputAsync(const char* tensorName, void* currentMemory, uint64_t size,
                                                 uint64_t alignment, cudaStream_t stream) TRT_NOEXCEPT {
    (void)alignment;
    if (!tensorName) {
        return nullptr;
    }
    if (stream == nullptr) {
        stream = stream_;
    }
    stream_ = stream;
    std::lock_guard<std::mutex> lock(mutex_);
    auto& alloc = allocations_[tensorName];
    if (alloc.ptr && size <= alloc.size) {
        return alloc.ptr;
    }
    if (alloc.ptr) {
        release(tensorName, alloc);
    } else if (currentMemory != nullptr && size == 0) {
        return currentMemory;
    }

    Allocation fresh = allocate(static_cast<std::size_t>(size));
    if (!fresh.ptr) {
        return nullptr;
    }
    alloc = fresh;
    return alloc.ptr;
}
#endif

void CudaOutputAllocator::notifyShape(const char* /*tensorName*/, nvinfer1::Dims const& /*dims*/) TRT_NOEXCEPT {}

CudaOutputAllocator::Allocation CudaOutputAllocator::allocate(std::size_t size) {
    Allocation alloc{};
    if (size == 0) {
        return alloc;
    }
    if (kind_ == OutputAllocKind::kCudaMallocAsync) {
        void* ptr = nullptr;
        if (cudaMallocAsync(&ptr, size, stream_) != cudaSuccess) {
            return alloc;
        }
        alloc.ptr = ptr;
        alloc.size = size;
        alloc.kind = OutputAllocKind::kCudaMallocAsync;
        return alloc;
    }
    if (kind_ == OutputAllocKind::kCudaMallocManaged) {
        void* ptr = nullptr;
        if (cudaMallocManaged(&ptr, size, cudaMemAttachGlobal) != cudaSuccess) {
            return alloc;
        }
        alloc.ptr = ptr;
        alloc.size = size;
        alloc.kind = OutputAllocKind::kCudaMallocManaged;
        return alloc;
    }

    if (cudaSetDevice(device_) != cudaSuccess) {
        return alloc;
    }
    if (cuInit(0) != CUDA_SUCCESS) {
        return alloc;
    }

    CUmemAllocationProp prop{};
    prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
    prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    prop.location.id = device_;

    std::size_t granularity = 0;
    if (cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM) != CUDA_SUCCESS) {
        return alloc;
    }

    const std::size_t alloc_size = ((size + granularity - 1) / granularity) * granularity;
    CUmemGenericAllocationHandle handle{};
    if (cuMemCreate(&handle, alloc_size, &prop, 0) != CUDA_SUCCESS) {
        return alloc;
    }

    CUdeviceptr addr = 0;
    if (cuMemAddressReserve(&addr, alloc_size, 0, 0, 0) != CUDA_SUCCESS) {
        cuMemRelease(handle);
        return alloc;
    }

    if (cuMemMap(addr, alloc_size, 0, handle, 0) != CUDA_SUCCESS) {
        cuMemAddressFree(addr, alloc_size);
        cuMemRelease(handle);
        return alloc;
    }

    CUmemAccessDesc access_desc{};
    access_desc.location = prop.location;
    access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
    if (cuMemSetAccess(addr, alloc_size, &access_desc, 1) != CUDA_SUCCESS) {
        cuMemUnmap(addr, alloc_size);
        cuMemAddressFree(addr, alloc_size);
        cuMemRelease(handle);
        return alloc;
    }
    static_assert(sizeof(void*) == sizeof(CUdeviceptr));
    alloc.ptr = reinterpret_cast<void*>(addr);  // NOLINT(performance-no-int-to-ptr)
    alloc.size = size;
    alloc.kind = OutputAllocKind::kCuMem;
    alloc.handle = handle;
    alloc.addr = addr;
    alloc.mapped_size = alloc_size;
    return alloc;
}

void CudaOutputAllocator::release(const std::string& /*tensorName*/, Allocation& alloc) {
    if (!alloc.ptr) {
        return;
    }
    if (alloc.kind == OutputAllocKind::kCudaMallocAsync) {
        cudaFreeAsync(alloc.ptr, stream_);
    } else if (alloc.kind == OutputAllocKind::kCudaMallocManaged) {
        cudaFree(alloc.ptr);
    } else if (alloc.kind == OutputAllocKind::kCuMem) {
        cuMemUnmap(alloc.addr, alloc.mapped_size);
        cuMemRelease(alloc.handle);
        cuMemAddressFree(alloc.addr, alloc.mapped_size);
    }
    alloc = Allocation{};
}

void* CudaOutputAllocator::getBuffer(const std::string& tensorName) const {
    std::lock_guard<std::mutex> lock(mutex_);
    auto it = allocations_.find(tensorName);
    if (it == allocations_.end()) {
        return nullptr;
    }
    return it->second.ptr;
}

std::size_t CudaOutputAllocator::getSize(const std::string& tensorName) const {
    std::lock_guard<std::mutex> lock(mutex_);
    auto it = allocations_.find(tensorName);
    if (it == allocations_.end()) {
        return 0;
    }
    return it->second.size;
}

OutputAllocKind CudaOutputAllocator::kind() const {
    return kind_;
}


================================================
FILE: vit/cuda_allocator.h
================================================
#pragma once
#include <NvInfer.h>
#include <cuda_runtime_api.h>
#include <cstddef>
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include "macros.h"

enum class OutputAllocKind : std::uint8_t { kCudaMallocAsync, kCudaMallocManaged, kCuMem };

class CudaOutputAllocator final : public nvinfer1::IOutputAllocator {
   public:
    static std::unique_ptr<CudaOutputAllocator> Create(cudaStream_t stream, int device = 0);

    explicit CudaOutputAllocator(cudaStream_t stream, OutputAllocKind kind, int device = 0);
    ~CudaOutputAllocator() override;

#if TRT_VERSION < 10000
    void* reallocateOutput(const char* tensorName, void* currentMemory, uint64_t size,
                           uint64_t alignment) TRT_NOEXCEPT override;
#else
    void* reallocateOutputAsync(const char* tensorName, void* currentMemory, uint64_t size, uint64_t alignment,
                                cudaStream_t stream) TRT_NOEXCEPT override;
#endif
    void notifyShape(const char* tensorName, nvinfer1::Dims const& dims) TRT_NOEXCEPT override;

    void* getBuffer(const std::string& tensorName) const;
    std::size_t getSize(const std::string& tensorName) const;
    OutputAllocKind kind() const;

   private:
    struct Allocation;
    Allocation allocate(std::size_t size);
    void release(const std::string& tensorName, Allocation& alloc);

    cudaStream_t stream_{};
    OutputAllocKind kind_{OutputAllocKind::kCudaMallocManaged};
    int device_{0};
    mutable std::mutex mutex_;
    std::unordered_map<std::string, Allocation> allocations_;
};


================================================
FILE: vit/gen_wts.py
================================================
import struct

import cv2
import numpy as np
import torch
from transformers import AutoConfig, AutoImageProcessor, AutoModelForImageClassification


def read_imagenet_labels() -> dict[int, str]:
    """
    read ImageNet 1000 labels

    Returns:
        dict[int, str]: labels dict
    """
    clsid2label = {}
    with open("../assets/imagenet1000_clsidx_to_labels.txt", "r") as f:
        for i in f.readlines():
            k, v = i.split(": ")
            clsid2label.setdefault(int(k), v[1:-3])
    return clsid2label


USE_HF_PREPROCESS = False

if __name__ == "__main__":
    hub_model_id = "google/vit-base-patch16-224"
    config = AutoConfig.from_pretrained(hub_model_id)
    config._attn_implementation = "eager"
    model = AutoModelForImageClassification.from_pretrained(
        hub_model_id,
        ignore_mismatched_sizes=False,
        config=config,
    )

    model.eval()

    img = cv2.imread("../assets/cats.jpg", cv2.IMREAD_COLOR)

    if USE_HF_PREPROCESS:
        image_processor = AutoImageProcessor.from_pretrained(hub_model_id)
        img = image_processor(img, return_tensors="pt")
        img = img["pixel_values"]
    else:
        img: np.array = cv2.resize(img, (224, 224), cv2.INTER_LINEAR)
        img = (img.astype(np.float32) / 255.0 - np.array([0.5, 0.5, 0.5])) / np.array([0.5, 0.5, 0.5])
        img = torch.from_numpy(np.transpose(img, (2, 0, 1))[None, ...])

    output = model(img)
    labels = read_imagenet_labels()
    for i, j in enumerate(torch.topk(output.logits[0], k=3).indices):
        print(f"Top: {i} is {labels[int(j)]}")

    f = open("../models/vit.wts", "w")
    f.write("{}\n".format(len(model.state_dict().keys())))
    for k, v in model.state_dict().items():
        print("key: ", k)
        print("value: ", v.shape)
        vr = v.reshape(-1).cpu().numpy()
        f.write("{} {}".format(k, len(vr)))
        for vv in vr:
            f.write(" ")
            f.write(struct.pack(">f", float(vv)).hex())
        f.write("\n")


================================================
FILE: vit/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include <cassert>
#include <cstdint>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include <utility>
#include "NvInferRuntime.h"
#include "macros.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf {
   public:
    LogStreamConsumerBuffer(std::ostream& stream, std::string prefix, bool shouldLog)
        : mOutput(stream), mPrefix(std::move(prefix)), mShouldLog(shouldLog) {}

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) noexcept
        : mOutput(other.mOutput), mPrefix(std::move(other.mPrefix)), mShouldLog(other.mShouldLog) {}

    ~LogStreamConsumerBuffer() override {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr()) {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    int sync() override {
        putOutput();
        return 0;
    }

    void putOutput() {
        if (mShouldLog) {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }

   private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase {
   public:
    LogStreamConsumerBase(std::ostream& stream, std::string prefix, bool shouldLog)
        : mBuffer(stream, std::move(prefix), shouldLog) {}

   protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
   public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(severity <= reportableSeverity),
          mSeverity(severity) {}

    LogStreamConsumer(LogStreamConsumer&& other) noexcept
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(other.mShouldLog),
          mSeverity(other.mSeverity) {}

    void setReportableSeverity(Severity reportableSeverity) {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

   private:
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger {
   private:
    struct TestInfo;

   public:
    Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {}

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult : std::uint8_t {
        kRUNNING,  //!< The test is running
        kPASSED,   //!< The test passed
        kFAILED,   //!< The test failed
        kWAIVED    //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger() { return *this; }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << '\n';
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity) { mReportableSeverity = severity; }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom {
       public:
        TestAtom(TestAtom&&) = default;

       private:
        friend class Logger;

        TestAtom(bool started, TestInfo info)
            : mStarted(started), mName(std::move(info.name)), mCmdline(std::move(info.cmdline)) {}

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline) {
        return TestAtom{false, TestInfo{name, cmdline}};
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom) {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result) {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass) {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    [[nodiscard]] Severity getReportableSeverity() const { return mReportableSeverity; }

   private:
    struct TestInfo {
        std::string name;
        std::string cmdline;
    };
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result) {
        switch (result) {
            case TestResult::kRUNNING:
                return "RUNNING";
            case TestResult::kPASSED:
                return "PASSED";
            case TestResult::kFAILED:
                return "FAILED";
            case TestResult::kWAIVED:
                return "WAIVED";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result) {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << '\n';
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv) {
        std::stringstream ss;
        for (int i = 0; i < argc; i++) {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace {

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kVERBOSE};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINFO};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kWARNING};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kERROR};
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger) {
    return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINTERNAL_ERROR};
}

}  // anonymous namespace

#endif  // TENSORRT_LOGGING_H


================================================
FILE: vit/macros.h
================================================
#pragma once
#include <NvInfer.h>

#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else

#if defined(_MSC_VER)
#define API __declspec(dllimport)
#else
#define API
#endif
#endif  // API_EXPORTS

#define TRT_VERSION \
    ((NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + (NV_TENSORRT_PATCH * 10) + NV_TENSORRT_BUILD)

#if TRT_VERSION < 7220
#error "TensorRT >= 7.2.2 is required for this demo."
#endif

#if TRT_VERSION >= 8000
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif


================================================
FILE: vit/profiler.cc
================================================
#include "profiler.h"
#include <NvInfer.h>
#include <algorithm>
#include <iomanip>
#include <string>

void Profiler::reportLayerTime(const char* layerName, float ms) noexcept {
    mProfile[layerName].count++;
    mProfile[layerName].time += ms;
    if (std::find(mLayerNames.begin(), mLayerNames.end(), layerName) == mLayerNames.end()) {
        mLayerNames.emplace_back(layerName);
    }
}

Profiler::Profiler(const char* name, const std::vector<Profiler>& srcProfilers) : mName(name) {
    for (const auto& srcProfiler : srcProfilers) {
        for (const auto& rec : srcProfiler.mProfile) {
            auto it = mProfile.find(rec.first);
            if (it == mProfile.end()) {
                mProfile.insert(rec);
            } else {
                it->second.time += rec.second.time;
                it->second.count += rec.second.count;
            }
        }
    }
}

std::ostream& operator<<(std::ostream& out, const Profiler& value) {
    out << "========== " << value.mName << " ==========\n";
    float totalTime = 0;
    std::string layerNameStr = "TensorRT layer name";
    int maxLayerNameLength = std::max(static_cast<int>(layerNameStr.size()), 70);
    for (const auto& elem : value.mProfile) {
        totalTime += elem.second.time;
        maxLayerNameLength = std::max(maxLayerNameLength, static_cast<int>(elem.first.size()));
    }

    auto old_settings = out.flags();
    auto old_precision = out.precision();
    // Output header
    {
        out << std::setfill(' ') << std::setw(maxLayerNameLength) << layerNameStr << " ";
        out << std::setw(12) << "Runtime, " << "%" << " ";
        out << std::setw(12) << "Invocations" << " ";
        out << std::setw(12) << "Runtime, ms\n";
    }
    for (size_t i = 0; i < value.mLayerNames.size(); i++) {
        const std::string layerName = value.mLayerNames[i];
        auto elem = value.mProfile.at(layerName);
        out << std::setw(maxLayerNameLength) << layerName << " ";
        out << std::setw(12) << std::fixed << std::setprecision(1) << (elem.time * 100.0F / totalTime) << "%" << " ";
        out << std::setw(12) << elem.count << " ";
        out << std::setw(12) << std::fixed << std::setprecision(2) << elem.time << "\n";
    }
    out.flags(old_settings);
    out.precision(old_precision);
    out << "========== " << value.mName << " total runtime = " << totalTime << " ms ==========\n";

    return out;
}

================================================
FILE: vit/profiler.h
================================================
#include <NvInfer.h>

#include <iostream>
#include <map>
#include <string>
#include <vector>

class Profiler final : public nvinfer1::IProfiler {
   public:
    struct Record {
        float time{0};
        int count{0};
    };
    Profiler(const char* name, const std::vector<Profiler>& srcProfilers = std::vector<Profiler>());
    void reportLayerTime(const char* layerName, float ms) noexcept override;
    friend std::ostream& operator<<(std::ostream& out, const Profiler& value);

   private:
    std::string mName;
    std::vector<std::string> mLayerNames;
    std::map<std::string, Record> mProfile;
};


================================================
FILE: vit/utils.h
================================================
#pragma once
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <fstream>
#include <iostream>
#include <map>
#include <numeric>
#include <opencv2/opencv.hpp>
#include <string>
#include <vector>
#include "macros.h"

constexpr const std::size_t WORKSPACE_SIZE = 16 << 20;
namespace {
#define CHECK(status)                                     \
    do {                                                  \
        auto ret = (status);                              \
        if (ret != cudaSuccess) {                         \
            std::cerr << "Cuda failure: " << ret << "\n"; \
            std::abort();                                 \
        }                                                 \
    } while (0)

static void checkTrtEnv(int device = 0) {
#if TRT_VERSION < 8000
    CHECK(cudaGetDevice(&device));
    cudaDeviceProp prop{};
    CHECK(cudaGetDeviceProperties(&prop, device));
    const int sm = prop.major * 10 + prop.minor;
    if (sm > 86) {
        std::cerr << "TensorRT < 8 does not support SM > 86 on this GPU.";
        std::abort();
    }
#endif
}

/**
 * @brief TensorRT weight files have a simple space delimited format:
 * [type] [size] <data x size in hex>
 * 
 * @param file input weight file path
 * @return std::map<std::string, nvinfer1::Weights> 
 */
static auto loadWeights(const std::string& file) {
    std::cout << "Loading weights: " << file << "\n";
    std::map<std::string, nvinfer1::Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--) {
        nvinfer1::Weights wt{.type = nvinfer1::DataType::kFLOAT, .values = nullptr, .count = 0};

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> wt.count;

        // Load blob
        auto* val = new uint32_t[wt.count];
        input >> std::hex;
        for (auto x = 0ll; x < wt.count; ++x) {
            input >> val[x];
        }
        wt.values = val;
        weightMap[name] = wt;
    }

    return weightMap;
}

/**
 * @brief a preprocess function aligning with ImageNet preprocess in torchvision, only support 3-channel image
 * 
 * @param img opencv image with BGR layout
 * @param bgr2rgb whether to convert BGR to RGB
  * @param mean_std subtract mean, then divide std
  * @param n batch size
  * @param h resize height
  * @param w resize width
  * @return std::vector<half> contiguous flatten image data in fp16 type (CHW)
  */
static auto preprocess_img(cv::Mat& img, bool bgr2rgb, const std::array<const float, 3>& mean,
                           const std::array<const float, 3>& std, int64_t n, int32_t h, int32_t w) {
    const auto c = img.channels();
    const auto size = c * h * w;
    if (c != 3) {
        std::cerr << "this demo only supports 3 channel input image.\n";
        std::abort();
    }
    if (bgr2rgb) {
        cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
    }
    cv::resize(img, img, cv::Size(w, h), 0, 0, cv::INTER_LINEAR);

    // Keep preprocessing in fp32 on CPU for correctness, then pack to fp16 CHW for TensorRT input.
    img.convertTo(img, CV_32FC3, 1.f / 255.f);
    img = (img - cv::Scalar(mean[0], mean[1], mean[2])) / cv::Scalar(std[0], std[1], std[2]);
    std::vector<half> chw(static_cast<std::size_t>(n) * c * h * w);

    // fill all batch with the same input image
    for (int i = 0; i < n; ++i) {
        for (int y = 0; y < h; ++y) {
            for (int x = 0; x < w; ++x) {
                const cv::Vec3f v = img.at<cv::Vec3f>(y, x);
                chw[i * size + 0 * h * w + y * w + x] = __float2half(v[0]);
                chw[i * size + 1 * h * w + y * w + x] = __float2half(v[1]);
                chw[i * size + 2 * h * w + y * w + x] = __float2half(v[2]);
            }
        }
    }
    return chw;
}

static auto topk(const std::vector<float>& v, int k) -> std::vector<std::pair<int, float>> {
    if (k <= 0)
        return {};
    auto stride = std::min<std::ptrdiff_t>(k, static_cast<std::ptrdiff_t>(v.size()));

    std::vector<int> idx(v.size());
    std::iota(idx.begin(), idx.end(), 0);

    std::partial_sort(idx.begin(), idx.begin() + stride, idx.end(), [&](int a, int b) { return v[a] > v[b]; });

    std::vector<std::pair<int, float>> out;
    out.reserve(stride);
    for (int i = 0; i < stride; ++i)
        out.emplace_back(idx[i], v[idx[i]]);
    return out;
}

static auto loadImagenetLabelMap(const std::string& path) {
    std::map<int, std::string> labels;
    std::ifstream in(path);
    if (!in.is_open()) {
        return labels;
    }
    std::string line;
    while (std::getline(in, line)) {
        auto colon = line.find(':');
        if (colon == std::string::npos) {
            continue;
        }
        auto first_quote = line.find('\'', colon);
        if (first_quote == std::string::npos) {
            continue;
        }
        auto second_quote = line.find('\'', first_quote + 1);
        if (second_quote == std::string::npos) {
            continue;
        }
        int idx = std::stoi(line.substr(0, colon));
        labels[idx] = line.substr(first_quote + 1, second_quote - first_quote - 1);
    }
    return labels;
}
}  // namespace


================================================
FILE: vit/vit.cc
================================================
#include <NvInfer.h>
#include <cuda_fp16.h>
#include <cassert>
#include <cstring>
#include <fstream>
#include <iostream>
#include "cuda_allocator.h"
#include "logging.h"
#include "macros.h"
#include "profiler.h"
#include "utils.h"

using namespace nvinfer1;
using WeightMap = std::map<std::string, Weights>;
using M = nvinfer1::MatrixOperation;
using E = nvinfer1::ElementWiseOperation;
using NDCF = nvinfer1::NetworkDefinitionCreationFlag;

static constexpr const int64_t N = 1;
static constexpr const int64_t INPUT_H = 224;
static constexpr const int64_t INPUT_W = 224;

static constexpr const char* WTS_PATH = "../models/vit.wts";
static constexpr const char* ENGINE_PATH = "../models/vit.engine";
static constexpr const char* LABELS_PATH = "../assets/imagenet1000_clsidx_to_labels.txt";
static constexpr const std::array<const char*, 2> NAMES = {"input", "logits"};
static constexpr const std::array<int64_t, 2> SIZES = {3 * INPUT_H * INPUT_W, 1000};
static constexpr const std::array<const float, 3> mean = {0.5f, 0.5f, 0.5f};
static constexpr const std::array<const float, 3> stdv = {0.5f, 0.5f, 0.5f};

static Logger gLogger;

static auto bytesPerElement(DataType t) -> std::size_t {
    switch (t) {
        case DataType::kFLOAT:
            return 4;
        case DataType::kHALF:
            return 2;
        case DataType::kINT32:
            return 4;
#if TRT_VERSION >= 8000
        case DataType::kBOOL:
#endif
#if TRT_VERSION >= 8500
        case DataType::kUINT8:
#endif
        case DataType::kINT8:
            return 1;
        default:
            std::cerr << "Unsupported TensorRT DataType\n";
            std::abort();
    }
}

static void convertWeightMapToHalf(WeightMap& w) {
    for (auto& kv : w) {
        auto& wt = kv.second;
        if (wt.type != DataType::kFLOAT || wt.values == nullptr || wt.count <= 0) {
            continue;
        }

        auto* half_vals = new half[wt.count];
        const auto* raw = reinterpret_cast<const uint32_t*>(wt.values);
        for (int64_t i = 0; i < wt.count; ++i) {
            float f;
            std::memcpy(&f, &raw[i], sizeof(float));
            half_vals[i] = __float2half(f);
        }

        delete[] raw;
        wt.type = DataType::kHALF;
        wt.values = half_vals;
    }
}

struct ViTParam {
    uint32_t index;
    uint32_t head_num;
    float lnorm_eps = 1e-12f;
};

static auto addGeLU(INetworkDefinition* net, ITensor& input) -> ILayer* {
#if TRT_VERSION < 10000
    // tanh approximation: 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
    const auto inputDims = input.getDimensions();

    Dims scalarDims{};
    scalarDims.nbDims = inputDims.nbDims;
    for (int i = 0; i < scalarDims.nbDims; ++i) {
        scalarDims.d[i] = 1;
    }

    static float _half = 0.5f;
    static float _one = 1.0f;
    static float _sqrt_2_div_pi = std::sqrt(2.0f / M_PI);
    static float _coeff = 0.044715f;
    auto* _w_half = net->addConstant(scalarDims, Weights{DataType::kFLOAT, &_half, 1});
    auto* _w_one = net->addConstant(scalarDims, Weights{DataType::kFLOAT, &_one, 1});
    auto* _w_sqrt_2_div_pi = net->addConstant(scalarDims, Weights{DataType::kFLOAT, &_sqrt_2_div_pi, 1});
    auto* _w_coeff = net->addConstant(scalarDims, Weights{DataType::kFLOAT, &_coeff, 1});

    auto* _x2 = net->addElementWise(input, input, E::kPROD);
    auto* x3_0 = net->addElementWise(*_x2->getOutput(0), input, E::kPROD);
    auto* x3_1 = net->addElementWise(*x3_0->getOutput(0), *_w_coeff->getOutput(0), E::kPROD);
    auto* x3_2 = net->addElementWise(input, *x3_1->getOutput(0), E::kSUM);
    auto* scaled = net->addElementWise(*x3_2->getOutput(0), *_w_sqrt_2_div_pi->getOutput(0), E::kPROD);

    auto* t = net->addActivation(*scaled->getOutput(0), ActivationType::kTANH);
    auto* one_plus = net->addElementWise(*t->getOutput(0), *_w_one->getOutput(0), E::kSUM);
    auto* half_x = net->addElementWise(input, *_w_half->getOutput(0), E::kPROD);
    return net->addElementWise(*half_x->getOutput(0), *one_plus->getOutput(0), E::kPROD);
#else
    // erf approximation
    return net->addActivation(input, ActivationType::kGELU_ERF);
#endif
}

static auto addLinearNorm(INetworkDefinition* net, ITensor& input, ITensor& scale, ITensor& bias,
                          uint32_t axesMask) noexcept -> ILayer* {
#if TRT_VERSION >= 11500
    auto* ln = net->addNormalizationV2(input, scale, bias, axesMask);
#else
    auto* ln = net->addNormalization(input, scale, bias, axesMask);
#endif
    ln->setEpsilon(1e-12f);
    return ln;
}

auto ViTLayer(INetworkDefinition* net, WeightMap& w, ITensor& input, const ViTParam& param) -> ITensor* {
    std::string name = "vit.encoder.layer." + std::to_string(param.index);
    auto attn_name = name + ".attention";
    int64_t attn_head_size = 768LL / param.head_num;

    auto* qw = net->addConstant(Dims3{1, 768, 768}, w.at(attn_name + ".attention.query.weight"));
    auto* kw = net->addConstant(Dims3{1, 768, 768}, w.at(attn_name + ".attention.key.weight"));
    auto* vw = net->addConstant(Dims3{1, 768, 768}, w.at(attn_name + ".attention.value.weight"));
    /* 1. layer norm before attention */
    auto pre_ln_name = name + ".layernorm_before";
    auto dims = input.getDimensions();
    uint32_t axes = 1U << static_cast<uint32_t>(dims.nbDims - 1);
    auto* ln_scale = net->addConstant(Dims3{1, 1, dims.d[dims.nbDims - 1]}, w[pre_ln_name + ".weight"]);
    auto* ln_bias = net->addConstant(Dims3{1, 1, dims.d[dims.nbDims - 1]}, w[pre_ln_name + ".bias"]);
    auto* pre_lnorm = addLinearNorm(net, input, *ln_scale->getOutput(0), *ln_bias->getOutput(0), axes);

    /** 2. multi-head self-attention */
    auto* qb = net->addConstant(Dims3{1, 1, 768}, w.at(attn_name + ".attention.query.bias"));
    auto* kb = net->addConstant(Dims3{1, 1, 768}, w.at(attn_name + ".attention.key.bias"));
    auto* vb = net->addConstant(Dims3{1, 1, 768}, w.at(attn_name + ".attention.value.bias"));
    auto* _lno = pre_lnorm->getOutput(0);
    // 2.1 Q, K attention matmul
    auto* _q_attn = net->addMatrixMultiply(*_lno, M::kNONE, *qw->getOutput(0), M::kTRANSPOSE);
    auto* _k_attn = net->addMatrixMultiply(*_lno, M::kNONE, *kw->getOutput(0), M::kTRANSPOSE);
    auto* _v_attn = net->addMatrixMultiply(*_lno, M::kNONE, *vw->getOutput(0), M::kTRANSPOSE);
    _q_attn->setName((attn_name + "query").c_str());
    _k_attn->setName((attn_name + "key").c_str());
    _v_attn->setName((attn_name + "value").c_str());
    auto* q_attn = net->addElementWise(*_q_attn->getOutput(0), *qb->getOutput(0), E::kSUM);
    auto* k_attn = net->addElementWise(*_k_attn->getOutput(0), *kb->getOutput(0), E::kSUM);
    auto* v_attn = net->addElementWise(*_v_attn->getOutput(0), *vb->getOutput(0), E::kSUM);
    auto* q_s = net->addShuffle(*q_attn->getOutput(0));
    auto* k_s = net->addShuffle(*k_attn->getOutput(0));
    auto* v_s = net->addShuffle(*v_attn->getOutput(0));
    q_s->setReshapeDimensions(Dims4{0, 0, param.head_num, attn_head_size});
    q_s->setSecondTranspose({0, 2, 1, 3});
    k_s->setReshapeDimensions(Dims4{0, 0, param.head_num, attn_head_size});
    k_s->setSecondTranspose({0, 2, 1, 3});
    v_s->setReshapeDimensions(Dims4{0, 0, param.head_num, attn_head_size});
    v_s->setSecondTranspose({0, 2, 1, 3});

    // 2.2 Q, K scaling (and softmax / fused attention)
    const float scale_f = 1.0f / std::sqrt(static_cast<float>(attn_head_size));
    if (input.getType() == DataType::kHALF) {
        auto* scale_val = new half[1];
        scale_val[0] = __float2half(scale_f);
        w[attn_name + ".scale"] = Weights{.type = DataType::kHALF, .values = scale_val, .count = 1};
    } else {
        auto* scale_val = new uint32_t[1];
        std::memcpy(scale_val, &scale_f, sizeof(float));
        w[attn_name + ".scale"] = Weights{.type = DataType::kFLOAT, .values = scale_val, .count = 1};
    }
    auto* qk_scale_w = net->addConstant(Dims4{1, 1, 1, 1}, w.at(attn_name + ".scale"));

    // 2.3 QKV attention output and reshape
#if TRT_VERSION >= 11400 && TRT_VERSION < 11500
    gLogger.log(Severity::kWARNING,
                "IAttention is available in TensorRT 10.14.1 SDK but have bugs, use 10.15.1+ to enable native fused "
                "kernel");
#endif
#if TRT_VERSION >= 11500
    using ANO = AttentionNormalizationOp;
    auto* q_scaled = net->addElementWise(*q_s->getOutput(0), *qk_scale_w->getOutput(0), E::kPROD)->getOutput(0);
    auto* attn = net->addAttention(*q_scaled, *k_s->getOutput(0), *v_s->getOutput(0), ANO::kSOFTMAX, false);
    assert(attn != nullptr);
    auto status = attn->setDecomposable(false);
    assert(status);
    auto* attn_out = net->addShuffle(*attn->getOutput(0));
#else
    auto* qk = net->addMatrixMultiply(*q_s->getOutput(0), M::kNONE, *k_s->getOutput(0), M::kTRANSPOSE);
    auto* attn_qk = net->addElementWise(*qk->getOutput(0), *qk_scale_w->getOutput(0), E::kPROD);
    auto* qk_softmax = net->addSoftMax(*attn_qk->getOutput(0));
    qk_softmax->setAxes(1U << static_cast<uint32_t>(attn_qk->getOutput(0)->getDimensions().nbDims - 1));
    auto* attn_qkv = net->addMatrixMultiply(*qk_softmax->getOutput(0), M::kNONE, *v_s->getOutput(0), M::kNONE);
    attn_qkv->setName((attn_name + ".attn_qkv").c_str());
    auto* attn_out = net->addShuffle(*attn_qkv->getOutput(0));
#endif
    attn_out->setFirstTranspose({0, 2, 1, 3});
    attn_out->setReshapeDimensions(Dims3{0, 0, 768});
    // 2.4 attention output projection
    auto* out_proj_w = net->addConstant(Dims3{1, 768, 768}, w.at(attn_name + ".output.dense.weight"))->getOutput(0);
    auto* out_proj_b = net->addConstant(Dims3{1, 1, 768}, w.at(attn_name + ".output.dense.bias"))->getOutput(0);
    auto* attn_fcw = net->addMatrixMultiply(*attn_out->getOutput(0), M::kNONE, *out_proj_w, M::kTRANSPOSE);
    auto* attn_fcb = net->addElementWise(*attn_fcw->getOutput(0), *out_proj_b, E::kSUM);
    attn_fcb->setName((attn_name + ".out_proj").c_str());

    /** 3. attention and hidden state residual connection */
    auto* attn_residual = net->addElementWise(input, *attn_fcb->getOutput(0), E::kSUM);
    attn_residual->setName((name + "attn_residual").c_str());

    /**  4. layer norm after attention */
    auto post_ln_name = name + ".layernorm_after";
    ln_scale = net->addConstant(Dims3{1, 1, dims.d[dims.nbDims - 1]}, w[post_ln_name + ".weight"]);
    ln_bias = net->addConstant(Dims3{1, 1, dims.d[dims.nbDims - 1]}, w[post_ln_name + ".bias"]);
    auto* _res = attn_residual->getOutput(0);
    axes = 1U << static_cast<uint32_t>(_res->getDimensions().nbDims - 1);
    auto* post_lnorm = addLinearNorm(net, *_res, *ln_scale->getOutput(0), *ln_bias->getOutput(0), axes);

    /** 6. intermediate (feed-forward) layer and activation */
    auto intermediate_name = name + ".intermediate.dense";
    std::cout << "Building: " << intermediate_name << "\n";
    auto* iw = net->addConstant(Dims3{1, 3072, 768}, w[intermediate_name + ".weight"]);
    auto* ib = net->addConstant(Dims3{1, 1, 3072}, w[intermediate_name + ".bias"]);
    ib->setName((intermediate_name + ".bias").c_str());
    auto* inter0 = net->addMatrixMultiply(*post_lnorm->getOutput(0), M::kNONE, *iw->getOutput(0), M::kTRANSPOSE);
    auto* inter1 = net->addElementWise(*inter0->getOutput(0), *ib->getOutput(0), E::kSUM);
    auto* inter_act = addGeLU(net, *inter1->getOutput(0));

    /** 7. output projection */
    auto output_name = name + ".output.dense";
    std::cout << "Building: " << output_name << "\n";
    auto* ow = net->addConstant(Dims3{1, 768, 3072}, w[output_name + ".weight"]);
    auto* ob = net->addConstant(Dims3{1, 1, 768}, w[output_name + ".bias"]);
    ob->setName((output_name + ".bias").c_str());
    auto* out0 = net->addMatrixMultiply(*inter_act->getOutput(0), M::kNONE, *ow->getOutput(0), M::kTRANSPOSE);
    auto* out1 = net->addElementWise(*out0->getOutput(0), *ob->getOutput(0), E::kSUM);

    /** 8. residual */
    auto* output_residual = net->addElementWise(*out1->getOutput(0), *attn_residual->getOutput(0), E::kSUM);
    output_residual->setName((name + ".output_residual").c_str());
    return output_residual->getOutput(0);
}

// Creat the engine using only the API without any parser.
auto createEngine(int64_t N, IRuntime* runtime, IBuilder* builder, IBuilderConfig* config,
                  DataType dt) -> ICudaEngine* {
    WeightMap w = loadWeights(WTS_PATH);
    if (dt == DataType::kHALF) {
        convertWeightMapToHalf(w);
    }

#if TRT_VERSION >= 10000
    auto* net = builder->createNetworkV2(1U << static_cast<uint32_t>(NDCF::kSTRONGLY_TYPED));
#else
    auto* net = builder->createNetworkV2(1U << static_cast<int>(NDCF::kEXPLICIT_BATCH));
#endif

    // 1. patch embedding
    ITensor* data = net->addInput(NAMES[0], dt, Dims4{-1, 3, INPUT_H, INPUT_W});
    std::string name = "vit.embeddings.patch_embeddings.projection.";
    auto* embed = net->addConvolutionNd(*data, 768, DimsHW{16, 16}, w[name + "weight"], w[name + "bias"]);
    embed->setName("patch embedding");
    embed->setStrideNd(DimsHW{16, 16});
    auto* s = net->addShuffle(*embed->getOutput(0));
    s->setReshapeDimensions(Dims3{0, 768, 14LL * 14});
    s->setSecondTranspose({0, 2, 1});

    // 2. add cls token and position embedding
    auto* cls_token = net->addConstant(Dims3{1, 1, 768}, w["vit.embeddings.cls_token"]);
    auto* pos_embed = net->addConstant(Dims3{1, 197, 768}, w["vit.embeddings.position_embeddings"]);
    const std::array<ITensor*, 2> _cat = {cls_token->getOutput(0), s->getOutput(0)};
    auto* cat = net->addConcatenation(_cat.data(), 2);
    cat->setAxis(1);
    cat->setName("cat_clstoken_embed");
    auto* pos_added = net->addElementWise(*cat->getOutput(0), *pos_embed->getOutput(0), ElementWiseOperation::kSUM);
    pos_added->setName("position_embed");

    // 3. transformer encoder layers
    ITensor* input = pos_added->getOutput(0);
    for (auto i = 0u; i < 12; i++) {
        auto* vit = ViTLayer(net, w, *input, {.index = i, .head_num = 12, .lnorm_eps = 1e-12f});
        input = vit;
    }

    // 4. layer norm after transformer encoder
    auto* ln_scale = net->addConstant(Dims3{1, 1, 768}, w["vit.layernorm.weight"]);
    auto* ln_bias = net->addConstant(Dims3{1, 1, 768}, w["vit.layernorm.bias"]);
    uint32_t axes = 1U << static_cast<uint32_t>(input->getDimensions().nbDims - 1);
    auto* post_lnorm = addLinearNorm(net, *input, *ln_scale->getOutput(0), *ln_bias->getOutput(0), axes);
    // 6. classifier head
    auto* slice = net->addSlice(*post_lnorm->getOutput(0), Dims3{0, 0, 0}, Dims3{N, 1, 768}, Dims3{1, 1, 1});
    auto* shuffle = net->addShuffle(*slice->getOutput(0));
    shuffle->setReshapeDimensions(Dims2{N, 768});
    auto* cls_w = net->addConstant(DimsHW{1000, 768}, w["classifier.weight"]);
    auto* cls_b = net->addConstant(DimsHW{1, 1000}, w["classifier.bias"]);
    auto* cls_0 = net->addMatrixMultiply(*shuffle->getOutput(0), M::kNONE, *cls_w->getOutput(0), M::kTRANSPOSE);
    auto* cls_1 = net->addElementWise(*cls_b->getOutput(0), *cls_0->getOutput(0), E::kSUM);
    net->markOutput(*cls_1->getOutput(0));

    Dims4 _min{1, 3, INPUT_H, INPUT_W}, _opt{N, 3, INPUT_H, INPUT_W}, _max{2 * N, 3, INPUT_H, INPUT_W};
#if TRT_VERSION >= 8000
    config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, WORKSPACE_SIZE);
    config->setHardwareCompatibilityLevel(HardwareCompatibilityLevel::kAMPERE_PLUS);
    auto* profile = builder->createOptimizationProfile();
    profile->setDimensions(NAMES[0], OptProfileSelector::kMIN, _min);
    profile->setDimensions(NAMES[0], OptProfileSelector::kOPT, _opt);
    profile->setDimensions(NAMES[0], OptProfileSelector::kMAX, _max);
    config->addOptimizationProfile(profile);
    IHostMemory* mem = builder->buildSerializedNetwork(*net, *config);
    ICudaEngine* engine = runtime->deserializeCudaEngine(mem->data(), mem->size());
    delete net;
#else
    builder->setMaxBatchSize(N);
    config->setMaxWorkspaceSize(WORKSPACE_SIZE);
    ICudaEngine* engine = builder->buildEngineWithConfig(*net, *config);
    net->destroy();
#endif

    std::cout << "build finished\n";
    // Release host memory
    for (auto& mem : w) {
        if (mem.second.values == nullptr) {
            continue;
        }
        if (mem.second.type == DataType::kHALF) {
            delete[] reinterpret_cast<const half*>(mem.second.values);
        } else {
            // loadWeights() allocates with new uint32_t[]
            delete[] reinterpret_cast<const uint32_t*>(mem.second.values);
        }
    }

    return engine;
}

std::vector<std::vector<float>> doInference(IExecutionContext& context, __half* input, std::size_t batchSize) {
    const ICudaEngine& engine = context.getEngine();
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));
    std::vector<void*> buffers;
#if TRT_VERSION >= 10000
    auto allocator = CudaOutputAllocator::Create(stream);
#endif

#if TRT_VERSION >= 8000
    const int32_t nIO = engine.getNbIOTensors();
#else
    const int32_t nIO = engine.getNbBindings();
#endif

    buffers.resize(nIO, nullptr);
    for (auto i = 0; i < nIO; ++i) {

#if TRT_VERSION >= 8000
        // TensorRT 8+ use name based SDK
        auto* tensor_name = engine.getIOTensorName(i);
        const auto dtype = engine.getTensorDataType(tensor_name);
        std::size_t size = batchSize * SIZES[i] * bytesPerElement(dtype);
#if TRT_VERSION >= 10000
        // TensorRT 10+ use outuput allocator
        if (i == 0) {
            CHECK(cudaMalloc(&buffers[i], size));
            CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream));
            context.setTensorAddress(tensor_name, buffers[i]);
        } else {
            context.setOutputAllocator(tensor_name, allocator.get());
        }
#else
        if (i != 0) {
            CHECK(cudaMalloc(&buffers[i], size));
        } else {
            CHECK(cudaMalloc(&buffers[i], size));
            CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream));
        }
        context.setTensorAddress(tensor_name, buffers[i]);
#endif
#else
        std::size_t size = batchSize * SIZES[i] * sizeof(float);
        const int32_t idx = engine.getBindingIndex(NAMES[i]);
        assert(idx == i);
        CHECK(cudaMalloc(&buffers[i], size));
        if (i == 0) {
            CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream));
        }
#endif
    }

#if TRT_VERSION >= 8000
    assert(context.enqueueV3(stream));
#else
    assert(context.enqueueV2(buffers.data(), stream, nullptr));
#endif

    std::vector<std::vector<float>> prob;
    for (int i = 1; i < nIO; ++i) {
#if TRT_VERSION >= 10000
        auto* tensor_name = engine.getIOTensorName(i);
        const auto dtype = engine.getTensorDataType(tensor_name);
        std::size_t size = batchSize * SIZES[i] * bytesPerElement(dtype);
        void* out_ptr = allocator->getBuffer(tensor_name);
        // D2H data transfer
        if (dtype == DataType::kHALF) {
            std::vector<__half> tmp_h(batchSize * SIZES[i]);
            CHECK(cudaMemcpyAsync(tmp_h.data(), out_ptr, size, cudaMemcpyDeviceToHost, stream));
            CHECK(cudaStreamSynchronize(stream));
            std::vector<float> tmp(batchSize * SIZES[i]);
            for (std::size_t j = 0; j < tmp.size(); ++j) {
                tmp[j] = __half2float(tmp_h[j]);
            }
            prob.emplace_back(std::move(tmp));
        } else {
            std::vector<float> tmp(batchSize * SIZES[i], std::nanf(""));
            CHECK(cudaMemcpyAsync(tmp.data(), out_ptr, size, cudaMemcpyDeviceToHost, stream));
            prob.emplace_back(std::move(tmp));
        }
#else
        std::vector<float> tmp(batchSize * SIZES[i], std::nanf(""));
        std::size_t size = batchSize * SIZES[i] * sizeof(float);
        CHECK(cudaMemcpyAsync(tmp.data(), buffers[i], size, cudaMemcpyDeviceToHost, stream));
        prob.emplace_back(std::move(tmp));
#endif
    }
    CHECK(cudaStreamSynchronize(stream));

    for (auto& buffer : buffers) {
        if (buffer != nullptr) {
            CHECK(cudaFree(buffer));
        }
    }
#if TRT_VERSION >= 10000
    allocator.reset();
#endif
    CHECK(cudaStreamDestroy(stream));
    return prob;
}

void APIToModel(int32_t N, IRuntime* runtime, IHostMemory** modelStream) {
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    ICudaEngine* engine = createEngine(N, runtime, builder, config, DataType::kHALF);
    assert(engine != nullptr);

    (*modelStream) = engine->serialize();

#if TRT_VERSION >= 8000
    delete engine;
    delete config;
    delete builder;
#else
    engine->destroy();
    config->destroy();
    builder->destroy();
#endif
}

auto main(int argc, char** argv) -> int {
    std::cout << "TensorRT version: " << TRT_VERSION << "\n";
    if (argc != 2) {
        std::cerr << "arguments not right!\n";
        std::cerr << "./vit -s  // serialize model to plan file\n";
        std::cerr << "./vit -d  // deserialize plan file and run inference\n";

        return 1;
    }
#ifndef NDEBUG
    gLogger.setReportableSeverity(nvinfer1::ILogger::Severity::kVERBOSE);
#endif
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    char* trtModelStream{nullptr};
    std::streamsize size{0};

    if (std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(N, runtime, &modelStream);
        assert(modelStream != nullptr);
        std::ofstream p(ENGINE_PATH, std::ios::binary | std::ios::trunc);
        if (!p) {
            std::cerr << "could not open plan output file\n";
            return -1;
        }
        if (modelStream->size() > static_cast<std::size_t>(std::numeric_limits<std::streamsize>::max())) {
            std::cerr << "this model is too large to serialize\n";
            return -1;
        }
        const auto* data_ptr = reinterpret_cast<const char*>(modelStream->data());
        auto data_size = static_cast<std::streamsize>(modelStream->size());
        p.write(data_ptr, data_size);
#if TRT_VERSION >= 8000
        delete modelStream;
#else
        modelStream->destroy();
#endif
        return 0;
    } else if (std::string(argv[1]) == "-d") {
        std::ifstream file(ENGINE_PATH, std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        } else {
            std::cerr << "read engine file error!\n";
            return -1;
        }

#if TRT_VERSION >= 8000
        ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
#else
        ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
#endif
        assert(engine != nullptr);
        auto* context = engine->createExecutionContext();
        assert(context != nullptr);

        // VIT use default BGR order
        auto img = cv::imread("../assets/cats.jpg", cv::IMREAD_COLOR);
        auto input = preprocess_img(img, false, mean, stdv, N, INPUT_H, INPUT_W);

        Profiler profiler("VisionTransformerProfiler");

        // Warmup: run a few iterations without profiling.
        for (int i = 0; i < 5; ++i) {
            (void)doInference(*context, input.data(), N);
        }

        // Profiled runs
        context->setProfiler(&profiler);
        for (int i = 0; i < 20; ++i) {
            auto start = std::chrono::system_clock::now();
            auto prob = doInference(*context, input.data(), N);
            auto end = std::chrono::system_clock::now();
            auto period = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
            std::cout << period.count() << "us\n";

            for (const auto& vector : prob) {
                int idx = 0;
                for (auto v : vector) {
                    std::cout << std::setprecision(4) << v << ", " << std::flush;
                    if (++idx > 20) {
                        std::cout << "\n====\n";
                        break;
                    }
                }
            }

            if (i == 19) {
                std::cout << "prediction result: \n";
                auto labels = loadImagenetLabelMap(LABELS_PATH);
                int _top = 0;
                for (auto& [idx, logits] : topk(prob[0], 3)) {
                    std::cout << "Top: " << _top++ << " idx: " << idx << ", logits: " << logits
                              << ", label: " << labels[idx] << "\n";
                }
                std::cout << profiler << "\n";
            }
        }
        return 0;
    }
    return 0;
}


================================================
FILE: yolo11/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.10)

project(yolov11)

add_definitions(-std=c++11)
add_definitions(-DAPI_EXPORTS)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
enable_language(CUDA)

include_directories(${PROJECT_SOURCE_DIR}/include)
include_directories(${PROJECT_SOURCE_DIR}/plugin)

# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
  message("embed_platform on")
  include_directories(/usr/local/cuda/targets/aarch64-linux/include)
  link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
else()
  message("embed_platform off")

  # cuda
  include_directories(/usr/local/cuda/include)
  link_directories(/usr/local/cuda/lib64)

  # tensorrt
  include_directories(/workspace/shared/TensorRT-8.6.1.6/include)
  link_directories(/workspace/shared/TensorRT-8.6.1.6/lib)
endif()

add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu)
target_link_libraries(myplugins nvinfer cudart)

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu)

add_executable(yolo11_det ${PROJECT_SOURCE_DIR}/yolo11_det.cpp ${SRCS})
target_link_libraries(yolo11_det nvinfer)
target_link_libraries(yolo11_det cudart)
target_link_libraries(yolo11_det myplugins)
target_link_libraries(yolo11_det ${OpenCV_LIBS})

add_executable(yolo11_cls ${PROJECT_SOURCE_DIR}/yolo11_cls.cpp ${SRCS})
target_link_libraries(yolo11_cls nvinfer)
target_link_libraries(yolo11_cls cudart)
target_link_libraries(yolo11_cls myplugins)
target_link_libraries(yolo11_cls ${OpenCV_LIBS})

add_executable(yolo11_seg ${PROJECT_SOURCE_DIR}/yolo11_seg.cpp ${SRCS})
target_link_libraries(yolo11_seg nvinfer)
target_link_libraries(yolo11_seg cudart)
target_link_libraries(yolo11_seg myplugins)
target_link_libraries(yolo11_seg ${OpenCV_LIBS})

add_executable(yolo11_pose ${PROJECT_SOURCE_DIR}/yolo11_pose.cpp ${SRCS})
target_link_libraries(yolo11_pose nvinfer)
target_link_libraries(yolo11_pose cudart)
target_link_libraries(yolo11_pose myplugins)
target_link_libraries(yolo11_pose ${OpenCV_LIBS})

add_executable(yolo11_obb ${PROJECT_SOURCE_DIR}/yolo11_obb.cpp ${SRCS})
target_link_libraries(yolo11_obb nvinfer)
target_link_libraries(yolo11_obb cudart)
target_link_libraries(yolo11_obb myplugins)
target_link_libraries(yolo11_obb ${OpenCV_LIBS})


================================================
FILE: yolo11/gen_wts.py
================================================
import sys  # noqa: F401
import argparse
import os
import struct
import torch


def parse_args():
    parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
    parser.add_argument('-w', '--weights', required=True,
                        help='Input weights (.pt) file path (required)')
    parser.add_argument(
        '-o', '--output', help='Output (.wts) file path (optional)')
    parser.add_argument(
        '-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg', 'pose', 'obb'],
        help='determines the model is detection/classification')
    args = parser.parse_args()
    if not os.path.isfile(args.weights):
        raise SystemExit('Invalid input file')
    if not args.output:
        args.output = os.path.splitext(args.weights)[0] + '.wts'
    elif os.path.isdir(args.output):
        args.output = os.path.join(
            args.output,
            os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
    return args.weights, args.output, args.type


pt_file, wts_file, m_type = parse_args()

print(f'Generating .wts for {m_type} model')

# Load model
print(f'Loading {pt_file}')

# Initialize
device = 'cpu'

# Load model
model = torch.load(pt_file, map_location=device, weights_only=False)  # Load FP32 weights
model = model['ema' if model.get('ema') else 'model'].float()

if m_type in ['detect', 'seg', 'pose', 'obb']:
    anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None]

    delattr(model.model[-1], 'anchors')

model.to(device).eval()

with open(wts_file, 'w') as f:
    f.write('{}\n'.format(len(model.state_dict().keys())))
    for k, v in model.state_dict().items():
        vr = v.reshape(-1).cpu().numpy()
        f.write('{} {} '.format(k, len(vr)))
        for vv in vr:
            f.write(' ')
            f.write(struct.pack('>f', float(vv)).hex())
        f.write('\n')


================================================
FILE: yolo11/include/block.h
================================================
#pragma once

#include <map>
#include <string>
#include <vector>
#include "NvInfer.h"

std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file);

nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network,
                                      std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                      std::string lname, float eps);

nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network,
                                        std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                        int ch, std::vector<int> k, int s, std::string lname);

nvinfer1::IElementWiseLayer* C2F(nvinfer1::INetworkDefinition* network,
                                 std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
                                 int c2, int n, bool shortcut, float e, std::string lname);

nvinfer1::IElementWiseLayer* C2(nvinfer1::INetworkDefinition* network,
                                std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c1,
                                int c2, int n, bool shortcut, float e, std::string lname);

nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network,
                                  std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
                                  int c2, int k, std::string lname);

nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                             nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname);

nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
                                       std::vector<nvinfer1::IConcatenationLayer*> dets, const int* px_arry,
                                       int px_arry_num, bool is_segmentation, bool is_pose, bool is_obb);

nvinfer1::IElementWiseLayer* C3K2(nvinfer1::INetworkDefinition* network,
                                  std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c1,
                                  int c2, int n, bool c3k, bool shortcut, float e, std::string lname);

nvinfer1::ILayer* C2PSA(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights>& weightMap,
                        nvinfer1::ITensor& input, int c1, int c2, int n, float e, std::string lname);

nvinfer1::ILayer* DWConv(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                         nvinfer1::ITensor& input, int ch, std::vector<int> k, int s, std::string lname);


================================================
FILE: yolo11/include/calibrator.h
================================================
#ifndef ENTROPY_CALIBRATOR_H
#define ENTROPY_CALIBRATOR_H

#include <NvInfer.h>
#include <string>
#include <vector>
#include "macros.h"

//! \class Int8EntropyCalibrator2
//!
//! \brief Implements Entropy calibrator 2.
//!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
//!
class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
   public:
    Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name,
                           const char* input_blob_name, bool read_cache = true);
    virtual ~Int8EntropyCalibrator2();
    int getBatchSize() const TRT_NOEXCEPT override;
    bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
    const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
    void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;

   private:
    int batchsize_;
    int input_w_;
    int input_h_;
    int img_idx_;
    std::string img_dir_;
    std::vector<std::string> img_files_;
    size_t input_count_;
    std::string calib_table_name_;
    const char* input_blob_name_;
    bool read_cache_;
    void* device_input_;
    std::vector<char> calib_cache_;
};

#endif  // ENTROPY_CALIBRATOR_H


================================================
FILE: yolo11/include/config.h
================================================
#define USE_FP16
// #define USE_FP32
// #define USE_INT8

const static char* kInputTensorName = "images";
const static char* kOutputTensorName = "output";
const static char* kProtoTensorName = "proto";
const static int kNumClass = 80;
const static int kPoseNumClass = 1;
const static int kNumberOfPoints = 17;  // number of keypoints total
// obb model's number of classes
constexpr static int kObbNumClass = 15;
const static int kObbNe = 1;  // number of extra parameters
const static int kBatchSize = 1;
const static int kGpuId = 0;
const static int kInputH = 640;
const static int kInputW = 640;
const static int kObbInputH = 1024;
const static int kObbInputW = 1024;
const static float kNmsThresh = 0.45f;
const static float kConfThresh = 0.5f;
const static float kConfThreshKeypoints = 0.5f;  // keypoints confidence
const static int kMaxInputImageSize = 3000 * 3000;
const static int kMaxNumOutputBbox = 1000;
//Quantization input image folder path
const static char* kInputQuantizationFolder = "./coco_calib";

// Classfication model's number of classes
constexpr static int kClsNumClass = 1000;
// Classfication model's input shape
constexpr static int kClsInputH = 224;
constexpr static int kClsInputW = 224;


================================================
FILE: yolo11/include/cuda_utils.h
================================================
#ifndef TRTX_CUDA_UTILS_H_
#define TRTX_CUDA_UTILS_H_

#include <cuda_runtime_api.h>

#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)                                                                    \
    {                                                                                          \
        cudaError_t error_code = callstr;                                                      \
        if (error_code != cudaSuccess) {                                                       \
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
            assert(0);                                                                         \
        }                                                                                      \
    }
#endif  // CUDA_CHECK

#endif  // TRTX_CUDA_UTILS_H_


================================================
FILE: yolo11/include/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "NvInferRuntimeCommon.h"
#include "macros.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf {
   public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {}

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {}

    ~LogStreamConsumerBuffer() {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr()) {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync() {
        putOutput();
        return 0;
    }

    void putOutput() {
        if (mShouldLog) {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }

   private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase {
   public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog) {}

   protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
   public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(severity <= reportableSeverity),
          mSeverity(severity) {}

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(other.mShouldLog),
          mSeverity(other.mSeverity) {}

    void setReportableSeverity(Severity reportableSeverity) {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

   private:
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger {
   public:
    Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {}

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult {
        kRUNNING,  //!< The test is running
        kPASSED,   //!< The test passed
        kFAILED,   //!< The test failed
        kWAIVED    //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger() { return *this; }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity) { mReportableSeverity = severity; }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom {
       public:
        TestAtom(TestAtom&&) = default;

       private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started), mName(name), mCmdline(cmdline) {}

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline) {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom) {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result) {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass) {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const { return mReportableSeverity; }

   private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result) {
        switch (result) {
            case TestResult::kRUNNING:
                return "RUNNING";
            case TestResult::kPASSED:
                return "PASSED";
            case TestResult::kFAILED:
                return "FAILED";
            case TestResult::kWAIVED:
                return "WAIVED";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result) {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv) {
        std::stringstream ss;
        for (int i = 0; i < argc; i++) {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace {

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

}  // anonymous namespace

#endif  // TENSORRT_LOGGING_H


================================================
FILE: yolo11/include/macros.h
================================================
#ifndef __MACROS_H
#define __MACROS_H

#include "NvInfer.h"

#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else

#if defined(_MSC_VER)
#define API __declspec(dllimport)
#else
#define API
#endif
#endif  // API_EXPORTS

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif

#endif  // __MACROS_H


================================================
FILE: yolo11/include/model.h
================================================
#pragma once

#include <assert.h>
#include <string>
#include "NvInfer.h"

nvinfer1::IHostMemory* buildEngineYolo11Cls(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                            nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                            std::string& type, int max_channels);

nvinfer1::IHostMemory* buildEngineYolo11Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                            nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                            int& max_channels, std::string& type);

nvinfer1::IHostMemory* buildEngineYolo11Seg(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                            nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                            int& max_channels, std::string& type);

nvinfer1::IHostMemory* buildEngineYolo11Pose(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                             nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                             int& max_channels, std::string& type);

nvinfer1::IHostMemory* buildEngineYolo11Obb(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                            nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                            int& max_channels, std::string& type);


================================================
FILE: yolo11/include/postprocess.h
================================================
#pragma once

#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "types.h"

// Preprocessing functions
cv::Rect get_rect(cv::Mat& img, float bbox[4]);

// Processing functions
void batch_process(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
                   int bbox_element, const std::vector<cv::Mat>& img_batch);

void batch_process_obb(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
                       int bbox_element, const std::vector<cv::Mat>& img_batch);

void process_decode_ptr_host(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img,
                             int count);

void process_decode_ptr_host_obb(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element,
                                 cv::Mat& img, int count);

// NMS functions
void nms(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh = 0.5);

void batch_nms(std::vector<std::vector<Detection>>& batch_res, float* output, int batch_size, int output_size,
               float conf_thresh, float nms_thresh = 0.5);

void nms_obb(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh = 0.5);

void batch_nms_obb(std::vector<std::vector<Detection>>& batch_res, float* output, int batch_size, int output_size,
                   float conf_thresh, float nms_thresh = 0.5);

// CUDA-related functions
void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
                 cudaStream_t stream);

void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);

void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
                     cudaStream_t stream);

void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);

// Drawing functions
void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);

void draw_bbox_obb(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);

void draw_bbox_keypoints_line(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);

void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks,
                    std::unordered_map<int, std::string>& labels_map);


================================================
FILE: yolo11/include/preprocess.h
================================================
#pragma once

#include <map>
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "types.h"

void cuda_preprocess_init(int max_image_size);

void cuda_preprocess_destroy();

void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height,
                     cudaStream_t stream);

void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch, float* dst, int dst_width, int dst_height,
                           cudaStream_t stream);


================================================
FILE: yolo11/include/types.h
================================================
#pragma once
#include "config.h"

struct alignas(float) Detection {
    //center_x center_y w h
    float bbox[4];
    float conf;  // bbox_conf * cls_conf
    float class_id;
    float mask[32];
    float keypoints[kNumberOfPoints * 3];  // 17*3 keypoints
    float angle;                           // obb angle
};

struct AffineMatrix {
    float value[6];
};

const int bbox_element =
        sizeof(AffineMatrix) / sizeof(float) + 1;  // left, top, right, bottom, confidence, class, keepflag


================================================
FILE: yolo11/include/utils.h
================================================
#pragma once
#include <dirent.h>
#include <fstream>
#include <opencv2/opencv.hpp>

static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
    int w, h, x, y;
    float r_w = input_w / (img.cols * 1.0);
    float r_h = input_h / (img.rows * 1.0);
    if (r_h > r_w) {
        w = input_w;
        h = r_w * img.rows;
        x = 0;
        y = (input_h - h) / 2;
    } else {
        w = r_h * img.cols;
        h = input_h;
        x = (input_w - w) / 2;
        y = 0;
    }
    cv::Mat re(h, w, CV_8UC3);
    cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
    cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
    re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
    return out;
}

static inline int read_files_in_dir(const char* p_dir_name, std::vector<std::string>& file_names) {
    DIR* p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            //            std::cout << "Found file: " << cur_file_name << std::endl;
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}

// Function to trim leading and trailing whitespace from a string
static inline std::string trim_leading_whitespace(const std::string& str) {
    size_t first = str.find_first_not_of(' ');
    if (std::string::npos == first) {
        return str;
    }
    size_t last = str.find_last_not_of(' ');
    return str.substr(first, (last - first + 1));
}

// Src: https://stackoverflow.com/questions/16605967
static inline std::string to_string_with_precision(const float a_value, const int n = 2) {
    std::ostringstream out;
    out.precision(n);
    out << std::fixed << a_value;
    return out.str();
}

static inline int read_labels(const std::string labels_filename, std::unordered_map<int, std::string>& labels_map) {
    std::ifstream file(labels_filename);
    // Read each line of the file
    std::string line;
    int index = 0;
    while (std::getline(file, line)) {
        // Strip the line of any leading or trailing whitespace
        line = trim_leading_whitespace(line);

        // Add the stripped line to the labels_map, using the loop index as the key
        labels_map[index] = line;
        index++;
    }
    // Close the file
    file.close();

    return 0;
}


================================================
FILE: yolo11/plugin/yololayer.cu
================================================
#include <assert.h>
#include <math.h>
#include <iostream>
#include <vector>
#include "cuda_utils.h"
#include "types.h"
#include "yololayer.h"

namespace Tn {
template <typename T>
void write(char*& buffer, const T& val) {
    *reinterpret_cast<T*>(buffer) = val;
    buffer += sizeof(T);
}

template <typename T>
void read(const char*& buffer, T& val) {
    val = *reinterpret_cast<const T*>(buffer);
    buffer += sizeof(T);
}
}  // namespace Tn

__device__ float sigmoid(float x) {
    return 1.0f / (1.0f + exp(-x));
}

namespace nvinfer1 {
YoloLayerPlugin::YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth,
                                 int netHeight, int maxOut, bool is_segmentation, bool is_pose, bool is_obb,
                                 const int* strides, int stridesLength) {

    mClassCount = classCount;
    mNumberofpoints = numberofpoints;
    mConfthreshkeypoints = confthreshkeypoints;
    mYoloV8NetWidth = netWidth;
    mYoloV8netHeight = netHeight;
    mMaxOutObject = maxOut;
    mStridesLength = stridesLength;
    mStrides = new int[stridesLength];
    memcpy(mStrides, strides, stridesLength * sizeof(int));
    is_segmentation_ = is_segmentation;
    is_pose_ = is_pose;
    is_obb_ = is_obb;
}

YoloLayerPlugin::~YoloLayerPlugin() {
    if (mStrides != nullptr) {
        delete[] mStrides;
        mStrides = nullptr;
    }
}

YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) {
    using namespace Tn;
    const char *d = reinterpret_cast<const char*>(data), *a = d;
    read(d, mClassCount);
    read(d, mNumberofpoints);
    read(d, mConfthreshkeypoints);
    read(d, mThreadCount);
    read(d, mYoloV8NetWidth);
    read(d, mYoloV8netHeight);
    read(d, mMaxOutObject);
    read(d, mStridesLength);
    mStrides = new int[mStridesLength];
    for (int i = 0; i < mStridesLength; ++i) {
        read(d, mStrides[i]);
    }
    read(d, is_segmentation_);
    read(d, is_pose_);
    read(d, is_obb_);

    assert(d == a + length);
}

void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT {

    using namespace Tn;
    char *d = static_cast<char*>(buffer), *a = d;
    write(d, mClassCount);
    write(d, mNumberofpoints);
    write(d, mConfthreshkeypoints);
    write(d, mThreadCount);
    write(d, mYoloV8NetWidth);
    write(d, mYoloV8netHeight);
    write(d, mMaxOutObject);
    write(d, mStridesLength);
    for (int i = 0; i < mStridesLength; ++i) {
        write(d, mStrides[i]);
    }
    write(d, is_segmentation_);
    write(d, is_pose_);
    write(d, is_obb_);

    assert(d == a + getSerializationSize());
}

size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT {
    return sizeof(mClassCount) + sizeof(mNumberofpoints) + sizeof(mConfthreshkeypoints) + sizeof(mThreadCount) +
           sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + sizeof(mMaxOutObject) + sizeof(mStridesLength) +
           sizeof(int) * mStridesLength + sizeof(is_segmentation_) + sizeof(is_pose_) + sizeof(is_obb_);
}

int YoloLayerPlugin::initialize() TRT_NOEXCEPT {
    return 0;
}

nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs,
                                                    int nbInputDims) TRT_NOEXCEPT {
    int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float);
    return nvinfer1::Dims3(total_size + 1, 1, 1);
}

void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT {
    mPluginNamespace = pluginNamespace;
}

const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT {
    return mPluginNamespace;
}

nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
                                                      int nbInputs) const TRT_NOEXCEPT {
    return nvinfer1::DataType::kFLOAT;
}

bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
                                                   int nbInputs) const TRT_NOEXCEPT {

    return false;
}

bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT {

    return false;
}

void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput,
                                      nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT{};

void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
                                      IGpuAllocator* gpuAllocator) TRT_NOEXCEPT{};

void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {}

const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT {

    return "YoloLayer_TRT";
}

const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT {
    return "1";
}

void YoloLayerPlugin::destroy() TRT_NOEXCEPT {
    delete this;
}

nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT {

    YoloLayerPlugin* p =
            new YoloLayerPlugin(mClassCount, mNumberofpoints, mConfthreshkeypoints, mYoloV8NetWidth, mYoloV8netHeight,
                                mMaxOutObject, is_segmentation_, is_pose_, is_obb_, mStrides, mStridesLength);
    p->setPluginNamespace(mPluginNamespace);
    return p;
}

int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs,
                             void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
    forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV8netHeight, mYoloV8NetWidth, batchSize);
    return 0;
}

__device__ float Logist(float data) {
    return 1.0f / (1.0f + expf(-data));
};

__global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h,
                             int grid_w, const int stride, int classes, int nk, float confkeypoints, int outputElem,
                             bool is_segmentation, bool is_pose, bool is_obb) {
    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    if (idx >= numElements)
        return;

    const int N_kpts = nk;
    int total_grid = grid_h * grid_w;
    int info_len = 4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0) + (is_obb ? 1 : 0);
    int batchIdx = idx / total_grid;
    int elemIdx = idx % total_grid;
    const float* curInput = input + batchIdx * total_grid * info_len;
    int outputIdx = batchIdx * outputElem;

    int class_id = 0;
    float max_cls_prob = 0.0;
    for (int i = 4; i < 4 + classes; i++) {
        float p = Logist(curInput[elemIdx + i * total_grid]);
        if (p > max_cls_prob) {
            max_cls_prob = p;
            class_id = i - 4;
        }
    }

    if (max_cls_prob < 0.1)
        return;

    int count = (int)atomicAdd(output + outputIdx, 1);
    if (count >= maxoutobject)
        return;
    char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection);
    Detection* det = (Detection*)(data);

    int row = elemIdx / grid_w;
    int col = elemIdx % grid_w;

    det->conf = max_cls_prob;
    det->class_id = class_id;
    det->bbox[0] = (col + 0.5f - curInput[elemIdx + 0 * total_grid]) * stride;
    det->bbox[1] = (row + 0.5f - curInput[elemIdx + 1 * total_grid]) * stride;
    det->bbox[2] = (col + 0.5f + curInput[elemIdx + 2 * total_grid]) * stride;
    det->bbox[3] = (row + 0.5f + curInput[elemIdx + 3 * total_grid]) * stride;

    if (is_segmentation) {
        for (int k = 0; k < 32; ++k) {
            det->mask[k] =
                    curInput[elemIdx + (4 + classes + (is_pose ? N_kpts * 3 : 0) + (is_obb ? 1 : 0) + k) * total_grid];
        }
    }

    if (is_pose) {
        for (int kpt = 0; kpt < N_kpts; kpt++) {
            int kpt_x_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3) * total_grid;
            int kpt_y_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3 + 1) * total_grid;
            int kpt_conf_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3 + 2) * total_grid;

            float kpt_confidence = sigmoid(curInput[elemIdx + kpt_conf_idx]);

            float kpt_x = (curInput[elemIdx + kpt_x_idx] * 2.0 + col) * stride;
            float kpt_y = (curInput[elemIdx + kpt_y_idx] * 2.0 + row) * stride;

            bool is_within_bbox =
                    kpt_x >= det->bbox[0] && kpt_x <= det->bbox[2] && kpt_y >= det->bbox[1] && kpt_y <= det->bbox[3];

            if (kpt_confidence < confkeypoints || !is_within_bbox) {
                det->keypoints[kpt * 3] = -1;
                det->keypoints[kpt * 3 + 1] = -1;
                det->keypoints[kpt * 3 + 2] = -1;
            } else {
                det->keypoints[kpt * 3] = kpt_x;
                det->keypoints[kpt * 3 + 1] = kpt_y;
                det->keypoints[kpt * 3 + 2] = kpt_confidence;
            }
        }
    }

    if (is_obb) {
        double pi = CV_PI;
        auto angle_inx = curInput[elemIdx + (4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0) +
                                             0) * total_grid];
        auto angle = (sigmoid(angle_inx) - 0.25f) * pi;

        auto cos1 = cos(angle);
        auto sin1 = sin(angle);
        auto xf = (curInput[elemIdx + 2 * total_grid] - curInput[elemIdx + 0 * total_grid]) / 2;
        auto yf = (curInput[elemIdx + 3 * total_grid] - curInput[elemIdx + 1 * total_grid]) / 2;

        auto x = xf * cos1 - yf * sin1;
        auto y = xf * sin1 + yf * cos1;

        float cx = (col + 0.5f + x) * stride;
        float cy = (row + 0.5f + y) * stride;

        float w1 = (curInput[elemIdx + 0 * total_grid] + curInput[elemIdx + 2 * total_grid]) * stride;
        float h1 = (curInput[elemIdx + 1 * total_grid] + curInput[elemIdx + 3 * total_grid]) * stride;
        det->bbox[0] = cx;
        det->bbox[1] = cy;
        det->bbox[2] = w1;
        det->bbox[3] = h1;
        det->angle = angle;
    }
}

void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,
                                 int mYoloV8NetWidth, int batchSize) {
    int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
    cudaMemsetAsync(output, 0, sizeof(float), stream);
    for (int idx = 0; idx < batchSize; ++idx) {
        CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream));
    }
    int numElem = 0;

    //    const int maxGrids = mStridesLength;
    //    int grids[maxGrids][2];
    //    for (int i = 0; i < maxGrids; ++i) {
    //        grids[i][0] = mYoloV8netHeight / mStrides[i];
    //        grids[i][1] = mYoloV8NetWidth / mStrides[i];
    //    }

    int maxGrids = mStridesLength;
    int flatGridsLen = 2 * maxGrids;
    int* flatGrids = new int[flatGridsLen];

    for (int i = 0; i < maxGrids; ++i) {
        flatGrids[2 * i] = mYoloV8netHeight / mStrides[i];
        flatGrids[2 * i + 1] = mYoloV8NetWidth / mStrides[i];
    }

    for (unsigned int i = 0; i < maxGrids; i++) {
        // Access the elements of the original 2D array from the flattened 1D array
        int grid_h = flatGrids[2 * i];      // Corresponds to the access of grids[i][0]
        int grid_w = flatGrids[2 * i + 1];  // Corresponds to the access of grids[i][1]
        int stride = mStrides[i];
        numElem = grid_h * grid_w * batchSize;  // Calculate the total number of elements
        if (numElem < mThreadCount)             // Adjust the thread count if needed
            mThreadCount = numElem;

        // The CUDA kernel call remains unchanged
        CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>(
                inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, mNumberofpoints,
                mConfthreshkeypoints, outputElem, is_segmentation_, is_pose_, is_obb_);
    }

    delete[] flatGrids;
}

PluginFieldCollection YoloPluginCreator::mFC{};
std::vector<PluginField> YoloPluginCreator::mPluginAttributes;

YoloPluginCreator::YoloPluginCreator() {
    mPluginAttributes.clear();
    mFC.nbFields = mPluginAttributes.size();
    mFC.fields = mPluginAttributes.data();
}

const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT {
    return "YoloLayer_TRT";
}

const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT {
    return "1";
}

const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT {
    return &mFC;
}

IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT {
    assert(fc->nbFields == 1);
    assert(strcmp(fc->fields[0].name, "combinedInfo") == 0);
    const int* combinedInfo = static_cast<const int*>(fc->fields[0].data);
    int netinfo_count = 9;
    int class_count = combinedInfo[0];
    int numberofpoints = combinedInfo[1];
    float confthreshkeypoints = combinedInfo[2];
    int input_w = combinedInfo[3];
    int input_h = combinedInfo[4];
    int max_output_object_count = combinedInfo[5];
    bool is_segmentation = combinedInfo[6];
    bool is_pose = combinedInfo[7];
    bool is_obb = combinedInfo[8];
    const int* px_arry = combinedInfo + netinfo_count;
    int px_arry_length = fc->fields[0].length - netinfo_count;
    YoloLayerPlugin* obj =
            new YoloLayerPlugin(class_count, numberofpoints, confthreshkeypoints, input_w, input_h,
                                max_output_object_count, is_segmentation, is_pose, is_obb, px_arry, px_arry_length);
    obj->setPluginNamespace(mNamespace.c_str());
    return obj;
}

IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData,
                                                     size_t serialLength) TRT_NOEXCEPT {
    // This object will be deleted when the network is destroyed, which will
    // call YoloLayerPlugin::destroy()
    YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
    obj->setPluginNamespace(mNamespace.c_str());
    return obj;
}

}  // namespace nvinfer1


================================================
FILE: yolo11/plugin/yololayer.h
================================================
#pragma once

#include <opencv2/opencv.hpp>
#include <string>
#include <vector>
#include "NvInfer.h"
#include "macros.h"

namespace nvinfer1 {
class API YoloLayerPlugin : public IPluginV2IOExt {
   public:
    YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth, int netHeight,
                    int maxOut, bool is_segmentation, bool is_pose, bool is_obb, const int* strides, int stridesLength);

    YoloLayerPlugin(const void* data, size_t length);

    ~YoloLayerPlugin();

    int getNbOutputs() const TRT_NOEXCEPT override { return 1; }

    nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;

    int initialize() TRT_NOEXCEPT override;

    virtual void terminate() TRT_NOEXCEPT override {}

    virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; }

    virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace,
                        cudaStream_t stream) TRT_NOEXCEPT override;

    virtual size_t getSerializationSize() const TRT_NOEXCEPT override;

    virtual void serialize(void* buffer) const TRT_NOEXCEPT override;

    bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs,
                                   int nbOutputs) const TRT_NOEXCEPT override {
        return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
    }

    const char* getPluginType() const TRT_NOEXCEPT override;

    const char* getPluginVersion() const TRT_NOEXCEPT override;

    void destroy() TRT_NOEXCEPT override;

    IPluginV2IOExt* clone() const TRT_NOEXCEPT override;

    void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;

    const char* getPluginNamespace() const TRT_NOEXCEPT override;

    nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
                                         int32_t nbInputs) const TRT_NOEXCEPT;

    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
                                      int nbInputs) const TRT_NOEXCEPT override;

    bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;

    void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
                         IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;

    void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out,
                         int32_t nbOutput) TRT_NOEXCEPT override;

    void detachFromContext() TRT_NOEXCEPT override;

   private:
    void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,
                    int mYoloV8NetWidth, int batchSize);

    int mThreadCount = 256;
    const char* mPluginNamespace;
    int mClassCount;
    int mNumberofpoints;
    float mConfthreshkeypoints;
    int mYoloV8NetWidth;
    int mYoloV8netHeight;
    int mMaxOutObject;
    bool is_segmentation_;
    bool is_pose_;
    bool is_obb_;
    int* mStrides;
    int mStridesLength;
};

class API YoloPluginCreator : public IPluginCreator {
   public:
    YoloPluginCreator();

    ~YoloPluginCreator() override = default;

    const char* getPluginName() const TRT_NOEXCEPT override;

    const char* getPluginVersion() const TRT_NOEXCEPT override;

    const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;

    nvinfer1::IPluginV2IOExt* createPlugin(const char* name,
                                           const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;

    nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData,
                                                size_t serialLength) TRT_NOEXCEPT override;

    void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; }

    const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); }

   private:
    std::string mNamespace;
    static PluginFieldCollection mFC;
    static std::vector<PluginField> mPluginAttributes;
};

REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
}  // namespace nvinfer1


================================================
FILE: yolo11/readme.md
================================================
## Introduction

Yolo11 model supports TensorRT-8.

Training code [link](https://github.com/ultralytics/ultralytics/archive/refs/tags/v8.3.38.zip)

## Environment

* cuda 11.8
* cudnn 8.9.1.23
* tensorrt 8.6.1.6
* opencv 4.8.0
* ultralytics 8.3.0

## Support

* [x] YOLO11-det support FP32/FP16/INT8 and Python/C++ API
* [x] YOLO11-cls support FP32/FP16/INT8 and Python/C++ API
* [x] YOLO11-seg support FP32/FP16/INT8 and Python/C++ API
* [x] YOLO11-pose support FP32/FP16/INT8 and Python/C++ API
* [x] YOLO11-obb support FP32/FP16/INT8 and Python/C++ API

## Config

* Choose the YOLO11 sub-model n/s/m/l/x from command line arguments.
* Other configs please check [src/config.h](src/config.h)

## Build and Run

1. generate .wts from pytorch with .pt, or download .wts from model zoo

```shell
# Download ultralytics
wget https://github.com/ultralytics/ultralytics/archive/refs/tags/v8.3.0.zip -O ultralytics-8.3.0.zip
# Unzip ultralytics
unzip ultralytics-8.3.0.zip
cd ultralytics-8.3.0
# Download models
wget https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n.pt -O yolo11n.pt
wget https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n-cls.pt -O yolo11n-cls.pt
wget https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n-seg.pt -O yolo11n-seg.pt
wget https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n-pose.pt -O yolo11n-pose.pt
wget https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n-obb.pt -O yolo11n-obb.pt
# Generate .wts
cp [PATH-TO-TENSORRTX]/yolo11/gen_wts.py .
python gen_wts.py -w yolo11n.pt -o yolo11n.wts -t detect
python gen_wts.py -w yolo11n-cls.pt -o yolo11n-cls.wts -t cls
python gen_wts.py -w yolo11n-seg.pt -o yolo11n-seg.wts -t seg
python gen_wts.py -w yolo11n-pose.pt -o yolo11n-pose.wts -t pose
python gen_wts.py -w yolo11n-obb.pt -o yolo11n-obb.wts -t obb
# A file 'yolo11n.wts' will be generated.
```

2. build tensorrtx/yolo11 and run
```shell
cd [PATH-TO-TENSORRTX]/yolo11
mkdir build
cd build
cmake ..
make
```

### Detection
```shell
cp [PATH-TO-ultralytics]/yolo11n.wts .
# Build and serialize TensorRT engine
./yolo11_det -s yolo11n.wts yolo11n.engine [n/s/m/l/x]
# Run inference
./yolo11_det -d yolo11n.engine ../images [c/g]
# results saved in build directory
```

### Classification
```shell
cp [PATH-TO-ultralytics]/yolo11n-cls.wts .
# Build and serialize TensorRT engine
./yolo11_cls -s yolo11n-cls.wts yolo11n-cls.engine [n/s/m/l/x]
# Download ImageNet labels
wget https://github.com/joannzhang00/ImageNet-dataset-classes-labels/blob/main/imagenet_classes.txt
# Run inference
./yolo11_cls -d yolo11n-cls.engine ../images
```

### Segmentation
```shell
cp [PATH-TO-ultralytics]/yolo11n-seg.wts .
# Build and serialize TensorRT engine
./yolo11_seg -s yolo11n-seg.wts yolo11n-seg.engine [n/s/m/l/x]
# Download the labels file
wget -O coco.txt https://raw.githubusercontent.com/amikelive/coco-labels/master/coco-labels-2014_2017.txt
# Run inference
./yolo11_seg -d yolo11n-seg.engine ../images c coco.txt
```

### Pose
```shell
cp [PATH-TO-ultralytics]/yolo11n-pose.wts .
# Build and serialize TensorRT engine
./yolo11_pose -s yolo11n-pose.wts yolo11n-pose.engine [n/s/m/l/x]
# Run inference
./yolo11_pose -d yolo11n-pose.engine ../images
```

### Obb
```shell
cp [PATH-TO-ultralytics]/yolo11n-obb.wts .
# Build and serialize TensorRT engine
./yolo11_obb -s yolo11n-obb.wts yolo11n-obb.engine [n/s/m/l/x]
# Download the image
wget -O P0015.png https://github.com/mpj1234/YOLO11-series-TensorRT8/releases/download/images/P0015.png
mv P0015.png ../images
# Run inference
./yolo11_obb -d yolo11n-obb.engine ../images
```

3. Optional, load and run the tensorrt model in Python
```shell
// Install python-tensorrt, pycuda, etc.
// Ensure the yolo11n.engine
python yolo11_det_trt.py ./build/yolo11n.engine ./build/libmyplugins.so
# faq: in windows bug pycuda._driver.LogicError
# faq: in linux bug Segmentation fault
# Add the following code to the py file:
# import pycuda.autoinit
# import pycuda.driver as cuda
```

## INT8 Quantization
1. Prepare calibration images, you can randomly select 1000s images from your train set. For coco, you can also download my calibration images `coco_calib` from [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing) or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh
2. unzip it in yolo11/build
3. set the macro `USE_INT8` in src/config.h and make again
4. serialize the model and test

## More Information
See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)


================================================
FILE: yolo11/src/block.cpp
================================================
#include "block.h"
#include <assert.h>
#include <math.h>
#include <fstream>
#include <iostream>
#include "config.h"
#include "model.h"
#include "yololayer.h"

std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, nvinfer1::Weights> WeightMap;

    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");

    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--) {
        nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        std::string name;
        input >> name >> std::dec >> size;
        wt.type = nvinfer1::DataType::kFLOAT;

        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; x++) {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        wt.count = size;
        WeightMap[name] = wt;
    }
    return WeightMap;
}

nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network,
                                      std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                      std::string lname, float eps) {
    float* gamma = (float*)weightMap[lname + ".weight"].values;
    float* beta = (float*)weightMap[lname + ".bias"].values;
    float* mean = (float*)weightMap[lname + ".running_mean"].values;
    float* var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;

    float* scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, scval, len};

    float* shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, shval, len};

    float* pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, pval, len};
    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    nvinfer1::IScaleLayer* output = network->addScale(input, nvinfer1::ScaleMode::kCHANNEL, shift, scale, power);
    assert(output);
    return output;
}

nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network,
                                        std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                        int ch, std::vector<int> k, int s, std::string lname) {
    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k[0], k[1]},
                                                                  weightMap[lname + ".conv.weight"], bias_empty);
    assert(conv);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    // auto pad
    int p0 = k[0] / 2;
    int p1 = k[1] / 2;
    conv->setPaddingNd(nvinfer1::DimsHW{p0, p1});

    nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);

    nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
    nvinfer1::IElementWiseLayer* ew =
            network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
    assert(ew);
    return ew;
}

static nvinfer1::ILayer* bottleneck(nvinfer1::INetworkDefinition* network,
                                    std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                    int c1, int c2, bool shortcut, std::vector<int> k1, std::vector<int> k2, float e,
                                    std::string lname) {
    int c_ = (int)((float)c2 * e);
    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c_, k1, 1, lname + ".cv1");
    nvinfer1::IElementWiseLayer* conv2 =
            convBnSiLU(network, weightMap, *conv1->getOutput(0), c2, k2, 1, lname + ".cv2");

    if (shortcut && c1 == c2) {
        nvinfer1::IElementWiseLayer* ew =
                network->addElementWise(input, *conv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
        return ew;
    }
    return conv2;
}

nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network,
                                  std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
                                  int c2, int k, std::string lname) {
    int c_ = c1 / 2;
    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv1");
    nvinfer1::IPoolingLayer* pool1 =
            network->addPoolingNd(*conv1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
    pool1->setStrideNd(nvinfer1::DimsHW{1, 1});
    pool1->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
    nvinfer1::IPoolingLayer* pool2 =
            network->addPoolingNd(*pool1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
    pool2->setStrideNd(nvinfer1::DimsHW{1, 1});
    pool2->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
    nvinfer1::IPoolingLayer* pool3 =
            network->addPoolingNd(*pool2->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
    pool3->setStrideNd(nvinfer1::DimsHW{1, 1});
    pool3->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
    nvinfer1::ITensor* inputTensors[] = {conv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0),
                                         pool3->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 4);
    nvinfer1::IElementWiseLayer* conv2 =
            convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv2");
    return conv2;
}

nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                             nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname) {

    nvinfer1::IShuffleLayer* shuffle1 = network->addShuffle(input);
    shuffle1->setReshapeDimensions(nvinfer1::Dims4{kBatchSize, 4, 16, grid});
    shuffle1->setSecondTranspose(nvinfer1::Permutation{0, 2, 1, 3});
    nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*shuffle1->getOutput(0));
    softmax->setAxes(1 << 1);

    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv =
            network->addConvolutionNd(*softmax->getOutput(0), 1, nvinfer1::DimsHW{1, 1}, weightMap[lname], bias_empty);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    conv->setPaddingNd(nvinfer1::DimsHW{p, p});

    nvinfer1::IShuffleLayer* shuffle2 = network->addShuffle(*conv->getOutput(0));
    shuffle2->setReshapeDimensions(nvinfer1::Dims3{kBatchSize, 4, grid});

    return shuffle2;
}

nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
                                       std::vector<nvinfer1::IConcatenationLayer*> dets, const int* px_arry,
                                       int px_arry_num, bool is_segmentation, bool is_pose, bool is_obb) {
    auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
    const int netinfo_count = 9;  // Assuming the first 5 elements are for netinfo as per existing code.
    const int total_count = netinfo_count + px_arry_num;  // Total number of elements for netinfo and px_arry combined.

    std::vector<int> combinedInfo(total_count);
    int class_num = kNumClass;
    if (is_pose)
        class_num = kPoseNumClass;
    else if (is_obb)
        class_num = kObbNumClass;
    int input_w = kInputW;
    if (is_obb)
        input_w = kObbInputW;
    int input_h = kInputH;
    if (is_obb)
        input_h = kObbInputH;
    // Fill in the first 5 elements as per existing netinfo.
    combinedInfo[0] = class_num;
    combinedInfo[1] = kNumberOfPoints;
    combinedInfo[2] = kConfThreshKeypoints;
    combinedInfo[3] = input_w;
    combinedInfo[4] = input_h;
    combinedInfo[5] = kMaxNumOutputBbox;
    combinedInfo[6] = is_segmentation;
    combinedInfo[7] = is_pose;
    combinedInfo[8] = is_obb;

    // Copy the contents of px_arry into the combinedInfo vector after the initial 5 elements.
    std::copy(px_arry, px_arry + px_arry_num, combinedInfo.begin() + netinfo_count);

    // Now let's create the PluginField object to hold this combined information.
    nvinfer1::PluginField pluginField;
    pluginField.name = "combinedInfo";  // This can be any name that the plugin will recognize
    pluginField.data = combinedInfo.data();
    pluginField.type = nvinfer1::PluginFieldType::kINT32;
    pluginField.length = combinedInfo.size();

    // Create the PluginFieldCollection to hold the PluginField object.
    nvinfer1::PluginFieldCollection pluginFieldCollection;
    pluginFieldCollection.nbFields = 1;  // We have just one field, but it's a combined array
    pluginFieldCollection.fields = &pluginField;

    // Create the plugin object using the PluginFieldCollection.
    nvinfer1::IPluginV2* pluginObject = creator->createPlugin("yololayer", &pluginFieldCollection);

    // We assume that the plugin is to be added onto the network.
    // Prepare input tensors for the YOLO Layer.
    std::vector<nvinfer1::ITensor*> inputTensors;
    for (auto det : dets) {
        inputTensors.push_back(det->getOutput(0));  // Assuming each IConcatenationLayer has one output tensor.
    }

    // Add the plugin to the network using the prepared input tensors.
    nvinfer1::IPluginV2Layer* yoloLayer = network->addPluginV2(inputTensors.data(), inputTensors.size(), *pluginObject);

    return yoloLayer;  // Return the added YOLO layer.
}

static nvinfer1::ILayer* C3k(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                             nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, std::vector<int> k1,
                             std::vector<int> k2, float e, std::string lname) {
    int c_ = (int)((float)c2 * e);
    auto cv1 = convBnSiLU(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv1");
    auto cv2 = convBnSiLU(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv2");
    nvinfer1::ITensor* y1 = cv1->getOutput(0);
    for (int i = 0; i < n; i++) {
        auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, k1, k2, 1.0, lname + ".m." + std::to_string(i));
        y1 = b->getOutput(0);
    }

    nvinfer1::ITensor* inputTensors[] = {y1, cv2->getOutput(0)};
    auto cat = network->addConcatenation(inputTensors, 2);

    auto cv3 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv3");
    return cv3;
}

nvinfer1::IElementWiseLayer* C3K2(nvinfer1::INetworkDefinition* network,
                                  std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c1,
                                  int c2, int n, bool c3k, bool shortcut, float e, std::string lname) {
    int c_ = (float)c2 * e;

    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, 2 * c_, {1, 1}, 1, lname + ".cv1");
    nvinfer1::Dims d = conv1->getOutput(0)->getDimensions();

    nvinfer1::ISliceLayer* split1 =
            network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0},
                              nvinfer1::Dims4{d.d[0], d.d[1] / 2, d.d[2], d.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    nvinfer1::ISliceLayer* split2 =
            network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, d.d[1] / 2, 0, 0},
                              nvinfer1::Dims4{d.d[0], d.d[1] / 2, d.d[2], d.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    nvinfer1::ITensor* inputTensor0[] = {split1->getOutput(0), split2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor0, 2);
    nvinfer1::ITensor* y1 = split2->getOutput(0);
    for (int i = 0; i < n; i++) {
        nvinfer1::ILayer* b;
        if (c3k) {
            b = C3k(network, weightMap, *y1, c_, c_, 2, shortcut, {3, 3}, {3, 3}, 0.5,
                    lname + ".m." + std::to_string(i));
        } else {
            b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, {3, 3}, {3, 3}, 0.5,
                           lname + ".m." + std::to_string(i));
        }
        y1 = b->getOutput(0);

        nvinfer1::ITensor* inputTensors[] = {cat->getOutput(0), b->getOutput(0)};
        cat = network->addConcatenation(inputTensors, 2);
    }

    nvinfer1::IElementWiseLayer* conv2 =
            convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv2");

    return conv2;
}

static nvinfer1::ILayer* convBn(nvinfer1::INetworkDefinition* network,
                                std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int ch,
                                int k, int s, std::string lname, int g = 1) {
    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv =
            network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty);
    assert(conv);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    int p = k / 2;
    conv->setPaddingNd(nvinfer1::DimsHW{p, p});
    conv->setNbGroups(g);

    nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);
    return bn;
}

static nvinfer1::ILayer* Attention(nvinfer1::INetworkDefinition* network,
                                   std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                   int dim, int num_heads, float attn_ratio, std::string lname) {
    int head_dim = dim / num_heads;
    int key_dim = head_dim * attn_ratio;
    float scale = pow(key_dim, -0.5);
    int nh_kd = key_dim * num_heads;
    int h = dim + nh_kd * 2;

    auto d = input.getDimensions();
    int B = d.d[0];
    int H = d.d[2];
    int W = d.d[3];
    int N = H * W;
    auto* qkv = convBn(network, weightMap, input, h, 1, 1, lname + ".qkv");
    // qkv.view(B, self.num_heads, -1, N)
    auto shuffle = network->addShuffle(*qkv->getOutput(0));
    shuffle->setReshapeDimensions(nvinfer1::Dims4{B, num_heads, -1, N});
    // q, k, v = .split([self.key_dim, self.key_dim, self.head_dim], dim=2)
    auto d1 = shuffle->getOutput(0)->getDimensions();
    auto q = network->addSlice(*shuffle->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0},
                               nvinfer1::Dims4{d1.d[0], d1.d[1], key_dim, d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    auto k = network->addSlice(*shuffle->getOutput(0), nvinfer1::Dims4{0, 0, key_dim, 0},
                               nvinfer1::Dims4{d1.d[0], d1.d[1], key_dim, d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    auto v = network->addSlice(*shuffle->getOutput(0), nvinfer1::Dims4{0, 0, key_dim * 2, 0},
                               nvinfer1::Dims4{d1.d[0], d1.d[1], head_dim, d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    // attn = ((q.transpose(-2, -1) @ k) * self.scale)
    auto qT = network->addShuffle(*q->getOutput(0));
    qT->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2});
    auto matmul = network->addMatrixMultiply(*qT->getOutput(0), nvinfer1::MatrixOperation::kNONE, *k->getOutput(0),
                                             nvinfer1::MatrixOperation::kNONE);
    // There are not many memory leaks, and I will change it when I have time
    float* scale_val = reinterpret_cast<float*>(malloc(sizeof(float) * 1));
    scale_val[0] = scale;
    nvinfer1::Weights s_w{nvinfer1::DataType::kFLOAT, scale_val, 1};
    float* shift_val = reinterpret_cast<float*>(malloc(sizeof(float) * 1));
    shift_val[0] = 0;
    nvinfer1::Weights sh_w{nvinfer1::DataType::kFLOAT, shift_val, 1};
    float* power_val = reinterpret_cast<float*>(malloc(sizeof(float) * 1));
    power_val[0] = 1;
    nvinfer1::Weights p_w{nvinfer1::DataType::kFLOAT, power_val, 1};
    nvinfer1::IScaleLayer* scaleLayer =
            network->addScale(*matmul->getOutput(0), nvinfer1::ScaleMode::kUNIFORM, sh_w, s_w, p_w);
    // attn = attn.softmax(dim=-1)
    nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*scaleLayer->getOutput(0));
    softmax->setAxes(1 << 3);
    // x = (v @ attn.transpose(-2, -1)).view(B, -1, H, W) + self.pe(v.reshape(B, -1, H, W))
    auto attnT = network->addShuffle(*softmax->getOutput(0));
    attnT->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2});
    auto matmul2 = network->addMatrixMultiply(*v->getOutput(0), nvinfer1::MatrixOperation::kNONE, *attnT->getOutput(0),
                                              nvinfer1::MatrixOperation::kNONE);
    auto reshape = network->addShuffle(*matmul2->getOutput(0));
    reshape->setReshapeDimensions(nvinfer1::Dims4{B, -1, H, W});
    auto v_reshape = network->addShuffle(*v->getOutput(0));
    v_reshape->setReshapeDimensions(nvinfer1::Dims4{B, -1, H, W});
    // self.pe = Conv(dim, dim, 3, 1, g=dim, act=False)
    auto pe = convBn(network, weightMap, *v_reshape->getOutput(0), dim, 3, 1, lname + ".pe", dim);
    auto sum = network->addElementWise(*reshape->getOutput(0), *pe->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
    // x = self.proj(x)
    // self.proj = Conv(dim, dim, 1, act=False)
    auto proj = convBn(network, weightMap, *sum->getOutput(0), dim, 1, 1, lname + ".proj");
    return proj;
}

static nvinfer1::ILayer* PSABlock(nvinfer1::INetworkDefinition* network,
                                  std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int dim,
                                  float attn_ratio, int num_heads, bool shortcut, std::string lname) {
    // x = x + self.attn(x) if self.add else self.attn(x)
    auto attn = Attention(network, weightMap, input, dim, num_heads, attn_ratio, lname + ".attn");
    nvinfer1::ILayer* shortcut_layer = nullptr;
    if (shortcut) {
        shortcut_layer = network->addElementWise(input, *attn->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
    } else {
        shortcut_layer = attn;
    }
    // self.ffn = nn.Sequential(Conv(c, c * 2, 1), Conv(c * 2, c, 1, act=False))
    // x = x + self.ffn(x) if self.add else self.ffn(x)
    auto ffn0 = convBnSiLU(network, weightMap, *shortcut_layer->getOutput(0), dim * 2, {1, 1}, 1, lname + ".ffn.0");
    auto ffn1 = convBn(network, weightMap, *ffn0->getOutput(0), dim, 1, 1, lname + ".ffn.1");
    if (shortcut) {
        return network->addElementWise(*shortcut_layer->getOutput(0), *ffn1->getOutput(0),
                                       nvinfer1::ElementWiseOperation::kSUM);
    } else {
        return ffn1;
    }
}

nvinfer1::ILayer* C2PSA(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights>& weightMap,
                        nvinfer1::ITensor& input, int c1, int c2, int n, float e, std::string lname) {
    assert(network != nullptr);
    int c = c1 * e;

    // cv1 branch
    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, 2 * c, {1, 1}, 1, lname + ".cv1");
    nvinfer1::ITensor* cv1_out = conv1->getOutput(0);

    // Split the output of cv1 into two tensors
    nvinfer1::Dims dims = cv1_out->getDimensions();
    nvinfer1::ISliceLayer* split1 = network->addSlice(*cv1_out, nvinfer1::Dims4{0, 0, 0, 0},
                                                      nvinfer1::Dims4{dims.d[0], dims.d[1] / 2, dims.d[2], dims.d[3]},
                                                      nvinfer1::Dims4{1, 1, 1, 1});
    nvinfer1::ISliceLayer* split2 = network->addSlice(*cv1_out, nvinfer1::Dims4{0, dims.d[1] / 2, 0, 0},
                                                      nvinfer1::Dims4{dims.d[0], dims.d[1] / 2, dims.d[2], dims.d[3]},
                                                      nvinfer1::Dims4{1, 1, 1, 1});

    // Create y1 bottleneck sequence
    nvinfer1::ITensor* y = split2->getOutput(0);
    for (int i = 0; i < n; ++i) {
        auto* bottleneck_layer =
                PSABlock(network, weightMap, *y, c, 0.5, c / 64, true, lname + ".m." + std::to_string(i));
        y = bottleneck_layer->getOutput(0);  // update 'y1' to be the output of the current bottleneck
    }

    // Concatenate y1 with the second split of cv1
    nvinfer1::ITensor* concatInputs[2] = {split1->getOutput(0), y};
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(concatInputs, 2);

    // cv2 to produce the final output
    nvinfer1::IElementWiseLayer* conv2 =
            convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv2");

    return conv2;
}

nvinfer1::ILayer* DWConv(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                         nvinfer1::ITensor& input, int ch, std::vector<int> k, int s, std::string lname) {
    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k[0], k[1]},
                                                                  weightMap[lname + ".conv.weight"], bias_empty);
    assert(conv);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    conv->setNbGroups(ch);
    // auto pad
    int p0 = k[0] / 2;
    int p1 = k[1] / 2;
    conv->setPaddingNd(nvinfer1::DimsHW{p0, p1});

    nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);

    nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
    nvinfer1::IElementWiseLayer* ew =
            network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
    assert(ew);
    return ew;
}


================================================
FILE: yolo11/src/calibrator.cpp
================================================
#include "calibrator.h"
#include <fstream>
#include <iostream>
#include <iterator>
#include <opencv2/dnn/dnn.hpp>
#include "cuda_utils.h"
#include "utils.h"

Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir,
                                               const char* calib_table_name, const char* input_blob_name,
                                               bool read_cache)
    : batchsize_(batchsize),
      input_w_(input_w),
      input_h_(input_h),
      img_idx_(0),
      img_dir_(img_dir),
      calib_table_name_(calib_table_name),
      input_blob_name_(input_blob_name),
      read_cache_(read_cache) {
    input_count_ = 3 * input_w * input_h * batchsize;
    CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float)));
    read_files_in_dir(img_dir, img_files_);
}

Int8EntropyCalibrator2::~Int8EntropyCalibrator2() {
    CUDA_CHECK(cudaFree(device_input_));
}

int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT {
    return batchsize_;
}

bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT {
    if (img_idx_ + batchsize_ > (int)img_files_.size()) {
        return false;
    }

    std::vector<cv::Mat> input_imgs_;
    for (int i = img_idx_; i < img_idx_ + batchsize_; i++) {
        std::cout << img_files_[i] << "  " << i << std::endl;
        cv::Mat temp = cv::imread(img_dir_ + "/" + img_files_[i]);
        if (temp.empty()) {
            std::cerr << "Fatal error: image cannot open!" << std::endl;
            return false;
        }
        cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_);
        input_imgs_.push_back(pr_img);
    }
    img_idx_ += batchsize_;
    cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0),
                                           true, false);
    CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr<float>(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice));
    assert(!strcmp(names[0], input_blob_name_));
    bindings[0] = device_input_;
    return true;
}

const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT {
    std::cout << "reading calib cache: " << calib_table_name_ << std::endl;
    calib_cache_.clear();
    std::ifstream input(calib_table_name_, std::ios::binary);
    input >> std::noskipws;
    if (read_cache_ && input.good()) {
        std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(calib_cache_));
    }
    length = calib_cache_.size();
    return length ? calib_cache_.data() : nullptr;
}

void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT {
    std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl;
    std::ofstream output(calib_table_name_, std::ios::binary);
    output.write(reinterpret_cast<const char*>(cache), length);
}


================================================
FILE: yolo11/src/model.cpp
================================================
#include <math.h>
#include <iostream>

#include "block.h"
#include "calibrator.h"
#include "config.h"
#include "model.h"

static int get_width(int x, float gw, int max_channels, int divisor = 8) {
    auto channel = std::min(x, max_channels);
    channel = int(ceil((channel * gw) / divisor)) * divisor;
    return channel;
}

static int get_depth(int x, float gd) {
    if (x == 1)
        return 1;
    int r = round(x * gd);
    if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0)
        --r;
    return std::max<int>(r, 1);
}

void calculateStrides(nvinfer1::IElementWiseLayer* conv_layers[], int size, int reference_size, int strides[]) {
    for (int i = 0; i < size; ++i) {
        nvinfer1::ILayer* layer = conv_layers[i];
        nvinfer1::Dims dims = layer->getOutput(0)->getDimensions();
        int feature_map_size = dims.d[2];
        strides[i] = reference_size / feature_map_size;
    }
}

nvinfer1::IHostMemory* buildEngineYolo11Cls(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                            nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                            std::string& type, int max_channels) {
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
    //	nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(
            1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));

    // ****************************************** YOLO11 INPUT **********************************************
    nvinfer1::ITensor* data =
            network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kClsInputH, kClsInputW});
    assert(data);

    // ***************************************** YOLO11 BACKBONE ********************************************
    nvinfer1::IElementWiseLayer* conv0 =
            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), {3, 3}, 2, "model.0");
    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0),
                                                    get_width(128, gw, max_channels), {3, 3}, 2, "model.1");
    bool c3k = false;
    if (type == "m" || type == "l" || type == "x") {
        c3k = true;
    }
    nvinfer1::IElementWiseLayer* conv2 =
            C3K2(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
                 get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, 0.25, "model.2");
    nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0),
                                                    get_width(256, gw, max_channels), {3, 3}, 2, "model.3");
    // 22466
    nvinfer1::IElementWiseLayer* conv4 =
            C3K2(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.25, "model.4");
    nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0),
                                                    get_width(512, gw, max_channels), {3, 3}, 2, "model.5");
    // 22466
    nvinfer1::IElementWiseLayer* conv6 =
            C3K2(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.6");
    nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0),
                                                    get_width(1024, gw, max_channels), {3, 3}, 2, "model.7");
    // 11233
    nvinfer1::IElementWiseLayer* conv8 =
            C3K2(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.8");
    auto* conv9 = C2PSA(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
                        get_width(1024, gw, max_channels), get_depth(2, gd), 0.5, "model.9");

    // ********************************************* YOLO11 HEAD *********************************************

    auto conv_class = convBnSiLU(network, weightMap, *conv9->getOutput(0), 1280, {1, 1}, 1, "model.10.conv");
    // Adjusted code
    nvinfer1::Dims dims =
            conv_class->getOutput(0)->getDimensions();  // Obtain the dimensions of the output of conv_class
    assert(dims.nbDims == 4);  // Make sure there are exactly 3 dimensions (channels, height, width)

    nvinfer1::IPoolingLayer* pool2 = network->addPoolingNd(*conv_class->getOutput(0), nvinfer1::PoolingType::kAVERAGE,
                                                           nvinfer1::DimsHW{dims.d[2], dims.d[3]});
    assert(pool2);

    // Fully connected layer declaration
    auto shuffle_0 = network->addShuffle(*pool2->getOutput(0));
    shuffle_0->setReshapeDimensions(nvinfer1::Dims2{kBatchSize, 1280});
    auto linear_weight = weightMap["model.10.linear.weight"];
    auto constant_weight = network->addConstant(nvinfer1::Dims2{kClsNumClass, 1280}, linear_weight);
    auto constant_bias =
            network->addConstant(nvinfer1::Dims2{kBatchSize, kClsNumClass}, weightMap["model.10.linear.bias"]);
    auto linear_matrix_multipy =
            network->addMatrixMultiply(*shuffle_0->getOutput(0), nvinfer1::MatrixOperation::kNONE,
                                       *constant_weight->getOutput(0), nvinfer1::MatrixOperation::kTRANSPOSE);
    auto yolo = network->addElementWise(*linear_matrix_multipy->getOutput(0), *constant_bias->getOutput(0),
                                        nvinfer1::ElementWiseOperation::kSUM);
    assert(yolo);

    // Set the name for the output tensor and mark it as network output
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    // Set the maximum batch size and workspace size
    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20));

    // Configuration according to the precision mode being used
#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform supports int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator = new Int8EntropyCalibrator2(1, kClsInputW, kClsInputH, kInputQuantizationFolder,
                                                  "int8calib.table", kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    // Begin building the engine; this may take a while
    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // Cleanup the network definition and allocated weights
    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}

nvinfer1::IHostMemory* buildEngineYolo11Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                            nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                            int& max_channels, std::string& type) {
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
    //	nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(
            1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));

    /*******************************************************************************************************
    ******************************************  YOLO11 INPUT  **********************************************
    *******************************************************************************************************/
    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW});
    assert(data);

    /*******************************************************************************************************
    *****************************************  YOLO11 BACKBONE  ********************************************
    *******************************************************************************************************/
    nvinfer1::IElementWiseLayer* conv0 =
            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), {3, 3}, 2, "model.0");
    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0),
                                                    get_width(128, gw, max_channels), {3, 3}, 2, "model.1");
    // 11233
    bool c3k = false;
    if (type == "m" || type == "l" || type == "x") {
        c3k = true;
    }
    nvinfer1::IElementWiseLayer* conv2 =
            C3K2(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
                 get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, 0.25, "model.2");
    nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0),
                                                    get_width(256, gw, max_channels), {3, 3}, 2, "model.3");
    // 22466
    nvinfer1::IElementWiseLayer* conv4 =
            C3K2(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.25, "model.4");
    nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0),
                                                    get_width(512, gw, max_channels), {3, 3}, 2, "model.5");
    // 22466
    nvinfer1::IElementWiseLayer* conv6 =
            C3K2(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.6");
    nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0),
                                                    get_width(1024, gw, max_channels), {3, 3}, 2, "model.7");
    // 11233
    nvinfer1::IElementWiseLayer* conv8 =
            C3K2(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.8");
    nvinfer1::IElementWiseLayer* conv9 =
            SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), 5, "model.9");
    auto* conv10 = C2PSA(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels),
                         get_width(1024, gw, max_channels), get_depth(2, gd), 0.5, "model.10");
    /*******************************************************************************************************
    *********************************************  YOLO11 HEAD  ********************************************
    *******************************************************************************************************/
    float scale[] = {1.0, 1.0, 2.0, 2.0};
    nvinfer1::IResizeLayer* upsample11 = network->addResize(*conv10->getOutput(0));
    assert(upsample11);
    upsample11->setResizeMode(nvinfer1::InterpolationMode::kNEAREST);
    upsample11->setScales(scale, 4);

    nvinfer1::ITensor* inputTensor12[] = {upsample11->getOutput(0), conv6->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat12 = network->addConcatenation(inputTensor12, 2);

    nvinfer1::IElementWiseLayer* conv13 =
            C3K2(network, weightMap, *cat12->getOutput(0), get_width(512, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.5, "model.13");

    nvinfer1::IResizeLayer* upsample14 = network->addResize(*conv13->getOutput(0));
    assert(upsample14);
    upsample14->setResizeMode(nvinfer1::InterpolationMode::kNEAREST);
    upsample14->setScales(scale, 4);

    nvinfer1::ITensor* inputTensor15[] = {upsample14->getOutput(0), conv4->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat15 = network->addConcatenation(inputTensor15, 2);

    nvinfer1::IElementWiseLayer* conv16 =
            C3K2(network, weightMap, *cat15->getOutput(0), get_width(256, gw, max_channels),
                 get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, 0.5, "model.16");

    nvinfer1::IElementWiseLayer* conv17 = convBnSiLU(network, weightMap, *conv16->getOutput(0),
                                                     get_width(256, gw, max_channels), {3, 3}, 2, "model.17");
    nvinfer1::ITensor* inputTensor18[] = {conv17->getOutput(0), conv13->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat18 = network->addConcatenation(inputTensor18, 2);
    nvinfer1::IElementWiseLayer* conv19 =
            C3K2(network, weightMap, *cat18->getOutput(0), get_width(512, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.5, "model.19");

    nvinfer1::IElementWiseLayer* conv20 = convBnSiLU(network, weightMap, *conv19->getOutput(0),
                                                     get_width(512, gw, max_channels), {3, 3}, 2, "model.20");
    nvinfer1::ITensor* inputTensor21[] = {conv20->getOutput(0), conv10->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat21 = network->addConcatenation(inputTensor21, 2);
    nvinfer1::IElementWiseLayer* conv22 =
            C3K2(network, weightMap, *cat21->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.22");

    /*******************************************************************************************************
    *********************************************  YOLO11 OUTPUT  ******************************************
    *******************************************************************************************************/
    // c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], min(self.nc, 100))  # channels
    int c2 = std::max(std::max(16, get_width(256, gw, max_channels) / 4), 16 * 4);
    int c3 = std::max(get_width(256, gw, max_channels), std::min(kNumClass, 100));

    // output0
    nvinfer1::IElementWiseLayer* conv23_cv2_0_0 =
            convBnSiLU(network, weightMap, *conv16->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.0.0");
    nvinfer1::IElementWiseLayer* conv23_cv2_0_1 =
            convBnSiLU(network, weightMap, *conv23_cv2_0_0->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.0.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_0_2 =
            network->addConvolutionNd(*conv23_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.23.cv2.0.2.weight"], weightMap["model.23.cv2.0.2.bias"]);
    conv23_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    auto* conv23_cv3_0_0_0 = DWConv(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), {3, 3},
                                    1, "model.23.cv3.0.0.0");
    auto* conv23_cv3_0_0_1 =
            convBnSiLU(network, weightMap, *conv23_cv3_0_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.0.0.1");
    auto* conv23_cv3_0_1_0 =
            DWConv(network, weightMap, *conv23_cv3_0_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.cv3.0.1.0");
    auto* conv23_cv3_0_1_1 =
            convBnSiLU(network, weightMap, *conv23_cv3_0_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.0.1.1");
    nvinfer1::IConvolutionLayer* conv23_cv3_0_2 =
            network->addConvolutionNd(*conv23_cv3_0_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.23.cv3.0.2.weight"], weightMap["model.23.cv3.0.2.bias"]);
    conv23_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor23_0[] = {conv23_cv2_0_2->getOutput(0), conv23_cv3_0_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_0 = network->addConcatenation(inputTensor23_0, 2);

    // output1
    nvinfer1::IElementWiseLayer* conv23_cv2_1_0 =
            convBnSiLU(network, weightMap, *conv19->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.1.0");
    nvinfer1::IElementWiseLayer* conv23_cv2_1_1 =
            convBnSiLU(network, weightMap, *conv23_cv2_1_0->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.1.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_1_2 =
            network->addConvolutionNd(*conv23_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.23.cv2.1.2.weight"], weightMap["model.23.cv2.1.2.bias"]);
    conv23_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    auto* conv23_cv3_1_0_0 = DWConv(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), {3, 3},
                                    1, "model.23.cv3.1.0.0");
    auto* conv23_cv3_1_0_1 =
            convBnSiLU(network, weightMap, *conv23_cv3_1_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.1.0.1");
    auto* conv23_cv3_1_1_0 =
            DWConv(network, weightMap, *conv23_cv3_1_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.cv3.1.1.0");
    auto* conv23_cv3_1_1_1 =
            convBnSiLU(network, weightMap, *conv23_cv3_1_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.1.1.1");
    nvinfer1::IConvolutionLayer* conv23_cv3_1_2 =
            network->addConvolutionNd(*conv23_cv3_1_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.23.cv3.1.2.weight"], weightMap["model.23.cv3.1.2.bias"]);
    conv23_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor23_1[] = {conv23_cv2_1_2->getOutput(0), conv23_cv3_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_1 = network->addConcatenation(inputTensor23_1, 2);

    // output2
    nvinfer1::IElementWiseLayer* conv23_cv2_2_0 =
            convBnSiLU(network, weightMap, *conv22->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.2.0");
    nvinfer1::IElementWiseLayer* conv23_cv2_2_1 =
            convBnSiLU(network, weightMap, *conv23_cv2_2_0->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.2.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_2_2 =
            network->addConvolutionNd(*conv23_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.23.cv2.2.2.weight"], weightMap["model.23.cv2.2.2.bias"]);
    conv23_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    auto* conv23_cv3_2_0_0 = DWConv(network, weightMap, *conv22->getOutput(0), get_width(1024, gw, max_channels),
                                    {3, 3}, 1, "model.23.cv3.2.0.0");
    auto* conv23_cv3_2_0_1 =
            convBnSiLU(network, weightMap, *conv23_cv3_2_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.2.0.1");
    auto* conv23_cv3_2_1_0 =
            DWConv(network, weightMap, *conv23_cv3_2_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.cv3.2.1.0");
    auto* conv23_cv3_2_1_1 =
            convBnSiLU(network, weightMap, *conv23_cv3_2_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.2.1.1");
    nvinfer1::IConvolutionLayer* conv23_cv3_2_2 =
            network->addConvolutionNd(*conv23_cv3_2_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.23.cv3.2.2.weight"], weightMap["model.23.cv3.2.2.bias"]);
    conv23_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor23_2[] = {conv23_cv2_2_2->getOutput(0), conv23_cv3_2_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_2 = network->addConcatenation(inputTensor23_2, 2);

    /*******************************************************************************************************
    *********************************************  YOLO11 DETECT  ******************************************
    *******************************************************************************************************/

    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7};
    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
    int stridesLength = sizeof(strides) / sizeof(int);

    nvinfer1::IShuffleLayer* shuffle23_0 = network->addShuffle(*cat23_0->getOutput(0));
    shuffle23_0->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
    nvinfer1::ISliceLayer* split23_0_0 = network->addSlice(
            *shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_0_1 =
            network->addSlice(*shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])},
                              nvinfer1::Dims3{1, 1, 1});

    nvinfer1::IShuffleLayer* dfl23_0 =
            DFL(network, weightMap, *split23_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
                1, 0, "model.23.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor22_dfl_0[] = {dfl23_0->getOutput(0), split23_0_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 2);
    cat22_dfl_0->setAxis(1);

    nvinfer1::IShuffleLayer* shuffle23_1 = network->addShuffle(*cat23_1->getOutput(0));
    shuffle23_1->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
    nvinfer1::ISliceLayer* split23_1_0 = network->addSlice(
            *shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_1_1 =
            network->addSlice(*shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl23_1 =
            DFL(network, weightMap, *split23_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
                1, 0, "model.23.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor22_dfl_1[] = {dfl23_1->getOutput(0), split23_1_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 2);
    cat22_dfl_1->setAxis(1);

    nvinfer1::IShuffleLayer* shuffle23_2 = network->addShuffle(*cat23_2->getOutput(0));
    shuffle23_2->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
    nvinfer1::ISliceLayer* split23_2_0 = network->addSlice(
            *shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_2_1 =
            network->addSlice(*shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl23_2 =
            DFL(network, weightMap, *split23_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
                1, 0, "model.23.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor22_dfl_2[] = {dfl23_2->getOutput(0), split23_2_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 2);
    cat22_dfl_2->setAxis(1);

    nvinfer1::IPluginV2Layer* yolo =
            addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer*>{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2},
                         strides, stridesLength, false, false, false);

    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table",
                                                  kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}

static nvinfer1::IElementWiseLayer* convBnSiLUProto(nvinfer1::INetworkDefinition* network,
                                                    std::map<std::string, nvinfer1::Weights> weightMap,
                                                    nvinfer1::ITensor& input, int ch, int k, int s, int p,
                                                    std::string lname) {
    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv =
            network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty);
    assert(conv);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    conv->setPaddingNd(nvinfer1::DimsHW{p, p});
    conv->setName((lname + ".conv").c_str());

    nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);
    bn->setName((lname + ".bn").c_str());
    // This concat operator is not used for calculation, in order to prevent the operator fusion unrealized error when int8 is quantized.
    // Error Code 10: Internal Error (Could not find any implementation for node
    // model.22.proto.cv3.conv + model.22.proto.cv3.sigmoid + PWN(PWN((Unnamed Layer* 353) [Activation]), PWN(model.22.proto.cv3.silu)).)

#if defined(USE_INT8)
    nvinfer1::ITensor* inputTensors[] = {bn->getOutput(0)};
    auto concat = network->addConcatenation(inputTensors, 1);
    nvinfer1::IActivationLayer* sigmoid =
            network->addActivation(*concat->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
    assert(sigmoid);
    bn->setName((lname + ".sigmoid").c_str());
    nvinfer1::IElementWiseLayer* ew = network->addElementWise(*concat->getOutput(0), *sigmoid->getOutput(0),
                                                              nvinfer1::ElementWiseOperation::kPROD);
    assert(ew);
    ew->setName((lname + ".silu").c_str());
#else
    nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
    assert(sigmoid);
    bn->setName((lname + ".sigmoid").c_str());
    nvinfer1::IElementWiseLayer* ew =
            network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
    assert(ew);
    ew->setName((lname + ".silu").c_str());
#endif
    return ew;
}

static nvinfer1::IElementWiseLayer* Proto(nvinfer1::INetworkDefinition* network,
                                          std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input,
                                          std::string lname, float gw, int max_channels) {
    int mid_channel = get_width(256, gw, max_channels);
    auto cv1 = convBnSiLU(network, weightMap, input, mid_channel, {3, 3}, 1, "model.23.proto.cv1");
    //    float *convTranpsose_bais = (float *) weightMap["model.23.proto.upsample.bias"].values;
    //    int convTranpsose_bais_len = weightMap["model.23.proto.upsample.bias"].count;
    //    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, convTranpsose_bais, convTranpsose_bais_len};
    auto convTranpsose = network->addDeconvolutionNd(*cv1->getOutput(0), mid_channel, nvinfer1::DimsHW{2, 2},
                                                     weightMap["model.23.proto.upsample.weight"],
                                                     weightMap["model.23.proto.upsample.bias"]);
    assert(convTranpsose);
    convTranpsose->setStrideNd(nvinfer1::DimsHW{2, 2});
    convTranpsose->setPadding(nvinfer1::DimsHW{0, 0});
    auto cv2 =
            convBnSiLU(network, weightMap, *convTranpsose->getOutput(0), mid_channel, {3, 3}, 1, "model.23.proto.cv2");
    auto cv3 = convBnSiLUProto(network, weightMap, *cv2->getOutput(0), 32, 1, 1, 0, "model.23.proto.cv3");
    assert(cv3);
    return cv3;
}

static nvinfer1::IShuffleLayer* cv4_conv_combined(nvinfer1::INetworkDefinition* network,
                                                  std::map<std::string, nvinfer1::Weights>& weightMap,
                                                  nvinfer1::ITensor& input, std::string lname, int grid_shape, float gw,
                                                  const std::string& algo_type, int max_channels) {
    int nm_nk = 0;
    int c4 = 0;

    if (algo_type == "seg") {
        nm_nk = 32;
        c4 = std::max(get_width(256, gw, max_channels) / 4, nm_nk);
    } else if (algo_type == "pose") {
        nm_nk = kNumberOfPoints * 3;
        c4 = std::max(get_width(256, gw, max_channels) / 4, kNumberOfPoints * 3);
    } else if (algo_type == "obb") {
        nm_nk = kObbNe;
        c4 = std::max(get_width(256, gw, max_channels) / 4, nm_nk);
    } else {
        std::cerr << "Unknown algo type: " << algo_type << std::endl;
        return nullptr;
    }

    auto cv0 = convBnSiLU(network, weightMap, input, c4, {3, 3}, 1, lname + ".0");
    auto cv1 = convBnSiLU(network, weightMap, *cv0->getOutput(0), c4, {3, 3}, 1, lname + ".1");
    float* cv2_bais_value = (float*)weightMap[lname + ".2" + ".bias"].values;
    int cv2_bais_len = weightMap[lname + ".2" + ".bias"].count;
    nvinfer1::Weights cv2_bais{nvinfer1::DataType::kFLOAT, cv2_bais_value, cv2_bais_len};
    auto cv2 = network->addConvolutionNd(*cv1->getOutput(0), nm_nk, nvinfer1::DimsHW{1, 1},
                                         weightMap[lname + ".2" + ".weight"], cv2_bais);
    cv2->setStrideNd(nvinfer1::DimsHW{1, 1});
    nvinfer1::IShuffleLayer* cv2_shuffle = network->addShuffle(*cv2->getOutput(0));
    cv2_shuffle->setReshapeDimensions(nvinfer1::Dims3{kBatchSize, nm_nk, grid_shape});

    return cv2_shuffle;
}

nvinfer1::IHostMemory* buildEngineYolo11Seg(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                            nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                            int& max_channels, std::string& type) {
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
    //	nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(
            1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));

    /*******************************************************************************************************
    ******************************************  YOLO11 INPUT  **********************************************
    *******************************************************************************************************/
    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW});
    assert(data);

    /*******************************************************************************************************
    *****************************************  YOLO11 BACKBONE  ********************************************
    *******************************************************************************************************/
    nvinfer1::IElementWiseLayer* conv0 =
            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), {3, 3}, 2, "model.0");
    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0),
                                                    get_width(128, gw, max_channels), {3, 3}, 2, "model.1");
    bool c3k = false;
    if (type == "m" || type == "l" || type == "x") {
        c3k = true;
    }
    nvinfer1::IElementWiseLayer* conv2 =
            C3K2(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
                 get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, 0.25, "model.2");
    nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0),
                                                    get_width(256, gw, max_channels), {3, 3}, 2, "model.3");
    // 22466
    nvinfer1::IElementWiseLayer* conv4 =
            C3K2(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.25, "model.4");
    nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0),
                                                    get_width(512, gw, max_channels), {3, 3}, 2, "model.5");
    // 22466
    nvinfer1::IElementWiseLayer* conv6 =
            C3K2(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.6");
    nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0),
                                                    get_width(1024, gw, max_channels), {3, 3}, 2, "model.7");
    // 11233
    nvinfer1::IElementWiseLayer* conv8 =
            C3K2(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.8");
    nvinfer1::IElementWiseLayer* conv9 =
            SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), 5, "model.9");
    auto* conv10 = C2PSA(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels),
                         get_width(1024, gw, max_channels), get_depth(2, gd), 0.5, "model.10");

    /*******************************************************************************************************
    *********************************************  YOLO11 HEAD  ********************************************
    *******************************************************************************************************/
    float scale[] = {1.0, 1.0, 2.0, 2.0};
    nvinfer1::IResizeLayer* upsample11 = network->addResize(*conv10->getOutput(0));
    assert(upsample11);
    upsample11->setResizeMode(nvinfer1::InterpolationMode::kNEAREST);
    upsample11->setScales(scale, 4);

    nvinfer1::ITensor* inputTensor12[] = {upsample11->getOutput(0), conv6->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat12 = network->addConcatenation(inputTensor12, 2);

    nvinfer1::IElementWiseLayer* conv13 =
            C3K2(network, weightMap, *cat12->getOutput(0), get_width(512, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.5, "model.13");

    nvinfer1::IResizeLayer* upsample14 = network->addResize(*conv13->getOutput(0));
    assert(upsample14);
    upsample14->setResizeMode(nvinfer1::InterpolationMode::kNEAREST);
    upsample14->setScales(scale, 4);

    nvinfer1::ITensor* inputTensor15[] = {upsample14->getOutput(0), conv4->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat15 = network->addConcatenation(inputTensor15, 2);

    nvinfer1::IElementWiseLayer* conv16 =
            C3K2(network, weightMap, *cat15->getOutput(0), get_width(256, gw, max_channels),
                 get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, 0.5, "model.16");

    nvinfer1::IElementWiseLayer* conv17 = convBnSiLU(network, weightMap, *conv16->getOutput(0),
                                                     get_width(256, gw, max_channels), {3, 3}, 2, "model.17");
    nvinfer1::ITensor* inputTensor18[] = {conv17->getOutput(0), conv13->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat18 = network->addConcatenation(inputTensor18, 2);
    nvinfer1::IElementWiseLayer* conv19 =
            C3K2(network, weightMap, *cat18->getOutput(0), get_width(512, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.5, "model.19");

    nvinfer1::IElementWiseLayer* conv20 = convBnSiLU(network, weightMap, *conv19->getOutput(0),
                                                     get_width(512, gw, max_channels), {3, 3}, 2, "model.20");
    nvinfer1::ITensor* inputTensor21[] = {conv20->getOutput(0), conv10->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat21 = network->addConcatenation(inputTensor21, 2);
    nvinfer1::IElementWiseLayer* conv22 =
            C3K2(network, weightMap, *cat21->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.22");

    /*******************************************************************************************************
    *********************************************  YOLO11 OUTPUT  ******************************************
    *******************************************************************************************************/
    // c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], min(self.nc, 100))  # channels
    int c2 = std::max(std::max(16, get_width(256, gw, max_channels) / 4), 16 * 4);
    int c3 = std::max(get_width(256, gw, max_channels), std::min(kNumClass, 100));

    // output0
    nvinfer1::IElementWiseLayer* conv23_cv2_0_0 =
            convBnSiLU(network, weightMap, *conv16->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.0.0");
    nvinfer1::IElementWiseLayer* conv23_cv2_0_1 =
            convBnSiLU(network, weightMap, *conv23_cv2_0_0->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.0.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_0_2 =
            network->addConvolutionNd(*conv23_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.23.cv2.0.2.weight"], weightMap["model.23.cv2.0.2.bias"]);
    conv23_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    auto* conv23_cv3_0_0_0 = DWConv(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), {3, 3},
                                    1, "model.23.cv3.0.0.0");
    auto* conv23_cv3_0_0_1 =
            convBnSiLU(network, weightMap, *conv23_cv3_0_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.0.0.1");
    auto* conv23_cv3_0_1_0 =
            DWConv(network, weightMap, *conv23_cv3_0_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.cv3.0.1.0");
    auto* conv23_cv3_0_1_1 =
            convBnSiLU(network, weightMap, *conv23_cv3_0_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.0.1.1");
    nvinfer1::IConvolutionLayer* conv23_cv3_0_2 =
            network->addConvolutionNd(*conv23_cv3_0_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.23.cv3.0.2.weight"], weightMap["model.23.cv3.0.2.bias"]);
    conv23_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor23_0[] = {conv23_cv2_0_2->getOutput(0), conv23_cv3_0_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_0 = network->addConcatenation(inputTensor23_0, 2);

    // output1
    nvinfer1::IElementWiseLayer* conv23_cv2_1_0 =
            convBnSiLU(network, weightMap, *conv19->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.1.0");
    nvinfer1::IElementWiseLayer* conv23_cv2_1_1 =
            convBnSiLU(network, weightMap, *conv23_cv2_1_0->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.1.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_1_2 =
            network->addConvolutionNd(*conv23_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.23.cv2.1.2.weight"], weightMap["model.23.cv2.1.2.bias"]);
    conv23_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    auto* conv23_cv3_1_0_0 = DWConv(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), {3, 3},
                                    1, "model.23.cv3.1.0.0");
    auto* conv23_cv3_1_0_1 =
            convBnSiLU(network, weightMap, *conv23_cv3_1_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.1.0.1");
    auto* conv23_cv3_1_1_0 =
            DWConv(network, weightMap, *conv23_cv3_1_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.cv3.1.1.0");
    auto* conv23_cv3_1_1_1 =
            convBnSiLU(network, weightMap, *conv23_cv3_1_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.1.1.1");
    nvinfer1::IConvolutionLayer* conv23_cv3_1_2 =
            network->addConvolutionNd(*conv23_cv3_1_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.23.cv3.1.2.weight"], weightMap["model.23.cv3.1.2.bias"]);
    conv23_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor23_1[] = {conv23_cv2_1_2->getOutput(0), conv23_cv3_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_1 = network->addConcatenation(inputTensor23_1, 2);

    // output2
    nvinfer1::IElementWiseLayer* conv23_cv2_2_0 =
            convBnSiLU(network, weightMap, *conv22->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.2.0");
    nvinfer1::IElementWiseLayer* conv23_cv2_2_1 =
            convBnSiLU(network, weightMap, *conv23_cv2_2_0->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.2.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_2_2 =
            network->addConvolutionNd(*conv23_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.23.cv2.2.2.weight"], weightMap["model.23.cv2.2.2.bias"]);
    conv23_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    auto* conv23_cv3_2_0_0 = DWConv(network, weightMap, *conv22->getOutput(0), get_width(1024, gw, max_channels),
                                    {3, 3}, 1, "model.23.cv3.2.0.0");
    auto* conv23_cv3_2_0_1 =
            convBnSiLU(network, weightMap, *conv23_cv3_2_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.2.0.1");
    auto* conv23_cv3_2_1_0 =
            DWConv(network, weightMap, *conv23_cv3_2_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.cv3.2.1.0");
    auto* conv23_cv3_2_1_1 =
            convBnSiLU(network, weightMap, *conv23_cv3_2_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.2.1.1");
    nvinfer1::IConvolutionLayer* conv23_cv3_2_2 =
            network->addConvolutionNd(*conv23_cv3_2_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.23.cv3.2.2.weight"], weightMap["model.23.cv3.2.2.bias"]);
    conv23_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor23_2[] = {conv23_cv2_2_2->getOutput(0), conv23_cv3_2_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_2 = network->addConcatenation(inputTensor23_2, 2);

    /*******************************************************************************************************
    *********************************************  YOLO11 DETECT  ******************************************
    *******************************************************************************************************/

    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7};
    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
    int stridesLength = sizeof(strides) / sizeof(int);

    nvinfer1::IShuffleLayer* shuffle23_0 = network->addShuffle(*cat23_0->getOutput(0));
    shuffle23_0->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
    nvinfer1::ISliceLayer* split23_0_0 = network->addSlice(
            *shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_0_1 =
            network->addSlice(*shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])},
                              nvinfer1::Dims3{1, 1, 1});

    nvinfer1::IShuffleLayer* dfl23_0 =
            DFL(network, weightMap, *split23_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
                1, 0, "model.23.dfl.conv.weight");

    nvinfer1::IShuffleLayer* shuffle23_1 = network->addShuffle(*cat23_1->getOutput(0));
    shuffle23_1->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
    nvinfer1::ISliceLayer* split23_1_0 = network->addSlice(
            *shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_1_1 =
            network->addSlice(*shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl23_1 =
            DFL(network, weightMap, *split23_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
                1, 0, "model.23.dfl.conv.weight");

    nvinfer1::IShuffleLayer* shuffle23_2 = network->addShuffle(*cat23_2->getOutput(0));
    shuffle23_2->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
    nvinfer1::ISliceLayer* split23_2_0 = network->addSlice(
            *shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_2_1 =
            network->addSlice(*shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl23_2 =
            DFL(network, weightMap, *split23_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
                1, 0, "model.23.dfl.conv.weight");

    // det0
    auto proto_coef_0 = cv4_conv_combined(network, weightMap, *conv16->getOutput(0), "model.23.cv4.0",
                                          (kInputH / strides[0]) * (kInputW / strides[0]), gw, "seg", max_channels);
    nvinfer1::ITensor* inputTensor23_dfl_0[] = {dfl23_0->getOutput(0), split23_0_1->getOutput(0),
                                                proto_coef_0->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_dfl_0 = network->addConcatenation(inputTensor23_dfl_0, 3);
    cat23_dfl_0->setAxis(1);

    // det1
    auto proto_coef_1 = cv4_conv_combined(network, weightMap, *conv19->getOutput(0), "model.23.cv4.1",
                                          (kInputH / strides[1]) * (kInputW / strides[1]), gw, "seg", max_channels);
    nvinfer1::ITensor* inputTensor23_dfl_1[] = {dfl23_1->getOutput(0), split23_1_1->getOutput(0),
                                                proto_coef_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_dfl_1 = network->addConcatenation(inputTensor23_dfl_1, 3);
    cat23_dfl_1->setAxis(1);

    // det2
    auto proto_coef_2 = cv4_conv_combined(network, weightMap, *conv22->getOutput(0), "model.23.cv4.2",
                                          (kInputH / strides[2]) * (kInputW / strides[2]), gw, "seg", max_channels);
    nvinfer1::ITensor* inputTensor23_dfl_2[] = {dfl23_2->getOutput(0), split23_2_1->getOutput(0),
                                                proto_coef_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_dfl_2 = network->addConcatenation(inputTensor23_dfl_2, 3);
    cat23_dfl_2->setAxis(1);

    nvinfer1::IPluginV2Layer* yolo =
            addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer*>{cat23_dfl_0, cat23_dfl_1, cat23_dfl_2},
                         strides, stridesLength, true, false, false);
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    auto proto = Proto(network, weightMap, *conv16->getOutput(0), "model.23.proto", gw, max_channels);
    proto->getOutput(0)->setName(kProtoTensorName);
    network->markOutput(*proto->getOutput(0));

    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table",
                                                  kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}

nvinfer1::IHostMemory* buildEngineYolo11Pose(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                             nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                             int& max_channels, std::string& type) {
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
    //	nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(
            1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));

    /*******************************************************************************************************
    ******************************************  YOLO11 INPUT  **********************************************
    *******************************************************************************************************/
    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW});
    assert(data);

    /*******************************************************************************************************
    *****************************************  YOLO11 BACKBONE  ********************************************
    *******************************************************************************************************/
    nvinfer1::IElementWiseLayer* conv0 =
            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), {3, 3}, 2, "model.0");
    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0),
                                                    get_width(128, gw, max_channels), {3, 3}, 2, "model.1");
    bool c3k = false;
    if (type == "m" || type == "l" || type == "x") {
        c3k = true;
    }
    nvinfer1::IElementWiseLayer* conv2 =
            C3K2(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
                 get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, 0.25, "model.2");
    nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0),
                                                    get_width(256, gw, max_channels), {3, 3}, 2, "model.3");
    // 22466
    nvinfer1::IElementWiseLayer* conv4 =
            C3K2(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.25, "model.4");
    nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0),
                                                    get_width(512, gw, max_channels), {3, 3}, 2, "model.5");
    // 22466
    nvinfer1::IElementWiseLayer* conv6 =
            C3K2(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.6");
    nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0),
                                                    get_width(1024, gw, max_channels), {3, 3}, 2, "model.7");
    // 11233
    nvinfer1::IElementWiseLayer* conv8 =
            C3K2(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.8");
    nvinfer1::IElementWiseLayer* conv9 =
            SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), 5, "model.9");
    auto* conv10 = C2PSA(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels),
                         get_width(1024, gw, max_channels), get_depth(2, gd), 0.5, "model.10");
    /*******************************************************************************************************
    *********************************************  YOLO11 HEAD  ********************************************
    *******************************************************************************************************/
    float scale[] = {1.0, 1.0, 2.0, 2.0};
    nvinfer1::IResizeLayer* upsample11 = network->addResize(*conv10->getOutput(0));
    assert(upsample11);
    upsample11->setResizeMode(nvinfer1::InterpolationMode::kNEAREST);
    upsample11->setScales(scale, 4);

    nvinfer1::ITensor* inputTensor12[] = {upsample11->getOutput(0), conv6->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat12 = network->addConcatenation(inputTensor12, 2);

    nvinfer1::IElementWiseLayer* conv13 =
            C3K2(network, weightMap, *cat12->getOutput(0), get_width(512, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.5, "model.13");

    nvinfer1::IResizeLayer* upsample14 = network->addResize(*conv13->getOutput(0));
    assert(upsample14);
    upsample14->setResizeMode(nvinfer1::InterpolationMode::kNEAREST);
    upsample14->setScales(scale, 4);

    nvinfer1::ITensor* inputTensor15[] = {upsample14->getOutput(0), conv4->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat15 = network->addConcatenation(inputTensor15, 2);

    nvinfer1::IElementWiseLayer* conv16 =
            C3K2(network, weightMap, *cat15->getOutput(0), get_width(256, gw, max_channels),
                 get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, 0.5, "model.16");

    nvinfer1::IElementWiseLayer* conv17 = convBnSiLU(network, weightMap, *conv16->getOutput(0),
                                                     get_width(256, gw, max_channels), {3, 3}, 2, "model.17");
    nvinfer1::ITensor* inputTensor18[] = {conv17->getOutput(0), conv13->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat18 = network->addConcatenation(inputTensor18, 2);
    nvinfer1::IElementWiseLayer* conv19 =
            C3K2(network, weightMap, *cat18->getOutput(0), get_width(512, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.5, "model.19");

    nvinfer1::IElementWiseLayer* conv20 = convBnSiLU(network, weightMap, *conv19->getOutput(0),
                                                     get_width(512, gw, max_channels), {3, 3}, 2, "model.20");
    nvinfer1::ITensor* inputTensor21[] = {conv20->getOutput(0), conv10->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat21 = network->addConcatenation(inputTensor21, 2);
    nvinfer1::IElementWiseLayer* conv22 =
            C3K2(network, weightMap, *cat21->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.22");

    /*******************************************************************************************************
    *********************************************  YOLO11 OUTPUT  ******************************************
    *******************************************************************************************************/
    // c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], min(self.nc, 100))  # channels
    int c2 = std::max(std::max(16, get_width(256, gw, max_channels) / 4), 16 * 4);
    int c3 = std::max(get_width(256, gw, max_channels), std::min(kPoseNumClass, 100));

    // output0
    nvinfer1::IElementWiseLayer* conv23_cv2_0_0 =
            convBnSiLU(network, weightMap, *conv16->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.0.0");
    nvinfer1::IElementWiseLayer* conv23_cv2_0_1 =
            convBnSiLU(network, weightMap, *conv23_cv2_0_0->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.0.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_0_2 =
            network->addConvolutionNd(*conv23_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.23.cv2.0.2.weight"], weightMap["model.23.cv2.0.2.bias"]);
    conv23_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    auto* conv23_cv3_0_0_0 = DWConv(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), {3, 3},
                                    1, "model.23.cv3.0.0.0");
    auto* conv23_cv3_0_0_1 =
            convBnSiLU(network, weightMap, *conv23_cv3_0_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.0.0.1");
    auto* conv23_cv3_0_1_0 =
            DWConv(network, weightMap, *conv23_cv3_0_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.cv3.0.1.0");
    auto* conv23_cv3_0_1_1 =
            convBnSiLU(network, weightMap, *conv23_cv3_0_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.0.1.1");
    nvinfer1::IConvolutionLayer* conv23_cv3_0_2 =
            network->addConvolutionNd(*conv23_cv3_0_1_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.23.cv3.0.2.weight"], weightMap["model.23.cv3.0.2.bias"]);
    conv23_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor23_0[] = {conv23_cv2_0_2->getOutput(0), conv23_cv3_0_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_0 = network->addConcatenation(inputTensor23_0, 2);

    // output1
    nvinfer1::IElementWiseLayer* conv23_cv2_1_0 =
            convBnSiLU(network, weightMap, *conv19->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.1.0");
    nvinfer1::IElementWiseLayer* conv23_cv2_1_1 =
            convBnSiLU(network, weightMap, *conv23_cv2_1_0->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.1.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_1_2 =
            network->addConvolutionNd(*conv23_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.23.cv2.1.2.weight"], weightMap["model.23.cv2.1.2.bias"]);
    conv23_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    auto* conv23_cv3_1_0_0 = DWConv(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), {3, 3},
                                    1, "model.23.cv3.1.0.0");
    auto* conv23_cv3_1_0_1 =
            convBnSiLU(network, weightMap, *conv23_cv3_1_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.1.0.1");
    auto* conv23_cv3_1_1_0 =
            DWConv(network, weightMap, *conv23_cv3_1_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.cv3.1.1.0");
    auto* conv23_cv3_1_1_1 =
            convBnSiLU(network, weightMap, *conv23_cv3_1_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.1.1.1");
    nvinfer1::IConvolutionLayer* conv23_cv3_1_2 =
            network->addConvolutionNd(*conv23_cv3_1_1_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.23.cv3.1.2.weight"], weightMap["model.23.cv3.1.2.bias"]);
    conv23_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor23_1[] = {conv23_cv2_1_2->getOutput(0), conv23_cv3_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_1 = network->addConcatenation(inputTensor23_1, 2);

    // output2
    nvinfer1::IElementWiseLayer* conv23_cv2_2_0 =
            convBnSiLU(network, weightMap, *conv22->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.2.0");
    nvinfer1::IElementWiseLayer* conv23_cv2_2_1 =
            convBnSiLU(network, weightMap, *conv23_cv2_2_0->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.2.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_2_2 =
            network->addConvolutionNd(*conv23_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.23.cv2.2.2.weight"], weightMap["model.23.cv2.2.2.bias"]);
    conv23_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    auto* conv23_cv3_2_0_0 = DWConv(network, weightMap, *conv22->getOutput(0), get_width(1024, gw, max_channels),
                                    {3, 3}, 1, "model.23.cv3.2.0.0");
    auto* conv23_cv3_2_0_1 =
            convBnSiLU(network, weightMap, *conv23_cv3_2_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.2.0.1");
    auto* conv23_cv3_2_1_0 =
            DWConv(network, weightMap, *conv23_cv3_2_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.cv3.2.1.0");
    auto* conv23_cv3_2_1_1 =
            convBnSiLU(network, weightMap, *conv23_cv3_2_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.2.1.1");
    nvinfer1::IConvolutionLayer* conv23_cv3_2_2 =
            network->addConvolutionNd(*conv23_cv3_2_1_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.23.cv3.2.2.weight"], weightMap["model.23.cv3.2.2.bias"]);
    conv23_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor23_2[] = {conv23_cv2_2_2->getOutput(0), conv23_cv3_2_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_2 = network->addConcatenation(inputTensor23_2, 2);
    /*******************************************************************************************************
    *********************************************  YOLO11 DETECT  ******************************************
    *******************************************************************************************************/

    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7};
    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
    int stridesLength = sizeof(strides) / sizeof(int);

    /**************************************************************************************P3****************************************************************************************************************************************/
    nvinfer1::IShuffleLayer* shuffle23_0 = network->addShuffle(*cat23_0->getOutput(0));
    shuffle23_0->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kPoseNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
    nvinfer1::ISliceLayer* split23_0_0 = network->addSlice(
            *shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_0_1 = network->addSlice(
            *shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 64, 0},
            nvinfer1::Dims3{kBatchSize, kPoseNumClass, (kInputH / strides[0]) * (kInputW / strides[0])},
            nvinfer1::Dims3{1, 1, 1});

    nvinfer1::IShuffleLayer* dfl23_0 =
            DFL(network, weightMap, *split23_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
                1, 0, "model.23.dfl.conv.weight");

    // det0
    auto shuffle_conv16 = cv4_conv_combined(network, weightMap, *conv16->getOutput(0), "model.23.cv4.0",
                                            (kInputH / strides[0]) * (kInputW / strides[0]), gw, "pose", max_channels);

    nvinfer1::ITensor* inputTensor23_dfl_0[] = {dfl23_0->getOutput(0), split23_0_1->getOutput(0),
                                                shuffle_conv16->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_dfl_0 = network->addConcatenation(inputTensor23_dfl_0, 3);
    cat23_dfl_0->setAxis(1);

    /********************************************************************************************P4**********************************************************************************************************************************/
    nvinfer1::IShuffleLayer* shuffle23_1 = network->addShuffle(*cat23_1->getOutput(0));
    shuffle23_1->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kPoseNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
    nvinfer1::ISliceLayer* split23_1_0 = network->addSlice(
            *shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_1_1 = network->addSlice(
            *shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 64, 0},
            nvinfer1::Dims3{kBatchSize, kPoseNumClass, (kInputH / strides[1]) * (kInputW / strides[1])},
            nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl23_1 =
            DFL(network, weightMap, *split23_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
                1, 0, "model.23.dfl.conv.weight");

    // det1
    auto shuffle_conv19 = cv4_conv_combined(network, weightMap, *conv19->getOutput(0), "model.23.cv4.1",
                                            (kInputH / strides[1]) * (kInputW / strides[1]), gw, "pose", max_channels);

    nvinfer1::ITensor* inputTensor23_dfl_1[] = {dfl23_1->getOutput(0), split23_1_1->getOutput(0),
                                                shuffle_conv19->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_dfl_1 = network->addConcatenation(inputTensor23_dfl_1, 3);
    cat23_dfl_1->setAxis(1);

    /********************************************************************************************P5**********************************************************************************************************************************/
    nvinfer1::IShuffleLayer* shuffle23_2 = network->addShuffle(*cat23_2->getOutput(0));
    shuffle23_2->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kPoseNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
    nvinfer1::ISliceLayer* split23_2_0 = network->addSlice(
            *shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_2_1 = network->addSlice(
            *shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 64, 0},
            nvinfer1::Dims3{kBatchSize, kPoseNumClass, (kInputH / strides[2]) * (kInputW / strides[2])},
            nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl23_2 =
            DFL(network, weightMap, *split23_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
                1, 0, "model.23.dfl.conv.weight");

    // det2
    auto shuffle_conv22 = cv4_conv_combined(network, weightMap, *conv22->getOutput(0), "model.23.cv4.2",
                                            (kInputH / strides[2]) * (kInputW / strides[2]), gw, "pose", max_channels);
    nvinfer1::ITensor* inputTensor23_dfl_2[] = {dfl23_2->getOutput(0), split23_2_1->getOutput(0),
                                                shuffle_conv22->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_dfl_2 = network->addConcatenation(inputTensor23_dfl_2, 3);
    cat23_dfl_2->setAxis(1);

    nvinfer1::IPluginV2Layer* yolo =
            addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer*>{cat23_dfl_0, cat23_dfl_1, cat23_dfl_2},
                         strides, stridesLength, false, true, false);
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table",
                                                  kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}

nvinfer1::IHostMemory* buildEngineYolo11Obb(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                            nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                            int& max_channels, std::string& type) {
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
    //	nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(
            1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));

    /*******************************************************************************************************
    ******************************************  YOLO11 INPUT  **********************************************
    *******************************************************************************************************/
    nvinfer1::ITensor* data =
            network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kObbInputH, kObbInputW});
    assert(data);

    /*******************************************************************************************************
    *****************************************  YOLO11 BACKBONE  ********************************************
    *******************************************************************************************************/
    nvinfer1::IElementWiseLayer* conv0 =
            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), {3, 3}, 2, "model.0");
    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0),
                                                    get_width(128, gw, max_channels), {3, 3}, 2, "model.1");
    // 11233
    bool c3k = false;
    if (type == "m" || type == "l" || type == "x") {
        c3k = true;
    }
    nvinfer1::IElementWiseLayer* conv2 =
            C3K2(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
                 get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, 0.25, "model.2");
    nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0),
                                                    get_width(256, gw, max_channels), {3, 3}, 2, "model.3");
    // 22466
    nvinfer1::IElementWiseLayer* conv4 =
            C3K2(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.25, "model.4");
    nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0),
                                                    get_width(512, gw, max_channels), {3, 3}, 2, "model.5");
    // 22466
    nvinfer1::IElementWiseLayer* conv6 =
            C3K2(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.6");
    nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0),
                                                    get_width(1024, gw, max_channels), {3, 3}, 2, "model.7");
    // 11233
    nvinfer1::IElementWiseLayer* conv8 =
            C3K2(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.8");
    nvinfer1::IElementWiseLayer* conv9 =
            SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), 5, "model.9");
    auto* conv10 = C2PSA(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels),
                         get_width(1024, gw, max_channels), get_depth(2, gd), 0.5, "model.10");
    /*******************************************************************************************************
    *********************************************  YOLO11 HEAD  ********************************************
    *******************************************************************************************************/
    float scale[] = {1.0, 1.0, 2.0, 2.0};
    nvinfer1::IResizeLayer* upsample11 = network->addResize(*conv10->getOutput(0));
    assert(upsample11);
    upsample11->setResizeMode(nvinfer1::InterpolationMode::kNEAREST);
    upsample11->setScales(scale, 4);

    nvinfer1::ITensor* inputTensor12[] = {upsample11->getOutput(0), conv6->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat12 = network->addConcatenation(inputTensor12, 2);

    nvinfer1::IElementWiseLayer* conv13 =
            C3K2(network, weightMap, *cat12->getOutput(0), get_width(512, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.5, "model.13");

    nvinfer1::IResizeLayer* upsample14 = network->addResize(*conv13->getOutput(0));
    assert(upsample14);
    upsample14->setResizeMode(nvinfer1::InterpolationMode::kNEAREST);
    upsample14->setScales(scale, 4);

    nvinfer1::ITensor* inputTensor15[] = {upsample14->getOutput(0), conv4->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat15 = network->addConcatenation(inputTensor15, 2);

    nvinfer1::IElementWiseLayer* conv16 =
            C3K2(network, weightMap, *cat15->getOutput(0), get_width(256, gw, max_channels),
                 get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, 0.5, "model.16");

    nvinfer1::IElementWiseLayer* conv17 = convBnSiLU(network, weightMap, *conv16->getOutput(0),
                                                     get_width(256, gw, max_channels), {3, 3}, 2, "model.17");
    nvinfer1::ITensor* inputTensor18[] = {conv17->getOutput(0), conv13->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat18 = network->addConcatenation(inputTensor18, 2);
    nvinfer1::IElementWiseLayer* conv19 =
            C3K2(network, weightMap, *cat18->getOutput(0), get_width(512, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.5, "model.19");

    nvinfer1::IElementWiseLayer* conv20 = convBnSiLU(network, weightMap, *conv19->getOutput(0),
                                                     get_width(512, gw, max_channels), {3, 3}, 2, "model.20");
    nvinfer1::ITensor* inputTensor21[] = {conv20->getOutput(0), conv10->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat21 = network->addConcatenation(inputTensor21, 2);
    nvinfer1::IElementWiseLayer* conv22 =
            C3K2(network, weightMap, *cat21->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.22");

    /*******************************************************************************************************
    *********************************************  YOLO11 OUTPUT  ******************************************
    *******************************************************************************************************/
    // c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], min(self.nc, 100))  # channels
    // c4 = max(ch[0] // 4, self.ne)
    int c2 = std::max(std::max(16, get_width(256, gw, max_channels) / 4), 16 * 4);
    int c3 = std::max(get_width(256, gw, max_channels), std::min(kObbNumClass, 100));
    int c4 = std::max(get_width(256, gw, max_channels) / 4, kObbNe);

    // output0
    nvinfer1::IElementWiseLayer* conv23_cv2_0_0 =
            convBnSiLU(network, weightMap, *conv16->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.0.0");
    nvinfer1::IElementWiseLayer* conv23_cv2_0_1 =
            convBnSiLU(network, weightMap, *conv23_cv2_0_0->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.0.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_0_2 =
            network->addConvolutionNd(*conv23_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.23.cv2.0.2.weight"], weightMap["model.23.cv2.0.2.bias"]);
    conv23_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    auto* conv23_cv3_0_0_0 = DWConv(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), {3, 3},
                                    1, "model.23.cv3.0.0.0");
    auto* conv23_cv3_0_0_1 =
            convBnSiLU(network, weightMap, *conv23_cv3_0_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.0.0.1");
    auto* conv23_cv3_0_1_0 =
            DWConv(network, weightMap, *conv23_cv3_0_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.cv3.0.1.0");
    auto* conv23_cv3_0_1_1 =
            convBnSiLU(network, weightMap, *conv23_cv3_0_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.0.1.1");
    nvinfer1::IConvolutionLayer* conv23_cv3_0_2 =
            network->addConvolutionNd(*conv23_cv3_0_1_1->getOutput(0), kObbNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.23.cv3.0.2.weight"], weightMap["model.23.cv3.0.2.bias"]);
    conv23_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor23_0[] = {conv23_cv2_0_2->getOutput(0), conv23_cv3_0_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_0 = network->addConcatenation(inputTensor23_0, 2);

    // output1
    nvinfer1::IElementWiseLayer* conv23_cv2_1_0 =
            convBnSiLU(network, weightMap, *conv19->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.1.0");
    nvinfer1::IElementWiseLayer* conv23_cv2_1_1 =
            convBnSiLU(network, weightMap, *conv23_cv2_1_0->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.1.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_1_2 =
            network->addConvolutionNd(*conv23_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.23.cv2.1.2.weight"], weightMap["model.23.cv2.1.2.bias"]);
    conv23_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    auto* conv23_cv3_1_0_0 = DWConv(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), {3, 3},
                                    1, "model.23.cv3.1.0.0");
    auto* conv23_cv3_1_0_1 =
            convBnSiLU(network, weightMap, *conv23_cv3_1_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.1.0.1");
    auto* conv23_cv3_1_1_0 =
            DWConv(network, weightMap, *conv23_cv3_1_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.cv3.1.1.0");
    auto* conv23_cv3_1_1_1 =
            convBnSiLU(network, weightMap, *conv23_cv3_1_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.1.1.1");
    nvinfer1::IConvolutionLayer* conv23_cv3_1_2 =
            network->addConvolutionNd(*conv23_cv3_1_1_1->getOutput(0), kObbNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.23.cv3.1.2.weight"], weightMap["model.23.cv3.1.2.bias"]);
    conv23_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor23_1[] = {conv23_cv2_1_2->getOutput(0), conv23_cv3_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_1 = network->addConcatenation(inputTensor23_1, 2);

    // output2
    nvinfer1::IElementWiseLayer* conv23_cv2_2_0 =
            convBnSiLU(network, weightMap, *conv22->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.2.0");
    nvinfer1::IElementWiseLayer* conv23_cv2_2_1 =
            convBnSiLU(network, weightMap, *conv23_cv2_2_0->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.2.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_2_2 =
            network->addConvolutionNd(*conv23_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.23.cv2.2.2.weight"], weightMap["model.23.cv2.2.2.bias"]);
    conv23_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    auto* conv23_cv3_2_0_0 = DWConv(network, weightMap, *conv22->getOutput(0), get_width(1024, gw, max_channels),
                                    {3, 3}, 1, "model.23.cv3.2.0.0");
    auto* conv23_cv3_2_0_1 =
            convBnSiLU(network, weightMap, *conv23_cv3_2_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.2.0.1");
    auto* conv23_cv3_2_1_0 =
            DWConv(network, weightMap, *conv23_cv3_2_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.cv3.2.1.0");
    auto* conv23_cv3_2_1_1 =
            convBnSiLU(network, weightMap, *conv23_cv3_2_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.2.1.1");
    nvinfer1::IConvolutionLayer* conv23_cv3_2_2 =
            network->addConvolutionNd(*conv23_cv3_2_1_1->getOutput(0), kObbNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.23.cv3.2.2.weight"], weightMap["model.23.cv3.2.2.bias"]);
    conv23_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor23_2[] = {conv23_cv2_2_2->getOutput(0), conv23_cv3_2_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_2 = network->addConcatenation(inputTensor23_2, 2);

    /*******************************************************************************************************
    *********************************************  YOLO11 DETECT  ******************************************
    *******************************************************************************************************/

    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7};
    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kObbInputH, strides);
    int stridesLength = sizeof(strides) / sizeof(int);

    nvinfer1::IShuffleLayer* shuffle23_0 = network->addShuffle(*cat23_0->getOutput(0));
    shuffle23_0->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kObbNumClass, (kObbInputH / strides[0]) * (kObbInputW / strides[0])});
    nvinfer1::ISliceLayer* split23_0_0 =
            network->addSlice(*shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 0, 0},
                              nvinfer1::Dims3{kBatchSize, 64, (kObbInputH / strides[0]) * (kObbInputW / strides[0])},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_0_1 = network->addSlice(
            *shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 64, 0},
            nvinfer1::Dims3{kBatchSize, kObbNumClass, (kObbInputH / strides[0]) * (kObbInputW / strides[0])},
            nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl23_0 =
            DFL(network, weightMap, *split23_0_0->getOutput(0), 4,
                (kObbInputH / strides[0]) * (kObbInputW / strides[0]), 1, 1, 0, "model.23.dfl.conv.weight");

    nvinfer1::IShuffleLayer* shuffle23_1 = network->addShuffle(*cat23_1->getOutput(0));
    shuffle23_1->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kObbNumClass, (kObbInputH / strides[1]) * (kObbInputW / strides[1])});
    nvinfer1::ISliceLayer* split23_1_0 =
            network->addSlice(*shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 0, 0},
                              nvinfer1::Dims3{kBatchSize, 64, (kObbInputH / strides[1]) * (kObbInputW / strides[1])},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_1_1 = network->addSlice(
            *shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 64, 0},
            nvinfer1::Dims3{kBatchSize, kObbNumClass, (kObbInputH / strides[1]) * (kObbInputW / strides[1])},
            nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl23_1 =
            DFL(network, weightMap, *split23_1_0->getOutput(0), 4,
                (kObbInputH / strides[1]) * (kObbInputW / strides[1]), 1, 1, 0, "model.23.dfl.conv.weight");

    nvinfer1::IShuffleLayer* shuffle23_2 = network->addShuffle(*cat23_2->getOutput(0));
    shuffle23_2->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kObbNumClass, (kObbInputH / strides[2]) * (kObbInputW / strides[2])});
    nvinfer1::ISliceLayer* split23_2_0 =
            network->addSlice(*shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 0, 0},
                              nvinfer1::Dims3{kBatchSize, 64, (kObbInputH / strides[2]) * (kObbInputW / strides[2])},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_2_1 = network->addSlice(
            *shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 64, 0},
            nvinfer1::Dims3{kBatchSize, kObbNumClass, (kObbInputH / strides[2]) * (kObbInputW / strides[2])},
            nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl23_2 =
            DFL(network, weightMap, *split23_2_0->getOutput(0), 4,
                (kObbInputH / strides[2]) * (kObbInputW / strides[2]), 1, 1, 0, "model.23.dfl.conv.weight");

    // det0
    auto shuffle_conv16 =
            cv4_conv_combined(network, weightMap, *conv16->getOutput(0), "model.23.cv4.0",
                              (kObbInputH / strides[0]) * (kObbInputW / strides[0]), gw, "obb", max_channels);

    nvinfer1::ITensor* inputTensor23_dfl_0[] = {dfl23_0->getOutput(0), split23_0_1->getOutput(0),
                                                shuffle_conv16->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_dfl_0 = network->addConcatenation(inputTensor23_dfl_0, 3);
    cat23_dfl_0->setAxis(1);

    // det1
    auto shuffle_conv19 =
            cv4_conv_combined(network, weightMap, *conv19->getOutput(0), "model.23.cv4.1",
                              (kObbInputH / strides[1]) * (kObbInputW / strides[1]), gw, "obb", max_channels);
    nvinfer1::ITensor* inputTensor23_dfl_1[] = {dfl23_1->getOutput(0), split23_1_1->getOutput(0),
                                                shuffle_conv19->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_dfl_1 = network->addConcatenation(inputTensor23_dfl_1, 3);
    cat23_dfl_1->setAxis(1);

    // det2
    auto shuffle_conv22 =
            cv4_conv_combined(network, weightMap, *conv22->getOutput(0), "model.23.cv4.2",
                              (kObbInputH / strides[2]) * (kObbInputW / strides[2]), gw, "obb", max_channels);
    nvinfer1::ITensor* inputTensor23_dfl_2[] = {dfl23_2->getOutput(0), split23_2_1->getOutput(0),
                                                shuffle_conv22->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_dfl_2 = network->addConcatenation(inputTensor23_dfl_2, 3);
    cat23_dfl_2->setAxis(1);

    // yolo layer
    nvinfer1::IPluginV2Layer* yolo =
            addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer*>{cat23_dfl_0, cat23_dfl_1, cat23_dfl_2},
                         strides, stridesLength, false, false, true);

    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator = new Int8EntropyCalibrator2(1, kObbInputW, kObbInputH, kInputQuantizationFolder,
                                                  "int8calib.table", kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}


================================================
FILE: yolo11/src/postprocess.cpp
================================================
#include "postprocess.h"
#include "utils.h"

cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
    float l, r, t, b;
    float r_w = kInputW / (img.cols * 1.0);
    float r_h = kInputH / (img.rows * 1.0);

    if (r_h > r_w) {
        l = bbox[0];
        r = bbox[2];
        t = bbox[1] - (kInputH - r_w * img.rows) / 2;
        b = bbox[3] - (kInputH - r_w * img.rows) / 2;
        l = l / r_w;
        r = r / r_w;
        t = t / r_w;
        b = b / r_w;
    } else {
        l = bbox[0] - (kInputW - r_h * img.cols) / 2;
        r = bbox[2] - (kInputW - r_h * img.cols) / 2;
        t = bbox[1];
        b = bbox[3];
        l = l / r_h;
        r = r / r_h;
        t = t / r_h;
        b = b / r_h;
    }
    l = std::max(0.0f, l);
    t = std::max(0.0f, t);
    int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
    int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));

    return cv::Rect(int(round(l)), int(round(t)), width, height);
}

cv::Rect get_rect_obb(cv::Mat& img, float bbox[4]) {
    float l, r, t, b;
    float r_w = kObbInputW / (img.cols * 1.0);
    float r_h = kObbInputH / (img.rows * 1.0);

    if (r_h > r_w) {
        l = bbox[0];
        r = bbox[2];
        t = bbox[1] - (kObbInputH - r_w * img.rows) / 2;
        b = bbox[3] - (kObbInputH - r_w * img.rows) / 2;
        l = l / r_w;
        r = r / r_w;
        t = t / r_w;
        b = b / r_w;
    } else {
        l = bbox[0] - (kObbInputW - r_h * img.cols) / 2;
        r = bbox[2] - (kObbInputW - r_h * img.cols) / 2;
        t = bbox[1];
        b = bbox[3];
        l = l / r_h;
        r = r / r_h;
        t = t / r_h;
        b = b / r_h;
    }
    l = std::max(0.0f, l);
    t = std::max(0.0f, t);
    int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
    int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));

    return cv::Rect(int(round(l)), int(round(t)), width, height);
}

cv::Rect get_rect_adapt_landmark(cv::Mat& img, float bbox[4], float lmk[kNumberOfPoints * 3]) {
    float l, r, t, b;
    float r_w = kInputW / (img.cols * 1.0);
    float r_h = kInputH / (img.rows * 1.0);
    if (r_h > r_w) {
        l = bbox[0] / r_w;
        r = bbox[2] / r_w;
        t = (bbox[1] - (kInputH - r_w * img.rows) / 2) / r_w;
        b = (bbox[3] - (kInputH - r_w * img.rows) / 2) / r_w;
        for (int i = 0; i < kNumberOfPoints * 3; i += 3) {
            lmk[i] /= r_w;
            lmk[i + 1] = (lmk[i + 1] - (kInputH - r_w * img.rows) / 2) / r_w;
            // lmk[i + 2]
        }
    } else {
        l = (bbox[0] - (kInputW - r_h * img.cols) / 2) / r_h;
        r = (bbox[2] - (kInputW - r_h * img.cols) / 2) / r_h;
        t = bbox[1] / r_h;
        b = bbox[3] / r_h;
        for (int i = 0; i < kNumberOfPoints * 3; i += 3) {
            lmk[i] = (lmk[i] - (kInputW - r_h * img.cols) / 2) / r_h;
            lmk[i + 1] /= r_h;
            // lmk[i + 2]
        }
    }
    l = std::max(0.0f, l);
    t = std::max(0.0f, t);
    int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
    int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));

    return cv::Rect(int(round(l)), int(round(t)), width, height);
}

static float iou(float lbox[4], float rbox[4]) {
    float interBox[] = {
            (std::max)(lbox[0], rbox[0]),
            (std::min)(lbox[2], rbox[2]),
            (std::max)(lbox[1], rbox[1]),
            (std::min)(lbox[3], rbox[3]),
    };

    if (interBox[2] > interBox[3] || interBox[0] > interBox[1])
        return 0.0f;

    float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]);
    float unionBoxS = (lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) - interBoxS;
    return interBoxS / unionBoxS;
}

static bool cmp(const Detection& a, const Detection& b) {
    if (a.conf == b.conf) {
        return a.bbox[0] < b.bbox[0];
    }
    return a.conf > b.conf;
}

void nms(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh) {
    int det_size = sizeof(Detection) / sizeof(float);
    std::map<float, std::vector<Detection>> m;

    for (int i = 0; i < output[0]; i++) {
        if (output[1 + det_size * i + 4] <= conf_thresh || isnan(output[1 + det_size * i + 4]))
            continue;
        Detection det;
        memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
        if (m.count(det.class_id) == 0)
            m.emplace(det.class_id, std::vector<Detection>());
        m[det.class_id].push_back(det);
    }
    for (auto it = m.begin(); it != m.end(); it++) {
        auto& dets = it->second;
        std::sort(dets.begin(), dets.end(), cmp);
        for (size_t m = 0; m < dets.size(); ++m) {
            auto& item = dets[m];
            res.push_back(item);
            for (size_t n = m + 1; n < dets.size(); ++n) {
                if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
                    dets.erase(dets.begin() + n);
                    --n;
                }
            }
        }
    }
}

void batch_nms(std::vector<std::vector<Detection>>& res_batch, float* output, int batch_size, int output_size,
               float conf_thresh, float nms_thresh) {
    res_batch.resize(batch_size);
    for (int i = 0; i < batch_size; i++) {
        nms(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh);
    }
}

void process_decode_ptr_host(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img,
                             int count) {
    Detection det;
    for (int i = 0; i < count; i++) {
        int basic_pos = 1 + i * bbox_element;
        int keep_flag = decode_ptr_host[basic_pos + 6];
        if (keep_flag == 1) {
            det.bbox[0] = decode_ptr_host[basic_pos + 0];
            det.bbox[1] = decode_ptr_host[basic_pos + 1];
            det.bbox[2] = decode_ptr_host[basic_pos + 2];
            det.bbox[3] = decode_ptr_host[basic_pos + 3];
            det.conf = decode_ptr_host[basic_pos + 4];
            det.class_id = decode_ptr_host[basic_pos + 5];
            res.push_back(det);
        }
    }
}

void batch_process(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
                   int bbox_element, const std::vector<cv::Mat>& img_batch) {
    res_batch.resize(batch_size);
    int count = static_cast<int>(*decode_ptr_host);
    count = std::min(count, kMaxNumOutputBbox);
    for (int i = 0; i < batch_size; i++) {
        auto& img = const_cast<cv::Mat&>(img_batch[i]);
        process_decode_ptr_host(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count);
    }
}

void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
    for (size_t i = 0; i < img_batch.size(); i++) {
        auto& res = res_batch[i];
        cv::Mat img = img_batch[i];
        for (size_t j = 0; j < res.size(); j++) {
            cv::Rect r = get_rect(img, res[j].bbox);
            cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
            cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
                        cv::Scalar(0xFF, 0xFF, 0xFF), 2);
        }
    }
}

void draw_bbox_keypoints_line(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
    const std::vector<std::pair<int, int>> skeleton_pairs = {
            {0, 1}, {0, 2},  {0, 5}, {0, 6},  {1, 2},   {1, 3},   {2, 4},   {5, 6},   {5, 7},  {5, 11},
            {6, 8}, {6, 12}, {7, 9}, {8, 10}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}};

    for (size_t i = 0; i < img_batch.size(); i++) {
        auto& res = res_batch[i];
        cv::Mat img = img_batch[i];
        for (size_t j = 0; j < res.size(); j++) {
            cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].keypoints);
            cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
            cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
                        cv::Scalar(0xFF, 0xFF, 0xFF), 2);

            for (int k = 0; k < kNumberOfPoints * 3; k += 3) {
                if (res[j].keypoints[k + 2] > 0.5) {
                    cv::circle(img, cv::Point((int)res[j].keypoints[k], (int)res[j].keypoints[k + 1]), 3,
                               cv::Scalar(0, 0x27, 0xC1), -1);
                }
            }

            for (const auto& bone : skeleton_pairs) {
                int kp1_idx = bone.first * 3;
                int kp2_idx = bone.second * 3;
                if (res[j].keypoints[kp1_idx + 2] > 0.5 && res[j].keypoints[kp2_idx + 2] > 0.5) {
                    cv::Point p1((int)res[j].keypoints[kp1_idx], (int)res[j].keypoints[kp1_idx + 1]);
                    cv::Point p2((int)res[j].keypoints[kp2_idx], (int)res[j].keypoints[kp2_idx + 1]);
                    cv::line(img, p1, p2, cv::Scalar(0, 0x27, 0xC1), 2);
                }
            }
        }
    }
}

cv::Mat scale_mask(cv::Mat mask, cv::Mat img) {
    int x, y, w, h;
    float r_w = kInputW / (img.cols * 1.0);
    float r_h = kInputH / (img.rows * 1.0);
    if (r_h > r_w) {
        w = kInputW;
        h = r_w * img.rows;
        x = 0;
        y = (kInputH - h) / 2;
    } else {
        w = r_h * img.cols;
        h = kInputH;
        x = (kInputW - w) / 2;
        y = 0;
    }
    cv::Rect r(x, y, w, h);
    cv::Mat res;
    cv::resize(mask(r), res, img.size());
    return res;
}

void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks,
                    std::unordered_map<int, std::string>& labels_map) {
    static std::vector<uint32_t> colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17,
                                           0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF,
                                           0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7};
    for (size_t i = 0; i < dets.size(); i++) {
        cv::Mat img_mask = scale_mask(masks[i], img);
        auto color = colors[(int)dets[i].class_id % colors.size()];
        auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF);

        cv::Rect r = get_rect(img, dets[i].bbox);
        for (int x = r.x; x < r.x + r.width; x++) {
            for (int y = r.y; y < r.y + r.height; y++) {
                float val = img_mask.at<float>(y, x);
                if (val <= 0.5)
                    continue;
                img.at<cv::Vec3b>(y, x)[0] = img.at<cv::Vec3b>(y, x)[0] / 2 + bgr[0] / 2;
                img.at<cv::Vec3b>(y, x)[1] = img.at<cv::Vec3b>(y, x)[1] / 2 + bgr[1] / 2;
                img.at<cv::Vec3b>(y, x)[2] = img.at<cv::Vec3b>(y, x)[2] / 2 + bgr[2] / 2;
            }
        }

        cv::rectangle(img, r, bgr, 2);

        // Get the size of the text
        cv::Size textSize =
                cv::getTextSize(labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf),
                                cv::FONT_HERSHEY_PLAIN, 1.2, 2, NULL);
        // Set the top left corner of the rectangle
        cv::Point topLeft(r.x, r.y - textSize.height);

        // Set the bottom right corner of the rectangle
        cv::Point bottomRight(r.x + textSize.width, r.y + textSize.height);

        // Set the thickness of the rectangle lines
        int lineThickness = 2;

        // Draw the rectangle on the image
        cv::rectangle(img, topLeft, bottomRight, bgr, -1);

        cv::putText(img, labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf),
                    cv::Point(r.x, r.y + 4), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar::all(0xFF), 2);
    }
}

void process_decode_ptr_host_obb(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element,
                                 cv::Mat& img, int count) {
    Detection det;
    for (int i = 0; i < count; i++) {
        int basic_pos = 1 + i * bbox_element;
        int keep_flag = decode_ptr_host[basic_pos + 6];
        if (keep_flag == 1) {
            det.bbox[0] = decode_ptr_host[basic_pos + 0];
            det.bbox[1] = decode_ptr_host[basic_pos + 1];
            det.bbox[2] = decode_ptr_host[basic_pos + 2];
            det.bbox[3] = decode_ptr_host[basic_pos + 3];
            det.conf = decode_ptr_host[basic_pos + 4];
            det.class_id = decode_ptr_host[basic_pos + 5];
            det.angle = decode_ptr_host[basic_pos + 7];
            res.push_back(det);
        }
    }
}

void batch_process_obb(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
                       int bbox_element, const std::vector<cv::Mat>& img_batch) {
    res_batch.resize(batch_size);
    int count = static_cast<int>(*decode_ptr_host);
    count = std::min(count, kMaxNumOutputBbox);
    for (int i = 0; i < batch_size; i++) {
        auto& img = const_cast<cv::Mat&>(img_batch[i]);
        process_decode_ptr_host_obb(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count);
    }
}

std::tuple<float, float, float> convariance_matrix(Detection res) {
    float w = res.bbox[2];
    float h = res.bbox[3];

    float a = w * w / 12.0;
    float b = h * h / 12.0;
    float c = res.angle;

    float cos_r = std::cos(c);
    float sin_r = std::sin(c);

    float cos_r2 = cos_r * cos_r;
    float sin_r2 = sin_r * sin_r;

    float a_val = a * cos_r2 + b * sin_r2;
    float b_val = a * sin_r2 + b * cos_r2;
    float c_val = (a - b) * cos_r * sin_r;

    return std::make_tuple(a_val, b_val, c_val);
}

static float probiou(const Detection& res1, const Detection& res2, float eps = 1e-7) {
    // Calculate the prob iou between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf.
    float a1, b1, c1, a2, b2, c2;
    std::tuple<float, float, float> matrix1 = {a1, b1, c1};
    std::tuple<float, float, float> matrix2 = {a2, b2, c2};
    matrix1 = convariance_matrix(res1);
    matrix2 = convariance_matrix(res2);
    a1 = std::get<0>(matrix1);
    b1 = std::get<1>(matrix1);
    c1 = std::get<2>(matrix1);
    a2 = std::get<0>(matrix2);
    b2 = std::get<1>(matrix2);
    c2 = std::get<2>(matrix2);

    float x1 = res1.bbox[0], y1 = res1.bbox[1];
    float x2 = res2.bbox[0], y2 = res2.bbox[1];

    float t1 = ((a1 + a2) * std::pow(y1 - y2, 2) + (b1 + b2) * std::pow(x1 - x2, 2)) /
               ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps);
    float t2 = ((c1 + c2) * (x2 - x1) * (y1 - y2)) / ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps);
    float t3 = std::log(
            ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2)) /
                    (4 * std::sqrt(std::max(a1 * b1 - c1 * c1, 0.0f)) * std::sqrt(std::max(a2 * b2 - c2 * c2, 0.0f)) +
                     eps) +
            eps);

    float bd = 0.25f * t1 + 0.5f * t2 + 0.5f * t3;
    bd = std::max(std::min(bd, 100.0f), eps);
    float hd = std::sqrt(1.0 - std::exp(-bd) + eps);

    return 1 - hd;
}

void nms_obb(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh) {
    int det_size = sizeof(Detection) / sizeof(float);
    std::map<float, std::vector<Detection>> m;

    for (int i = 0; i < output[0]; i++) {

        if (output[1 + det_size * i + 4] <= conf_thresh)
            continue;
        Detection det;
        memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
        if (m.count(det.class_id) == 0)
            m.emplace(det.class_id, std::vector<Detection>());
        m[det.class_id].push_back(det);
    }
    for (auto it = m.begin(); it != m.end(); it++) {
        auto& dets = it->second;
        std::sort(dets.begin(), dets.end(), cmp);
        for (size_t m = 0; m < dets.size(); ++m) {
            auto& item = dets[m];
            res.push_back(item);
            for (size_t n = m + 1; n < dets.size(); ++n) {
                if (probiou(item, dets[n]) >= nms_thresh) {
                    dets.erase(dets.begin() + n);
                    --n;
                }
            }
        }
    }
}

void batch_nms_obb(std::vector<std::vector<Detection>>& res_batch, float* output, int batch_size, int output_size,
                   float conf_thresh, float nms_thresh) {
    res_batch.resize(batch_size);
    for (int i = 0; i < batch_size; i++) {
        nms_obb(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh);
    }
}

static std::vector<cv::Point> get_corner(cv::Mat& img, const Detection& box) {
    float cos_value, sin_value;

    // Calculate center point and width/height
    float x1 = box.bbox[0];
    float y1 = box.bbox[1];
    float w = box.bbox[2];
    float h = box.bbox[3];
    float angle = box.angle * 180.0f / CV_PI;  // Convert radians to degrees

    // Print original angle
    std::cout << "Original angle: " << angle << std::endl;

    // Swap width and height if height is greater than or equal to width
    if (h >= w) {
        std::swap(w, h);
        angle = fmod(angle + 90.0f, 180.0f);  // Adjust angle to be within [0, 180)
    }

    // Ensure the angle is between 0 and 180 degrees
    if (angle < 0) {
        angle += 360.0f;  // Convert to positive value
    }
    if (angle > 180.0f) {
        angle -= 180.0f;  // Subtract 180 from angles greater than 180
    }

    // Print adjusted angle
    std::cout << "Adjusted angle: " << angle << std::endl;

    // Convert to normal angle value
    float normal_angle = fmod(angle, 180.0f);
    if (normal_angle < 0) {
        normal_angle += 180.0f;  // Ensure it's a positive value
    }

    // Print normal angle value
    std::cout << "Normal angle: " << normal_angle << std::endl;

    cos_value = std::cos(angle * CV_PI / 180.0f);  // Convert to radians
    sin_value = std::sin(angle * CV_PI / 180.0f);

    // Calculate each corner point
    float l = x1 - w / 2;  // Left boundary
    float r = x1 + w / 2;  // Right boundary
    float t = y1 - h / 2;  // Top boundary
    float b = y1 + h / 2;  // Bottom boundary

    // Use get_rect function to scale the coordinates
    float bbox[4] = {l, t, r, b};
    cv::Rect rect = get_rect_obb(img, bbox);

    float x_ = (rect.x + rect.x + rect.width) / 2;   // Center x
    float y_ = (rect.y + rect.y + rect.height) / 2;  // Center y
    float width = rect.width;                        // Width
    float height = rect.height;                      // Height

    // Calculate each corner point
    std::vector<cv::Point> corner_points(4);
    float vec1x = width / 2 * cos_value;
    float vec1y = width / 2 * sin_value;
    float vec2x = -height / 2 * sin_value;
    float vec2y = height / 2 * cos_value;

    corner_points[0] = cv::Point(int(round(x_ + vec1x + vec2x)), int(round(y_ + vec1y + vec2y)));  // Top-left corner
    corner_points[1] = cv::Point(int(round(x_ + vec1x - vec2x)), int(round(y_ + vec1y - vec2y)));  // Top-right corner
    corner_points[2] =
            cv::Point(int(round(x_ - vec1x - vec2x)), int(round(y_ - vec1y - vec2y)));  // Bottom-right corner
    corner_points[3] = cv::Point(int(round(x_ - vec1x + vec2x)), int(round(y_ - vec1y + vec2y)));  // Bottom-left corner

    // Check and adjust corner points to ensure the rectangle is parallel to image boundaries
    for (auto& point : corner_points) {
        point.x = std::max(0, std::min(point.x, img.cols - 1));
        point.y = std::max(0, std::min(point.y, img.rows - 1));
    }

    return corner_points;
}

void draw_bbox_obb(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
    static std::vector<uint32_t> colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17,
                                           0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF,
                                           0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7};
    for (size_t i = 0; i < img_batch.size(); i++) {
        auto& res = res_batch[i];
        auto& img = img_batch[i];
        for (auto& obj : res) {
            auto color = colors[(int)obj.class_id % colors.size()];
            auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF);
            auto corner_points = get_corner(img, obj);
            cv::polylines(img, std::vector<std::vector<cv::Point>>{corner_points}, true, bgr, 1);

            auto text = (std::to_string((int)(obj.class_id)) + ":" + to_string_with_precision(obj.conf));
            cv::Size textsize = cv::getTextSize(text, 0, 0.3, 1, nullptr);

            int width = textsize.width;
            int height = textsize.height;
            bool outside = (corner_points[0].y - height >= 3) ? true : false;
            cv::Point p1(corner_points[0].x, corner_points[0].y), p2;
            p2.x = corner_points[0].x + width;
            if (outside) {
                p2.y = corner_points[0].y - height - 3;
            } else {
                p2.y = corner_points[0].y + height + 3;
            }
            cv::rectangle(img, p1, p2, bgr, -1, cv::LINE_AA);
            cv::putText(
                    img, text,
                    cv::Point(corner_points[0].x, (outside ? corner_points[0].y - 2 : corner_points[0].y + height + 2)),
                    0, 0.3, cv::Scalar::all(255), 1, cv::LINE_AA);
        }
    }
}


================================================
FILE: yolo11/src/postprocess.cu
================================================
//
// Created by lindsay on 23-7-17.
//
#include "postprocess.h"
#include "types.h"

static __global__ void decode_kernel_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray,
                                         int max_objects) {
    float count = predict[0];
    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    if (position >= count)
        return;

    float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float));
    int index = atomicAdd(parray, 1);
    if (index >= max_objects)
        return;

    float confidence = pitem[4];

    if (confidence < confidence_threshold)
        return;
    //[center_x center_y w h conf class_id  mask[32] keypoints[51] angle]
    float cx = pitem[0];
    float cy = pitem[1];
    float width = pitem[2];
    float height = pitem[3];
    float label = pitem[5];
    float angle = pitem[89];

    float* pout_item = parray + 1 + index * bbox_element;
    *pout_item++ = cx;
    *pout_item++ = cy;
    *pout_item++ = width;
    *pout_item++ = height;
    *pout_item++ = confidence;
    *pout_item++ = label;
    *pout_item++ = 1;  // 1 = keep, 0 = ignore
    *pout_item++ = angle;
}

static __global__ void decode_kernel(float* predict, int num_bboxes, float confidence_threshold, float* parray,
                                     int max_objects) {
    float count = predict[0];
    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    if (position >= count)
        return;

    float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float));
    int index = atomicAdd(parray, 1);
    if (index >= max_objects)
        return;

    float confidence = pitem[4];
    if (confidence < confidence_threshold)
        return;

    float left = pitem[0];
    float top = pitem[1];
    float right = pitem[2];
    float bottom = pitem[3];
    float label = pitem[5];

    float* pout_item = parray + 1 + index * bbox_element;
    *pout_item++ = left;
    *pout_item++ = top;
    *pout_item++ = right;
    *pout_item++ = bottom;
    *pout_item++ = confidence;
    *pout_item++ = label;
    *pout_item++ = 1;  // 1 = keep, 0 = ignore
}

static __device__ float box_iou(float aleft, float atop, float aright, float abottom, float bleft, float btop,
                                float bright, float bbottom) {
    float cleft = max(aleft, bleft);
    float ctop = max(atop, btop);
    float cright = min(aright, bright);
    float cbottom = min(abottom, bbottom);
    float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
    if (c_area == 0.0f)
        return 0.0f;

    float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
    float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
    return c_area / (a_area + b_area - c_area);
}

static __global__ void nms_kernel(float* bboxes, int max_objects, float threshold) {
    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    int count = min(static_cast<int>(bboxes[0]), max_objects);
    if (position >= count)
        return;

    float* pcurrent = bboxes + 1 + position * bbox_element;
    for (int i = 0; i < count; ++i) {
        float* pitem = bboxes + 1 + i * bbox_element;
        if (i == position || pcurrent[5] != pitem[5])
            continue;
        if (pitem[4] >= pcurrent[4]) {
            if (pitem[4] == pcurrent[4] && i < position)
                continue;
            float iou =
                    box_iou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pitem[0], pitem[1], pitem[2], pitem[3]);
            if (iou > threshold) {
                pcurrent[6] = 0;
                return;
            }
        }
    }
}

static __device__ void convariance_matrix(float w, float h, float r, float& a, float& b, float& c) {
    float a_val = w * w / 12.0f;
    float b_val = h * h / 12.0f;
    float cos_r = cosf(r);
    float sin_r = sinf(r);

    a = a_val * cos_r * cos_r + b_val * sin_r * sin_r;
    b = a_val * sin_r * sin_r + b_val * cos_r * cos_r;
    c = (a_val - b_val) * sin_r * cos_r;
}

static __device__ float box_probiou(float cx1, float cy1, float w1, float h1, float r1, float cx2, float cy2, float w2,
                                    float h2, float r2, float eps = 1e-7) {

    // Calculate the prob iou between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf.
    float a1, b1, c1, a2, b2, c2;
    convariance_matrix(w1, h1, r1, a1, b1, c1);
    convariance_matrix(w2, h2, r2, a2, b2, c2);

    float t1 = ((a1 + a2) * powf(cy1 - cy2, 2) + (b1 + b2) * powf(cx1 - cx2, 2)) /
               ((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2) + eps);
    float t2 = ((c1 + c2) * (cx2 - cx1) * (cy1 - cy2)) / ((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2) + eps);
    float t3 = logf(((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2)) /
                            (4 * sqrtf(fmaxf(a1 * b1 - c1 * c1, 0.0f)) * sqrtf(fmaxf(a2 * b2 - c2 * c2, 0.0f)) + eps) +
                    eps);
    float bd = 0.25f * t1 + 0.5f * t2 + 0.5f * t3;
    bd = fmaxf(fminf(bd, 100.0f), eps);
    float hd = sqrtf(1.0f - expf(-bd) + eps);
    return 1 - hd;
}

static __global__ void nms_kernel_obb(float* bboxes, int max_objects, float threshold) {
    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    int count = min(static_cast<int>(bboxes[0]), max_objects);
    if (position >= count)
        return;

    float* pcurrent = bboxes + 1 + position * bbox_element;
    for (int i = 0; i < count; ++i) {
        float* pitem = bboxes + 1 + i * bbox_element;
        if (i == position || pcurrent[5] != pitem[5])
            continue;
        if (pitem[4] >= pcurrent[4]) {
            if (pitem[4] == pcurrent[4] && i < position)
                continue;
            float iou = box_probiou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pcurrent[7], pitem[0], pitem[1],
                                    pitem[2], pitem[3], pitem[7]);
            if (iou > threshold) {
                pcurrent[6] = 0;
                return;
            }
        }
    }
}

void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
                 cudaStream_t stream) {
    int block = 256;
    int grid = ceil(num_bboxes / (float)block);
    decode_kernel<<<grid, block, 0, stream>>>((float*)predict, num_bboxes, confidence_threshold, parray, max_objects);
}

void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) {
    int block = max_objects < 256 ? max_objects : 256;
    int grid = ceil(max_objects / (float)block);
    nms_kernel<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold);
}

void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
                     cudaStream_t stream) {
    int block = 256;
    int grid = ceil(num_bboxes / (float)block);
    decode_kernel_obb<<<grid, block, 0, stream>>>((float*)predict, num_bboxes, confidence_threshold, parray,
                                                  max_objects);
}

void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) {
    int block = max_objects < 256 ? max_objects : 256;
    int grid = ceil(max_objects / (float)block);
    nms_kernel_obb<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold);
}


================================================
FILE: yolo11/src/preprocess.cu
================================================
#include "cuda_utils.h"
#include "preprocess.h"

static uint8_t* img_buffer_host = nullptr;
static uint8_t* img_buffer_device = nullptr;

__global__ void warpaffine_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst,
                                  int dst_width, int dst_height, uint8_t const_value_st, AffineMatrix d2s, int edge) {
    int position = blockDim.x * blockIdx.x + threadIdx.x;
    if (position >= edge)
        return;

    float m_x1 = d2s.value[0];
    float m_y1 = d2s.value[1];
    float m_z1 = d2s.value[2];
    float m_x2 = d2s.value[3];
    float m_y2 = d2s.value[4];
    float m_z2 = d2s.value[5];

    int dx = position % dst_width;
    int dy = position / dst_width;
    float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
    float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
    float c0, c1, c2;

    if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
        // out of range
        c0 = const_value_st;
        c1 = const_value_st;
        c2 = const_value_st;
    } else {
        int y_low = floorf(src_y);
        int x_low = floorf(src_x);
        int y_high = y_low + 1;
        int x_high = x_low + 1;

        uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
        float ly = src_y - y_low;
        float lx = src_x - x_low;
        float hy = 1 - ly;
        float hx = 1 - lx;
        float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
        uint8_t* v1 = const_value;
        uint8_t* v2 = const_value;
        uint8_t* v3 = const_value;
        uint8_t* v4 = const_value;

        if (y_low >= 0) {
            if (x_low >= 0)
                v1 = src + y_low * src_line_size + x_low * 3;

            if (x_high < src_width)
                v2 = src + y_low * src_line_size + x_high * 3;
        }

        if (y_high < src_height) {
            if (x_low >= 0)
                v3 = src + y_high * src_line_size + x_low * 3;

            if (x_high < src_width)
                v4 = src + y_high * src_line_size + x_high * 3;
        }

        c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
        c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
        c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
    }

    // bgr to rgb
    float t = c2;
    c2 = c0;
    c0 = t;

    // normalization
    c0 = c0 / 255.0f;
    c1 = c1 / 255.0f;
    c2 = c2 / 255.0f;

    // rgbrgbrgb to rrrgggbbb
    int area = dst_width * dst_height;
    float* pdst_c0 = dst + dy * dst_width + dx;
    float* pdst_c1 = pdst_c0 + area;
    float* pdst_c2 = pdst_c1 + area;
    *pdst_c0 = c0;
    *pdst_c1 = c1;
    *pdst_c2 = c2;
}

void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height,
                     cudaStream_t stream) {
    int img_size = src_width * src_height * 3;
    // copy data to pinned memory
    memcpy(img_buffer_host, src, img_size);
    // copy data to device memory
    CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream));

    AffineMatrix s2d, d2s;
    float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width);

    s2d.value[0] = scale;
    s2d.value[1] = 0;
    s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5;
    s2d.value[3] = 0;
    s2d.value[4] = scale;
    s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5;
    cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
    cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
    cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);

    memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));

    int jobs = dst_height * dst_width;
    int threads = 256;
    int blocks = ceil(jobs / (float)threads);
    warpaffine_kernel<<<blocks, threads, 0, stream>>>(img_buffer_device, src_width * 3, src_width, src_height, dst,
                                                      dst_width, dst_height, 128, d2s, jobs);
}

void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch, float* dst, int dst_width, int dst_height,
                           cudaStream_t stream) {
    int dst_size = dst_width * dst_height * 3;
    for (size_t i = 0; i < img_batch.size(); i++) {
        cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width,
                        dst_height, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
    }
}

void cuda_preprocess_init(int max_image_size) {
    // prepare input data in pinned memory
    CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3));
    // prepare input data in device memory
    CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3));
}

void cuda_preprocess_destroy() {
    CUDA_CHECK(cudaFree(img_buffer_device));
    CUDA_CHECK(cudaFreeHost(img_buffer_host));
}


================================================
FILE: yolo11/yolo11_cls.cpp
================================================
#include "calibrator.h"
#include "config.h"
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "utils.h"

#include <chrono>
#include <cmath>
#include <iostream>
#include <numeric>
#include <opencv2/opencv.hpp>

using namespace nvinfer1;

static Logger gLogger;
const static int kOutputSize = kClsNumClass;

void batch_preprocess(std::vector<cv::Mat>& imgs, float* output, int dst_width = 224, int dst_height = 224) {
    for (size_t b = 0; b < imgs.size(); b++) {
        int h = imgs[b].rows;
        int w = imgs[b].cols;
        int m = std::min(h, w);
        int top = (h - m) / 2;
        int left = (w - m) / 2;
        cv::Mat img = imgs[b](cv::Rect(left, top, m, m));
        cv::resize(img, img, cv::Size(dst_width, dst_height), 0, 0, cv::INTER_LINEAR);
        cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
        img.convertTo(img, CV_32F, 1 / 255.0);

        std::vector<cv::Mat> channels(3);
        cv::split(img, channels);

        // CHW format
        for (int c = 0; c < 3; ++c) {
            int i = 0;
            for (int row = 0; row < dst_height; ++row) {
                for (int col = 0; col < dst_width; ++col) {
                    output[b * 3 * dst_height * dst_width + c * dst_height * dst_width + i] =
                            channels[c].at<float>(row, col);
                    ++i;
                }
            }
        }
    }
}

std::vector<float> softmax(float* prob, int n) {
    std::vector<float> res;
    float sum = 0.0f;
    float t;
    for (int i = 0; i < n; i++) {
        t = expf(prob[i]);
        res.push_back(t);
        sum += t;
    }
    for (int i = 0; i < n; i++) {
        res[i] /= sum;
    }
    return res;
}

std::vector<int> topk(const std::vector<float>& vec, int k) {
    std::vector<int> topk_index;
    std::vector<size_t> vec_index(vec.size());
    std::iota(vec_index.begin(), vec_index.end(), 0);

    std::sort(vec_index.begin(), vec_index.end(),
              [&vec](size_t index_1, size_t index_2) { return vec[index_1] > vec[index_2]; });

    int k_num = std::min<int>(vec.size(), k);

    for (int i = 0; i < k_num; ++i) {
        topk_index.push_back(vec_index[i]);
    }

    return topk_index;
}

std::vector<std::string> read_classes(std::string file_name) {
    std::vector<std::string> classes;
    std::ifstream ifs(file_name, std::ios::in);
    if (!ifs.is_open()) {
        std::cerr << file_name << " is not found, pls refer to README and download it." << std::endl;
        assert(0);
    }
    std::string s;
    while (std::getline(ifs, s)) {
        classes.push_back(s);
    }
    ifs.close();
    return classes;
}

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, float& gd, float& gw,
                std::string& img_dir, std::string& type, int& max_channels) {
    if (argc < 4)
        return false;
    if (std::string(argv[1]) == "-s" && (argc == 5)) {
        wts = std::string(argv[2]);
        engine = std::string(argv[3]);
        auto net = std::string(argv[4]);
        if (net[0] == 'n') {
            gd = 0.50;
            gw = 0.25;
            max_channels = 1024;
            type = "n";
        } else if (net[0] == 's') {
            gd = 0.50;
            gw = 0.50;
            max_channels = 1024;
            type = "s";
        } else if (net[0] == 'm') {
            gd = 0.50;
            gw = 1.00;
            max_channels = 512;
            type = "m";
        } else if (net[0] == 'l') {
            gd = 1.0;
            gw = 1.0;
            max_channels = 512;
            type = "l";
        } else if (net[0] == 'x') {
            gd = 1.0;
            gw = 1.50;
            max_channels = 512;
            type = "x";
        } else {
            return false;
        }
    } else if (std::string(argv[1]) == "-d" && argc == 4) {
        engine = std::string(argv[2]);
        img_dir = std::string(argv[3]);
    } else {
        return false;
    }
    return true;
}

void prepare_buffers(ICudaEngine* engine, float** gpu_input_buffer, float** gpu_output_buffer, float** cpu_input_buffer,
                     float** output_buffer_host) {
    assert(engine->getNbBindings() == 2);
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(kInputTensorName);
    const int outputIndex = engine->getBindingIndex(kOutputTensorName);
    assert(inputIndex == 0);
    assert(outputIndex == 1);
    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc((void**)gpu_input_buffer, kBatchSize * 3 * kClsInputH * kClsInputW * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)gpu_output_buffer, kBatchSize * kOutputSize * sizeof(float)));

    *cpu_input_buffer = new float[kBatchSize * 3 * kClsInputH * kClsInputW];
    *output_buffer_host = new float[kBatchSize * kOutputSize];
}

void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* input, float* output,
           int batchSize) {
    CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * kClsInputH * kClsInputW * sizeof(float),
                               cudaMemcpyHostToDevice, stream));
    context.enqueueV2(buffers, stream, nullptr);
    CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
                               stream));
    cudaStreamSynchronize(stream);
}

void serialize_engine(float& gd, float& gw, std::string& wts_name, std::string& engine_name, std::string& type,
                      int max_channels) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();
    // Create model to populate the network, then set the outputs and create an engine
    IHostMemory* serialized_engine = nullptr;
    //engine = buildEngineYolo11Cls(max_batchsize, builder, config, DataType::kFLOAT, gd, gw, wts_name);
    serialized_engine = buildEngineYolo11Cls(builder, config, DataType::kFLOAT, wts_name, gd, gw, type, max_channels);
    assert(serialized_engine);
    // Save engine to file
    std::ofstream p(engine_name, std::ios::binary);
    if (!p) {
        std::cerr << "Could not open plan output file" << std::endl;
        assert(false);
    }
    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());

    // Close everything down
    delete serialized_engine;
    delete config;
    delete builder;
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
                        IExecutionContext** context) {
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        assert(false);
    }
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    char* serialized_engine = new char[size];
    assert(serialized_engine);
    file.read(serialized_engine, size);
    file.close();

    *runtime = createInferRuntime(gLogger);
    assert(*runtime);
    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
    assert(*engine);
    *context = (*engine)->createExecutionContext();
    assert(*context);
    delete[] serialized_engine;
}

int main(int argc, char** argv) {
    // yolo11_cls -s ../models/yolo11n-cls.wts ../models/yolo11n-cls.fp32.trt n
    // yolo11_cls -d ../models/yolo11n-cls.fp32.trt ../images
    cudaSetDevice(kGpuId);
    std::string wts_name;
    std::string engine_name;
    float gd = 0.0f, gw = 0.0f;
    std::string img_dir;
    std::string type;
    int max_channels = 0;

    if (!parse_args(argc, argv, wts_name, engine_name, gd, gw, img_dir, type, max_channels)) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./yolo11_cls -s [.wts] [.engine] [n/s/m/l/x]  // serialize model to plan file" << std::endl;
        std::cerr << "./yolo11_cls -d [.engine] ../images  // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // Create a model using the API directly and serialize it to a file
    if (!wts_name.empty()) {
        serialize_engine(gd, gw, wts_name, engine_name, type, max_channels);
        return 0;
    }

    // Deserialize the engine from file
    IRuntime* runtime = nullptr;
    ICudaEngine* engine = nullptr;
    IExecutionContext* context = nullptr;
    deserialize_engine(engine_name, &runtime, &engine, &context);
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    // Prepare cpu and gpu buffers
    float* device_buffers[2];
    float* cpu_input_buffer = nullptr;
    float* output_buffer_host = nullptr;
    prepare_buffers(engine, &device_buffers[0], &device_buffers[1], &cpu_input_buffer, &output_buffer_host);

    // Read images from directory
    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    // Read imagenet labels
    auto classes = read_classes("imagenet_classes.txt");

    // batch predict
    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
        // Get a batch of images
        std::vector<cv::Mat> img_batch;
        std::vector<std::string> img_name_batch;
        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
            img_batch.push_back(img);
            img_name_batch.push_back(file_names[j]);
        }

        // Preprocess
        batch_preprocess(img_batch, cpu_input_buffer);

        // Run inference
        auto start = std::chrono::system_clock::now();
        infer(*context, stream, (void**)device_buffers, cpu_input_buffer, output_buffer_host, kBatchSize);
        auto end = std::chrono::system_clock::now();
        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
                  << "ms" << std::endl;

        // Postprocess and get top-k result
        for (size_t b = 0; b < img_name_batch.size(); b++) {
            float* p = &output_buffer_host[b * kOutputSize];
            auto res = softmax(p, kOutputSize);
            auto topk_idx = topk(res, 3);
            std::cout << img_name_batch[b] << std::endl;
            for (auto idx : topk_idx) {
                std::cout << "  " << classes[idx] << " " << res[idx] << std::endl;
            }
        }
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(device_buffers[0]));
    CUDA_CHECK(cudaFree(device_buffers[1]));
    delete[] cpu_input_buffer;
    delete[] output_buffer_host;
    // Destroy the engine
    delete context;
    delete engine;
    delete runtime;
    return 0;
}


================================================
FILE: yolo11/yolo11_cls_trt.py
================================================
"""
An example that uses TensorRT's Python api to make inferences.
"""
import os
import shutil
import sys
import threading
import time
import cv2
import numpy as np
import torch
import pycuda.autoinit  # noqa: F401
import pycuda.driver as cuda
import tensorrt as trt


def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret


with open("imagenet_classes.txt") as f:
    classes = [line.strip() for line in f.readlines()]


class YoLo11TRT(object):
    """
    description: A YOLO11 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []
        self.mean = (0.485, 0.456, 0.406)
        self.std = (0.229, 0.224, 0.225)

        for binding in engine:
            print('binding:', binding, engine.get_binding_shape(binding))
            self.batch_size = engine.get_binding_shape(binding)[0]
            size = trt.volume(engine.get_binding_shape(
                binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings

    def infer(self, raw_image_generator):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_input_image = np.empty(
            shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            batch_image_raw.append(image_raw)
            input_image = self.preprocess_cls_image(image_raw)
            np.copyto(batch_input_image[i], input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        # Do postprocess
        for i in range(self.batch_size):
            classes_ls, predicted_conf_ls, category_id_ls = self.postprocess_cls(
                output)
            cv2.putText(batch_image_raw[i], str(
                classes_ls), (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1, cv2.LINE_AA)
            print(classes_ls, predicted_conf_ls)
        return batch_image_raw, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()

    def get_raw_image(self, image_path_batch):
        """
        description: Read an image from image path
        """
        for img_path in image_path_batch:
            yield cv2.imread(img_path)

    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_cls_image(self, raw_bgr_image, dst_width=224, dst_height=224):

        """
            description: Convert BGR image to RGB,
                         crop the center square frame,
                         resize it to target size, normalize to [0,1],
                         transform to NCHW format.
            param:
                raw_bgr_image: numpy array, raw BGR image
                dst_width: int, target image width
                dst_height: int, target image height
            return:
                image:  the processed image
                image_raw: the original image
                h: original height
                w: original width
        """
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        # Crop the center square frame
        m = min(h, w)
        top = (h - m) // 2
        left = (w - m) // 2
        image = raw_bgr_image[top:top + m, left:left + m]

        # Resize the image with target size while maintaining ratio
        image = cv2.resize(image, (dst_width, dst_height), interpolation=cv2.INTER_LINEAR)

        # Convert BGR to RGB
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # Normalize to [0,1]
        image = image.astype(np.float32) / 255.0

        # HWC to CHW format
        image = image.transpose(2, 0, 1)

        # CHW to NCHW format (add batch dimension)
        image = np.expand_dims(image, axis=0)

        # Convert the image to row-major order, also known as "C order"
        image = np.ascontiguousarray(image)

        batch_data = np.expand_dims(image, axis=0)

        return batch_data

    def postprocess_cls(self, output_data):
        classes_ls = []
        predicted_conf_ls = []
        category_id_ls = []
        output_data = output_data.reshape(self.batch_size, -1)
        output_data = torch.Tensor(output_data)
        p = torch.nn.functional.softmax(output_data, dim=1)
        score, index = torch.topk(p, 3)
        for ind in range(index.shape[0]):
            input_category_id = index[ind][0].item()  # 716
            category_id_ls.append(input_category_id)
            predicted_confidence = score[ind][0].item()
            predicted_conf_ls.append(predicted_confidence)
            classes_ls.append(classes[input_category_id])
        return classes_ls, predicted_conf_ls, category_id_ls


class inferThread(threading.Thread):
    def __init__(self, yolo11_wrapper, image_path_batch):
        threading.Thread.__init__(self)
        self.yolo11_wrapper = yolo11_wrapper
        self.image_path_batch = image_path_batch

    def run(self):
        batch_image_raw, use_time = self.yolo11_wrapper.infer(
            self.yolo11_wrapper.get_raw_image(self.image_path_batch))
        for i, img_path in enumerate(self.image_path_batch):
            parent, filename = os.path.split(img_path)
            save_name = os.path.join('output', filename)
            # Save image
            cv2.imwrite(save_name, batch_image_raw[i])
        print('input->{}, time->{:.2f}ms, saving into output/'.format(
            self.image_path_batch, use_time * 1000))


class warmUpThread(threading.Thread):
    def __init__(self, yolo11_wrapper):
        threading.Thread.__init__(self)
        self.yolo11_wrapper = yolo11_wrapper

    def run(self):
        batch_image_raw, use_time = self.yolo11_wrapper.infer(
            self.yolo11_wrapper.get_raw_image_zeros())
        print(
            'warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))


if __name__ == "__main__":
    # load custom plugin and engine
    engine_file_path = "./yolo11x-cls-fp32.engine"

    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')
    # a YoLo11TRT instance
    yolo11_wrapper = YoLo11TRT(engine_file_path)
    try:
        print('batch size is', yolo11_wrapper.batch_size)

        image_dir = "images/"
        image_path_batches = get_img_path_batches(
            yolo11_wrapper.batch_size, image_dir)

        for i in range(10):
            # create a new thread to do warm_up
            thread1 = warmUpThread(yolo11_wrapper)
            thread1.start()
            thread1.join()
        for batch in image_path_batches:
            # create a new thread to do inference
            thread1 = inferThread(yolo11_wrapper, batch)
            thread1.start()
            thread1.join()
    finally:
        # destroy the instance
        yolo11_wrapper.destroy()


================================================
FILE: yolo11/yolo11_det.cpp
================================================

#include <fstream>
#include <iostream>
#include <opencv2/opencv.hpp>
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "postprocess.h"
#include "preprocess.h"
#include "utils.h"

Logger gLogger;
using namespace nvinfer1;
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;

void serialize_engine(std::string& wts_name, std::string& engine_name, float& gd, float& gw, int& max_channels,
                      std::string& type) {
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();
    IHostMemory* serialized_engine = nullptr;

    serialized_engine = buildEngineYolo11Det(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels, type);

    assert(serialized_engine);
    std::ofstream p(engine_name, std::ios::binary);
    if (!p) {
        std::cout << "could not open plan output file" << std::endl;
        assert(false);
    }
    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());

    delete serialized_engine;
    delete config;
    delete builder;
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
                        IExecutionContext** context) {
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        assert(false);
    }
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    char* serialized_engine = new char[size];
    assert(serialized_engine);
    file.read(serialized_engine, size);
    file.close();

    *runtime = createInferRuntime(gLogger);
    assert(*runtime);
    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
    assert(*engine);
    *context = (*engine)->createExecutionContext();
    assert(*context);
    delete[] serialized_engine;
}

void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
                    float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
                    std::string cuda_post_process) {
    assert(engine->getNbBindings() == 2);
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(kInputTensorName);
    const int outputIndex = engine->getBindingIndex(kOutputTensorName);
    assert(inputIndex == 0);
    assert(outputIndex == 1);
    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
    if (cuda_post_process == "c") {
        *output_buffer_host = new float[kBatchSize * kOutputSize];
    } else if (cuda_post_process == "g") {
        if (kBatchSize > 1) {
            std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
            exit(0);
        }
        // Allocate memory for decode_ptr_host and copy to device
        *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
        CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
    }
}

void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
           float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
    // infer on the batch asynchronously, and DMA output back to host
    auto start = std::chrono::system_clock::now();
    context.enqueueV2(buffers, stream, nullptr);
    if (cuda_post_process == "c") {
        CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
                                   stream));
        auto end = std::chrono::system_clock::now();
        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
                  << "ms" << std::endl;
    } else if (cuda_post_process == "g") {
        CUDA_CHECK(
                cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
        cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
        cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream);  //cuda nms
        CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
                                   sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
                                   stream));
        auto end = std::chrono::system_clock::now();
        std::cout << "inference and gpu postprocess time: "
                  << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));
}

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir, std::string& type,
                std::string& cuda_post_process, float& gd, float& gw, int& max_channels) {
    if (argc < 4)
        return false;
    if (std::string(argv[1]) == "-s" && (argc == 5)) {
        wts = std::string(argv[2]);
        engine = std::string(argv[3]);
        auto sub_type = std::string(argv[4]);

        if (sub_type[0] == 'n') {
            gd = 0.50;
            gw = 0.25;
            max_channels = 1024;
            type = "n";
        } else if (sub_type[0] == 's') {
            gd = 0.50;
            gw = 0.50;
            max_channels = 1024;
            type = "s";
        } else if (sub_type[0] == 'm') {
            gd = 0.50;
            gw = 1.00;
            max_channels = 512;
            type = "m";
        } else if (sub_type[0] == 'l') {
            gd = 1.0;
            gw = 1.0;
            max_channels = 512;
            type = "l";
        } else if (sub_type[0] == 'x') {
            gd = 1.0;
            gw = 1.50;
            max_channels = 512;
            type = "x";
        } else {
            return false;
        }
    } else if (std::string(argv[1]) == "-d" && argc == 5) {
        engine = std::string(argv[2]);
        img_dir = std::string(argv[3]);
        cuda_post_process = std::string(argv[4]);
    } else {
        return false;
    }
    return true;
}

int main(int argc, char** argv) {
    // yolo11_det -s ../models/yolo11n.wts ../models/yolo11n.fp32.trt n
    // yolo11_det -d ../models/yolo11n.fp32.trt ../images c
    cudaSetDevice(kGpuId);
    std::string wts_name;
    std::string engine_name;
    std::string img_dir;
    std::string cuda_post_process;
    std::string type;
    int model_bboxes;
    float gd = 0, gw = 0;
    int max_channels = 0;

    if (!parse_args(argc, argv, wts_name, engine_name, img_dir, type, cuda_post_process, gd, gw, max_channels)) {
        std::cerr << "Arguments not right!" << std::endl;
        std::cerr << "./yolo11_det -s [.wts] [.engine] [n/s/m/l/x]  // serialize model to "
                     "plan file"
                  << std::endl;
        std::cerr << "./yolo11_det -d [.engine] ../images  [c/g]// deserialize plan file and run inference"
                  << std::endl;
        return -1;
    }

    // Create a model using the API directly and serialize it to a file
    if (!wts_name.empty()) {
        serialize_engine(wts_name, engine_name, gd, gw, max_channels, type);
        return 0;
    }

    // Deserialize the engine from file
    IRuntime* runtime = nullptr;
    ICudaEngine* engine = nullptr;
    IExecutionContext* context = nullptr;
    deserialize_engine(engine_name, &runtime, &engine, &context);
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));
    cuda_preprocess_init(kMaxInputImageSize);
    auto out_dims = engine->getBindingDimensions(1);
    model_bboxes = out_dims.d[0];
    // Prepare cpu and gpu buffers
    float* device_buffers[2];
    float* output_buffer_host = nullptr;
    float* decode_ptr_host = nullptr;
    float* decode_ptr_device = nullptr;

    // Read images from directory
    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
                   &decode_ptr_device, cuda_post_process);

    // batch predict
    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
        // Get a batch of images
        std::vector<cv::Mat> img_batch;
        std::vector<std::string> img_name_batch;
        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
            img_batch.push_back(img);
            img_name_batch.push_back(file_names[j]);
        }
        // Preprocess
        cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
        // Run inference
        infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
              decode_ptr_device, model_bboxes, cuda_post_process);
        // 保存output_buffer_host的前100个值，一行一个
        //        std::ofstream out("../models/output.txt");
        //        for (int j = 0; j < 100; j++) {
        //            out << output_buffer_host[j] << std::endl;
        //        }
        //        out.close();
        std::vector<std::vector<Detection>> res_batch;
        if (cuda_post_process == "c") {
            // NMS
            batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
        } else if (cuda_post_process == "g") {
            //Process gpu decode and nms results
            batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
        }
        // Draw bounding boxes
        draw_bbox(img_batch, res_batch);
        // Save images
        for (size_t j = 0; j < img_batch.size(); j++) {
            cv::imwrite("_" + img_name_batch[j], img_batch[j]);
        }
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(device_buffers[0]));
    CUDA_CHECK(cudaFree(device_buffers[1]));
    CUDA_CHECK(cudaFree(decode_ptr_device));
    delete[] decode_ptr_host;
    delete[] output_buffer_host;
    cuda_preprocess_destroy();
    // Destroy the engine
    delete context;
    delete engine;
    delete runtime;

    // Print histogram of the output distribution
    //std::cout << "\nOutput:\n\n";
    //for (unsigned int i = 0; i < kOutputSize; i++)
    //{
    //    std::cout << prob[i] << ", ";
    //    if (i % 10 == 0) std::cout << std::endl;
    //}
    //std::cout << std::endl;

    return 0;
}


================================================
FILE: yolo11/yolo11_det_trt.py
================================================
"""
An example that uses TensorRT's Python api to make inferences.
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit  # noqa: F401
import pycuda.driver as cuda
import tensorrt as trt

CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4
POSE_NUM = 17 * 3
DET_NUM = 6
SEG_NUM = 32
OBB_NUM = 1


def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret


def plot_one_box(x, img, color=None, label=None, line_thickness=None):
    """
    description: Plots one bounding box on image img,
                 this function comes from YoLo11 project.
    param:
        x:      a box likes [x1,y1,x2,y2]
        img:    a opencv image object
        color:  color to draw rectangle, such as (0,255,0)
        label:  str
        line_thickness: int
    return:
        no return

    """
    tl = (
            line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
    )  # line/font thickness
    color = color or [random.randint(0, 255) for _ in range(3)]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
    if label:
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(
            img,
            label,
            (c1[0], c1[1] - 2),
            0,
            tl / 3,
            [225, 255, 255],
            thickness=tf,
            lineType=cv2.LINE_AA,
        )


class YoLo11TRT(object):
    """
    description: A YOLO11 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            self.batch_size = engine.get_binding_shape(binding)[0]
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.det_output_length = host_outputs[0].shape[0]

    def infer(self, raw_image_generator):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_origin_h = []
        batch_origin_w = []
        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
            batch_image_raw.append(image_raw)
            batch_origin_h.append(origin_h)
            batch_origin_w.append(origin_w)
            np.copyto(batch_input_image[i], input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        # Do postprocess
        for i in range(self.batch_size):
            result_boxes, result_scores, result_classid = self.post_process(
                output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i],
                batch_origin_w[i]
            )
            # Draw rectangles and labels on the original image
            for j in range(len(result_boxes)):
                box = result_boxes[j]
                plot_one_box(
                    box,
                    batch_image_raw[i],
                    label="{}:{:.2f}".format(
                        categories[int(result_classid[j])], result_scores[j]
                    ),
                )
        return batch_image_raw, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()

    def get_raw_image(self, image_path_batch):
        """
        description: Read an image from image path
        """
        for img_path in image_path_batch:
            yield cv2.imread(img_path)

    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_image(self, raw_bgr_image):
        """
        description: Convert BGR image to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = self.input_w / w
        r_h = self.input_h / h
        if r_h > r_w:
            tw = self.input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.input_h - th) / 2)
            ty2 = self.input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = self.input_h
            tx1 = int((self.input_w - tw) / 2)
            tx2 = self.input_w - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy(self, origin_h, origin_w, x):
        """
        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
        param:
            origin_h:   height of original image
            origin_w:   width of original image
            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
        return:
            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
        """
        y = np.zeros_like(x)
        r_w = self.input_w / origin_w
        r_h = self.input_h / origin_h
        if r_h > r_w:
            y[:, 0] = x[:, 0]
            y[:, 2] = x[:, 2]
            y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2
            y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2
            y /= r_w
        else:
            y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2
            y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2
            y[:, 1] = x[:, 1]
            y[:, 3] = x[:, 3]
            y /= r_h

        return y

    def post_process(self, output, origin_h, origin_w):
        """
        description: postprocess the prediction
        param:
            output:     A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
            result_scores: finally scores, a numpy, each element is the score correspoing to box
            result_classid: finally classid, a numpy, each element is the classid correspoing to box
        """
        num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + OBB_NUM
        # Get the num of boxes detected
        num = int(output[0])
        # Reshape to a two dimentional ndarray
        # pred = np.reshape(output[1:], (-1, 38))[:num, :]
        pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :]
        # Do nms
        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
        result_scores = boxes[:, 4] if len(boxes) else np.array([])
        result_classid = boxes[:, 5] if len(boxes) else np.array([])
        return result_boxes, result_scores, result_classid

    def bbox_iou(self, box1, box2, x1y1x2y2=True):
        """
        description: compute the IoU of two bounding boxes
        param:
            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            x1y1x2y2: select the coordinate format
        return:
            iou: computed iou
        """
        if not x1y1x2y2:
            # Transform from center and width to exact coordinates
            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
        else:
            # Get the coordinates of bounding boxes
            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

        # Get the coordinates of the intersection rectangle
        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
        # Intersection area
        inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None)
                      * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None))
        # Union Area
        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)

        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

        return iou

    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
        """
        description: Removes detections with lower object confidence score than 'conf_thres' and performs
        Non-Maximum Suppression to further filter detections.
        param:
            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
            origin_h: original image height
            origin_w: original image width
            conf_thres: a confidence threshold to filter detections
            nms_thres: a iou threshold to filter detections
        return:
            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
        """
        # Get the boxes that score > CONF_THRESH
        boxes = prediction[prediction[:, 4] >= conf_thres]
        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
        boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
        # clip the coordinates
        boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1)
        boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1)
        boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1)
        boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1)
        # Object confidence
        confs = boxes[:, 4]
        # Sort by the confs
        boxes = boxes[np.argsort(-confs)]
        # Perform non-maximum suppression
        keep_boxes = []
        while boxes.shape[0]:
            large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
            label_match = boxes[0, -1] == boxes[:, -1]
            # Indices of boxes with lower confidence scores, large IOUs and matching labels
            invalid = large_overlap & label_match
            keep_boxes += [boxes[0]]
            boxes = boxes[~invalid]
        boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
        return boxes


class inferThread(threading.Thread):
    def __init__(self, yolo11_wrapper, image_path_batch):
        threading.Thread.__init__(self)
        self.yolo11_wrapper = yolo11_wrapper
        self.image_path_batch = image_path_batch

    def run(self):
        batch_image_raw, use_time = self.yolo11_wrapper.infer(self.yolo11_wrapper.get_raw_image(self.image_path_batch))
        for i, img_path in enumerate(self.image_path_batch):
            parent, filename = os.path.split(img_path)
            save_name = os.path.join('output', filename)
            # Save image
            cv2.imwrite(save_name, batch_image_raw[i])
        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))


class warmUpThread(threading.Thread):
    def __init__(self, yolo11_wrapper):
        threading.Thread.__init__(self)
        self.yolo11_wrapper = yolo11_wrapper

    def run(self):
        batch_image_raw, use_time = self.yolo11_wrapper.infer(self.yolo11_wrapper.get_raw_image_zeros())
        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))


if __name__ == "__main__":
    # load custom plugin and engine
    PLUGIN_LIBRARY = "build/libmyplugins.so"
    engine_file_path = "yolo11s.engine"

    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]
    if len(sys.argv) > 2:
        PLUGIN_LIBRARY = sys.argv[2]

    ctypes.CDLL(PLUGIN_LIBRARY)

    # load coco labels

    categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
                  "traffic light",
                  "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
                  "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase",
                  "frisbee",
                  "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
                  "surfboard",
                  "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
                  "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
                  "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
                  "cell phone",
                  "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
                  "teddy bear",
                  "hair drier", "toothbrush"]

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')
    # a YoLo11TRT instance
    yolo11_wrapper = YoLo11TRT(engine_file_path)
    try:
        print('batch size is', yolo11_wrapper.batch_size)

        image_dir = "images/"
        image_path_batches = get_img_path_batches(yolo11_wrapper.batch_size, image_dir)

        for i in range(10):
            # create a new thread to do warm_up
            thread1 = warmUpThread(yolo11_wrapper)
            thread1.start()
            thread1.join()
        for batch in image_path_batches:
            # create a new thread to do inference
            thread1 = inferThread(yolo11_wrapper, batch)
            thread1.start()
            thread1.join()
    finally:
        # destroy the instance
        yolo11_wrapper.destroy()


================================================
FILE: yolo11/yolo11_obb.cpp
================================================

#include <fstream>
#include <iostream>
#include <opencv2/opencv.hpp>
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "postprocess.h"
#include "preprocess.h"
#include "utils.h"

Logger gLogger;
using namespace nvinfer1;
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;

void serialize_engine(std::string& wts_name, std::string& engine_name, std::string& type, float& gd, float& gw,
                      int& max_channels) {
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();
    IHostMemory* serialized_engine = nullptr;

    serialized_engine = buildEngineYolo11Obb(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels, type);

    assert(serialized_engine);
    std::ofstream p(engine_name, std::ios::binary);
    if (!p) {
        std::cout << "could not open plan output file" << std::endl;
        assert(false);
    }
    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());

    delete serialized_engine;
    delete config;
    delete builder;
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
                        IExecutionContext** context) {
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        assert(false);
    }
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    char* serialized_engine = new char[size];
    assert(serialized_engine);
    file.read(serialized_engine, size);
    file.close();

    *runtime = createInferRuntime(gLogger);
    assert(*runtime);
    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
    assert(*engine);
    *context = (*engine)->createExecutionContext();
    assert(*context);
    delete[] serialized_engine;
}

void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
                    float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
                    std::string cuda_post_process) {
    assert(engine->getNbBindings() == 2);
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(kInputTensorName);
    const int outputIndex = engine->getBindingIndex(kOutputTensorName);

    assert(inputIndex == 0);
    assert(outputIndex == 1);
    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kObbInputH * kObbInputW * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));

    if (cuda_post_process == "c") {
        *output_buffer_host = new float[kBatchSize * kOutputSize];
    } else if (cuda_post_process == "g") {
        if (kBatchSize > 1) {
            std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
            exit(0);
        }
        // Allocate memory for decode_ptr_host and copy to device
        *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
        CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
    }
}

void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
           float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
    // infer on the batch asynchronously, and DMA output back to host
    auto start = std::chrono::system_clock::now();
    context.enqueueV2(buffers, stream, nullptr);
    if (cuda_post_process == "c") {
        CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
                                   stream));
        auto end = std::chrono::system_clock::now();
        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
                  << "ms" << std::endl;
    } else if (cuda_post_process == "g") {
        CUDA_CHECK(
                cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
        cuda_decode_obb((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
        cuda_nms_obb(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream);  //cuda nms
        CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
                                   sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
                                   stream));
        auto end = std::chrono::system_clock::now();
        std::cout << "inference and gpu postprocess time: "
                  << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));
}

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir, std::string& type,
                std::string& cuda_post_process, float& gd, float& gw, int& max_channels) {
    if (argc < 4)
        return false;
    if (std::string(argv[1]) == "-s" && argc == 5) {
        wts = std::string(argv[2]);
        engine = std::string(argv[3]);
        std::string sub_type = std::string(argv[4]);
        if (sub_type[0] == 'n') {
            gd = 0.50;
            gw = 0.25;
            max_channels = 1024;
            type = "n";
        } else if (sub_type[0] == 's') {
            gd = 0.50;
            gw = 0.50;
            max_channels = 1024;
            type = "s";
        } else if (sub_type[0] == 'm') {
            gd = 0.50;
            gw = 1.00;
            max_channels = 512;
            type = "m";
        } else if (sub_type[0] == 'l') {
            gd = 1.0;
            gw = 1.0;
            max_channels = 512;
            type = "l";
        } else if (sub_type[0] == 'x') {
            gd = 1.0;
            gw = 1.50;
            max_channels = 512;
            type = "x";
        } else {
            return false;
        }
    } else if (std::string(argv[1]) == "-d" && argc == 5) {
        engine = std::string(argv[2]);
        img_dir = std::string(argv[3]);
        cuda_post_process = std::string(argv[4]);
    } else {
        return false;
    }
    return true;
}

int main(int argc, char** argv) {
    // yolo11_obb -s ../models/yolo11n-obb.wts ../models/yolo11n-obb.fp32.trt n
    // yolo11_obb -d ../models/yolo11n-obb.fp32.trt ../images c
    cudaSetDevice(kGpuId);
    std::string wts_name;
    std::string engine_name;
    std::string img_dir;
    std::string type;
    std::string cuda_post_process;
    int model_bboxes;
    float gd = 0.0f, gw = 0.0f;
    int max_channels = 0;

    if (!parse_args(argc, argv, wts_name, engine_name, img_dir, type, cuda_post_process, gd, gw, max_channels)) {
        std::cerr << "Arguments not right!" << std::endl;
        std::cerr << "./yolo11_obb -s [.wts] [.engine] [n/s/m/l/x]  // serialize model to plan file" << std::endl;
        std::cerr << "./yolo11_obb -d [.engine] ../images  [c/g]// deserialize plan file and run inference"
                  << std::endl;
        return -1;
    }

    // Create a model using the API directly and serialize it to a file
    if (!wts_name.empty()) {
        serialize_engine(wts_name, engine_name, type, gd, gw, max_channels);
        return 0;
    }

    // Deserialize the engine from file
    IRuntime* runtime = nullptr;
    ICudaEngine* engine = nullptr;
    IExecutionContext* context = nullptr;
    deserialize_engine(engine_name, &runtime, &engine, &context);
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));
    cuda_preprocess_init(kMaxInputImageSize);
    auto out_dims = engine->getBindingDimensions(1);
    model_bboxes = out_dims.d[0];
    // Prepare cpu and gpu buffers
    float* device_buffers[2];
    float* output_buffer_host = nullptr;
    float* decode_ptr_host = nullptr;
    float* decode_ptr_device = nullptr;

    // Read images from directory
    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
                   &decode_ptr_device, cuda_post_process);

    // batch predict
    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
        // Get a batch of images
        std::vector<cv::Mat> img_batch;
        std::vector<std::string> img_name_batch;
        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
            img_batch.push_back(img);
            img_name_batch.push_back(file_names[j]);
        }
        // Preprocess
        cuda_batch_preprocess(img_batch, device_buffers[0], kObbInputW, kObbInputH, stream);
        // Run inference
        infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
              decode_ptr_device, model_bboxes, cuda_post_process);
        std::vector<std::vector<Detection>> res_batch;
        if (cuda_post_process == "c") {
            // NMS
            batch_nms_obb(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
        } else if (cuda_post_process == "g") {
            //Process gpu decode and nms results
            //            batch_process_obb(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
            // todo seg in gpu
            std::cerr << "obb_postprocess is not support in gpu right now" << std::endl;
        }
        // Draw bounding boxes
        draw_bbox_obb(img_batch, res_batch);
        // Save images
        for (size_t j = 0; j < img_batch.size(); j++) {
            cv::imwrite("_" + img_name_batch[j], img_batch[j]);
        }
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(device_buffers[0]));
    CUDA_CHECK(cudaFree(device_buffers[1]));
    CUDA_CHECK(cudaFree(decode_ptr_device));
    delete[] decode_ptr_host;
    delete[] output_buffer_host;
    cuda_preprocess_destroy();
    // Destroy the engine
    delete context;
    delete engine;
    delete runtime;

    // Print histogram of the output distribution
    // std::cout << "\nOutput:\n\n";
    // for (unsigned int i = 0; i < kOutputSize; i++)
    //{
    //    std::cout << prob[i] << ", ";
    //    if (i % 10 == 0) std::cout << std::endl;
    //}
    // std::cout << std::endl;

    return 0;
}


================================================
FILE: yolo11/yolo11_obb_trt.py
================================================
"""
An example that uses TensorRT's Python api to make inferences.
"""
import ctypes
import os
import shutil
import sys
import threading
import time
import cv2
import math
import numpy as np
import pycuda.autoinit  # noqa: F401
import pycuda.driver as cuda
import tensorrt as trt

CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4
POSE_NUM = 17 * 3
DET_NUM = 6
SEG_NUM = 32
OBB_NUM = 1

INPUT_W = 640
INPUT_H = 640


class Detection:
    def __init__(self, bbox, score, class_id, angle):
        self.bbox = bbox
        self.score = score
        self.class_id = class_id
        self.angle = angle


def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret


def get_corner(img, box: Detection):
    """
    description: Get the four corner points of the rotated bounding box
    param:
        img:    an opencv image object (numpy array)
        box:    a Detection object containing bbox [cx,cy,w,h] and angle (radians)
    return:
        corners: four corner points of the rotated bounding box as numpy array [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
    """
    # Extract box parameters
    cx, cy, w, h = box.bbox
    angle = box.angle * 180.0 / math.pi  # Convert radians to degrees

    # Swap width and height if height >= width
    if h >= w:
        w, h = h, w
        angle = (angle + 90.0) % 180.0  # Adjust angle

    # Ensure angle is between 0 and 180 degrees
    if angle < 0:
        angle += 360.0
    if angle > 180.0:
        angle -= 180.0

    # Convert to normalized angle (0-180)
    normal_angle = angle % 180.0
    if normal_angle < 0:
        normal_angle += 180.0

    # Convert back to radians for calculation
    angle_rad = angle * math.pi / 180.0
    cos_val = math.cos(angle_rad)
    sin_val = math.sin(angle_rad)

    # Calculate boundaries
    l_x = cx - w / 2
    r_x = cx + w / 2
    t_y = cy - h / 2
    b_y = cy + h / 2

    # Scale coordinates using get_rect_obb (matching C++ version)
    bbox = [l_x, t_y, r_x, b_y]
    rect = get_rect_obb(img, bbox)

    # Calculate center and dimensions of scaled box
    x_ = (rect[0] + rect[0] + rect[2]) / 2  # rect.x + rect.width/2
    y_ = (rect[1] + rect[1] + rect[3]) / 2  # rect.y + rect.height/2
    width = rect[2]
    height = rect[3]

    # Calculate vectors
    vec1x = width / 2 * cos_val
    vec1y = width / 2 * sin_val
    vec2x = -height / 2 * sin_val
    vec2y = height / 2 * cos_val

    # Calculate four corners
    corners = np.array([
        [int(round(x_ + vec1x + vec2x)), int(round(y_ + vec1y + vec2y))],  # Top-left
        [int(round(x_ + vec1x - vec2x)), int(round(y_ + vec1y - vec2y))],  # Top-right
        [int(round(x_ - vec1x - vec2x)), int(round(y_ - vec1y - vec2y))],  # Bottom-right
        [int(round(x_ - vec1x + vec2x)), int(round(y_ - vec1y + vec2y))]   # Bottom-left
    ], dtype=np.int32)

    # Clip to image boundaries
    h, w = img.shape[:2]
    corners[:, 0] = np.clip(corners[:, 0], 0, w - 1)
    corners[:, 1] = np.clip(corners[:, 1], 0, h - 1)

    return corners


def get_rect_obb(img, bbox):
    """
    Scale coordinates according to image resize ratio (matching C++ version)
    param:
        img: OpenCV image (numpy array)
        bbox: [left, top, right, bottom]
    return:
        [x, y, width, height]
    """
    l_x, t_y, r_x, b_y = bbox
    r_w = INPUT_W / img.shape[1]  # INPUT_W should be your model input width
    r_h = INPUT_H / img.shape[0]  # INPUT_H should be your model input height

    if r_h > r_w:
        l_x = l_x
        r_x = r_x
        t_y = t_y - (INPUT_H - r_w * img.shape[0]) / 2
        b_y = b_y - (INPUT_H - r_w * img.shape[0]) / 2
        l_x = l_x / r_w
        r_x = r_x / r_w
        t_y = t_y / r_w
        b_y = b_y / r_w
    else:
        l_x = l_x - (INPUT_W - r_h * img.shape[1]) / 2
        r_x = r_x - (INPUT_W - r_h * img.shape[1]) / 2
        t_y = t_y
        b_y = b_y
        l_x = l_x / r_h
        r_x = r_x / r_h
        t_y = t_y / r_h
        b_y = b_y / r_h

    l_x = max(0.0, l_x)
    t_y = max(0.0, t_y)
    width = max(0, min(int(round(r_x - l_x)), img.shape[1] - int(round(l_x))))
    height = max(0, min(int(round(b_y - t_y)), img.shape[0] - int(round(t_y))))

    return [int(round(l_x)), int(round(t_y)), width, height]


def plot_one_box(x, img, color=None, label=None, line_thickness=None):
    """
    description: Plots one rotated bounding box on image img
    param:
        x:      a box in [cx, cy, w, h, angle] format
        img:    an opencv image object
        color:  color to draw rectangle
        label:  str
        line_thickness: int
    """
    tl = line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1

    # Get four corner points
    corners = get_corner(img, x)
    corners = corners.astype(int)

    # Draw the rotated rectangle
    cv2.polylines(img, [corners], isClosed=True, color=color, thickness=tl, lineType=cv2.LINE_AA)

    if label:
        tf = max(tl - 1, 1)  # font thickness
        # Use first corner point for label placement
        p1 = tuple(corners[0])
        w, h = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]

        outside = p1[1] - h >= 3
        p2 = (p1[0] + w, p1[1] - h - 3 if outside else p1[1] + h + 3)

        cv2.rectangle(img, p1, p2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(
            img,
            label,
            (p1[0], p1[1] - 2 if outside else p1[1] + h + 2),
            0,
            tl / 3,
            [225, 255, 255],
            thickness=tf,
            lineType=cv2.LINE_AA
        )


class YoLo11TRT(object):
    """
    description: A YOLO11 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            self.batch_size = engine.get_binding_shape(binding)[0]
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                global INPUT_W, INPUT_H
                self.input_w = engine.get_binding_shape(binding)[-1]
                INPUT_W = self.input_w
                self.input_h = engine.get_binding_shape(binding)[-2]
                INPUT_H = self.input_h
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.det_output_length = host_outputs[0].shape[0]

    def infer(self, raw_image_generator):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_origin_h = []
        batch_origin_w = []
        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
            batch_image_raw.append(image_raw)
            batch_origin_h.append(origin_h)
            batch_origin_w.append(origin_w)
            np.copyto(batch_input_image[i], input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        # Do postprocess
        for i in range(self.batch_size):
            keep = self.post_process(
                output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i],
                batch_origin_w[i]
            )
            # Draw rectangles and labels on the original image
            for j in range(len(keep)):
                box = keep[j]  # type: Detection
                np.random.seed(int(keep[j].class_id))
                color = [np.random.randint(0, 255) for _ in range(3)]
                plot_one_box(
                    box,
                    batch_image_raw[i],
                    label="{}:{:.2f}".format(
                        categories[int(keep[j].class_id)], keep[j].score
                    ),
                    color=color,
                    line_thickness=1
                )
        return batch_image_raw, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()

    def get_raw_image(self, image_path_batch):
        """
        description: Read an image from image path
        """
        for img_path in image_path_batch:
            yield cv2.imread(img_path)

    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_image(self, raw_bgr_image):
        """
        description: Convert BGR image to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = self.input_w / w
        r_h = self.input_h / h
        if r_h > r_w:
            tw = self.input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.input_h - th) / 2)
            ty2 = self.input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = self.input_h
            tx1 = int((self.input_w - tw) / 2)
            tx2 = self.input_w - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy(self, origin_h, origin_w, x):
        """
        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
        param:
            origin_h:   height of original image
            origin_w:   width of original image
            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
        return:
            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
        """
        y = np.zeros_like(x)
        r_w = self.input_w / origin_w
        r_h = self.input_h / origin_h
        if r_h > r_w:
            y[:, 0] = x[:, 0]
            y[:, 2] = x[:, 2]
            y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2
            y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2
            y /= r_w
        else:
            y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2
            y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2
            y[:, 1] = x[:, 1]
            y[:, 3] = x[:, 3]
            y /= r_h

        return y

    def covariance_matrix(self, res: Detection):
        """
        description: Generating covariance matrix from obbs.
        param:
            box (np.ndarray): A numpy array representing rotated bounding box, with xywhr format.

        return:
            tuple: (a, b, c) values of covariance matrix
        """
        w = res.bbox[2]
        h = res.bbox[3]
        angle = res.angle

        a = w * w / 12.0
        b = h * h / 12.0
        c = angle

        cos_r = math.cos(c)
        sin_r = math.sin(c)

        cos_r2 = cos_r * cos_r
        sin_r2 = sin_r * sin_r

        a_val = a * cos_r2 + b * sin_r2
        b_val = a * sin_r2 + b * cos_r2
        c_val = (a - b) * cos_r * sin_r

        return a_val, b_val, c_val

    def probiou(self, box1: Detection, box2: Detection, eps=1e-7):
        """
        description: Calculate the prob IoU between oriented bounding boxes.
        param:
            box1 (np.ndarray): First box in xywhr format
            box2 (np.ndarray): Second box in xywhr format
            eps (float): Small value to avoid division by zero
        return:
            float: 1 - hd where hd is the Bhattacharyya distance
        """
        a1, b1, c1 = self.covariance_matrix(box1)
        a2, b2, c2 = self.covariance_matrix(box2)

        x1, y1 = box1.bbox[0], box1.bbox[1]
        x2, y2 = box2.bbox[0], box2.bbox[1]

        t1 = ((a1 + a2) * (y1 - y2) ** 2 + (b1 + b2) * (x1 - x2) ** 2) / \
             ((a1 + a2) * (b1 + b2) - (c1 + c2) ** 2 + eps)
        t1 *= 0.25

        t2 = ((c1 + c2) * (x2 - x1) * (y1 - y2)) / \
             ((a1 + a2) * (b1 + b2) - (c1 + c2) ** 2 + eps)
        t2 *= 0.5

        t3 = ((a1 + a2) * (b1 + b2) - (c1 + c2) ** 2) / \
             (4 * math.sqrt(max(a1 * b1 - c1 * c1, 0.0)) *
              math.sqrt(max(a2 * b2 - c2 * c2, 0.0)) + eps)
        t3 = math.log(t3 + eps) * 0.5

        bd = max(min(t1 + t2 + t3, 100.0), eps)
        hd = math.sqrt(1.0 - math.exp(-bd) + eps)

        return 1 - hd

    def post_process(self, output, origin_h, origin_w):
        """
        description: postprocess the prediction
        param:
            output:     A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id,angle cx,cy,w,h,conf,cls_id,angle ...]
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2, angle]
            result_scores: finally scores, a numpy, each element is the score correspoing to box
            result_classid: finally classid, a numpy, each element is the classid correspoing to box
        """
        num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + OBB_NUM
        # Get the num of boxes detected
        num = int(output[0])
        # Reshape to a two dimentional ndarray
        pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :]

        # Filter by confidence threshold
        mask = pred[:, 4] >= CONF_THRESH
        pred = pred[mask]

        if len(pred) == 0:
            return []

        m_map = {}
        for i in range(len(pred)):
            class_id = int(pred[i][5])
            if class_id not in m_map:
                m_map[class_id] = []
            m_map[class_id].append(Detection(pred[i][:4], pred[i][4], class_id, pred[i][89]))

        res = []
        for it in m_map:
            dets = m_map[it]
            dets = sorted(dets, key=lambda x: x.score, reverse=True)
            for m in range(len(dets)):
                if dets[m].score == 0.0:
                    continue
                item = dets[m]
                res.append(item)
                for n in range(m + 1, len(dets)):
                    if dets[n].score == 0.0:
                        continue
                    if self.probiou(item, dets[n]) > IOU_THRESHOLD:
                        dets[n].score = 0.0

        keep = []
        for i in range(len(res)):
            if res[i].score > CONF_THRESH:
                keep.append(res[i])

        return keep


class inferThread(threading.Thread):
    def __init__(self, yolo11_wrapper, image_path_batch):
        threading.Thread.__init__(self)
        self.yolo11_wrapper = yolo11_wrapper
        self.image_path_batch = image_path_batch

    def run(self):
        batch_image_raw, use_time = self.yolo11_wrapper.infer(self.yolo11_wrapper.get_raw_image(self.image_path_batch))
        for i, img_path in enumerate(self.image_path_batch):
            parent, filename = os.path.split(img_path)
            save_name = os.path.join('output', filename)
            # Save image
            cv2.imwrite(save_name, batch_image_raw[i])
        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))


class warmUpThread(threading.Thread):
    def __init__(self, yolo11_wrapper):
        threading.Thread.__init__(self)
        self.yolo11_wrapper = yolo11_wrapper

    def run(self):
        batch_image_raw, use_time = self.yolo11_wrapper.infer(self.yolo11_wrapper.get_raw_image_zeros())
        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))


if __name__ == "__main__":
    # load custom plugin and engine
    PLUGIN_LIBRARY = "./build/libmyplugins.so"
    engine_file_path = "yolo11n-obb.engine"

    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]
    if len(sys.argv) > 2:
        PLUGIN_LIBRARY = sys.argv[2]

    ctypes.CDLL(PLUGIN_LIBRARY)

    # load DOTAV 1.5 labels

    categories = ["plane", "ship", "storage tank", "baseball diamond", "tennis court",
                  "basketball court", "ground track field", "harbor",
                  "bridge", "large vehicle", "small vehicle", "helicopter",
                  "roundabout", "soccer ball field", "swimming pool", "container crane"]

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')
    # a YoLo11TRT instance
    yolo11_wrapper = YoLo11TRT(engine_file_path)
    try:
        print('batch size is', yolo11_wrapper.batch_size)

        image_dir = "images/"
        image_path_batches = get_img_path_batches(yolo11_wrapper.batch_size, image_dir)

        for i in range(10):
            # create a new thread to do warm_up
            thread1 = warmUpThread(yolo11_wrapper)
            thread1.start()
            thread1.join()
        for batch in image_path_batches:
            # create a new thread to do inference
            thread1 = inferThread(yolo11_wrapper, batch)
            thread1.start()
            thread1.join()
    finally:
        # destroy the instance
        yolo11_wrapper.destroy()


================================================
FILE: yolo11/yolo11_pose.cpp
================================================

#include <fstream>
#include <iostream>
#include <opencv2/opencv.hpp>
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "postprocess.h"
#include "preprocess.h"
#include "utils.h"

Logger gLogger;
using namespace nvinfer1;
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;

void serialize_engine(std::string& wts_name, std::string& engine_name, std::string& type, float& gd, float& gw,
                      int& max_channels) {
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();
    IHostMemory* serialized_engine = nullptr;

    serialized_engine = buildEngineYolo11Pose(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels, type);

    assert(serialized_engine);
    std::ofstream p(engine_name, std::ios::binary);
    if (!p) {
        std::cout << "could not open plan output file" << std::endl;
        assert(false);
    }
    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());

    delete serialized_engine;
    delete config;
    delete builder;
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
                        IExecutionContext** context) {
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        assert(false);
    }
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    char* serialized_engine = new char[size];
    assert(serialized_engine);
    file.read(serialized_engine, size);
    file.close();

    *runtime = createInferRuntime(gLogger);
    assert(*runtime);
    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
    assert(*engine);
    *context = (*engine)->createExecutionContext();
    assert(*context);
    delete[] serialized_engine;
}

void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
                    float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
                    std::string cuda_post_process) {
    assert(engine->getNbBindings() == 2);
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(kInputTensorName);
    const int outputIndex = engine->getBindingIndex(kOutputTensorName);
    assert(inputIndex == 0);
    assert(outputIndex == 1);
    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
    if (cuda_post_process == "c") {
        *output_buffer_host = new float[kBatchSize * kOutputSize];
    } else if (cuda_post_process == "g") {
        if (kBatchSize > 1) {
            std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
            exit(0);
        }
        // Allocate memory for decode_ptr_host and copy to device
        *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
        CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
    }
}

void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
           float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
    // infer on the batch asynchronously, and DMA output back to host
    auto start = std::chrono::system_clock::now();
    context.enqueueV2(buffers, stream, nullptr);
    if (cuda_post_process == "c") {
        CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
                                   stream));
        auto end = std::chrono::system_clock::now();
        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
                  << "ms" << std::endl;
    } else if (cuda_post_process == "g") {
        CUDA_CHECK(
                cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
        cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
        cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream);  //cuda nms
        CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
                                   sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
                                   stream));
        auto end = std::chrono::system_clock::now();
        std::cout << "inference and gpu postprocess time: "
                  << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));
}

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir, std::string& type,
                std::string& cuda_post_process, float& gd, float& gw, int& max_channels) {
    if (argc < 4)
        return false;
    if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) {
        wts = std::string(argv[2]);
        engine = std::string(argv[3]);
        auto sub_type = std::string(argv[4]);
        if (sub_type[0] == 'n') {
            gd = 0.50;
            gw = 0.25;
            max_channels = 1024;
            type = "n";
        } else if (sub_type[0] == 's') {
            gd = 0.50;
            gw = 0.50;
            max_channels = 1024;
            type = "s";
        } else if (sub_type[0] == 'm') {
            gd = 0.50;
            gw = 1.00;
            max_channels = 512;
            type = "m";
        } else if (sub_type[0] == 'l') {
            gd = 1.0;
            gw = 1.0;
            max_channels = 512;
            type = "l";
        } else if (sub_type[0] == 'x') {
            gd = 1.0;
            gw = 1.50;
            max_channels = 512;
            type = "x";
        } else {
            return false;
        }
    } else if (std::string(argv[1]) == "-d" && argc == 5) {
        engine = std::string(argv[2]);
        img_dir = std::string(argv[3]);
        cuda_post_process = std::string(argv[4]);
    } else {
        return false;
    }
    return true;
}

int main(int argc, char** argv) {
    // yolo11_pose -s ../models/yolo11n-pose.wts ../models/yolo11n-pose.fp32.trt n
    // yolo11_pose -d ../models/yolo11n-pose.fp32.trt ../images c
    cudaSetDevice(kGpuId);
    std::string wts_name;
    std::string engine_name;
    std::string img_dir;
    std::string type;
    std::string cuda_post_process;
    int model_bboxes;
    float gd = 0.0f, gw = 0.0f;
    int max_channels = 0;

    if (!parse_args(argc, argv, wts_name, engine_name, img_dir, type, cuda_post_process, gd, gw, max_channels)) {
        std::cerr << "Arguments not right!" << std::endl;
        std::cerr << "./yolo11_pose -s [.wts] [.engine] [n/s/m/l/x]  // serialize model to "
                     "plan file"
                  << std::endl;
        std::cerr << "./yolo11_pose -d [.engine] ../images  [c/g]// deserialize plan file and run inference"
                  << std::endl;
        return -1;
    }

    // Create a model using the API directly and serialize it to a file
    if (!wts_name.empty()) {
        serialize_engine(wts_name, engine_name, type, gd, gw, max_channels);
        return 0;
    }

    // Deserialize the engine from file
    IRuntime* runtime = nullptr;
    ICudaEngine* engine = nullptr;
    IExecutionContext* context = nullptr;
    deserialize_engine(engine_name, &runtime, &engine, &context);
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));
    cuda_preprocess_init(kMaxInputImageSize);
    auto out_dims = engine->getBindingDimensions(1);
    model_bboxes = out_dims.d[0];
    // Prepare cpu and gpu buffers
    float* device_buffers[2];
    float* output_buffer_host = nullptr;
    float* decode_ptr_host = nullptr;
    float* decode_ptr_device = nullptr;

    // Read images from directory
    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
                   &decode_ptr_device, cuda_post_process);

    // batch predict
    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
        // Get a batch of images
        std::vector<cv::Mat> img_batch;
        std::vector<std::string> img_name_batch;
        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
            img_batch.push_back(img);
            img_name_batch.push_back(file_names[j]);
        }
        // Preprocess
        cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
        // Run inference
        infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
              decode_ptr_device, model_bboxes, cuda_post_process);
        std::vector<std::vector<Detection>> res_batch;
        if (cuda_post_process == "c") {
            // NMS
            batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
        } else if (cuda_post_process == "g") {
            // Process gpu decode and nms results
            // todo pose in gpu
            std::cerr << "pose_postprocess is not support in gpu right now" << std::endl;
        }
        // Draw bounding boxes
        draw_bbox_keypoints_line(img_batch, res_batch);
        // Save images
        for (size_t j = 0; j < img_batch.size(); j++) {
            cv::imwrite("_" + img_name_batch[j], img_batch[j]);
        }
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(device_buffers[0]));
    CUDA_CHECK(cudaFree(device_buffers[1]));
    CUDA_CHECK(cudaFree(decode_ptr_device));
    delete[] decode_ptr_host;
    delete[] output_buffer_host;
    cuda_preprocess_destroy();
    // Destroy the engine
    delete context;
    delete engine;
    delete runtime;

    // Print histogram of the output distribution
    //std::cout << "\nOutput:\n\n";
    //for (unsigned int i = 0; i < kOutputSize; i++)
    //{
    //    std::cout << prob[i] << ", ";
    //    if (i % 10 == 0) std::cout << std::endl;
    //}
    //std::cout << std::endl;

    return 0;
}


================================================
FILE: yolo11/yolo11_pose_trt.py
================================================
"""
An example that uses TensorRT's Python api to make inferences.
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit  # noqa: F401
import pycuda.driver as cuda
import tensorrt as trt

CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4
POSE_NUM = 17 * 3
DET_NUM = 6
SEG_NUM = 32
OBB_NUM = 1
keypoint_pairs = [
    (0, 1), (0, 2), (0, 5), (0, 6), (1, 2),
    (1, 3), (2, 4), (5, 6), (5, 7), (5, 11),
    (6, 8), (6, 12), (7, 9), (8, 10), (11, 12),
    (11, 13), (12, 14), (13, 15), (14, 16)
]


def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret


def plot_one_box(x, img, color=None, label=None, line_thickness=None):
    """
    description: Plots one bounding box on image img,
                 this function comes from YoLo11 project.
    param:
        x:      a box likes [x1,y1,x2,y2]
        img:    a opencv image object
        color:  color to draw rectangle, such as (0,255,0)
        label:  str
        line_thickness: int
    return:
        no return

    """
    tl = (
            line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
    )  # line/font thickness
    color = color or [random.randint(0, 255) for _ in range(3)]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
    if label:
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(
            img,
            label,
            (c1[0], c1[1] - 2),
            0,
            tl / 3,
            [225, 255, 255],
            thickness=tf,
            lineType=cv2.LINE_AA,
        )


class YoLo11TRT(object):
    """
    description: A YOLO11 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            self.batch_size = engine.get_binding_shape(binding)[0]
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.det_output_size = host_outputs[0].shape[0]

    def infer(self, raw_image_generator):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_origin_h = []
        batch_origin_w = []
        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
            batch_image_raw.append(image_raw)
            batch_origin_h.append(origin_h)
            batch_origin_w.append(origin_w)
            np.copyto(batch_input_image[i], input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        # Do postprocess
        for i in range(self.batch_size):

            result_boxes, result_scores, result_classid, keypoints = self.post_process(
                output[i * (self.det_output_size): (i + 1) * (self.det_output_size)],
                batch_origin_h[i], batch_origin_w[i]
            )

            # Draw rectangles and labels on the original image
            for j in range(len(result_boxes)):
                box = result_boxes[j]
                plot_one_box(
                    box,
                    batch_image_raw[i],
                    label="{}:{:.2f}".format(
                        categories[int(result_classid[j])], result_scores[j]
                    ),
                )

                num_keypoints = len(keypoints[j]) // 3
                points = []
                for k in range(num_keypoints):
                    x = keypoints[j][k * 3]
                    y = keypoints[j][k * 3 + 1]
                    confidence = keypoints[j][k * 3 + 2]
                    if confidence > 0:
                        points.append((int(x), int(y)))
                    else:
                        points.append(None)

                # 根据关键点索引对绘制线条
                for pair in keypoint_pairs:
                    partA, partB = pair
                    if points[partA] and points[partB]:
                        cv2.line(batch_image_raw[i], points[partA], points[partB], (0, 255, 0), 2)

        return batch_image_raw, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()

    def get_raw_image(self, image_path_batch):
        """
        description: Read an image from image path
        """
        for img_path in image_path_batch:
            yield cv2.imread(img_path)

    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_image(self, raw_bgr_image):
        """
        description: Convert BGR image to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = self.input_w / w
        r_h = self.input_h / h
        if r_h > r_w:
            tw = self.input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.input_h - th) / 2)
            ty2 = self.input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = self.input_h
            tx1 = int((self.input_w - tw) / 2)
            tx2 = self.input_w - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy_with_keypoints(self, origin_h, origin_w, boxes, keypoints):

        n = len(boxes)
        box_array = np.zeros_like(boxes)
        keypoint_array = np.zeros_like(keypoints)
        r_w = self.input_w / origin_w
        r_h = self.input_h / origin_h
        for i in range(n):
            if r_h > r_w:
                box = boxes[i]
                lmk = keypoints[i]
                box_array[i, 0] = box[0] / r_w
                box_array[i, 2] = box[2] / r_w
                box_array[i, 1] = (box[1] - (self.input_h - r_w * origin_h) / 2) / r_w
                box_array[i, 3] = (box[3] - (self.input_h - r_w * origin_h) / 2) / r_w

                for j in range(0, len(lmk), 3):
                    keypoint_array[i, j] = lmk[j] / r_w
                    keypoint_array[i, j + 1] = (lmk[j + 1] - (self.input_h - r_w * origin_h) / 2) / r_w
                    keypoint_array[i, j + 2] = lmk[j + 2]
            else:

                box = boxes[i]
                lmk = keypoints[i]

                box_array[i, 0] = (box[0] - (self.input_w - r_h * origin_w) / 2) / r_h
                box_array[i, 2] = (box[2] - (self.input_w - r_h * origin_w) / 2) / r_h
                box_array[i, 1] = box[1] / r_h
                box_array[i, 3] = box[3] / r_h

                for j in range(0, len(lmk), 3):
                    keypoint_array[i, j] = (lmk[j] - (self.input_w - r_h * origin_w) / 2) / r_h
                    keypoint_array[i, j + 1] = lmk[j + 1] / r_h
                    keypoint_array[i, j + 2] = lmk[j + 2]

        return box_array, keypoint_array

    def post_process(self, output, origin_h, origin_w):
        """
        description: Post-process the prediction to include pose keypoints
        param:
            output:     A numpy array like [num_boxes, cx, cy, w, h, conf,
            cls_id, px1, py1, pconf1,...px17, py17, pconf17] where p denotes pose keypoint
            origin_h:   Height of original image
            origin_w:   Width of original image
        return:
            result_boxes:    Final boxes, a numpy array, each row is a box [x1, y1, x2, y2]
            result_scores:   Final scores, a numpy array, each element is the score corresponding to box
            result_classid:  Final classID, a numpy array, each element is the classid corresponding to box
            result_keypoints: Final keypoints, a list of numpy arrays,
            each element represents keypoints for a box, shaped as (#keypoints, 3)
        """
        # Number of values per detection: 38 base values + 17 keypoints * 3 values each + angle
        num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + OBB_NUM
        # Get the number of boxes detected
        num = int(output[0])
        # Reshape to a two-dimensional ndarray with the full detection shape
        pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :]

        # Perform non-maximum suppression to filter the detections
        boxes = self.non_max_suppression(
            pred[:, :num_values_per_detection], origin_h, origin_w,
            conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)

        # Extract the bounding boxes, confidence scores, and class IDs
        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
        result_scores = boxes[:, 4] if len(boxes) else np.array([])
        result_classid = boxes[:, 5] if len(boxes) else np.array([])
        result_keypoints = boxes[:, -POSE_NUM - 1:-1] if len(boxes) else np.array([])

        # Return the post-processed results including keypoints
        return result_boxes, result_scores, result_classid, result_keypoints

    def bbox_iou(self, box1, box2, x1y1x2y2=True):
        """
        description: compute the IoU of two bounding boxes
        param:
            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            x1y1x2y2: select the coordinate format
        return:
            iou: computed iou
        """
        if not x1y1x2y2:
            # Transform from center and width to exact coordinates
            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
        else:
            # Get the coordinates of bounding boxes
            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

        # Get the coordinates of the intersection rectangle
        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
        # Intersection area
        inter_area = np.clip(
            inter_rect_x2 - inter_rect_x1 + 1, 0, None) * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)
        # Union Area
        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)

        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

        return iou

    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
        """
        description: Removes detections with lower object confidence score than 'conf_thres' and performs
        Non-Maximum Suppression to further filter detections.
        param:
            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
            origin_h: original image height
            origin_w: original image width
            conf_thres: a confidence threshold to filter detections
            nms_thres: a iou threshold to filter detections
        return:
            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
        """
        # Get the boxes that score > CONF_THRESH
        boxes = prediction[prediction[:, 4] >= conf_thres]
        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
        res_array = np.copy(boxes)
        box_pred_deep_copy = np.copy(boxes[:, :4])
        keypoints_pred_deep_copy = np.copy(boxes[:, -POSE_NUM - 1:-1])
        res_box, res_keypoints = self.xywh2xyxy_with_keypoints(
            origin_h, origin_w, box_pred_deep_copy, keypoints_pred_deep_copy)
        res_array[:, :4] = res_box
        res_array[:, -POSE_NUM - 1:-1] = res_keypoints
        # clip the coordinates
        res_array[:, 0] = np.clip(res_array[:, 0], 0, origin_w - 1)
        res_array[:, 2] = np.clip(res_array[:, 2], 0, origin_w - 1)
        res_array[:, 1] = np.clip(res_array[:, 1], 0, origin_h - 1)
        res_array[:, 3] = np.clip(res_array[:, 3], 0, origin_h - 1)
        # Object confidence
        confs = res_array[:, 4]
        # Sort by the confs
        res_array = res_array[np.argsort(-confs)]
        # Perform non-maximum suppression
        keep_res_array = []
        while res_array.shape[0]:
            large_overlap = self.bbox_iou(np.expand_dims(res_array[0, :4], 0), res_array[:, :4]) > nms_thres
            label_match = res_array[0, 5] == res_array[:, 5]
            invalid = large_overlap & label_match
            keep_res_array.append(res_array[0])
            res_array = res_array[~invalid]

        res_array = np.stack(keep_res_array, 0) if len(keep_res_array) else np.array([])
        return res_array


class inferThread(threading.Thread):
    def __init__(self, yolo11_wrapper, image_path_batch):
        threading.Thread.__init__(self)
        self.yolo11_wrapper = yolo11_wrapper
        self.image_path_batch = image_path_batch

    def run(self):
        batch_image_raw, use_time = self.yolo11_wrapper.infer(self.yolo11_wrapper.get_raw_image(self.image_path_batch))
        for i, img_path in enumerate(self.image_path_batch):
            parent, filename = os.path.split(img_path)
            save_name = os.path.join('output', filename)
            # Save image

            cv2.imwrite(save_name, batch_image_raw[i])
        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))


class warmUpThread(threading.Thread):
    def __init__(self, yolo11_wrapper):
        threading.Thread.__init__(self)
        self.yolo11_wrapper = yolo11_wrapper

    def run(self):
        batch_image_raw, use_time = self.yolo11_wrapper.infer(self.yolo11_wrapper.get_raw_image_zeros())
        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))


if __name__ == "__main__":
    # load custom plugin and engine
    PLUGIN_LIBRARY = "./build/libmyplugins.so"
    engine_file_path = "yolo11n-pose.engine"

    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]
    if len(sys.argv) > 2:
        PLUGIN_LIBRARY = sys.argv[2]

    ctypes.CDLL(PLUGIN_LIBRARY)

    # load coco labels

    categories = ["person"]

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')
    # a YoLo11TRT instance
    yolo11_wrapper = YoLo11TRT(engine_file_path)
    try:
        print('batch size is', yolo11_wrapper.batch_size)

        image_dir = "images/"
        image_path_batches = get_img_path_batches(yolo11_wrapper.batch_size, image_dir)

        for i in range(10):
            # create a new thread to do warm_up
            thread1 = warmUpThread(yolo11_wrapper)
            thread1.start()
            thread1.join()
        for batch in image_path_batches:
            # create a new thread to do inference
            thread1 = inferThread(yolo11_wrapper, batch)
            thread1.start()
            thread1.join()
    finally:
        # destroy the instance
        yolo11_wrapper.destroy()


================================================
FILE: yolo11/yolo11_seg.cpp
================================================

#include <fstream>
#include <iostream>
#include <opencv2/opencv.hpp>
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "postprocess.h"
#include "preprocess.h"
#include "utils.h"

Logger gLogger;
using namespace nvinfer1;
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
const static int kOutputSegSize = 32 * (kInputH / 4) * (kInputW / 4);

static cv::Rect get_downscale_rect(float bbox[4], float scale) {

    float left = bbox[0];
    float top = bbox[1];
    float right = bbox[0] + bbox[2];
    float bottom = bbox[1] + bbox[3];

    left = left < 0 ? 0 : left;
    top = top < 0 ? 0 : top;
    right = right > kInputW ? kInputW : right;
    bottom = bottom > kInputH ? kInputH : bottom;

    left /= scale;
    top /= scale;
    right /= scale;
    bottom /= scale;
    return cv::Rect(int(left), int(top), int(right - left), int(bottom - top));
}

std::vector<cv::Mat> process_mask(const float* proto, int proto_size, std::vector<Detection>& dets) {

    std::vector<cv::Mat> masks;
    for (size_t i = 0; i < dets.size(); i++) {

        cv::Mat mask_mat = cv::Mat::zeros(kInputH / 4, kInputW / 4, CV_32FC1);
        auto r = get_downscale_rect(dets[i].bbox, 4);

        for (int x = r.x; x < r.x + r.width; x++) {
            for (int y = r.y; y < r.y + r.height; y++) {
                float e = 0.0f;
                for (int j = 0; j < 32; j++) {
                    e += dets[i].mask[j] * proto[j * proto_size / 32 + y * mask_mat.cols + x];
                }
                e = 1.0f / (1.0f + expf(-e));
                mask_mat.at<float>(y, x) = e;
            }
        }
        cv::resize(mask_mat, mask_mat, cv::Size(kInputW, kInputH));
        masks.push_back(mask_mat);
    }
    return masks;
}

void serialize_engine(std::string& wts_name, std::string& engine_name, std::string& type, float& gd, float& gw,
                      int& max_channels) {
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();
    IHostMemory* serialized_engine = nullptr;

    serialized_engine = buildEngineYolo11Seg(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels, type);

    assert(serialized_engine);
    std::ofstream p(engine_name, std::ios::binary);
    if (!p) {
        std::cout << "could not open plan output file" << std::endl;
        assert(false);
    }
    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());

    delete serialized_engine;
    delete config;
    delete builder;
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
                        IExecutionContext** context) {
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        assert(false);
    }
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    char* serialized_engine = new char[size];
    assert(serialized_engine);
    file.read(serialized_engine, size);
    file.close();

    *runtime = createInferRuntime(gLogger);
    assert(*runtime);
    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
    assert(*engine);
    *context = (*engine)->createExecutionContext();
    assert(*context);
    delete[] serialized_engine;
}

void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
                    float** output_seg_buffer_device, float** output_buffer_host, float** output_seg_buffer_host,
                    float** decode_ptr_host, float** decode_ptr_device, std::string cuda_post_process) {
    assert(engine->getNbBindings() == 3);
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(kInputTensorName);
    const int outputIndex = engine->getBindingIndex(kOutputTensorName);
    const int outputIndex_seg = engine->getBindingIndex(kProtoTensorName);

    assert(inputIndex == 0);
    assert(outputIndex == 1);
    assert(outputIndex_seg == 2);
    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)output_seg_buffer_device, kBatchSize * kOutputSegSize * sizeof(float)));

    if (cuda_post_process == "c") {
        *output_buffer_host = new float[kBatchSize * kOutputSize];
        *output_seg_buffer_host = new float[kBatchSize * kOutputSegSize];
    } else if (cuda_post_process == "g") {
        if (kBatchSize > 1) {
            std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
            exit(0);
        }
        // Allocate memory for decode_ptr_host and copy to device
        *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
        CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
    }
}

void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, float* output_seg,
           int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes,
           std::string cuda_post_process) {
    // infer on the batch asynchronously, and DMA output back to host
    auto start = std::chrono::system_clock::now();
    context.enqueueV2(buffers, stream, nullptr);
    if (cuda_post_process == "c") {

        std::cout << "kOutputSize:" << kOutputSize << std::endl;
        CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
                                   stream));
        std::cout << "kOutputSegSize:" << kOutputSegSize << std::endl;
        CUDA_CHECK(cudaMemcpyAsync(output_seg, buffers[2], batchsize * kOutputSegSize * sizeof(float),
                                   cudaMemcpyDeviceToHost, stream));

        auto end = std::chrono::system_clock::now();
        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
                  << "ms" << std::endl;
    } else if (cuda_post_process == "g") {
        CUDA_CHECK(
                cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
        cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
        cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream);  //cuda nms
        CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
                                   sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
                                   stream));
        auto end = std::chrono::system_clock::now();
        std::cout << "inference and gpu postprocess time: "
                  << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));
}

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir, std::string& type,
                std::string& cuda_post_process, std::string& labels_filename, float& gd, float& gw, int& max_channels) {
    if (argc < 4)
        return false;
    if (std::string(argv[1]) == "-s" && argc == 5) {
        wts = std::string(argv[2]);
        engine = std::string(argv[3]);
        std::string sub_type = std::string(argv[4]);
        if (sub_type[0] == 'n') {
            gd = 0.50;
            gw = 0.25;
            max_channels = 1024;
            type = "n";
        } else if (sub_type[0] == 's') {
            gd = 0.50;
            gw = 0.50;
            max_channels = 1024;
            type = "s";
        } else if (sub_type[0] == 'm') {
            gd = 0.50;
            gw = 1.00;
            max_channels = 512;
            type = "m";
        } else if (sub_type[0] == 'l') {
            gd = 1.0;
            gw = 1.0;
            max_channels = 512;
            type = "l";
        } else if (sub_type[0] == 'x') {
            gd = 1.0;
            gw = 1.50;
            max_channels = 512;
            type = "x";
        } else {
            return false;
        }
    } else if (std::string(argv[1]) == "-d" && argc == 6) {
        engine = std::string(argv[2]);
        img_dir = std::string(argv[3]);
        cuda_post_process = std::string(argv[4]);
        labels_filename = std::string(argv[5]);
    } else {
        return false;
    }
    return true;
}

int main(int argc, char** argv) {
    // yolo11_seg -s ../models/yolo11n-seg.wts ../models/yolo11n-seg.fp32.trt n
    // yolo11_seg -d ../models/yolo11n-seg.fp32.trt ../images c coco.txt
    cudaSetDevice(kGpuId);
    std::string wts_name;
    std::string engine_name;
    std::string img_dir;
    std::string type;
    std::string cuda_post_process;
    std::string labels_filename = "coco.txt";
    int model_bboxes;
    float gd = 0.0f, gw = 0.0f;
    int max_channels = 0;

    if (!parse_args(argc, argv, wts_name, engine_name, img_dir, type, cuda_post_process, labels_filename, gd, gw,
                    max_channels)) {
        std::cerr << "Arguments not right!" << std::endl;
        std::cerr << "./yolo11_seg -s [.wts] [.engine] [n/s/m/l/x]  // serialize model to plan file" << std::endl;
        std::cerr << "./yolo11_seg -d [.engine] ../images  [c/g] coco_file// deserialize plan file and run inference"
                  << std::endl;
        return -1;
    }

    // Create a model using the API directly and serialize it to a file
    if (!wts_name.empty()) {
        serialize_engine(wts_name, engine_name, type, gd, gw, max_channels);
        return 0;
    }

    // Deserialize the engine from file
    IRuntime* runtime = nullptr;
    ICudaEngine* engine = nullptr;
    IExecutionContext* context = nullptr;
    deserialize_engine(engine_name, &runtime, &engine, &context);
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));
    cuda_preprocess_init(kMaxInputImageSize);
    auto out_dims = engine->getBindingDimensions(1);
    model_bboxes = out_dims.d[0];
    // Prepare cpu and gpu buffers
    float* device_buffers[3];
    float* output_buffer_host = nullptr;
    float* output_seg_buffer_host = nullptr;
    float* decode_ptr_host = nullptr;
    float* decode_ptr_device = nullptr;

    // Read images from directory
    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    std::unordered_map<int, std::string> labels_map;
    read_labels(labels_filename, labels_map);
    assert(kNumClass == labels_map.size());

    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &device_buffers[2], &output_buffer_host,
                   &output_seg_buffer_host, &decode_ptr_host, &decode_ptr_device, cuda_post_process);

    // // batch predict
    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
        // Get a batch of images
        std::vector<cv::Mat> img_batch;
        std::vector<std::string> img_name_batch;
        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
            img_batch.push_back(img);
            img_name_batch.push_back(file_names[j]);
        }
        // Preprocess
        cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
        // Run inference
        infer(*context, stream, (void**)device_buffers, output_buffer_host, output_seg_buffer_host, kBatchSize,
              decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process);
        std::vector<std::vector<Detection>> res_batch;
        if (cuda_post_process == "c") {
            // NMS
            batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
            for (size_t b = 0; b < img_batch.size(); b++) {
                auto& res = res_batch[b];
                cv::Mat img = img_batch[b];
                auto masks = process_mask(&output_seg_buffer_host[b * kOutputSegSize], kOutputSegSize, res);
                draw_mask_bbox(img, res, masks, labels_map);
                cv::imwrite("_" + img_name_batch[b], img);
            }
        } else if (cuda_post_process == "g") {
            // Process gpu decode and nms results
            // batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
            // todo seg in gpu
            std::cerr << "seg_postprocess is not support in gpu right now" << std::endl;
        }
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(device_buffers[0]));
    CUDA_CHECK(cudaFree(device_buffers[1]));
    CUDA_CHECK(cudaFree(device_buffers[2]));
    CUDA_CHECK(cudaFree(decode_ptr_device));
    delete[] decode_ptr_host;
    delete[] output_buffer_host;
    delete[] output_seg_buffer_host;
    cuda_preprocess_destroy();
    // Destroy the engine
    delete context;
    delete engine;
    delete runtime;

    // Print histogram of the output distribution
    // std::cout << "\nOutput:\n\n";
    // for (unsigned int i = 0; i < kOutputSize; i++)
    //{
    //    std::cout << prob[i] << ", ";
    //    if (i % 10 == 0) std::cout << std::endl;
    //}
    // std::cout << std::endl;

    return 0;
}


================================================
FILE: yolo11/yolo11_seg_trt.py
================================================
"""
An example that uses TensorRT's Python api to make inferences.
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit  # noqa: F401
import pycuda.driver as cuda
import tensorrt as trt

CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4
POSE_NUM = 17 * 3
DET_NUM = 6
SEG_NUM = 32
OBB_NUM = 1


def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret


def plot_one_box(x, img, color=None, label=None, line_thickness=None):
    """
    description: Plots one bounding box on image img,
                 this function comes from YoLo11 project.
    param:
        x:      a box likes [x1,y1,x2,y2]
        img:    a opencv image object
        color:  color to draw rectangle, such as (0,255,0)
        label:  str
        line_thickness: int
    return:
        no return

    """
    tl = (
            line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
    )  # line/font thickness
    color = color or [random.randint(0, 255) for _ in range(3)]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
    if label:
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(
            img,
            label,
            (c1[0], c1[1] - 2),
            0,
            tl / 3,
            [225, 255, 255],
            thickness=tf,
            lineType=cv2.LINE_AA,
        )


class YoLo11TRT(object):
    """
    description: A YOLO11 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            self.batch_size = engine.get_binding_shape(binding)[0]
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings

        # Data length
        self.det_output_length = host_outputs[0].shape[0]
        self.seg_output_length = host_outputs[1].shape[0]
        self.seg_w = int(self.input_w / 4)
        self.seg_h = int(self.input_h / 4)
        self.seg_c = int(self.seg_output_length / (self.seg_w * self.seg_w))
        self.det_row_output_length = self.seg_c + DET_NUM + POSE_NUM + OBB_NUM

        # Draw mask
        self.colors_obj = Colors()

    def infer(self, raw_image_generator):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_origin_h = []
        batch_origin_w = []
        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
            batch_image_raw.append(image_raw)
            batch_origin_h.append(origin_h)
            batch_origin_w.append(origin_w)
            np.copyto(batch_input_image[i], input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        cuda.memcpy_dtoh_async(host_outputs[1], cuda_outputs[1], stream)

        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        output_proto_mask = host_outputs[1]
        # Do postprocess
        for i in range(self.batch_size):
            result_boxes, result_scores, result_classid, result_proto_coef = self.post_process(
                output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i],
                batch_origin_w[i]
            )

            if result_proto_coef.shape[0] == 0:
                continue
            result_masks = self.process_mask(output_proto_mask, result_proto_coef, result_boxes, batch_origin_h[i],
                                             batch_origin_w[i])

            self.draw_mask(result_masks, colors_=[self.colors_obj(x, True) for x in result_classid],
                           im_src=batch_image_raw[i])

            # Draw rectangles and labels on the original image
            for j in range(len(result_boxes)):
                box = result_boxes[j]
                plot_one_box(
                    box,
                    batch_image_raw[i],
                    label="{}:{:.2f}".format(
                        categories[int(result_classid[j])], result_scores[j]
                    ),
                )
        return batch_image_raw, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()

    def get_raw_image(self, image_path_batch):
        """
        description: Read an image from image path
        """
        for img_path in image_path_batch:
            yield cv2.imread(img_path)

    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_image(self, raw_bgr_image):
        """
        description: Convert BGR image to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = self.input_w / w
        r_h = self.input_h / h
        if r_h > r_w:
            tw = self.input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.input_h - th) / 2)
            ty2 = self.input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = self.input_h
            tx1 = int((self.input_w - tw) / 2)
            tx2 = self.input_w - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy(self, origin_h, origin_w, x):
        """
        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
        param:
            origin_h:   height of original image
            origin_w:   width of original image
            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
        return:
            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
        """
        y = np.zeros_like(x)
        r_w = self.input_w / origin_w
        r_h = self.input_h / origin_h
        if r_h > r_w:
            y[:, 0] = x[:, 0]
            y[:, 2] = x[:, 2]
            y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2
            y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2
            y /= r_w
        else:
            y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2
            y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2
            y[:, 1] = x[:, 1]
            y[:, 3] = x[:, 3]
            y /= r_h

        return y

    def post_process(self, output, origin_h, origin_w):
        """
        description: postprocess the prediction
        param:
            output:     A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
            result_scores: finally scores, a numpy, each element is the score correspoing to box
            result_classid: finally classid, a numpy, each element is the classid correspoing to box
        """
        # Get the num of boxes detected
        num = int(output[0])
        # Reshape to a two dimentional ndarray
        pred = np.reshape(output[1:], (-1, self.det_row_output_length))[:num, :]

        # Do nms
        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
        result_scores = boxes[:, 4] if len(boxes) else np.array([])
        result_classid = boxes[:, 5] if len(boxes) else np.array([])
        result_proto_coef = boxes[:, DET_NUM:int(DET_NUM + SEG_NUM)] if len(boxes) else np.array([])
        return result_boxes, result_scores, result_classid, result_proto_coef

    def bbox_iou(self, box1, box2, x1y1x2y2=True):
        """
        description: compute the IoU of two bounding boxes
        param:
            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            x1y1x2y2: select the coordinate format
        return:
            iou: computed iou
        """
        if not x1y1x2y2:
            # Transform from center and width to exact coordinates
            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
        else:
            # Get the coordinates of bounding boxes
            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

        # Get the coordinates of the intersection rectangle
        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
        # Intersection area
        inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None)
                      * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None))
        # Union Area
        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)

        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

        return iou

    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
        """
        description: Removes detections with lower object confidence score than 'conf_thres' and performs
        Non-Maximum Suppression to further filter detections.
        param:
            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
            origin_h: original image height
            origin_w: original image width
            conf_thres: a confidence threshold to filter detections
            nms_thres: a iou threshold to filter detections
        return:
            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
        """
        # Get the boxes that score > CONF_THRESH
        boxes = prediction[prediction[:, 4] >= conf_thres]
        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
        boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
        # clip the coordinates
        boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1)
        boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1)
        boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1)
        boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1)
        # Object confidence
        confs = boxes[:, 4]
        # Sort by the confs
        boxes = boxes[np.argsort(-confs)]
        # Perform non-maximum suppression
        keep_boxes = []
        while boxes.shape[0]:
            large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
            label_match = boxes[0, 5] == boxes[:, 5]
            # Indices of boxes with lower confidence scores, large IOUs and matching labels
            invalid = large_overlap & label_match
            keep_boxes += [boxes[0]]
            boxes = boxes[~invalid]
        boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
        return boxes

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def scale_mask(self, mask, ih, iw):
        mask = cv2.resize(mask, (self.input_w, self.input_h))
        r_w = self.input_w / (iw * 1.0)
        r_h = self.input_h / (ih * 1.0)
        if r_h > r_w:
            w = self.input_w
            h = int(r_w * ih)
            x = 0
            y = int((self.input_h - h) / 2)
        else:
            w = int(r_h * iw)
            h = self.input_h
            x = int((self.input_w - w) / 2)
            y = 0
        crop = mask[y:y + h, x:x + w]
        crop = cv2.resize(crop, (iw, ih))
        return crop

    def process_mask(self, output_proto_mask, result_proto_coef, result_boxes, ih, iw):
        """
        description: Mask pred by yolo11 instance segmentation ,
        param:
            output_proto_mask: prototype mask e.g. (32, 160, 160) for 640x640 input
            result_proto_coef: prototype mask coefficients (n, 32), n represents n results
            result_boxes     :
            ih: rows of original image
            iw: cols of original image
        return:
            mask_result: (n, ih, iw)
        """
        result_proto_masks = output_proto_mask.reshape(self.seg_c, self.seg_h, self.seg_w)
        c, mh, mw = result_proto_masks.shape
        print(result_proto_masks.shape)
        print(result_proto_coef.shape)
        masks = self.sigmoid((result_proto_coef @ result_proto_masks.astype(np.float32).reshape(c, -1))).reshape(-1, mh,
                                                                                                                 mw)

        mask_result = []
        for mask, box in zip(masks, result_boxes):
            mask_s = np.zeros((ih, iw))
            crop_mask = self.scale_mask(mask, ih, iw)
            x1 = int(box[0])
            y1 = int(box[1])
            x2 = int(box[2])
            y2 = int(box[3])
            crop = crop_mask[y1:y2, x1:x2]
            crop = np.where(crop >= 0.5, 1, 0)
            crop = crop.astype(np.uint8)
            mask_s[y1:y2, x1:x2] = crop

            mask_result.append(mask_s)
        mask_result = np.array(mask_result)
        return mask_result

    def draw_mask(self, masks, colors_, im_src, alpha=0.5):
        """
        description: Draw mask on image ,
        param:
            masks  : result_mask
            colors_: color to draw mask
            im_src : original image
            alpha  : scale between original  image and mask
        return:
            no return
        """
        if len(masks) == 0:
            return
        masks = np.asarray(masks, dtype=np.uint8)
        masks = np.ascontiguousarray(masks.transpose(1, 2, 0))
        masks = np.asarray(masks, dtype=np.float32)
        colors_ = np.asarray(colors_, dtype=np.float32)
        s = masks.sum(2, keepdims=True).clip(0, 1)
        masks = (masks @ colors_).clip(0, 255)
        im_src[:] = masks * alpha + im_src * (1 - s * alpha)


class inferThread(threading.Thread):
    def __init__(self, yolo11_wrapper, image_path_batch):
        threading.Thread.__init__(self)
        self.yolo11_wrapper = yolo11_wrapper
        self.image_path_batch = image_path_batch

    def run(self):
        batch_image_raw, use_time = self.yolo11_wrapper.infer(self.yolo11_wrapper.get_raw_image(self.image_path_batch))
        for i, img_path in enumerate(self.image_path_batch):
            parent, filename = os.path.split(img_path)
            save_name = os.path.join('output', filename)
            # Save image
            cv2.imwrite(save_name, batch_image_raw[i])
        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))


class warmUpThread(threading.Thread):
    def __init__(self, yolo11_wrapper):
        threading.Thread.__init__(self)
        self.yolo11_wrapper = yolo11_wrapper

    def run(self):
        batch_image_raw, use_time = self.yolo11_wrapper.infer(self.yolo11_wrapper.get_raw_image_zeros())
        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))


class Colors:
    def __init__(self):
        hexs = ('FF3838', 'FF9D97', 'FF701F', 'FFB21D', 'CFD231', '48F90A',
                '92CC17', '3DDB86', '1A9334', '00D4BB', '2C99A8', '00C2FF',
                '344593', '6473FF', '0018EC', '8438FF', '520085', 'CB38FF',
                'FF95C8', 'FF37C7')
        self.palette = [self.hex2rgb(f'#{c}') for c in hexs]
        self.n = len(self.palette)

    def __call__(self, i, bgr=False):
        c = self.palette[int(i) % self.n]
        return (c[2], c[1], c[0]) if bgr else c

    @staticmethod
    def hex2rgb(h):  # rgb order (PIL)
        return tuple(int(h[1 + i:1 + i + 2], 16) for i in (0, 2, 4))


if __name__ == "__main__":
    # load custom plugin and engine
    PLUGIN_LIBRARY = "build/libmyplugins.so"
    engine_file_path = "yolo11n-seg.engine"

    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]
    if len(sys.argv) > 2:
        PLUGIN_LIBRARY = sys.argv[2]

    ctypes.CDLL(PLUGIN_LIBRARY)

    # load coco labels

    categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
                  "traffic light",
                  "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
                  "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase",
                  "frisbee",
                  "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
                  "surfboard",
                  "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
                  "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
                  "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
                  "cell phone",
                  "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
                  "teddy bear",
                  "hair drier", "toothbrush"]

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')
    # a YoLo11TRT instance
    yolo11_wrapper = YoLo11TRT(engine_file_path)
    try:
        print('batch size is', yolo11_wrapper.batch_size)

        image_dir = "images/"
        image_path_batches = get_img_path_batches(yolo11_wrapper.batch_size, image_dir)

        for i in range(10):
            # create a new thread to do warm_up
            thread1 = warmUpThread(yolo11_wrapper)
            thread1.start()
            thread1.join()
        for batch in image_path_batches:
            # create a new thread to do inference
            thread1 = inferThread(yolo11_wrapper, batch)
            thread1.start()
            thread1.join()
    finally:
        # destroy the instance
        yolo11_wrapper.destroy()


================================================
FILE: yolo11_tripy/.gitignore
================================================
imagenet_classes.txt
*.JPEG
*.pt


================================================
FILE: yolo11_tripy/README.md
================================================
# YOLO11 Tripy

This example implements a YOLO11 classifier model using [Tripy](https://nvidia.github.io/TensorRT-Incubator/).

## Running The Example

Run the following commands from the [`yolo11_tripy`](./) directory:

1. Install Dependencies:

    ```bash
    python3 -m pip install -r requirements.txt
    ```

2. Download ImageNet classes file:

    ```bash
    wget https://raw.githubusercontent.com/joannzhang00/ImageNet-dataset-classes-labels/main/imagenet_classes.txt
    ```

3. [*Optional*] Download some images:

    ```bash
    wget https://raw.githubusercontent.com/EliSchwartz/imagenet-sample-images/master/n01558993_robin.JPEG
    wget https://raw.githubusercontent.com/EliSchwartz/imagenet-sample-images/master/n04389033_tank.JPEG
    ```

    You can skip this step if you already have images you'd like to classify.

3. Build the model:

    ```bash
    python3 compile_classifier.py
    ```

    You can configure various aspects of the model when you compile.
    Run `python3 compile_classifier.py -h` for details.

4. Run inference:

    ```bash
    python3 classify.py n01558993_robin.JPEG n04389033_tank.JPEG
    ```

    The `classify.py` script allows you to pass one or more image file paths on the command line.
    The images are batched and classified in a single forward pass.


================================================
FILE: yolo11_tripy/classify.py
================================================
import argparse
import os

import cv2
import numpy as np
import nvtripy as tp
import time
from constants import IMAGE_H, IMAGE_W

CURDIR = os.path.realpath(os.path.dirname(__file__))


def load_image(path):
    return cv2.imread(path)


def preprocess(image):
    h, w, _ = image.shape
    # Crop the center square frame
    m = min(h, w)
    top = (h - m) // 2
    left = (w - m) // 2
    image = image[top:top + m, left:left + m]

    # Resize the image with target size while maintaining ratio
    image = cv2.resize(image, (IMAGE_H, IMAGE_W), interpolation=cv2.INTER_LINEAR)

    # Convert BGR to RGB
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # Normalize to [0,1]
    image = image.astype(np.float32) / 255.0

    # HWC to CHW format
    image = image.transpose(2, 0, 1)

    # CHW to NCHW format (add batch dimension)
    image = np.expand_dims(image, axis=0)

    # Convert the image to row-major order, also known as "C order"
    image = np.ascontiguousarray(image)

    return image


def main():
    parser = argparse.ArgumentParser(description="Classify an image using a YOLO11 classifier model.")
    parser.add_argument("images", help="Images to classify", nargs="+")
    parser.add_argument(
        "--model-path",
        help="Path to the compiled model",
        default=os.path.join(CURDIR, "yolo11-cls.tpymodel"),
    )

    parser.add_argument(
        "--imagenet-classes-file",
        help="Path to the ImageNet classes file (imagenet_classes.txt)",
        default=os.path.join(CURDIR, "imagenet_classes.txt"),
    )

    args, _ = parser.parse_known_args()

    with open(args.imagenet_classes_file) as f:
        CLASSES = [line.strip() for line in f.readlines()]

    print(f"Loading model: {args.model_path}...")

    model = tp.Executable.load(args.model_path)

    input_info = model.input_infos["batch"]
    dtype = input_info.dtype

    if input_info.shape_bounds.max[0] < len(args.images):
        raise ValueError(
            f"Model was compiled for a maximum of {input_info.shape_bounds.max[0]} image(s) "
            f"per batch, but {len(args.images)} were provided."
            f"\nPlease recompile the model with a larger maximum batch size using the "
            f"`--max-images` argument in `compile_classifier.py`."
        )

    images = [preprocess(load_image(path)) for path in args.images]
    batch = tp.Tensor(np.concatenate(images, axis=0))

    # Warm up the model:
    _, _ = model(tp.zeros_like(batch, dtype=dtype).eval())

    # Cast the input based on the model type.
    # Note that the result will be in GPU memory, so we don't need an explicit copy.
    batch = tp.cast(batch, dtype).eval()

    start = time.perf_counter()
    batch_scores, batch_preds = model(batch)
    end = time.perf_counter()

    print(f"Inference + Postprocessing took: {(end - start) * 1000:.3f} ms")

    # Copy the scores back to CPU memory and convert to numpy:
    batch_scores = np.from_dlpack(tp.copy(batch_scores, device=tp.device("cpu")))
    batch_preds = np.from_dlpack(tp.copy(batch_preds, device=tp.device("cpu")))

    for path, scores, preds in zip(args.images, batch_scores, batch_preds):
        print(f"Top {len(preds)} predictions for:", path)
        for idx, (score, pred) in enumerate(zip(scores, preds)):
            print(f"    {idx + 1}. (confidence: {score:.3f}) {CLASSES[pred]}")
        print()


if __name__ == "__main__":
    main()


================================================
FILE: yolo11_tripy/compile_classifier.py
================================================
import argparse
import os

import nvtripy as tp
import requests
import torch
from constants import IMAGE_C, IMAGE_H, IMAGE_W
from model.model import Yolo11Cls
from tqdm import tqdm

CURDIR = os.path.realpath(os.path.dirname(__file__))


def get_model_config(model_variant):
    config = {
        "model_variant": model_variant,
    }
    if model_variant == "n":
        config.update({"gd": 0.50, "gw": 0.25, "max_channels": 1024})
    elif model_variant == "s":
        config.update({"gd": 0.50, "gw": 0.50, "max_channels": 1024})
    elif model_variant == "m":
        config.update({"gd": 0.50, "gw": 1.00, "max_channels": 512})
    elif model_variant == "l":
        config.update({"gd": 1.0, "gw": 1.0, "max_channels": 512})
    elif model_variant == "x":
        config.update({"gd": 1.0, "gw": 1.50, "max_channels": 512})

    return config


def download_weights(model_variant, directory):
    out_path = os.path.join(directory, f"yolo11{model_variant}-cls.pt")

    if os.path.exists(out_path):
        print(f"Checkpoint already exists at: {out_path}, skipping download.")
        return out_path

    URL = f"https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11{model_variant}-cls.pt"

    response = requests.get(URL, stream=True)
    response.raise_for_status()
    total_size = int(response.headers.get("content-length", 0))

    os.makedirs(directory, exist_ok=True)

    with open(out_path, "wb") as f, tqdm(
        desc=f"Downloading checkpoint: yolo11{model_variant}-cls.pt",
        total=total_size,
        unit="B",
        unit_scale=True,
        unit_divisor=1024,
    ) as progress_bar:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
            progress_bar.update(len(chunk))

    return out_path


def load_weights(weights_path, dtype):
    checkpoint = torch.load(weights_path, weights_only=False)
    torch_model = checkpoint["model"].eval()
    if dtype == tp.float16:
        torch_model = torch_model.half()
    else:
        assert dtype == tp.float32, "Unsupported dtype"
        torch_model = torch_model.float()

    state_dict = torch_model.state_dict()

    # Some weights from the training graph are not needed for inference:
    def should_include(key):
        return "num_batches_tracked" not in key

    return {name: tp.Tensor(weight) for name, weight in state_dict.items() if should_include(name)}


def main():
    parser = argparse.ArgumentParser(description="Compiles a YOLO11 classifier model with Tripy.")
    parser.add_argument(
        "--model-variant",
        help="Model variant (n, s, m, l, x)",
        default="n",
        choices=["n", "s", "m", "l", "x"],
    )
    parser.add_argument(
        "-o",
        "--output",
        help="Where to save the Tripy executable",
        default="yolo11-cls.tpymodel",
    )
    parser.add_argument(
        "--checkpoints-dir",
        help="Where to save PyTorch checkpoints",
        default=os.path.join(CURDIR, "checkpoints"),
    )
    parser.add_argument(
        "--max-images",
        help="Maximum number of images the model will be able to classify at once, i.e. the maximum batch size.",
        default=10,
        type=int,
    )
    parser.add_argument(
        "--dtype",
        help="Data type to use for inference",
        default="float16",
        choices=["float32", "float16"],
    )

    args, _ = parser.parse_known_args()

    config = get_model_config(args.model_variant)
    dtype = getattr(tp, args.dtype)
    model = Yolo11Cls(**config, dtype=dtype)

    weights_path = download_weights(args.model_variant, args.checkpoints_dir)

    model.load_state_dict(load_weights(weights_path, dtype))

    # We compile not only the classifier itself, but also accelerate the postprocessing:
    def infer(batch):
        out = model(batch)
        out = tp.softmax(out, dim=1)
        batch_scores, batch_preds = tp.topk(out, 3, dim=-1)
        return batch_scores, batch_preds

    print("Compiling YOLO11 classifier + postprocessing. This may take a few moments...")
    executable = tp.compile(
        infer,
        args=[
            tp.InputInfo(
                [
                    # Support a range of batch sizes from 1 to `max_images`, optimizing for the midpoint:
                    (1, (args.max_images + 1) // 2, args.max_images),
                    IMAGE_C,
                    IMAGE_H,
                    IMAGE_W,
                ],
                dtype=dtype,
            ),
        ],
    )

    print(f"Saving compiled executable to: {args.output}")
    executable.save(args.output)


if __name__ == "__main__":
    main()


================================================
FILE: yolo11_tripy/constants.py
================================================
IMAGE_C = 3
IMAGE_H = 224
IMAGE_W = 224


================================================
FILE: yolo11_tripy/model/block.py
================================================
import nvtripy as tp


class ConvBnSilu(tp.Module):
    def __init__(self, in_channels, out_channels, kernel_dims, stride, dtype):
        super().__init__()
        self.conv = tp.Conv(
            in_channels,
            out_channels,
            kernel_dims,
            stride=stride,
            padding=[(dim // 2, dim // 2) for dim in kernel_dims],
            bias=False,
            dtype=dtype,
        )
        self.bn = tp.BatchNorm(out_channels, eps=1e-3, dtype=dtype)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = tp.silu(x)
        return x


class Bottleneck(tp.Module):
    def __init__(
        self,
        in_channels,
        out_channels,
        shortcut,
        kernel_dims1,
        kernel_dims2,
        expansion_ratio,
        dtype,
    ):
        super().__init__()
        expanded_out_channels = int(out_channels * expansion_ratio)
        self.cv1 = ConvBnSilu(in_channels, expanded_out_channels, kernel_dims1, stride=(1, 1), dtype=dtype)
        self.cv2 = ConvBnSilu(
            expanded_out_channels,
            out_channels,
            kernel_dims2,
            stride=(1, 1),
            dtype=dtype,
        )

        self.shortcut = shortcut and in_channels == out_channels

    def forward(self, x):
        out = self.cv1(x)
        out = self.cv2(out)
        if self.shortcut:
            out += x
        return out


class C3k(tp.Module):
    def __init__(
        self,
        in_channels,
        out_channels,
        num_layers,
        shortcut,
        kernel_dims1,
        kernel_dims2,
        expansion_ratio,
        dtype,
    ):
        super().__init__()
        expanded_out_channels = int(out_channels * expansion_ratio)

        self.cv1 = ConvBnSilu(
            in_channels,
            expanded_out_channels,
            kernel_dims=(1, 1),
            stride=(1, 1),
            dtype=dtype,
        )
        self.cv2 = ConvBnSilu(
            in_channels,
            expanded_out_channels,
            kernel_dims=(1, 1),
            stride=(1, 1),
            dtype=dtype,
        )

        self.m = tp.Sequential(
            *[
                Bottleneck(
                    expanded_out_channels,
                    expanded_out_channels,
                    shortcut,
                    kernel_dims1,
                    kernel_dims2,
                    1.0,
                    dtype=dtype,
                )
                for _ in range(num_layers)
            ]
        )

        self.cv3 = ConvBnSilu(
            2 * expanded_out_channels,
            out_channels,
            kernel_dims=(1, 1),
            stride=(1, 1),
            dtype=dtype,
        )

    def forward(self, x):
        out1 = self.cv1(x)
        out2 = self.cv2(x)

        out1 = self.m(out1)
        out = tp.concatenate((out1, out2), dim=1)
        out = self.cv3(out)
        return out


class C3K2(tp.Module):
    def __init__(
        self,
        in_channels,
        out_channels,
        num_layers,
        use_c3k,
        shortcut,
        expansion_ratio,
        dtype,
    ):
        super().__init__()

        expanded_out_channels = int(out_channels * expansion_ratio)
        self.cv1 = ConvBnSilu(
            in_channels,
            2 * expanded_out_channels,
            kernel_dims=(1, 1),
            stride=(1, 1),
            dtype=dtype,
        )

        self.m = tp.Sequential(
            *[
                (
                    C3k(
                        expanded_out_channels,
                        expanded_out_channels,
                        2,
                        shortcut,
                        (3, 3),
                        (3, 3),
                        0.5,
                        dtype=dtype,
                    )
                    if use_c3k
                    else Bottleneck(
                        expanded_out_channels,
                        expanded_out_channels,
                        shortcut,
                        (3, 3),
                        (3, 3),
                        0.5,
                        dtype=dtype,
                    )
                )
                for _ in range(num_layers)
            ]
        )

        # Number of input channels to CV2 is the output channels of CV1 plus all
        # output channels from the layers in `m`.
        cv2_in_channels = (2 * expanded_out_channels) + (expanded_out_channels * num_layers)
        self.cv2 = ConvBnSilu(cv2_in_channels, out_channels, (1, 1), (1, 1), dtype=dtype)

    def forward(self, x):
        x = self.cv1(x)

        _, m_inp = tp.split(x, 2, dim=1)

        cat = x
        # We manually iterate over the Sequential module here since we need to access the intermediate outputs.
        for layer in self.m:
            m_inp = layer(m_inp)
            cat = tp.concatenate((cat, m_inp), dim=1)
        out = self.cv2(cat)
        return out


class ConvBn(tp.Module):
    def __init__(self, in_channels, out_channels, kernel_dims, stride, dtype, num_groups=1):
        super().__init__()
        self.conv = tp.Conv(
            in_channels,
            out_channels,
            kernel_dims,
            stride=stride,
            padding=[(dim // 2, dim // 2) for dim in kernel_dims],
            bias=False,
            groups=num_groups,
            dtype=dtype,
        )
        self.bn = tp.BatchNorm(out_channels, eps=1e-3, dtype=dtype)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        return x


class Attention(tp.Module):
    def __init__(self, dim, num_heads, attn_ratio, dtype):
        super().__init__()

        self.dim = dim
        self.num_heads = num_heads
        head_dim = self.dim // num_heads
        self.key_dim = int(head_dim * attn_ratio)
        self.scale = self.key_dim**-0.5
        nh_kd = self.key_dim * num_heads
        h = self.dim + nh_kd * 2

        self.qkv = ConvBn(self.dim, h, (1, 1), (1, 1), dtype=dtype)
        self.pe = ConvBn(self.dim, self.dim, (3, 3), (1, 1), dtype=dtype, num_groups=self.dim)
        self.proj = ConvBn(self.dim, self.dim, (1, 1), (1, 1), dtype=dtype)

    def forward(self, x):
        B, _, H, W = x.shape
        N = H * W

        x = self.qkv(x)

        x = tp.reshape(x, (B, self.num_heads, -1, N))

        q, k, v = tp.split(x, [self.key_dim, self.key_dim, self.key_dim * 2], dim=2)

        q_t = tp.transpose(q, 2, 3)

        softmax = tp.softmax((q_t @ k) * self.scale, dim=3)

        attn_t = tp.transpose(softmax, 2, 3)

        matmul2 = v @ attn_t
        reshape = tp.reshape(matmul2, (B, -1, H, W))

        v_reshape = tp.reshape(v, (B, self.dim, H, W))

        pe = self.pe(v_reshape)

        sum = reshape + pe
        proj = self.proj(sum)
        return proj


class PSABlock(tp.Module):
    def __init__(self, dim, attn_ratio, num_heads, shortcut, dtype):
        super().__init__()

        self.attn = Attention(dim, num_heads, attn_ratio, dtype=dtype)
        self.shortcut = shortcut

        self.ffn = tp.Sequential(
            ConvBnSilu(dim, dim * 2, (1, 1), (1, 1), dtype=dtype),
            ConvBn(dim * 2, dim, (1, 1), (1, 1), dtype=dtype),
        )

    def forward(self, x):
        attn_out = self.attn(x)
        if self.shortcut:
            x = x + attn_out
        else:
            x = attn_out

        ffn_out = self.ffn(x)
        if self.shortcut:
            x = x + ffn_out
        else:
            x = ffn_out

        return x


class C2PSA(tp.Module):
    def __init__(self, input_channels, output_channels, num_layers, expansion_ratio, dtype):
        super().__init__()

        expanded_input_channels = int(input_channels * expansion_ratio)

        self.cv1 = ConvBnSilu(input_channels, 2 * expanded_input_channels, (1, 1), (1, 1), dtype=dtype)
        self.m = tp.Sequential(
            *[
                PSABlock(
                    expanded_input_channels,
                    0.5,
                    expanded_input_channels // 64,
                    True,
                    dtype=dtype,
                )
                for _ in range(num_layers)
            ]
        )

        self.cv2 = ConvBnSilu(2 * expanded_input_channels, output_channels, (1, 1), (1, 1), dtype=dtype)

    def forward(self, x):
        x = self.cv1(x)

        split1, y = tp.split(x, 2, dim=1)

        y = self.m(y)

        cat = tp.concatenate((split1, y), dim=1)
        out = self.cv2(cat)
        return out


================================================
FILE: yolo11_tripy/model/model.py
================================================
import math

import nvtripy as tp

from .block import C2PSA, C3K2, ConvBnSilu

NUM_CLASSES = 1000


def get_width(w, gw, max_channels, divisor=8):
    return int(math.ceil((min(w, max_channels) * gw) / divisor)) * divisor


def get_depth(d, gd):
    if d == 1:
        return d

    r = round(d * gd)
    # Round ties for even numbers down:
    if d * gd - int(d * gd) == 0.5 and (int(d * gd) % 2) == 0:
        r -= 1
    return max(r, 1)


class Yolo11Head(tp.Module):
    def __init__(self, input_channels, dtype):
        super().__init__()
        self.conv = ConvBnSilu(input_channels, 1280, (1, 1), (1, 1), dtype=dtype)
        self.linear = tp.Linear(1280, NUM_CLASSES, dtype=dtype)

    def forward(self, x):
        x = self.conv(x)
        # Global average pooling:
        x = tp.reshape(tp.mean(x, dim=(2, 3), keepdim=True), (-1, 1280))
        x = self.linear(x)
        return x


class Yolo11Cls(tp.Module):
    def __init__(self, model_variant, gd, gw, max_channels, dtype=tp.float32):
        use_c3k = model_variant in {"m", "l", "x"}

        self.model = tp.Sequential(
            ConvBnSilu(3, get_width(64, gw, max_channels), (3, 3), (2, 2), dtype=dtype),
            ConvBnSilu(
                get_width(64, gw, max_channels),
                get_width(128, gw, max_channels),
                (3, 3),
                (2, 2),
                dtype=dtype,
            ),
            C3K2(
                get_width(128, gw, max_channels),
                get_width(256, gw, max_channels),
                get_depth(2, gd),
                use_c3k,
                True,
                0.25,
                dtype=dtype,
            ),
            ConvBnSilu(
                get_width(256, gw, max_channels),
                get_width(256, gw, max_channels),
                (3, 3),
                (2, 2),
                dtype=dtype,
            ),
            C3K2(
                get_width(256, gw, max_channels),
                get_width(512, gw, max_channels),
                get_depth(2, gd),
                use_c3k,
                True,
                0.25,
                dtype=dtype,
            ),
            ConvBnSilu(
                get_width(512, gw, max_channels),
                get_width(512, gw, max_channels),
                (3, 3),
                (2, 2),
                dtype=dtype,
            ),
            C3K2(
                get_width(512, gw, max_channels),
                get_width(512, gw, max_channels),
                get_depth(2, gd),
                True,
                True,
                0.5,
                dtype=dtype,
            ),
            ConvBnSilu(
                get_width(512, gw, max_channels),
                get_width(1024, gw, max_channels),
                (3, 3),
                (2, 2),
                dtype=dtype,
            ),
            C3K2(
                get_width(1024, gw, max_channels),
                get_width(1024, gw, max_channels),
                get_depth(2, gd),
                True,
                True,
                0.5,
                dtype=dtype,
            ),
            C2PSA(
                get_width(1024, gw, max_channels),
                get_width(1024, gw, max_channels),
                get_depth(2, gd),
                0.5,
                dtype=dtype,
            ),
            Yolo11Head(get_width(1024, gw, max_channels), dtype=dtype),
        )

    def forward(self, x):
        x = self.model(x)
        return x


================================================
FILE: yolo11_tripy/requirements.txt
================================================
-f https://nvidia.github.io/TensorRT-Incubator/packages.html
nvtripy>=0.1.1
opencv-python-headless
numpy
torch


================================================
FILE: yolo26/.clang-format
================================================
# Google C/C++ Code Style settings (with 4-space)
# Refered to https://github.com/kehanXue/google-style-clang-format/blob/master/.clang-format

Language: Cpp
BasedOnStyle: Google
AccessModifierOffset: -1
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: None
AlignOperands: Align
AllowAllArgumentsOnNextLine: true
AllowAllConstructorInitializersOnNextLine: true
AllowAllParametersOfDeclarationOnNextLine: false
AllowShortBlocksOnASingleLine: Empty
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Inline
AllowShortIfStatementsOnASingleLine: Never  # To avoid conflict, set this "Never" and each "if statement" should include brace when coding
AllowShortLambdasOnASingleLine: Inline
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterReturnType: None
AlwaysBreakTemplateDeclarations: Yes
BinPackArguments: true
BreakBeforeBraces: Custom
BraceWrapping:
  AfterCaseLabel: false
  AfterClass: false
  AfterStruct: false
  AfterControlStatement: Never
  AfterEnum: false
  AfterFunction: false
  AfterNamespace: false
  AfterUnion: false
  AfterExternBlock: false
  BeforeCatch: false
  BeforeElse: false
  BeforeLambdaBody: false
  IndentBraces: false
  SplitEmptyFunction: false
  SplitEmptyRecord: false
  SplitEmptyNamespace: false
BreakBeforeBinaryOperators: None
BreakBeforeTernaryOperators: true
BreakConstructorInitializers: BeforeColon
BreakInheritanceList: BeforeColon
ColumnLimit: 120
CompactNamespaces: false
ContinuationIndentWidth: 8
Cpp11BracedListStyle: true
DerivePointerAlignment: false  # Make sure the * or & align on the left
EmptyLineBeforeAccessModifier: LogicalBlock
FixNamespaceComments: true
IncludeBlocks: Preserve
IndentCaseLabels: true
IndentPPDirectives: None
IndentWidth: 4
KeepEmptyLinesAtTheStartOfBlocks: true
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: true
PointerAlignment: Left
ReflowComments: false
# SeparateDefinitionBlocks: Always   # Only support since clang-format 14
SpaceAfterCStyleCast: false
SpaceAfterLogicalNot: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeCpp11BracedList: false
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeParens: ControlStatements
SpaceBeforeRangeBasedForLoopColon: true
SpaceBeforeSquareBrackets: false
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 2
SpacesInAngles: false
SpacesInCStyleCastParentheses: false
SpacesInContainerLiterals: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: c++11
TabWidth: 8
UseTab: Never

================================================
FILE: yolo26/.gitignore
================================================
**/build/**
**/models/**
**/*.onnx
**/*.engine
**/*.pt


================================================
FILE: yolo26/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.10)

project(yolo26)

add_definitions(-std=c++11)
add_definitions(-DAPI_EXPORTS)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
enable_language(CUDA)

include_directories(${PROJECT_SOURCE_DIR}/include)
include_directories(${PROJECT_SOURCE_DIR}/plugin)

# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
  message("embed_platform on")
  include_directories(/usr/local/cuda/targets/aarch64-linux/include)
  link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
else()
  message("embed_platform off")

  # cuda
  include_directories(/usr/local/cuda/include)
  link_directories(/usr/local/cuda/lib64)

  # tensorrt
  include_directories(/workspace/shared/TensorRT-8.6.3/include)
  link_directories(/workspace/shared/TensorRT-8.6.3/lib)
endif()

add_library(yololayerplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu)
target_link_libraries(yololayerplugins nvinfer cudart)

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu)

add_executable(yolo26_det ${PROJECT_SOURCE_DIR}/yolo26_det.cpp ${SRCS})
target_link_libraries(yolo26_det nvinfer)
target_link_libraries(yolo26_det cudart)
target_link_libraries(yolo26_det yololayerplugins)
target_link_libraries(yolo26_det ${OpenCV_LIBS})

add_executable(yolo26_obb ${PROJECT_SOURCE_DIR}/yolo26_obb.cpp ${SRCS})
target_link_libraries(yolo26_obb nvinfer)
target_link_libraries(yolo26_obb cudart)
target_link_libraries(yolo26_obb yololayerplugins)
target_link_libraries(yolo26_obb ${OpenCV_LIBS})

add_executable(yolo26_cls ${PROJECT_SOURCE_DIR}/yolo26_cls.cpp ${SRCS})
target_link_libraries(yolo26_cls nvinfer)
target_link_libraries(yolo26_cls cudart)
target_link_libraries(yolo26_cls yololayerplugins)
target_link_libraries(yolo26_cls ${OpenCV_LIBS})

================================================
FILE: yolo26/README.md
================================================
## Introduction

Yolo26 model supports TensorRT-8.

Training code [link](https://github.com/ultralytics/ultralytics/archive/refs/tags/v8.4.0.zip)

## Environment

* cuda 12.4
* cudnn 9.1.0.70
* tensorrt 8.6.3
* opencv 4.8.0
* ultralytics 8.4.0

## Support

* [✅] Yolo26n-det, Yolo26s-det, Yolo26m-det, Yolo26l-det, Yolo26sx-det, support FP32/FP16 and C++ API
* [✅] Yolo26n-obb, Yolo26s-obb, Yolo26m-obb, Yolo26l-obb, Yolo26sx-obb, support FP32/FP16 and C++ API
* [✅] Yolo26n-cls, Yolo26s-cls, Yolo26m-cls, Yolo26l-cls, Yolo26sx-cls, support FP32/FP16 and C++ API

## COMING FEATURES
* [⏳] Windows OS Support
* [⏳] Support Batched Inputs
* [⏳] Support Quantization
* [⏳] Yolo26-cls models
* [⏳] Yolo26-pose models
* [⏳] Yolo26-seg models

## Config

* Choose the YOLO26 sub-model n/s/m/l/x from command line arguments.
* Other configs please check [include/config.h](include/config.h)

## Build and Run

1. generate .wts from pytorch with .pt, or download .wts from model zoo

```shell
# Download ultralytics
wget https://github.com/ultralytics/ultralytics/archive/refs/tags/v8.4.4.zip -O ultralytics-8.4.4.zip
# Unzip ultralytics
unzip ultralytics-8.4.4.zip
cd ultralytics-8.4.4
# Download models For Detection
wget https://github.com/ultralytics/assets/releases/download/v8.4.0/yolo26n.pt -O yolo26n.pt # to download other models, replace 'yolo26n.pt' with 'yolo26s.pt', 'yolo26m.pt', 'yolo26l.pt' or 'yolo26x.pt'
# Generate .wts
cp [PATH-TO-MAIN-FOLDER]/gen_wts.py .
python gen_wts.py -w yolo26n.pt -o yolo26n.wts -t detect
# A file 'yolo26n.wts' will be generated.

# Download models for Obb
wget https://github.com/ultralytics/assets/releases/download/v8.4.0/yolo26n-obb.pt -O yolo26n-obb.pt # to download other models, replace 'yolo26n-obb.pt' with 'yolo26s-obb.pt', 'yolo26m-obb.pt', 'yolo26l-obb.pt' or 'yolo26x-obb.pt'
# Generate .wts
cp [PATH-TO-MAIN-FOLDER]/gen_wts.py .
python gen_wts.py -w yolo26n-obb.pt -o yolo26n-obb.wts -t obb
# A file 'yolo26n-obb.wts' will be generated.

# Download models for Cls
wget https://github.com/ultralytics/assets/releases/download/v8.4.0/yolo26n-cls.pt -O yolo26n-cls.pt # to download other models, replace 'yolo26n-cls.pt' with 'yolo26s-cls.pt', 'yolo26m-cls.pt', 'yolo26l-cls.pt' or 'yolo26x-cls.pt'
# Generate .wts
cp [PATH-TO-MAIN-FOLDER]/gen_wts.py .
python gen_wts.py -w yolo26n-cls.pt -o yolo26n-cls.wts -t cls
# A file 'yolo26n-cls.wts' will be generated.

```

2. build and run
```shell
cd [PATH-TO-MAIN-FOLDER]
mkdir build
cd build
cmake ..
make
```

### Detection
```shell
cp [PATH-TO-ultralytics]/yolo26n.wts .
# Build and serialize TensorRT engine
./yolo26_det -s yolo26n.wts yolo26n.engine [n/s/m/l/x]
# Run inference
./yolo26_det -d yolo26n.engine ../images
# results saved in build directory
```

### Obb
```shell
cp [PATH-TO-ultralytics]/yolo26n-obb.wts .
# Build and serialize TensorRT engine
./yolo26_obb -s yolo26n-obb.wts yolo26n-obb.engine [n/s/m/l/x]
# Run inference
./yolo26_obb -d yolo26n-obb.engine ../images
# results saved in build directory
```

### Cls
```shell
Generate classification text file in build folder or download it
# wget https://github.com/joannzhang00/ImageNet-dataset-classes-labels/blob/main/imagenet_classes.txt

cp [PATH-TO-ultralytics]/yolo26n-cls.wts .
# Build and serialize TensorRT engine
./yolo26_cls -s yolo26n-cls.wts yolo26n-cls.engine [n/s/m/l/x]
# Run inference
./yolo26_cls -d yolo26n-cls.engine ../images
# results saved in build directory
```

## More Information
See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)

================================================
FILE: yolo26/gen_wts.py
================================================
import sys  # noqa: F401
import argparse
import os
import struct
import torch


def parse_args():
    parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
    parser.add_argument('-w', '--weights', required=True,
                        help='Input weights (.pt) file path (required)')
    parser.add_argument(
        '-o', '--output', help='Output (.wts) file path (optional)')
    parser.add_argument(
        '-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg', 'pose', 'obb'],
        help='determines the model is detection/classification')
    args = parser.parse_args()
    if not os.path.isfile(args.weights):
        raise SystemExit('Invalid input file')
    if not args.output:
        args.output = os.path.splitext(args.weights)[0] + '.wts'
    elif os.path.isdir(args.output):
        args.output = os.path.join(
            args.output,
            os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
    return args.weights, args.output, args.type


pt_file, wts_file, m_type = parse_args()

print(f'Generating .wts for {m_type} model')

# Load model
print(f'Loading {pt_file}')

# Initialize
device = 'cpu'

# Load model
model = torch.load(pt_file, map_location=device, weights_only=False)['model'].float()  # load to FP32

if m_type in ['detect', 'seg', 'pose', 'obb']:
    anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None]

    delattr(model.model[-1], 'anchors')

model.to(device).eval()

with open(wts_file, 'w') as f:
    f.write('{}\n'.format(len(model.state_dict().keys())))
    for k, v in model.state_dict().items():
        vr = v.reshape(-1).cpu().numpy()
        f.write('{} {} '.format(k, len(vr)))
        for vv in vr:
            f.write(' ')
            f.write(struct.pack('>f', float(vv)).hex())
        f.write('\n')


================================================
FILE: yolo26/include/block.h
================================================
#pragma once

#include <map>
#include <string>
#include <vector>
#include "NvInfer.h"

using namespace std;
std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file);

nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network,
                                      std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                      std::string lname, float eps);

nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network,
                                        std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                        int ch, std::vector<int> k, int s, std::string lname, int g = 1);

nvinfer1::IElementWiseLayer* C3K2(nvinfer1::INetworkDefinition* network,
                                  std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c1,
                                  int c2, int n, bool c3k, bool shortcut, bool atnn, float e, std::string lname);

nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network,
                                  std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
                                  int c2, int k, bool shortcut, std::string lname);

nvinfer1::IElementWiseLayer* C2PSA(nvinfer1::INetworkDefinition* network,
                                   std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input,
                                   int c1, int c2, int n, float e, std::string lname);

nvinfer1::ILayer* DWConv(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                         nvinfer1::ITensor& input, int ch, std::vector<int> k, int s, std::string lname);

nvinfer1::ILayer* conv(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                       nvinfer1::ITensor& input, int ch, std::vector<int> k, int s, std::string lname, int g = 1,
                       bool act = true);

nvinfer1::IPluginV2Layer* addYoloLayer(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor& input,
                                       const std::vector<int>& strides, const std::vector<int>& fm_sizes,
                                       int stridesLength, bool is_detection, bool is_segmentation, bool is_pose,
                                       bool is_obb, int anchorCount);

================================================
FILE: yolo26/include/config.h
================================================
#define USE_FP16
// #define USE_FP32
// #define USE_INT8

const static char* kInputTensorName = "images";
const static char* kOutputTensorName = "output";
const static char* kProtoTensorName = "proto";
const static int kNumClass = 80;
const static int kPoseNumClass = 1;
const static int kNumberOfPoints = 17;  // number of keypoints total
// obb model's number of classes
constexpr static int kObbNumClass = 15;
const static int kObbNe = 1;  // number of extra parameters
const static int kBatchSize = 1;
const static int kGpuId = 0;
const static int kInputH = 640;
const static int kInputW = 640;
const static int kObbInputH = 1024;
const static int kObbInputW = 1024;
const static float kNmsThresh = 0.45f;
const static float kConfThresh = 0.3f;
const static float kConfThreshKeypoints = 0.5f;  // keypoints confidence
const static int kMaxInputImageSize = 3000 * 3000;
const static int kMaxNumOutputBbox = 300;
// Quantization input image folder path
const static char* kInputQuantizationFolder = "./coco_calib";

// Classfication model's number of classes
constexpr static int kClsNumClass = 1000;
// Classfication model's input shape
constexpr static int kClsInputH = 224;
constexpr static int kClsInputW = 224;

================================================
FILE: yolo26/include/cuda_utils.h
================================================
#ifndef TRTX_CUDA_UTILS_H_
#define TRTX_CUDA_UTILS_H_

#include <cuda_runtime_api.h>

#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)                                                                    \
    {                                                                                          \
        cudaError_t error_code = callstr;                                                      \
        if (error_code != cudaSuccess) {                                                       \
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
            assert(0);                                                                         \
        }                                                                                      \
    }
#endif  // CUDA_CHECK

#endif  // TRTX_CUDA_UTILS_H_

================================================
FILE: yolo26/include/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "NvInferRuntimeCommon.h"
#include "macros.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf {
   public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {}

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {}

    ~LogStreamConsumerBuffer() {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr()) {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync() {
        putOutput();
        return 0;
    }

    void putOutput() {
        if (mShouldLog) {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }

   private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase {
   public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog) {}

   protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
   public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(severity <= reportableSeverity),
          mSeverity(severity) {}

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(other.mShouldLog),
          mSeverity(other.mSeverity) {}

    void setReportableSeverity(Severity reportableSeverity) {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

   private:
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger {
   public:
    Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {}

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult {
        kRUNNING,  //!< The test is running
        kPASSED,   //!< The test passed
        kFAILED,   //!< The test failed
        kWAIVED    //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger() { return *this; }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity) { mReportableSeverity = severity; }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom {
       public:
        TestAtom(TestAtom&&) = default;

       private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started), mName(name), mCmdline(cmdline) {}

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline) {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom) {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result) {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass) {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const { return mReportableSeverity; }

   private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result) {
        switch (result) {
            case TestResult::kRUNNING:
                return "RUNNING";
            case TestResult::kPASSED:
                return "PASSED";
            case TestResult::kFAILED:
                return "FAILED";
            case TestResult::kWAIVED:
                return "WAIVED";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result) {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv) {
        std::stringstream ss;
        for (int i = 0; i < argc; i++) {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace {

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

}  // anonymous namespace

#endif  // TENSORRT_LOGGING_H

================================================
FILE: yolo26/include/macros.h
================================================
#ifndef __MACROS_H
#define __MACROS_H

#include "NvInfer.h"

#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else

#if defined(_MSC_VER)
#define API __declspec(dllimport)
#else
#define API
#endif
#endif  // API_EXPORTS

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif

#endif  // __MACROS_H

================================================
FILE: yolo26/include/model.h
================================================
#pragma once

#include <assert.h>
#include <string>
#include "NvInfer.h"

nvinfer1::IHostMemory* buildEngineYolo26Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                            nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                            int& max_channels, std::string& type);

nvinfer1::IHostMemory* buildEngineYolo26Obb(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                            nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                            int& max_channels, std::string& type);

nvinfer1::IHostMemory* buildEngineYolo26Cls(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                            nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                            int& max_channels, std::string& type);


================================================
FILE: yolo26/include/postprocess.h
================================================
#pragma once

#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "types.h"

// Preprocessing functions
cv::Rect get_rect(cv::Mat& img, float bbox[4]);

// NMS functions
void decode(std::vector<Detection>& res, float* output);

void batch_decode(std::vector<std::vector<Detection>>& res_batch, float* output, int batch_size, int output_size);

void decode_obb(std::vector<Detection>& res, float* output);

void batch_decode_obb(std::vector<std::vector<Detection>>& batch_res, float* output, int batch_size, int output_size);

// Drawing functions
void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);

void draw_bbox_obb(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);

void draw_bbox_keypoints_line(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);

void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks,
                    std::unordered_map<int, std::string>& labels_map);

================================================
FILE: yolo26/include/preprocess.h
================================================
#pragma once

#include <map>
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "types.h"

void cuda_preprocess_init(int max_image_size);

void cuda_preprocess_destroy();

void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height,
                     cudaStream_t stream);

void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch, float* dst, int dst_width, int dst_height,
                           cudaStream_t stream);

================================================
FILE: yolo26/include/types.h
================================================
#pragma once
#include "config.h"

struct alignas(float) Detection {
    // center_x center_y w h
    float bbox[4];
    float conf;  // bbox_conf * cls_conf
    float class_id;
    float mask[32];
    float keypoints[kNumberOfPoints * 3];  // 17*3 keypoints
    float angle;                           // obb angle
};

struct AffineMatrix {
    float value[6];
};

const int bbox_element =
        sizeof(AffineMatrix) / sizeof(float) + 1;  // left, top, right, bottom, confidence, class, keepflag

================================================
FILE: yolo26/include/utils.h
================================================
#pragma once
#include <dirent.h>
#include <fstream>
#include <opencv2/opencv.hpp>

static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
    int w, h, x, y;
    float r_w = input_w / (img.cols * 1.0);
    float r_h = input_h / (img.rows * 1.0);
    if (r_h > r_w) {
        w = input_w;
        h = r_w * img.rows;
        x = 0;
        y = (input_h - h) / 2;
    } else {
        w = r_h * img.cols;
        h = input_h;
        x = (input_w - w) / 2;
        y = 0;
    }
    cv::Mat re(h, w, CV_8UC3);
    cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
    cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
    re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
    return out;
}

static inline int read_files_in_dir(const char* p_dir_name, std::vector<std::string>& file_names) {
    DIR* p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) {
            // std::string cur_file_name(p_dir_name);
            // cur_file_name += "/";
            // cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            //            std::cout << "Found file: " << cur_file_name << std::endl;
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}

inline std::vector<std::string> read_classes(std::string file_name) {
    std::vector<std::string> classes;
    std::ifstream ifs(file_name, std::ios::in);
    if (!ifs.is_open()) {
        std::cerr << file_name << " is not found, pls refer to README and download it." << std::endl;
        assert(0);
    }
    std::string s;
    while (std::getline(ifs, s)) {
        // std::cout << "Read class: " << s << std::endl;
        classes.push_back(s);
    }
    ifs.close();
    return classes;
}

// Function to trim leading and trailing whitespace from a string
static inline std::string trim_leading_whitespace(const std::string& str) {
    size_t first = str.find_first_not_of(' ');
    if (std::string::npos == first) {
        return str;
    }
    size_t last = str.find_last_not_of(' ');
    return str.substr(first, (last - first + 1));
}

// Src: https://stackoverflow.com/questions/16605967
static inline std::string to_string_with_precision(const float a_value, const int n = 2) {
    std::ostringstream out;
    out.precision(n);
    out << std::fixed << a_value;
    return out.str();
}

static inline int read_labels(const std::string labels_filename, std::unordered_map<int, std::string>& labels_map) {
    std::ifstream file(labels_filename);
    // Read each line of the file
    std::string line;
    int index = 0;
    while (std::getline(file, line)) {
        // Strip the line of any leading or trailing whitespace
        line = trim_leading_whitespace(line);

        // Add the stripped line to the labels_map, using the loop index as the key
        labels_map[index] = line;
        index++;
    }
    // Close the file
    file.close();

    return 0;
}

static inline bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir,
                              std::string& type, float& gd, float& gw, int& max_channels) {
    if (argc < 4)
        return false;
    if (std::string(argv[1]) == "-s" && (argc == 5)) {
        wts = std::string(argv[2]);
        engine = std::string(argv[3]);
        auto sub_type = std::string(argv[4]);

        if (sub_type[0] == 'n') {
            gd = 0.50;
            gw = 0.25;
            max_channels = 1024;
            type = "n";
        } else if (sub_type[0] == 's') {
            gd = 0.50;
            gw = 0.50;
            max_channels = 1024;
            type = "s";
        } else if (sub_type[0] == 'm') {
            gd = 0.50;
            gw = 1.00;
            max_channels = 512;
            type = "m";
        } else if (sub_type[0] == 'l') {
            gd = 1.0;
            gw = 1.0;
            max_channels = 512;
            type = "l";
        } else if (sub_type[0] == 'x') {
            gd = 1.0;
            gw = 1.50;
            max_channels = 512;
            type = "x";
        } else {
            return false;
        }
    } else if (std::string(argv[1]) == "-d" && argc == 4) {
        engine = std::string(argv[2]);
        img_dir = std::string(argv[3]);
    } else {
        return false;
    }
    return true;
}

================================================
FILE: yolo26/plugin/yololayer.cu
================================================
#include <assert.h>
#include <math.h>
#include <iostream>
#include <vector>
#include "cuda_utils.h"
#include "types.h"
#include "yololayer.h"

__device__ float d_confThreshold = 0.4f;

namespace Tn {
template <typename T>
void write(char*& buffer, const T& val) {
    *reinterpret_cast<T*>(buffer) = val;
    buffer += sizeof(T);
}

template <typename T>
void read(const char*& buffer, T& val) {
    val = *reinterpret_cast<const T*>(buffer);
    buffer += sizeof(T);
}
}  // namespace Tn

__device__ float sigmoid(float x) {
    return 1.0f / (1.0f + exp(-x));
}

namespace nvinfer1 {

void setPluginDeviceParams(float confThreshold) {
    cudaMemcpyToSymbol(d_confThreshold, &confThreshold, sizeof(float));
}

YoloLayerPlugin::YoloLayerPlugin(int classCount, int numberOfPoints, int maxDetections, bool isDetection,
                                 bool isSegmentation, bool isPose, bool isObb, int anchor_count) {

    mClassCount = classCount;
    mNumberOfPoints = numberOfPoints;
    mThreadCount = 256;
    mMaxDetections = maxDetections;
    mIsDetection = isDetection;
    mIsSegmentation = isSegmentation;
    mIsPose = isPose;
    mIsObb = isObb;
    mAnchorCount = anchor_count;

    /*
    std::cout << "YoloLayerPlugin created with the following parameters:" << std::endl;
    std::cout << "  Class Count: " << mClassCount << std::endl;
    std::cout << "  Number of Points: " << mNumberOfPoints << std::endl;
    std::cout << "  Confidence Threshold Keypoints: " << mConfThreshold << std::endl;
    std::cout << "  Max Detections: " << mMaxDetections << std::endl;
    std::cout << "  Is Detection: " << mIsDetection << std::endl;
    std::cout << "  Is Segmentation: " << mIsSegmentation << std::endl;
    std::cout << "  Is Pose: " << mIsPose << std::endl;
    std::cout << "  Is OBB: " << mIsObb << std::endl;
    std::cout << "  Anchor Count: " << mAnchorCount << std::endl;
    std::cout << "  Strides: ";
    for (int i = 0; i < mStridesLength; ++i) {
        std::cout << mStrides[i] << " ";
    }
    std::cout << std::endl;
    */
}

YoloLayerPlugin::~YoloLayerPlugin() {}

YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) {
    using namespace Tn;
    const char *d = reinterpret_cast<const char*>(data), *a = d;
    read(d, mClassCount);
    read(d, mNumberOfPoints);
    read(d, mThreadCount);
    read(d, mMaxDetections);
    read(d, mIsDetection);
    read(d, mIsSegmentation);
    read(d, mIsPose);
    read(d, mIsObb);
    read(d, mAnchorCount);

    assert(d == a + length);
}

void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT {

    using namespace Tn;
    char *d = static_cast<char*>(buffer), *a = d;
    write(d, mClassCount);
    write(d, mNumberOfPoints);
    write(d, mThreadCount);
    write(d, mMaxDetections);
    write(d, mIsDetection);
    write(d, mIsSegmentation);
    write(d, mIsPose);
    write(d, mIsObb);
    write(d, mAnchorCount);

    assert(d == a + getSerializationSize());
}

size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT {
    return sizeof(mClassCount) + sizeof(mNumberOfPoints) + sizeof(mThreadCount) + sizeof(mMaxDetections) +
           sizeof(mIsDetection) + sizeof(mIsSegmentation) + sizeof(mIsPose) + sizeof(mIsObb) + sizeof(mAnchorCount);
}

int YoloLayerPlugin::initialize() TRT_NOEXCEPT {
    return 0;
}

nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs,
                                                    int nbInputDims) TRT_NOEXCEPT {
    int total_size = mMaxDetections * sizeof(Detection) / sizeof(float);
    return nvinfer1::Dims3(total_size + 1, 1, 1);
}

void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT {
    mPluginNamespace = pluginNamespace;
}

const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT {
    return mPluginNamespace;
}

nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
                                                      int nbInputs) const TRT_NOEXCEPT {
    return nvinfer1::DataType::kFLOAT;
}

bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
                                                   int nbInputs) const TRT_NOEXCEPT {
    return false;
}

bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT {
    return false;
}

void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int32_t nbInput,
                                      nvinfer1::PluginTensorDesc const* out, int32_t nbOutput) TRT_NOEXCEPT {}

void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
                                      IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {}

void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {}

const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT {
    return "YoloLayer_TRT";
}

const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT {
    return "1";
}

void YoloLayerPlugin::destroy() TRT_NOEXCEPT {
    delete this;
}

nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT {
    YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mNumberOfPoints, mMaxDetections, mIsDetection,
                                             mIsSegmentation, mIsPose, mIsObb, mAnchorCount);
    p->setPluginNamespace(mPluginNamespace);
    return p;
}

int YoloLayerPlugin::enqueue(int batchSize, const void* const* inputs, void* const* outputs, void* workspace,
                             cudaStream_t stream) TRT_NOEXCEPT {
    gatherKernelLauncher(reinterpret_cast<const float* const*>(inputs), reinterpret_cast<float*>(outputs[0]), stream,
                         batchSize);

    return 0;
}

__device__ float Logist(float data) {
    return 1.f / (1.f + expf(-data));
}

__global__ void gatherKernel(const float* input, float* output, int num_elements, int max_out_object, int class_count,
                             int nk, int output_elem, bool is_detection, bool is_segmentation, bool is_pose,
                             bool is_obb) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= num_elements)
        return;

    int outputIdx = 0 * output_elem;  // TODO: ADD BATCH SUPPORT HERE
    int anchor_size = -1;
    float angle = 0.0f;

    if (is_detection) {
        anchor_size = 4 + class_count;
    } else if (is_obb) {
        anchor_size = 5 + class_count;
        angle = input[idx * (anchor_size) + 4 + class_count];
    }

    float xmin = input[idx * (anchor_size) + 0];
    float ymin = input[idx * (anchor_size) + 1];
    float xmax = input[idx * (anchor_size) + 2];
    float ymax = input[idx * (anchor_size) + 3];

    float score = 0.0f;
    int class_id = -1;
    for (int c = 0; c < class_count; c++) {
        float conf = input[idx * (anchor_size) + 4 + c];
        if (conf > score) {
            score = conf;
            class_id = c;
        }
    }

    if (score < d_confThreshold) {
        return;
    }

    int count = (int)atomicAdd(output + outputIdx, 1);
    if (count >= max_out_object) {
        return;
    }

    int det_size = sizeof(Detection) / sizeof(float);
    Detection* det = (Detection*)(output + outputIdx + 1 + count * det_size);

    /*
    float scale = fminf(640.0f / 1080.0f, 640.0f / 608.0f);    // TODO: GET FROM PARAMETERS WITH SCALE!
    float offset_x = -scale * 1080.0f / 2.0f + 640.0f / 2.0f;  // TODO: GET FROM PARAMETERS WITH OFFSET!
    float offset_y = -scale * 608.0f / 2.0f + 640.0f / 2.0f;   // TODO: GET FROM PARAMETERS WITH OFFSET!
    

    det->conf = score;
    det->class_id = 1;  // TODO: ADD CLASS ID HERE
    det->bbox[0] = (xmin - offset_x) / scale;
    det->bbox[1] = (ymin - offset_y) / scale;
    det->bbox[2] = (xmax - offset_x) / scale;
    det->bbox[3] = (ymax - offset_y) / scale;
    */

    det->conf = score;
    det->class_id = class_id;
    det->bbox[0] = xmin;
    det->bbox[1] = ymin;
    det->bbox[2] = xmax;
    det->bbox[3] = ymax;

    if (is_obb) {
        det->angle = angle;
    }

    // TODO: ADD KEYPOINTS, SEGMENTATION, OBB HERE
}

void YoloLayerPlugin::gatherKernelLauncher(const float* const* inputs, float* outputs, cudaStream_t stream,
                                           int batchSize) {
    // TODO: ADD BATCH SUPPORT, CURRENTLY ONLY BATCH=1 IS SUPPORTED
    // TODO: ADD SEGMENTATION, POSE, OBB SUPPORT
    // TODO: num_elem = batch_size * anchor_num
    const float* input = inputs[0];

    int outputElem = mMaxDetections * sizeof(Detection) / sizeof(float) + 1;
    int num_elem = mAnchorCount;  // Use anchor count from model configuration

    dim3 blockSize(mThreadCount);
    dim3 gridSize((num_elem + mThreadCount - 1) / mThreadCount);

    cudaMemsetAsync(outputs, 0, batchSize * outputElem * sizeof(float), stream);  // TODO: adjust for batch size

    gatherKernel<<<gridSize, blockSize, 0, stream>>>(input, outputs, num_elem, mMaxDetections, mClassCount,
                                                     mNumberOfPoints, outputElem, mIsDetection, mIsSegmentation,
                                                     mIsPose, mIsObb);
}

PluginFieldCollection YoloLayerPluginCreator::mFC{};
std::vector<PluginField> YoloLayerPluginCreator::mPluginAttributes;

YoloLayerPluginCreator::YoloLayerPluginCreator() {
    mPluginAttributes.clear();
    mFC.nbFields = mPluginAttributes.size();
    mFC.fields = mPluginAttributes.data();
}

const char* YoloLayerPluginCreator::getPluginName() const TRT_NOEXCEPT {
    return "YoloLayer_TRT";
}

const char* YoloLayerPluginCreator::getPluginVersion() const TRT_NOEXCEPT {
    return "1";
}

const PluginFieldCollection* YoloLayerPluginCreator::getFieldNames() TRT_NOEXCEPT {
    return &mFC;
}

IPluginV2IOExt* YoloLayerPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT {

    assert(fc->nbFields == 1);
    assert(strcmp(fc->fields[0].name, "combinedInfo") == 0);
    const int* combinedInfo = static_cast<const int*>(fc->fields[0].data);
    int net_info_count = fc->fields[0].length;
    int class_count = combinedInfo[0];
    int number_of_points = combinedInfo[1];
    int max_detections = combinedInfo[2];
    bool is_detection = combinedInfo[3];
    bool is_segmentation = combinedInfo[4];
    bool is_pose = combinedInfo[5];
    bool is_obb = combinedInfo[6];
    int anchor_count = combinedInfo[7];

    YoloLayerPlugin* plugin = new YoloLayerPlugin(class_count, number_of_points, max_detections, is_detection,
                                                  is_segmentation, is_pose, is_obb, anchor_count);
    plugin->setPluginNamespace(mNamespace.c_str());
    return plugin;
}

IPluginV2IOExt* YoloLayerPluginCreator::deserializePlugin(const char* name, const void* serialData,
                                                          size_t serialLength) TRT_NOEXCEPT {
    YoloLayerPlugin* plugin = new YoloLayerPlugin(serialData, serialLength);
    plugin->setPluginNamespace(mNamespace.c_str());
    return plugin;
}

}  // namespace nvinfer1

================================================
FILE: yolo26/plugin/yololayer.h
================================================
#pragma once
#include <string>
#include <vector>
#include "NvInfer.h"
#include "macros.h"

namespace nvinfer1 {

void setPluginDeviceParams(float confThreshold);

class API YoloLayerPlugin : public IPluginV2IOExt {
   public:
    YoloLayerPlugin(int classCount, int numberOfPoints, int maxDetections, bool isDetection, bool isSegmentation,
                    bool isPose, bool isObb, int anchor_count);
    YoloLayerPlugin(const void* data, size_t length);

    ~YoloLayerPlugin();

    int getNbOutputs() const TRT_NOEXCEPT override { return 1; }

    nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;

    int initialize() TRT_NOEXCEPT override;

    virtual void terminate() TRT_NOEXCEPT override {}

    virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; }

    virtual int enqueue(int batchSize, const void* const* inputs, void* const* outputs, void* workspace,
                        cudaStream_t stream) TRT_NOEXCEPT override;

    virtual size_t getSerializationSize() const TRT_NOEXCEPT override;

    virtual void serialize(void* buffer) const TRT_NOEXCEPT override;

    bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs,
                                   int nbOutputs) const TRT_NOEXCEPT override {
        return inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
    }

    const char* getPluginType() const TRT_NOEXCEPT override;

    const char* getPluginVersion() const TRT_NOEXCEPT override;

    void destroy() TRT_NOEXCEPT override;

    IPluginV2IOExt* clone() const TRT_NOEXCEPT override;

    void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;

    const char* getPluginNamespace() const TRT_NOEXCEPT override;

    nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
                                         int32_t nbInputs) const TRT_NOEXCEPT override;

    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
                                      int nbInputs) const TRT_NOEXCEPT override;

    bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;

    void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
                         IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;

    void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out,
                         int32_t nbOutput) TRT_NOEXCEPT override;

    void detachFromContext() TRT_NOEXCEPT override;

   private:
    void gatherKernelLauncher(const float* const* inputs, float* outputs, cudaStream_t stream, int batchSize);
    int mThreadCount = 256;
    const char* mPluginNamespace = "";
    int mClassCount;
    int mNumberOfPoints;
    int mMaxDetections;
    bool mIsDetection;
    bool mIsSegmentation;
    bool mIsPose;
    bool mIsObb;
    int mAnchorCount;
};

class API YoloLayerPluginCreator : public IPluginCreator {
   public:
    YoloLayerPluginCreator();

    const char* getPluginName() const TRT_NOEXCEPT override;

    const char* getPluginVersion() const TRT_NOEXCEPT override;

    const PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;

    IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT override;

    IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData,
                                      size_t serialLength) TRT_NOEXCEPT override;

    void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override { mNamespace = pluginNamespace; }

    const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); }

   private:
    std::string mNamespace;
    static PluginFieldCollection mFC;
    static std::vector<PluginField> mPluginAttributes;
};

REGISTER_TENSORRT_PLUGIN(YoloLayerPluginCreator);
}  // namespace nvinfer1

================================================
FILE: yolo26/src/block.cpp
================================================
#include "block.h"
#include <assert.h>
#include <math.h>
#include <fstream>
#include <iostream>
#include "config.h"
#include "model.h"
#include "yololayer.h"

std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, nvinfer1::Weights> WeightMap;

    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");

    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--) {
        nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        std::string name;
        input >> name >> std::dec >> size;
        wt.type = nvinfer1::DataType::kFLOAT;

        //uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(uint32_t) * size));

        for (uint32_t x = 0, y = size; x < y; x++) {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        wt.count = size;
        WeightMap[name] = wt;
    }
    return WeightMap;
}

nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network,
                                      std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                      std::string lname, float eps) {
    float* gamma = (float*)weightMap[lname + ".weight"].values;
    float* beta = (float*)weightMap[lname + ".bias"].values;
    float* mean = (float*)weightMap[lname + ".running_mean"].values;
    float* var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;

    float* scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, scval, len};

    float* shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, shval, len};

    float* pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, pval, len};
    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    nvinfer1::IScaleLayer* output = network->addScale(input, nvinfer1::ScaleMode::kCHANNEL, shift, scale, power);
    assert(output);
    output->setName(lname.c_str());
    return output;
}

nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network,
                                        std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                        int ch, std::vector<int> k, int s, std::string lname, int g) {

    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k[0], k[1]},
                                                                  weightMap[lname + ".conv.weight"], bias_empty);
    assert(conv);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    // auto pad
    int p0 = k[0] / 2;
    int p1 = k[1] / 2;
    conv->setPaddingNd(nvinfer1::DimsHW{p0, p1});
    conv->setNbGroups(g);
    conv->setName((lname + "/conv/Conv").c_str());

    nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);

    nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
    sigmoid->setName((lname + "/act/Sigmoid").c_str());
    nvinfer1::IElementWiseLayer* ew =
            network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
    assert(ew);
    ew->setName((lname + "/act/Mul").c_str());
    return ew;
}

nvinfer1::ILayer* conv(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                       nvinfer1::ITensor& input, int ch, std::vector<int> k, int s, std::string lname, int g,
                       bool act) {
    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k[0], k[1]},
                                                                  weightMap[lname + ".conv.weight"], bias_empty);
    assert(conv);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    // auto pad
    int p0 = k[0] / 2;
    int p1 = k[1] / 2;
    conv->setPaddingNd(nvinfer1::DimsHW{p0, p1});
    conv->setNbGroups(g);
    conv->setName((lname + "/conv/Conv").c_str());
    nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);

    if (!act)
        return bn;

    nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
    sigmoid->setName((lname + "/act/Sigmoid").c_str());
    nvinfer1::IElementWiseLayer* ew =
            network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
    assert(ew);
    ew->setName((lname + "/act/Mul").c_str());
    return ew;
}

static nvinfer1::ILayer* bottleneck(nvinfer1::INetworkDefinition* network,
                                    std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                    int c1, int c2, bool shortcut, std::vector<int> k1, std::vector<int> k2, float e,
                                    std::string lname, int g = 1) {
    int c_ = (int)((float)c2 * e);
    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c_, k1, 1, lname + ".cv1");
    nvinfer1::IElementWiseLayer* conv2 =
            convBnSiLU(network, weightMap, *conv1->getOutput(0), c2, k2, 1, lname + ".cv2", g);

    if (shortcut && c1 == c2) {
        nvinfer1::IElementWiseLayer* ew =
                network->addElementWise(input, *conv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
        ew->setName((lname + ".add").c_str());
        return ew;
    }
    return conv2;
}

static nvinfer1::ILayer* convBn(nvinfer1::INetworkDefinition* network,
                                std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int ch,
                                int k, int s, std::string lname, int g = 1) {
    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv =
            network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty);
    assert(conv);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    int p = k / 2;
    conv->setPaddingNd(nvinfer1::DimsHW{p, p});
    conv->setNbGroups(g);

    nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);
    return bn;
}

static nvinfer1::ILayer* Attention(nvinfer1::INetworkDefinition* network,
                                   std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                   int dim, int num_heads, float attn_ratio, std::string lname) {
    int head_dim = dim / num_heads;
    int key_dim = head_dim * attn_ratio;
    float scale = pow(key_dim, -0.5);
    int nh_kd = key_dim * num_heads;
    int h = dim + nh_kd * 2;

    auto d = input.getDimensions();
    int B = d.d[0];
    int H = d.d[2];
    int W = d.d[3];
    int N = H * W;
    auto* qkv = convBn(network, weightMap, input, h, 1, 1, lname + ".qkv");
    // qkv.view(B, self.num_heads, -1, N)
    auto shuffle = network->addShuffle(*qkv->getOutput(0));
    shuffle->setReshapeDimensions(nvinfer1::Dims4{B, num_heads, -1, N});
    // q, k, v = .split([self.key_dim, self.key_dim, self.head_dim], dim=2)
    auto d1 = shuffle->getOutput(0)->getDimensions();
    auto q = network->addSlice(*shuffle->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0},
                               nvinfer1::Dims4{d1.d[0], d1.d[1], key_dim, d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    auto k = network->addSlice(*shuffle->getOutput(0), nvinfer1::Dims4{0, 0, key_dim, 0},
                               nvinfer1::Dims4{d1.d[0], d1.d[1], key_dim, d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    auto v = network->addSlice(*shuffle->getOutput(0), nvinfer1::Dims4{0, 0, key_dim * 2, 0},
                               nvinfer1::Dims4{d1.d[0], d1.d[1], head_dim, d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    // attn = ((q.transpose(-2, -1) @ k) * self.scale)
    auto qT = network->addShuffle(*q->getOutput(0));
    qT->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2});
    auto matmul = network->addMatrixMultiply(*qT->getOutput(0), nvinfer1::MatrixOperation::kNONE, *k->getOutput(0),
                                             nvinfer1::MatrixOperation::kNONE);
    // There are not many memory leaks, and I will change it when I have time
    float* scale_val = reinterpret_cast<float*>(malloc(sizeof(float) * 1));
    scale_val[0] = scale;
    nvinfer1::Weights s_w{nvinfer1::DataType::kFLOAT, scale_val, 1};
    float* shift_val = reinterpret_cast<float*>(malloc(sizeof(float) * 1));
    shift_val[0] = 0;
    nvinfer1::Weights sh_w{nvinfer1::DataType::kFLOAT, shift_val, 1};
    float* power_val = reinterpret_cast<float*>(malloc(sizeof(float) * 1));
    power_val[0] = 1;
    nvinfer1::Weights p_w{nvinfer1::DataType::kFLOAT, power_val, 1};
    nvinfer1::IScaleLayer* scaleLayer =
            network->addScale(*matmul->getOutput(0), nvinfer1::ScaleMode::kUNIFORM, sh_w, s_w, p_w);
    // attn = attn.softmax(dim=-1)
    nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*scaleLayer->getOutput(0));
    softmax->setAxes(1 << 3);
    // x = (v @ attn.transpose(-2, -1)).view(B, -1, H, W) + self.pe(v.reshape(B, -1, H, W))
    auto attnT = network->addShuffle(*softmax->getOutput(0));
    attnT->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2});
    auto matmul2 = network->addMatrixMultiply(*v->getOutput(0), nvinfer1::MatrixOperation::kNONE, *attnT->getOutput(0),
                                              nvinfer1::MatrixOperation::kNONE);
    auto reshape = network->addShuffle(*matmul2->getOutput(0));
    reshape->setReshapeDimensions(nvinfer1::Dims4{B, -1, H, W});
    auto v_reshape = network->addShuffle(*v->getOutput(0));
    v_reshape->setReshapeDimensions(nvinfer1::Dims4{B, -1, H, W});
    // self.pe = Conv(dim, dim, 3, 1, g=dim, act=False)
    auto pe = convBn(network, weightMap, *v_reshape->getOutput(0), dim, 3, 1, lname + ".pe", dim);
    auto sum = network->addElementWise(*reshape->getOutput(0), *pe->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
    // x = self.proj(x)
    // self.proj = Conv(dim, dim, 1, act=False)
    auto proj = convBn(network, weightMap, *sum->getOutput(0), dim, 1, 1, lname + ".proj");
    return proj;
}

static nvinfer1::ILayer* PSABlock(nvinfer1::INetworkDefinition* network,
                                  std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int dim,
                                  float attn_ratio, int num_heads, bool shortcut, std::string lname) {

    auto attn = Attention(network, weightMap, input, dim, num_heads, attn_ratio, lname + ".attn");
    nvinfer1::ILayer* shortcut_layer = nullptr;
    if (shortcut) {
        shortcut_layer = network->addElementWise(input, *attn->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
    } else {
        shortcut_layer = attn;
    }
    // self.ffn = nn.Sequential(Conv(c, c * 2, 1), Conv(c * 2, c, 1, act=False))
    // x = x + self.ffn(x) if self.add else self.ffn(x)
    auto ffn0 = convBnSiLU(network, weightMap, *shortcut_layer->getOutput(0), dim * 2, {1, 1}, 1, lname + ".ffn.0");
    auto ffn1 = convBn(network, weightMap, *ffn0->getOutput(0), dim, 1, 1, lname + ".ffn.1");
    if (shortcut) {
        return network->addElementWise(*shortcut_layer->getOutput(0), *ffn1->getOutput(0),
                                       nvinfer1::ElementWiseOperation::kSUM);
    } else {
        return ffn1;
    }
}

static nvinfer1::ILayer* C3k(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                             nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, std::vector<int> k1,
                             std::vector<int> k2, float e, std::string lname) {
    int c_ = (int)((float)c2 * e);
    auto cv1 = convBnSiLU(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv1");
    auto cv2 = convBnSiLU(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv2");
    nvinfer1::ITensor* y1 = cv1->getOutput(0);
    for (int i = 0; i < n; i++) {
        auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, k1, k2, 1.0, lname + ".m." + std::to_string(i));
        y1 = b->getOutput(0);
    }

    nvinfer1::ITensor* inputTensors[] = {y1, cv2->getOutput(0)};
    auto cat = network->addConcatenation(inputTensors, 2);
    cat->setName((lname + ".cat").c_str());

    auto cv3 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv3");
    return cv3;
}

nvinfer1::IElementWiseLayer* C3K2(nvinfer1::INetworkDefinition* network,
                                  std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c1,
                                  int c2, int n, bool c3k, bool shortcut, bool attn, float e, std::string lname) {
    int c_ = (int)((float)c2 * e);

    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, 2 * c_, {1, 1}, 1, lname + ".cv1");
    nvinfer1::Dims d = conv1->getOutput(0)->getDimensions();

    nvinfer1::ISliceLayer* split1 =
            network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0},
                              nvinfer1::Dims4{d.d[0], d.d[1] / 2, d.d[2], d.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    split1->setName((lname + ".split1").c_str());
    nvinfer1::ISliceLayer* split2 =
            network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, d.d[1] / 2, 0, 0},
                              nvinfer1::Dims4{d.d[0], d.d[1] / 2, d.d[2], d.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    split2->setName((lname + ".split2").c_str());
    nvinfer1::ITensor* inputTensor0[] = {split1->getOutput(0), split2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor0, 2);
    cat->setName((lname + ".cat0").c_str());
    nvinfer1::ITensor* y1 = split2->getOutput(0);
    for (int i = 0; i < n; i++) {
        nvinfer1::ILayer* b = nullptr;
        if (attn) {
            b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, {3, 3}, {3, 3}, 0.5,
                           lname + ".m." + std::to_string(i) + ".0");

            b = PSABlock(network, weightMap, *b->getOutput(0), c_, 0.5, max(1, c_ / 64), shortcut,
                         lname + ".m." + std::to_string(i) + ".1");

        } else if (c3k) {
            b = C3k(network, weightMap, *y1, c_, c_, 2, shortcut, {3, 3}, {3, 3}, 0.5,
                    lname + ".m." + std::to_string(i));
        } else {
            b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, {3, 3}, {3, 3}, 0.5,
                           lname + ".m." + std::to_string(i));
        }
        y1 = b->getOutput(0);

        nvinfer1::ITensor* inputTensors[] = {cat->getOutput(0), b->getOutput(0)};
        cat = network->addConcatenation(inputTensors, 2);
        cat->setName((lname + ".cat" + std::to_string(i + 1)).c_str());
    }

    nvinfer1::IElementWiseLayer* conv2 =
            convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv2");

    return conv2;
}

nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network,
                                  std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
                                  int c2, int k, bool shortcut, std::string lname) {
    int c_ = c1 / 2;
    nvinfer1::ILayer* conv1 = conv(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv1", 1, false);
    nvinfer1::IPoolingLayer* pool1 =
            network->addPoolingNd(*conv1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
    pool1->setStrideNd(nvinfer1::DimsHW{1, 1});
    pool1->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
    nvinfer1::IPoolingLayer* pool2 =
            network->addPoolingNd(*pool1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
    pool2->setStrideNd(nvinfer1::DimsHW{1, 1});
    pool2->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
    nvinfer1::IPoolingLayer* pool3 =
            network->addPoolingNd(*pool2->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
    pool3->setStrideNd(nvinfer1::DimsHW{1, 1});
    pool3->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
    nvinfer1::ITensor* inputTensors[] = {conv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0),
                                         pool3->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 4);
    nvinfer1::IElementWiseLayer* conv2 =
            convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv2");

    if (shortcut && (c1 == c2)) {
        nvinfer1::IElementWiseLayer* sum =
                network->addElementWise(input, *conv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
        return sum;
    } else {
        return conv2;
    }
}

nvinfer1::IElementWiseLayer* C2PSA(nvinfer1::INetworkDefinition* network,
                                   std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input,
                                   int c1, int c2, int n, float e, std::string lname) {
    int c = c2 * e;

    // cv1 branch
    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, 2 * c, {1, 1}, 1, lname + ".cv1");
    nvinfer1::ITensor* cv1_out = conv1->getOutput(0);

    // Split the output of cv1 into two tensors
    nvinfer1::Dims dims = cv1_out->getDimensions();
    nvinfer1::ISliceLayer* split1 = network->addSlice(*cv1_out, nvinfer1::Dims4{0, 0, 0, 0},
                                                      nvinfer1::Dims4{dims.d[0], dims.d[1] / 2, dims.d[2], dims.d[3]},
                                                      nvinfer1::Dims4{1, 1, 1, 1});
    nvinfer1::ISliceLayer* split2 = network->addSlice(*cv1_out, nvinfer1::Dims4{0, dims.d[1] / 2, 0, 0},
                                                      nvinfer1::Dims4{dims.d[0], dims.d[1] / 2, dims.d[2], dims.d[3]},
                                                      nvinfer1::Dims4{1, 1, 1, 1});

    // Create y1 bottleneck sequence
    nvinfer1::ITensor* y = split2->getOutput(0);
    for (int i = 0; i < n; ++i) {
        auto* bottleneck_layer =
                PSABlock(network, weightMap, *y, c, 0.5, c / 64, true, lname + ".m." + std::to_string(i));
        y = bottleneck_layer->getOutput(0);  // update 'y1' to be the output of the current bottleneck
    }

    // Concatenate y1 with the second split of cv1
    nvinfer1::ITensor* concatInputs[2] = {split1->getOutput(0), y};
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(concatInputs, 2);

    // cv2 to produce the final output
    nvinfer1::IElementWiseLayer* conv2 =
            convBnSiLU(network, weightMap, *cat->getOutput(0), c1, {1, 1}, 1, lname + ".cv2");

    return conv2;
}

nvinfer1::ILayer* DWConv(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                         nvinfer1::ITensor& input, int ch, std::vector<int> k, int s, std::string lname) {
    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k[0], k[1]},
                                                                  weightMap[lname + ".conv.weight"], bias_empty);
    assert(conv);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    conv->setNbGroups(ch);
    // auto pad
    int p0 = k[0] / 2;
    int p1 = k[1] / 2;
    conv->setPaddingNd(nvinfer1::DimsHW{p0, p1});

    nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);

    nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
    nvinfer1::IElementWiseLayer* ew =
            network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
    assert(ew);
    return ew;
}

nvinfer1::IPluginV2Layer* addYoloLayer(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor& input,
                                       const std::vector<int>& strides, const std::vector<int>& fm_sizes,
                                       int stridesLength, bool is_detection, bool is_segmentation, bool is_pose,
                                       bool is_obb, int anchorCount) {
    auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
    const int netinfo_count = 8;
    const int total_count = netinfo_count + stridesLength;
    int class_num = kNumClass;
    if (is_pose) {
        class_num = kPoseNumClass;
    }

    if (is_obb) {
        class_num = kObbNumClass;
    }

    std::vector<int> combinedInfo(total_count);
    combinedInfo[0] = class_num;
    combinedInfo[1] = kNumberOfPoints;
    combinedInfo[2] = kMaxNumOutputBbox;
    combinedInfo[3] = is_detection;
    combinedInfo[4] = is_segmentation;
    combinedInfo[5] = is_pose;
    combinedInfo[6] = is_obb;
    combinedInfo[7] = anchorCount;

    nvinfer1::PluginField pluginField;
    pluginField.name = "combinedInfo";
    pluginField.data = combinedInfo.data();
    pluginField.type = nvinfer1::PluginFieldType::kINT32;
    pluginField.length = combinedInfo.size();

    nvinfer1::PluginFieldCollection pluginFieldCollection;
    pluginFieldCollection.nbFields = 1;
    pluginFieldCollection.fields = &pluginField;

    nvinfer1::IPluginV2* pluginObject = creator->createPlugin("yololayer", &pluginFieldCollection);

    // Use the single input tensor instead of multiple detection heads
    nvinfer1::ITensor* inputTensors[] = {&input};
    nvinfer1::IPluginV2Layer* yololayer = network->addPluginV2(inputTensors, 1, *pluginObject);
    return yololayer;
}

================================================
FILE: yolo26/src/model.cpp
================================================
#include <math.h>
#include <iostream>

#include "block.h"
// #include "calibrator.h"
#include "config.h"
#include "model.h"

static int get_width(int x, float gw, int max_channels, int divisor = 8) {
    auto channel = std::min(x, max_channels);
    channel = int(ceil((channel * gw) / divisor)) * divisor;
    return channel;
}

static int get_depth(int x, float gd) {
    if (x == 1)
        return 1;
    int r = round(x * gd);
    if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0)
        --r;
    return std::max<int>(r, 1);
}

void calculateStrides(nvinfer1::IElementWiseLayer* conv_layers[], int size, int reference_size, int strides[]) {
    for (int i = 0; i < size; ++i) {
        nvinfer1::ILayer* layer = conv_layers[i];
        nvinfer1::Dims dims = layer->getOutput(0)->getDimensions();
        int feature_map_size = dims.d[2];
        strides[i] = reference_size / feature_map_size;
    }
}

nvinfer1::IHostMemory* buildEngineYolo26Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                            nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                            int& max_channels, std::string& type)

{
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);

    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(
            1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));

    /*******************************************************************************************************
     ******************************************  YOLO26 INPUT  **********************************************
     *******************************************************************************************************/

    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW});
    assert(data);

    /*******************************************************************************************************
    *****************************************  YOLO26 BACKBONE  ********************************************
    *******************************************************************************************************/

    nvinfer1::IElementWiseLayer* block0 =
            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), {3, 3}, 2, "model.0");

    nvinfer1::IElementWiseLayer* block1 = convBnSiLU(network, weightMap, *block0->getOutput(0),
                                                     get_width(128, gw, max_channels), {3, 3}, 2, "model.1");

    bool c3k = false;
    if (type == "m" || type == "l" || type == "x") {
        c3k = true;
    }

    nvinfer1::IElementWiseLayer* conv2 =
            C3K2(network, weightMap, *block1->getOutput(0), get_width(128, gw, max_channels),
                 get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, false, 0.25, "model.2");

    nvinfer1::IElementWiseLayer* block3 = convBnSiLU(network, weightMap, *conv2->getOutput(0),
                                                     get_width(256, gw, max_channels), {3, 3}, 2, "model.3");

    nvinfer1::IElementWiseLayer* block4 =
            C3K2(network, weightMap, *block3->getOutput(0), get_width(256, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, false, 0.25, "model.4");

    nvinfer1::IElementWiseLayer* block5 = convBnSiLU(network, weightMap, *block4->getOutput(0),
                                                     get_width(512, gw, max_channels), {3, 3}, 2, "model.5");

    nvinfer1::IElementWiseLayer* block6 =
            C3K2(network, weightMap, *block5->getOutput(0), get_width(512, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), true, true, false, 0.5, "model.6");

    nvinfer1::IElementWiseLayer* block7 = convBnSiLU(network, weightMap, *block6->getOutput(0),
                                                     get_width(1024, gw, max_channels), {3, 3}, 2, "model.7");

    nvinfer1::IElementWiseLayer* block8 =
            C3K2(network, weightMap, *block7->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), get_depth(2, gd), true, true, false, 0.5, "model.8");

    nvinfer1::IElementWiseLayer* block9 =
            SPPF(network, weightMap, *block8->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), 5, true, "model.9");

    nvinfer1::IElementWiseLayer* block10 =
            C2PSA(network, weightMap, *block9->getOutput(0), get_width(1024, gw, max_channels),
                  get_width(1024, gw, max_channels), get_depth(2, gd), 0.5, "model.10");
    /*******************************************************************************************************
    *********************************************  YOLO26 HEAD  ********************************************
    *******************************************************************************************************/
    float scale[] = {1.0, 1.0, 2.0, 2.0};
    nvinfer1::IResizeLayer* upsample11 = network->addResize(*block10->getOutput(0));
    assert(upsample11);

    upsample11->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample11->setScales(scale, 4);
    nvinfer1::ITensor* inputTensors12[] = {upsample11->getOutput(0), block6->getOutput(0)};

    nvinfer1::IConcatenationLayer* cat12 = network->addConcatenation(inputTensors12, 2);

    nvinfer1::IElementWiseLayer* block13 =
            C3K2(network, weightMap, *cat12->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), true, true, false, 0.5, "model.13");

    nvinfer1::IResizeLayer* upsample14 = network->addResize(*block13->getOutput(0));
    assert(upsample14);

    upsample14->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample14->setScales(scale, 4);

    nvinfer1::ITensor* inputTensors15[] = {upsample14->getOutput(0), block4->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat15 = network->addConcatenation(inputTensors15, 2);

    nvinfer1::IElementWiseLayer* block16 =
            C3K2(network, weightMap, *cat15->getOutput(0), get_width(512, gw, max_channels),
                 get_width(256, gw, max_channels), get_depth(2, gd), true, true, false, 0.5, "model.16");

    nvinfer1::IElementWiseLayer* block17 = convBnSiLU(network, weightMap, *block16->getOutput(0),
                                                      get_width(256, gw, max_channels), {3, 3}, 2, "model.17");

    nvinfer1::ITensor* inputTensors18[] = {block17->getOutput(0), block13->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat18 = network->addConcatenation(inputTensors18, 2);

    nvinfer1::IElementWiseLayer* block19 =
            C3K2(network, weightMap, *cat18->getOutput(0), get_width(512, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), true, true, false, 0.5, "model.19");

    nvinfer1::IElementWiseLayer* block20 = convBnSiLU(network, weightMap, *block19->getOutput(0),
                                                      get_width(512, gw, max_channels), {3, 3}, 2, "model.20");

    nvinfer1::ITensor* inputTensors21[] = {block20->getOutput(0), block10->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat21 = network->addConcatenation(inputTensors21, 2);

    nvinfer1::IElementWiseLayer* block22 =
            C3K2(network, weightMap, *cat21->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), 1, true, true, true, 0.5,
                 "model.22");  // WARN: get_depth(2, gd) changed to 1.

    /*******************************************************************************************************
    *********************************************  YOLO26 OUTPUT  ********************************************
    *******************************************************************************************************/

    int c2 = std::max(std::max(16, get_width(256, gw, max_channels)), 16 * 4);
    int c3 = std::max(get_width(256, gw, max_channels), std::min(kNumClass, 100));

    /////////////////////////////////////////////////////

    nvinfer1::IElementWiseLayer* conv23_one2one_cv3_0_0_0 =
            convBnSiLU(network, weightMap, *block16->getOutput(0), c2, {3, 3}, 1, "model.23.one2one_cv3.0.0.0", c2);

    nvinfer1::IElementWiseLayer* conv23_one2one_cv3_0_0_1 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv3_0_0_0->getOutput(0), c3, {1, 1}, 1,
                       "model.23.one2one_cv3.0.0.1", 1);

    nvinfer1::IElementWiseLayer* conv23_one2one_cv3_0_1_0 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv3_0_0_1->getOutput(0), c3, {3, 3}, 1,
                       "model.23.one2one_cv3.0.1.0", c3);

    nvinfer1::IElementWiseLayer* conv23_one2one_cv3_0_1_1 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv3_0_1_0->getOutput(0), c3, {1, 1}, 1,
                       "model.23.one2one_cv3.0.1.1", 1);

    nvinfer1::IConvolutionLayer* conv23_one2one_cv3_0_2 = network->addConvolutionNd(
            *conv23_one2one_cv3_0_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
            weightMap["model.23.one2one_cv3.0.2.weight"], weightMap["model.23.one2one_cv3.0.2.bias"]);
    conv23_one2one_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_one2one_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    conv23_one2one_cv3_0_2->setNbGroups(1);

    nvinfer1::IShuffleLayer* reshape23_3 = network->addShuffle(*conv23_one2one_cv3_0_2->getOutput(0));
    reshape23_3->setReshapeDimensions(nvinfer1::Dims3{1, kNumClass, -1});

    /////////////////////////////////////////////////////

    nvinfer1::IElementWiseLayer* conv23_one2one_cv3_1_0_0 = convBnSiLU(
            network, weightMap, *block19->getOutput(0), c2 * 2, {3, 3}, 1, "model.23.one2one_cv3.1.0.0", c2 * 2);
    nvinfer1::IElementWiseLayer* conv23_one2one_cv3_1_0_1 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv3_1_0_0->getOutput(0), c3, {1, 1}, 1,
                       "model.23.one2one_cv3.1.0.1", 1);
    nvinfer1::IElementWiseLayer* conv23_one2one_cv3_1_1_0 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv3_1_0_1->getOutput(0), c3, {3, 3}, 1,
                       "model.23.one2one_cv3.1.1.0", c3);
    nvinfer1::IElementWiseLayer* conv23_one2one_cv3_1_1_1 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv3_1_1_0->getOutput(0), c3, {1, 1}, 1,
                       "model.23.one2one_cv3.1.1.1", 1);
    nvinfer1::IConvolutionLayer* conv23_one2one_cv3_1_2 = network->addConvolutionNd(
            *conv23_one2one_cv3_1_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
            weightMap["model.23.one2one_cv3.1.2.weight"], weightMap["model.23.one2one_cv3.1.2.bias"]);
    conv23_one2one_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_one2one_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    conv23_one2one_cv3_1_2->setNbGroups(1);
    nvinfer1::IShuffleLayer* reshape23_4 = network->addShuffle(*conv23_one2one_cv3_1_2->getOutput(0));
    reshape23_4->setReshapeDimensions(nvinfer1::Dims3{1, kNumClass, -1});

    /////////////////////////////////////////////////////
    nvinfer1::IElementWiseLayer* conv23_one2one_cv3_2_0_0;
    if (type == "m" || type == "l" || type == "x") {
        conv23_one2one_cv3_2_0_0 = convBnSiLU(network, weightMap, *block22->getOutput(0), c2 * 2, {3, 3}, 1,
                                              "model.23.one2one_cv3.2.0.0", c2 * 2);
    } else {
        conv23_one2one_cv3_2_0_0 = convBnSiLU(network, weightMap, *block22->getOutput(0), c2 * 4, {3, 3}, 1,
                                              "model.23.one2one_cv3.2.0.0", c2 * 4);
    }

    nvinfer1::IElementWiseLayer* conv23_one2one_cv3_2_0_1 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv3_2_0_0->getOutput(0), c3, {1, 1}, 1,
                       "model.23.one2one_cv3.2.0.1", 1);
    nvinfer1::IElementWiseLayer* conv23_one2one_cv3_2_1_0 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv3_2_0_1->getOutput(0), c3, {3, 3}, 1,
                       "model.23.one2one_cv3.2.1.0", c3);
    nvinfer1::IElementWiseLayer* conv23_one2one_cv3_2_1_1 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv3_2_1_0->getOutput(0), c3, {1, 1}, 1,
                       "model.23.one2one_cv3.2.1.1", 1);
    nvinfer1::IConvolutionLayer* conv23_one2one_cv3_2_2 = network->addConvolutionNd(
            *conv23_one2one_cv3_2_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
            weightMap["model.23.one2one_cv3.2.2.weight"], weightMap["model.23.one2one_cv3.2.2.bias"]);
    conv23_one2one_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_one2one_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    conv23_one2one_cv3_2_2->setNbGroups(1);
    nvinfer1::IShuffleLayer* reshape23_5 = network->addShuffle(*conv23_one2one_cv3_2_2->getOutput(0));
    reshape23_5->setReshapeDimensions(nvinfer1::Dims3{1, kNumClass, -1});

    /////////////////////////////////////////////////////

    nvinfer1::ITensor* inputTensors23_1[] = {reshape23_3->getOutput(0), reshape23_4->getOutput(0),
                                             reshape23_5->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_1 = network->addConcatenation(inputTensors23_1, 3);
    cat23_1->setAxis(2);
    nvinfer1::IActivationLayer* sigmoid23 = network->addActivation(
            *cat23_1->getOutput(0),
            nvinfer1::ActivationType::kSIGMOID);  // TODO: THIS IS UNNESSARY, REMOVE AFTER PLUGIN IS READY

    /////////////////////////////////////////////////////

    nvinfer1::IElementWiseLayer* conv23_one2one_cv2_0_0 =
            convBnSiLU(network, weightMap, *block16->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv2.0.0", 1);
    nvinfer1::IElementWiseLayer* conv23_one2one_cv2_0_1 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv2_0_0->getOutput(0), c2 / 4, {3, 3}, 1,
                       "model.23.one2one_cv2.0.1", 1);
    nvinfer1::IConvolutionLayer* conv23_one2one_cv2_0_2 = network->addConvolutionNd(
            *conv23_one2one_cv2_0_1->getOutput(0), 4, nvinfer1::DimsHW{1, 1},
            weightMap["model.23.one2one_cv2.0.2.weight"], weightMap["model.23.one2one_cv2.0.2.bias"]);
    conv23_one2one_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_one2one_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    conv23_one2one_cv2_0_2->setNbGroups(1);
    nvinfer1::IShuffleLayer* reshape23 = network->addShuffle(*conv23_one2one_cv2_0_2->getOutput(0));
    reshape23->setReshapeDimensions(nvinfer1::Dims3{1, 4, -1});

    /////////////////////////////////////////////////////

    nvinfer1::IElementWiseLayer* conv23_one2one_cv2_1_0 =
            convBnSiLU(network, weightMap, *block19->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv2.1.0", 1);
    nvinfer1::IElementWiseLayer* conv23_one2one_cv2_1_1 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv2_1_0->getOutput(0), c2 / 4, {3, 3}, 1,
                       "model.23.one2one_cv2.1.1", 1);
    nvinfer1::IConvolutionLayer* conv23_one2one_cv2_1_2 = network->addConvolutionNd(
            *conv23_one2one_cv2_1_1->getOutput(0), 4, nvinfer1::DimsHW{1, 1},
            weightMap["model.23.one2one_cv2.1.2.weight"], weightMap["model.23.one2one_cv2.1.2.bias"]);
    conv23_one2one_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_one2one_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    conv23_one2one_cv2_1_2->setNbGroups(1);
    nvinfer1::IShuffleLayer* reshape23_1 = network->addShuffle(*conv23_one2one_cv2_1_2->getOutput(0));
    reshape23_1->setReshapeDimensions(nvinfer1::Dims3{1, 4, -1});

    /////////////////////////////////////////////////////

    nvinfer1::IElementWiseLayer* conv23_one2one_cv2_2_0 =
            convBnSiLU(network, weightMap, *block22->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv2.2.0", 1);
    nvinfer1::IElementWiseLayer* conv23_one2one_cv2_2_1 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv2_2_0->getOutput(0), c2 / 4, {3, 3}, 1,
                       "model.23.one2one_cv2.2.1", 1);
    nvinfer1::IConvolutionLayer* conv23_one2one_cv2_2_2 = network->addConvolutionNd(
            *conv23_one2one_cv2_2_1->getOutput(0), 4, nvinfer1::DimsHW{1, 1},
            weightMap["model.23.one2one_cv2.2.2.weight"], weightMap["model.23.one2one_cv2.2.2.bias"]);
    conv23_one2one_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_one2one_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    conv23_one2one_cv2_2_2->setNbGroups(1);
    nvinfer1::IShuffleLayer* reshape23_2 = network->addShuffle(*conv23_one2one_cv2_2_2->getOutput(0));
    reshape23_2->setReshapeDimensions(nvinfer1::Dims3{1, 4, -1});

    /////////////////////////////////////////////////////

    nvinfer1::ITensor* inputTensors23[] = {reshape23->getOutput(0), reshape23_1->getOutput(0),
                                           reshape23_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23 = network->addConcatenation(inputTensors23, 3);
    cat23->setAxis(2);

    /////////////////////////////////////////////////////

    nvinfer1::ISliceLayer* slice23_1 = network->addSlice(
            *cat23->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{cat23->getOutput(0)->getDimensions().d[0], cat23->getOutput(0)->getDimensions().d[1] / 2,
                            cat23->getOutput(0)->getDimensions().d[2]},
            nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* slice23 = network->addSlice(
            *cat23->getOutput(0), nvinfer1::Dims3{0, cat23->getOutput(0)->getDimensions().d[1] / 2, 0},
            nvinfer1::Dims3{cat23->getOutput(0)->getDimensions().d[0], cat23->getOutput(0)->getDimensions().d[1] / 2,
                            cat23->getOutput(0)->getDimensions().d[2]},
            nvinfer1::Dims3{1, 1, 1});

    // TODO: MAKE HARDCODED TO AUTOMATIC
    const int anchor_num = cat23->getOutput(0)->getDimensions().d[2];

    std::vector<int> fm_sizes;
    int fm_h_0 = block16->getOutput(0)->getDimensions().d[2];  // P3
    int fm_h_1 = block19->getOutput(0)->getDimensions().d[2];  // P4
    int fm_h_2 = block22->getOutput(0)->getDimensions().d[2];  // P5

    fm_sizes.push_back(fm_h_0);
    fm_sizes.push_back(fm_h_1);
    fm_sizes.push_back(fm_h_2);

    std::vector<int> strides = {kInputH / fm_h_0, kInputH / fm_h_1, kInputH / fm_h_2};
    std::vector<float> grid(anchor_num * 2);
    std::vector<float> stride_vec(anchor_num);
    std::fill(stride_vec.begin(), stride_vec.begin() + fm_sizes[0] * fm_sizes[0], strides[0]);
    std::fill(stride_vec.begin() + fm_sizes[0] * fm_sizes[0],
              stride_vec.begin() + fm_sizes[0] * fm_sizes[0] + fm_sizes[1] * fm_sizes[1], strides[1]);
    std::fill(stride_vec.begin() + fm_sizes[0] * fm_sizes[0] + fm_sizes[1] * fm_sizes[1], stride_vec.end(), strides[2]);

    int idx = 0;
    for (int s = 0; s < fm_sizes.size(); ++s) {
        int h = fm_sizes[s];
        int w = fm_sizes[s];

        for (int y = 0; y < h; ++y) {
            for (int x = 0; x < w; ++x) {
                grid[idx] = x + 0.5f;
                grid[idx + anchor_num] = y + 0.5f;

                idx++;
            }
        }
    }

    nvinfer1::Dims gridDims;
    gridDims.nbDims = 3;
    gridDims.d[0] = 1;
    gridDims.d[1] = 2;
    gridDims.d[2] = anchor_num;

    nvinfer1::IConstantLayer* constant_grid = network->addConstant(
            gridDims, nvinfer1::Weights{nvinfer1::DataType::kFLOAT, grid.data(), (int64_t)grid.size()});

    nvinfer1::IElementWiseLayer* conv23_add_1 = network->addElementWise(
            *constant_grid->getOutput(0), *slice23->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);

    nvinfer1::IElementWiseLayer* conv23_sub_1 = network->addElementWise(
            *constant_grid->getOutput(0), *slice23_1->getOutput(0), nvinfer1::ElementWiseOperation::kSUB);

    nvinfer1::ITensor* tensor23[] = {conv23_sub_1->getOutput(0), conv23_add_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_2 = network->addConcatenation(tensor23, 2);
    cat23_2->setAxis(1);

    nvinfer1::IConstantLayer* constant_stride = network->addConstant(
            nvinfer1::Dims3{1, 1, anchor_num},
            nvinfer1::Weights{nvinfer1::DataType::kFLOAT, stride_vec.data(), (int64_t)stride_vec.size()});

    nvinfer1::IElementWiseLayer* mul23_2 = network->addElementWise(
            *cat23_2->getOutput(0), *constant_stride->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);

    ///////////////////////////////////////////////////////////

    nvinfer1::IConcatenationLayer* cat23_3 = network->addConcatenation(
            std::array<nvinfer1::ITensor*, 2>{mul23_2->getOutput(0), sigmoid23->getOutput(0)}.data(), 2);
    cat23_3->setAxis(1);

    nvinfer1::IShuffleLayer* transpose = network->addShuffle(*cat23_3->getOutput(0));
    transpose->setFirstTranspose(nvinfer1::Permutation{0, 2, 1});
    // transpose->setReshapeDimensions(nvinfer1::Dims3{1, anchor_num, kNumClass + 4});

    ///////////////////////////////////////////////////////////

    int stridesLength = strides.size();
    nvinfer1::IPluginV2Layer* yolo = addYoloLayer(network, *transpose->getOutput(0), strides, fm_sizes, stridesLength,
                                                  true, false, false, false, anchor_num);
    assert(yolo);

    ///////////////////////////////////////////////////////////

    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    // Use setMemoryPoolLimit instead of deprecated setMaxWorkspaceSize
    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cerr << "INT8 not supported for YOLO26 model yet." << std::endl;
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}

nvinfer1::IHostMemory* buildEngineYolo26Obb(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                            nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                            int& max_channels, std::string& type)

{
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);

    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(
            1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));

    /*******************************************************************************************************
     ******************************************  YOLO26-Obb INPUT  **********************************************
     *******************************************************************************************************/

    nvinfer1::ITensor* data =
            network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kObbInputH, kObbInputW});
    assert(data);

    nvinfer1::IElementWiseLayer* block0 =
            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), {3, 3}, 2, "model.0");

    nvinfer1::IElementWiseLayer* block1 = convBnSiLU(network, weightMap, *block0->getOutput(0),
                                                     get_width(128, gw, max_channels), {3, 3}, 2, "model.1");

    bool c3k = false;
    if (type == "m" || type == "l" || type == "x") {
        c3k = true;
    }

    nvinfer1::IElementWiseLayer* conv2 =
            C3K2(network, weightMap, *block1->getOutput(0), get_width(128, gw, max_channels),
                 get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, false, 0.25, "model.2");

    nvinfer1::IElementWiseLayer* block3 = convBnSiLU(network, weightMap, *conv2->getOutput(0),
                                                     get_width(256, gw, max_channels), {3, 3}, 2, "model.3");

    nvinfer1::IElementWiseLayer* block4 =
            C3K2(network, weightMap, *block3->getOutput(0), get_width(256, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, false, 0.25, "model.4");

    nvinfer1::IElementWiseLayer* block5 = convBnSiLU(network, weightMap, *block4->getOutput(0),
                                                     get_width(512, gw, max_channels), {3, 3}, 2, "model.5");

    nvinfer1::IElementWiseLayer* block6 =
            C3K2(network, weightMap, *block5->getOutput(0), get_width(512, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), true, true, false, 0.5, "model.6");

    nvinfer1::IElementWiseLayer* block7 = convBnSiLU(network, weightMap, *block6->getOutput(0),
                                                     get_width(1024, gw, max_channels), {3, 3}, 2, "model.7");

    nvinfer1::IElementWiseLayer* block8 =
            C3K2(network, weightMap, *block7->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), get_depth(2, gd), true, true, false, 0.5, "model.8");

    nvinfer1::IElementWiseLayer* block9 = SPPF(network, weightMap, *block8->getOutput(0),
                                               get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), 5,
                                               true, "model.9");  // TODO: VERIFY THIS BLOCK FOR OTHER YOLO26 MODELS

    nvinfer1::IElementWiseLayer* block10 =
            C2PSA(network, weightMap, *block9->getOutput(0), get_width(1024, gw, max_channels),
                  get_width(1024, gw, max_channels), get_depth(2, gd), 0.5, "model.10");

    /*******************************************************************************************************
    *********************************************  YOLO26-Obb HEAD  ********************************************
    *******************************************************************************************************/

    float scale[] = {1.0, 1.0, 2.0, 2.0};
    nvinfer1::IResizeLayer* upsample11 = network->addResize(*block10->getOutput(0));
    assert(upsample11);

    upsample11->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample11->setScales(scale, 4);
    nvinfer1::ITensor* inputTensors12[] = {upsample11->getOutput(0), block6->getOutput(0)};

    nvinfer1::IConcatenationLayer* cat12 = network->addConcatenation(inputTensors12, 2);

    nvinfer1::IElementWiseLayer* block13 =
            C3K2(network, weightMap, *cat12->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), true, true, false, 0.5, "model.13");

    nvinfer1::IResizeLayer* upsample14 = network->addResize(*block13->getOutput(0));
    assert(upsample14);

    upsample14->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample14->setScales(scale, 4);

    nvinfer1::ITensor* inputTensors15[] = {upsample14->getOutput(0), block4->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat15 = network->addConcatenation(inputTensors15, 2);

    nvinfer1::IElementWiseLayer* block16 =
            C3K2(network, weightMap, *cat15->getOutput(0), get_width(512, gw, max_channels),
                 get_width(256, gw, max_channels), get_depth(2, gd), true, true, false, 0.5, "model.16");

    nvinfer1::IElementWiseLayer* block17 = convBnSiLU(network, weightMap, *block16->getOutput(0),
                                                      get_width(256, gw, max_channels), {3, 3}, 2, "model.17");

    nvinfer1::ITensor* inputTensors18[] = {block17->getOutput(0), block13->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat18 = network->addConcatenation(inputTensors18, 2);

    nvinfer1::IElementWiseLayer* block19 =
            C3K2(network, weightMap, *cat18->getOutput(0), get_width(512, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), true, true, false, 0.5, "model.19");

    nvinfer1::IElementWiseLayer* block20 = convBnSiLU(network, weightMap, *block19->getOutput(0),
                                                      get_width(512, gw, max_channels), {3, 3}, 2, "model.20");

    nvinfer1::ITensor* inputTensors21[] = {block20->getOutput(0), block10->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat21 = network->addConcatenation(inputTensors21, 2);

    nvinfer1::IElementWiseLayer* block22 =
            C3K2(network, weightMap, *cat21->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), 1, true, true, true, 0.5,
                 "model.22");  // WARN: get_depth(2, gd) changed to 1.

    /*******************************************************************************************************
    *********************************************  YOLO26-Obb OUTPUT  ********************************************
    *******************************************************************************************************/

    int c2 = std::max(std::max(16, get_width(256, gw, max_channels)), 16 * 4);
    int c3 = std::max(get_width(256, gw, max_channels), std::min(kObbNumClass, 100));

    //cv.2.*.*
    /////////////////////////////////////////////////////

    nvinfer1::IElementWiseLayer* conv23_one2one_cv2_0_0 =
            convBnSiLU(network, weightMap, *block16->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv2.0.0", 1);
    nvinfer1::IElementWiseLayer* conv23_one2one_cv2_0_1 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv2_0_0->getOutput(0), c2 / 4, {3, 3}, 1,
                       "model.23.one2one_cv2.0.1", 1);
    nvinfer1::IConvolutionLayer* conv23_one2one_cv2_0_2 = network->addConvolutionNd(
            *conv23_one2one_cv2_0_1->getOutput(0), 4, nvinfer1::DimsHW{1, 1},
            weightMap["model.23.one2one_cv2.0.2.weight"], weightMap["model.23.one2one_cv2.0.2.bias"]);
    conv23_one2one_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_one2one_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    conv23_one2one_cv2_0_2->setNbGroups(1);
    nvinfer1::IShuffleLayer* reshape23 = network->addShuffle(*conv23_one2one_cv2_0_2->getOutput(0));
    reshape23->setReshapeDimensions(nvinfer1::Dims3{1, 4, -1});

    nvinfer1::IElementWiseLayer* conv23_one2one_cv2_1_0 =
            convBnSiLU(network, weightMap, *block19->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv2.1.0", 1);
    nvinfer1::IElementWiseLayer* conv23_one2one_cv2_1_1 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv2_1_0->getOutput(0), c2 / 4, {3, 3}, 1,
                       "model.23.one2one_cv2.1.1", 1);
    nvinfer1::IConvolutionLayer* conv23_one2one_cv2_1_2 = network->addConvolutionNd(
            *conv23_one2one_cv2_1_1->getOutput(0), 4, nvinfer1::DimsHW{1, 1},
            weightMap["model.23.one2one_cv2.1.2.weight"], weightMap["model.23.one2one_cv2.1.2.bias"]);
    conv23_one2one_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_one2one_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    conv23_one2one_cv2_1_2->setNbGroups(1);
    nvinfer1::IShuffleLayer* reshape23_1 = network->addShuffle(*conv23_one2one_cv2_1_2->getOutput(0));
    reshape23_1->setReshapeDimensions(nvinfer1::Dims3{1, 4, -1});

    nvinfer1::IElementWiseLayer* conv23_one2one_cv2_2_0 =
            convBnSiLU(network, weightMap, *block22->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv2.2.0", 1);
    nvinfer1::IElementWiseLayer* conv23_one2one_cv2_2_1 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv2_2_0->getOutput(0), c2 / 4, {3, 3}, 1,
                       "model.23.one2one_cv2.2.1", 1);
    nvinfer1::IConvolutionLayer* conv23_one2one_cv2_2_2 = network->addConvolutionNd(
            *conv23_one2one_cv2_2_1->getOutput(0), 4, nvinfer1::DimsHW{1, 1},
            weightMap["model.23.one2one_cv2.2.2.weight"], weightMap["model.23.one2one_cv2.2.2.bias"]);
    conv23_one2one_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_one2one_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    conv23_one2one_cv2_2_2->setNbGroups(1);
    nvinfer1::IShuffleLayer* reshape23_2 = network->addShuffle(*conv23_one2one_cv2_2_2->getOutput(0));
    reshape23_2->setReshapeDimensions(nvinfer1::Dims3{1, 4, -1});

    nvinfer1::ITensor* inputTensors23[] = {reshape23->getOutput(0), reshape23_1->getOutput(0),
                                           reshape23_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23 = network->addConcatenation(inputTensors23, 3);
    cat23->setAxis(2);

    //cv.4.*.*
    /////////////////////////////////////////////////////
    nvinfer1::IElementWiseLayer* conv23_one2one_cv4_0_0 =
            convBnSiLU(network, weightMap, *block16->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv4.0.0", 1);
    nvinfer1::IElementWiseLayer* conv23_one2one_cv4_0_1 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv4_0_0->getOutput(0), c2 / 4, {3, 3}, 1,
                       "model.23.one2one_cv4.0.1", 1);
    nvinfer1::IConvolutionLayer* conv23_one2one_cv4_0_2 = network->addConvolutionNd(
            *conv23_one2one_cv4_0_1->getOutput(0), 1, nvinfer1::DimsHW{1, 1},
            weightMap["model.23.one2one_cv4.0.2.weight"], weightMap["model.23.one2one_cv4.0.2.bias"]);
    conv23_one2one_cv4_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_one2one_cv4_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    conv23_one2one_cv4_0_2->setNbGroups(1);
    nvinfer1::IShuffleLayer* reshape23_6 = network->addShuffle(*conv23_one2one_cv4_0_2->getOutput(0));
    reshape23_6->setReshapeDimensions(nvinfer1::Dims3{1, 1, -1});

    nvinfer1::IElementWiseLayer* conv23_one2one_cv4_1_0 =
            convBnSiLU(network, weightMap, *block19->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv4.1.0", 1);
    nvinfer1::IElementWiseLayer* conv23_one2one_cv4_1_1 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv4_1_0->getOutput(0), c2 / 4, {3, 3}, 1,
                       "model.23.one2one_cv4.1.1", 1);
    nvinfer1::IConvolutionLayer* conv23_one2one_cv4_1_2 = network->addConvolutionNd(
            *conv23_one2one_cv4_1_1->getOutput(0), 1, nvinfer1::DimsHW{1, 1},
            weightMap["model.23.one2one_cv4.1.2.weight"], weightMap["model.23.one2one_cv4.1.2.bias"]);
    conv23_one2one_cv4_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_one2one_cv4_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    conv23_one2one_cv4_1_2->setNbGroups(1);
    nvinfer1::IShuffleLayer* reshape23_7 = network->addShuffle(*conv23_one2one_cv4_1_2->getOutput(0));
    reshape23_7->setReshapeDimensions(nvinfer1::Dims3{1, 1, -1});

    nvinfer1::IElementWiseLayer* conv23_one2one_cv4_2_0 =
            convBnSiLU(network, weightMap, *block22->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv4.2.0", 1);
    nvinfer1::IElementWiseLayer* conv23_one2one_cv4_2_1 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv4_2_0->getOutput(0), c2 / 4, {3, 3}, 1,
                       "model.23.one2one_cv4.2.1", 1);
    nvinfer1::IConvolutionLayer* conv23_one2one_cv4_2_2 = network->addConvolutionNd(
            *conv23_one2one_cv4_2_1->getOutput(0), 1, nvinfer1::DimsHW{1, 1},
            weightMap["model.23.one2one_cv4.2.2.weight"], weightMap["model.23.one2one_cv4.2.2.bias"]);
    conv23_one2one_cv4_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_one2one_cv4_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    conv23_one2one_cv4_2_2->setNbGroups(1);
    nvinfer1::IShuffleLayer* reshape23_8 = network->addShuffle(*conv23_one2one_cv4_2_2->getOutput(0));
    reshape23_8->setReshapeDimensions(nvinfer1::Dims3{1, 1, -1});

    nvinfer1::ITensor* inputTensors23_2[] = {reshape23_6->getOutput(0), reshape23_7->getOutput(0),
                                             reshape23_8->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_2 = network->addConcatenation(inputTensors23_2, 3);
    cat23_2->setAxis(2);

    /////////////////////////////////////////////////////
    nvinfer1::ISliceLayer* split23__0 = network->addSlice(
            *cat23->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{cat23->getOutput(0)->getDimensions().d[0], cat23->getOutput(0)->getDimensions().d[1] / 2,
                            cat23->getOutput(0)->getDimensions().d[2]},
            nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23__1 = network->addSlice(
            *cat23->getOutput(0), nvinfer1::Dims3{0, cat23->getOutput(0)->getDimensions().d[1] / 2, 0},
            nvinfer1::Dims3{cat23->getOutput(0)->getDimensions().d[0], cat23->getOutput(0)->getDimensions().d[1] / 2,
                            cat23->getOutput(0)->getDimensions().d[2]},
            nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IElementWiseLayer* sub23 = network->addElementWise(*split23__1->getOutput(0), *split23__0->getOutput(0),
                                                                 nvinfer1::ElementWiseOperation::kSUB);

    // Divide by 2
    static float two = 2.0f;
    nvinfer1::Weights two_weights{nvinfer1::DataType::kFLOAT, &two, 1};
    nvinfer1::IConstantLayer* const_two = network->addConstant(nvinfer1::Dims3{1, 1, 1}, two_weights);
    nvinfer1::IElementWiseLayer* div23 = network->addElementWise(*sub23->getOutput(0), *const_two->getOutput(0),
                                                                 nvinfer1::ElementWiseOperation::kDIV);

    nvinfer1::ISliceLayer* split23_1__0 = network->addSlice(
            *div23->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{div23->getOutput(0)->getDimensions().d[0], div23->getOutput(0)->getDimensions().d[1] / 2,
                            div23->getOutput(0)->getDimensions().d[2]},
            nvinfer1::Dims3{1, 1, 1});

    nvinfer1::ISliceLayer* split23_1__1 = network->addSlice(
            *div23->getOutput(0), nvinfer1::Dims3{0, div23->getOutput(0)->getDimensions().d[1] / 2, 0},
            nvinfer1::Dims3{div23->getOutput(0)->getDimensions().d[0], div23->getOutput(0)->getDimensions().d[1] / 2,
                            div23->getOutput(0)->getDimensions().d[2]},
            nvinfer1::Dims3{1, 1, 1});

    nvinfer1::IUnaryLayer* cos23 = network->addUnary(*cat23_2->getOutput(0), nvinfer1::UnaryOperation::kCOS);
    nvinfer1::IUnaryLayer* sin23 = network->addUnary(*cat23_2->getOutput(0), nvinfer1::UnaryOperation::kSIN);

    nvinfer1::IElementWiseLayer* mul23 = network->addElementWise(*split23_1__0->getOutput(0), *cos23->getOutput(0),
                                                                 nvinfer1::ElementWiseOperation::kPROD);
    nvinfer1::IElementWiseLayer* mul23_1 = network->addElementWise(*split23_1__1->getOutput(0), *sin23->getOutput(0),
                                                                   nvinfer1::ElementWiseOperation::kPROD);
    nvinfer1::IElementWiseLayer* sub23_1 =
            network->addElementWise(*mul23->getOutput(0), *mul23_1->getOutput(0), nvinfer1::ElementWiseOperation::kSUB);

    nvinfer1::IElementWiseLayer* mul23_2 = network->addElementWise(*split23_1__0->getOutput(0), *sin23->getOutput(0),
                                                                   nvinfer1::ElementWiseOperation::kPROD);
    nvinfer1::IElementWiseLayer* mul23_3 = network->addElementWise(*split23_1__1->getOutput(0), *cos23->getOutput(0),
                                                                   nvinfer1::ElementWiseOperation::kPROD);
    nvinfer1::IElementWiseLayer* add23 = network->addElementWise(*mul23_2->getOutput(0), *mul23_3->getOutput(0),
                                                                 nvinfer1::ElementWiseOperation::kSUM);

    nvinfer1::ITensor* tensor23[] = {sub23_1->getOutput(0), add23->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_3 = network->addConcatenation(tensor23, 2);
    cat23_3->setAxis(1);

    std::vector<int> fm_sizes;
    int fm_h_0 = block16->getOutput(0)->getDimensions().d[2];  // P3
    int fm_h_1 = block19->getOutput(0)->getDimensions().d[2];  // P4
    int fm_h_2 = block22->getOutput(0)->getDimensions().d[2];  // P5

    fm_sizes.push_back(fm_h_0);
    fm_sizes.push_back(fm_h_1);
    fm_sizes.push_back(fm_h_2);

    int grid_num = fm_h_0 * fm_h_0 + fm_h_1 * fm_h_1 + fm_h_2 * fm_h_2;

    assert((kObbInputH % fm_h_0) == 0 && (kObbInputH % fm_h_1) == 0 && (kObbInputH % fm_h_2) == 0);
    assert((fm_h_0 == block16->getOutput(0)->getDimensions().d[3]) &&
           (fm_h_1 == block19->getOutput(0)->getDimensions().d[3]) &&
           (fm_h_2 == block22->getOutput(0)->getDimensions().d[3]));  // verify fm_w == fm_h

    assert(cat23_3->getOutput(0)->getDimensions().d[2] == grid_num);

    int idx = 0;
    std::vector<float> grid(grid_num * 2);
    auto fill_grid = [&](int fm_h) {
        for (int y = 0; y < fm_h; ++y) {
            for (int x = 0; x < fm_h; ++x) {
                grid[idx] = x + 0.5f;
                grid[idx + grid_num] = y + 0.5f;
                idx++;
            }
        }
    };
    fill_grid(fm_h_0);
    fill_grid(fm_h_1);
    fill_grid(fm_h_2);

    std::vector<float> stride_vec(grid_num);
    idx = 0;
    auto fill_stride = [&](int fm_h, int fm_w, int stride) {
        for (int y = 0; y < fm_h; ++y) {
            for (int x = 0; x < fm_w; ++x) {
                stride_vec[idx] = static_cast<float>(stride);
                idx++;
            }
        }
    };

    std::vector<int> strides = {kObbInputH / fm_h_0, kObbInputH / fm_h_1, kObbInputH / fm_h_2};
    fill_stride(fm_h_0, fm_h_0, strides[0]);
    fill_stride(fm_h_1, fm_h_1, strides[1]);
    fill_stride(fm_h_2, fm_h_2, strides[2]);

    nvinfer1::Dims gridDims{3, {1, 2, grid_num}};
    nvinfer1::IConstantLayer* constant_grid = network->addConstant(
            gridDims, nvinfer1::Weights{nvinfer1::DataType::kFLOAT, grid.data(), (int64_t)grid.size()});

    nvinfer1::Dims strideDims{3, {1, 1, grid_num}};
    nvinfer1::IConstantLayer* constant_stride = network->addConstant(
            strideDims, nvinfer1::Weights{nvinfer1::DataType::kFLOAT, stride_vec.data(), (int64_t)stride_vec.size()});

    nvinfer1::IElementWiseLayer* add23_1 = network->addElementWise(*cat23_3->getOutput(0), *constant_grid->getOutput(0),
                                                                   nvinfer1::ElementWiseOperation::kSUM);

    nvinfer1::IElementWiseLayer* add23_2 = network->addElementWise(*split23__0->getOutput(0), *split23__1->getOutput(0),
                                                                   nvinfer1::ElementWiseOperation::kSUM);

    nvinfer1::ITensor* tensor23_4[] = {add23_1->getOutput(0), add23_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_4 = network->addConcatenation(tensor23_4, 2);
    cat23_4->setAxis(1);

    nvinfer1::IElementWiseLayer* mul23_4 = network->addElementWise(
            *cat23_4->getOutput(0), *constant_stride->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);

    /////////////////////////////////////////////////////
    nvinfer1::IElementWiseLayer* conv23_one2one_cv3_0_0_0 =
            convBnSiLU(network, weightMap, *block16->getOutput(0), c2, {3, 3}, 1, "model.23.one2one_cv3.0.0.0", c2);

    nvinfer1::IElementWiseLayer* conv23_one2one_cv3_0_0_1 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv3_0_0_0->getOutput(0), c3, {1, 1}, 1,
                       "model.23.one2one_cv3.0.0.1", 1);

    nvinfer1::IElementWiseLayer* conv23_one2one_cv3_0_1_0 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv3_0_0_1->getOutput(0), c3, {3, 3}, 1,
                       "model.23.one2one_cv3.0.1.0", c3);

    nvinfer1::IElementWiseLayer* conv23_one2one_cv3_0_1_1 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv3_0_1_0->getOutput(0), c3, {1, 1}, 1,
                       "model.23.one2one_cv3.0.1.1", 1);

    nvinfer1::IConvolutionLayer* conv23_one2one_cv3_0_2 = network->addConvolutionNd(
            *conv23_one2one_cv3_0_1_1->getOutput(0), kObbNumClass, nvinfer1::DimsHW{1, 1},
            weightMap["model.23.one2one_cv3.0.2.weight"], weightMap["model.23.one2one_cv3.0.2.bias"]);
    conv23_one2one_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_one2one_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    conv23_one2one_cv3_0_2->setNbGroups(1);

    nvinfer1::IShuffleLayer* reshape23_3 = network->addShuffle(*conv23_one2one_cv3_0_2->getOutput(0));
    reshape23_3->setReshapeDimensions(nvinfer1::Dims3{1, kObbNumClass, -1});

    nvinfer1::IElementWiseLayer* conv23_one2one_cv3_1_0_0 = convBnSiLU(
            network, weightMap, *block19->getOutput(0), c2 * 2, {3, 3}, 1, "model.23.one2one_cv3.1.0.0", c2 * 2);
    nvinfer1::IElementWiseLayer* conv23_one2one_cv3_1_0_1 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv3_1_0_0->getOutput(0), c3, {1, 1}, 1,
                       "model.23.one2one_cv3.1.0.1", 1);
    nvinfer1::IElementWiseLayer* conv23_one2one_cv3_1_1_0 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv3_1_0_1->getOutput(0), c3, {3, 3}, 1,
                       "model.23.one2one_cv3.1.1.0", c3);
    nvinfer1::IElementWiseLayer* conv23_one2one_cv3_1_1_1 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv3_1_1_0->getOutput(0), c3, {1, 1}, 1,
                       "model.23.one2one_cv3.1.1.1", 1);
    nvinfer1::IConvolutionLayer* conv23_one2one_cv3_1_2 = network->addConvolutionNd(
            *conv23_one2one_cv3_1_1_1->getOutput(0), kObbNumClass, nvinfer1::DimsHW{1, 1},
            weightMap["model.23.one2one_cv3.1.2.weight"], weightMap["model.23.one2one_cv3.1.2.bias"]);
    conv23_one2one_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_one2one_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    conv23_one2one_cv3_1_2->setNbGroups(1);
    nvinfer1::IShuffleLayer* reshape23_4 = network->addShuffle(*conv23_one2one_cv3_1_2->getOutput(0));
    reshape23_4->setReshapeDimensions(nvinfer1::Dims3{1, kObbNumClass, -1});

    nvinfer1::IElementWiseLayer* conv23_one2one_cv3_2_0_0;
    if (type == "m" || type == "l" || type == "x") {
        conv23_one2one_cv3_2_0_0 = convBnSiLU(network, weightMap, *block22->getOutput(0), c2 * 2, {3, 3}, 1,
                                              "model.23.one2one_cv3.2.0.0", c2 * 2);
    } else {
        conv23_one2one_cv3_2_0_0 = convBnSiLU(network, weightMap, *block22->getOutput(0), c2 * 4, {3, 3}, 1,
                                              "model.23.one2one_cv3.2.0.0", c2 * 4);
    }

    nvinfer1::IElementWiseLayer* conv23_one2one_cv3_2_0_1 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv3_2_0_0->getOutput(0), c3, {1, 1}, 1,
                       "model.23.one2one_cv3.2.0.1", 1);
    nvinfer1::IElementWiseLayer* conv23_one2one_cv3_2_1_0 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv3_2_0_1->getOutput(0), c3, {3, 3}, 1,
                       "model.23.one2one_cv3.2.1.0", c3);
    nvinfer1::IElementWiseLayer* conv23_one2one_cv3_2_1_1 =
            convBnSiLU(network, weightMap, *conv23_one2one_cv3_2_1_0->getOutput(0), c3, {1, 1}, 1,
                       "model.23.one2one_cv3.2.1.1", 1);
    nvinfer1::IConvolutionLayer* conv23_one2one_cv3_2_2 = network->addConvolutionNd(
            *conv23_one2one_cv3_2_1_1->getOutput(0), kObbNumClass, nvinfer1::DimsHW{1, 1},
            weightMap["model.23.one2one_cv3.2.2.weight"], weightMap["model.23.one2one_cv3.2.2.bias"]);
    conv23_one2one_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_one2one_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    conv23_one2one_cv3_2_2->setNbGroups(1);
    nvinfer1::IShuffleLayer* reshape23_5 = network->addShuffle(*conv23_one2one_cv3_2_2->getOutput(0));
    reshape23_5->setReshapeDimensions(nvinfer1::Dims3{1, kObbNumClass, -1});

    nvinfer1::ITensor* tensor23_1[] = {reshape23_3->getOutput(0), reshape23_4->getOutput(0), reshape23_5->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_1 = network->addConcatenation(tensor23_1, 3);
    cat23_1->setAxis(2);
    nvinfer1::IActivationLayer* sigmoid23 = network->addActivation(
            *cat23_1->getOutput(0),
            nvinfer1::ActivationType::kSIGMOID);  // TODO: THIS IS UNNESSARY, REMOVE AFTER PLUGIN IS READY
    /////////////////////////////////////////////////////

    nvinfer1::ITensor* tensor23_5[] = {mul23_4->getOutput(0), sigmoid23->getOutput(0), cat23_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_5 = network->addConcatenation(tensor23_5, 3);
    cat23_5->setAxis(1);

    nvinfer1::IShuffleLayer* transpose = network->addShuffle(*cat23_5->getOutput(0));
    transpose->setFirstTranspose(nvinfer1::Permutation{0, 2, 1});

    nvinfer1::IPluginV2Layer* yolo = addYoloLayer(network, *transpose->getOutput(0), strides, fm_sizes, strides.size(),
                                                  false, false, false, true, grid_num);

    /////////////////////////////////////////////////////

    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));
    // Use setMemoryPoolLimit instead of deprecated setMaxWorkspaceSize
    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cerr << "INT8 not supported for YOLO26 model yet." << std::endl;
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}

nvinfer1::IHostMemory* buildEngineYolo26Cls(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                            nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                            int& max_channels, std::string& type) {
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);

    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(
            1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));

    /*******************************************************************************************************
     ******************************************  YOLO26 INPUT  **********************************************
     *******************************************************************************************************/

    nvinfer1::ITensor* data =
            network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kClsInputH, kClsInputW});
    assert(data);

    /*******************************************************************************************************
    *****************************************  YOLO26 BACKBONE  ********************************************
    *******************************************************************************************************/

    nvinfer1::IElementWiseLayer* block0 =
            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), {3, 3}, 2, "model.0");

    nvinfer1::IElementWiseLayer* block1 = convBnSiLU(network, weightMap, *block0->getOutput(0),
                                                     get_width(128, gw, max_channels), {3, 3}, 2, "model.1");

    bool c3k = false;
    if (type == "m" || type == "l" || type == "x") {
        c3k = true;
    }

    nvinfer1::IElementWiseLayer* conv2 =
            C3K2(network, weightMap, *block1->getOutput(0), get_width(128, gw, max_channels),
                 get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, false, 0.25, "model.2");

    nvinfer1::IElementWiseLayer* block3 = convBnSiLU(network, weightMap, *conv2->getOutput(0),
                                                     get_width(256, gw, max_channels), {3, 3}, 2, "model.3");

    nvinfer1::IElementWiseLayer* block4 =
            C3K2(network, weightMap, *block3->getOutput(0), get_width(256, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, false, 0.25, "model.4");

    nvinfer1::IElementWiseLayer* block5 = convBnSiLU(network, weightMap, *block4->getOutput(0),
                                                     get_width(512, gw, max_channels), {3, 3}, 2, "model.5");

    nvinfer1::IElementWiseLayer* block6 =
            C3K2(network, weightMap, *block5->getOutput(0), get_width(512, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), true, true, false, 0.5, "model.6");

    nvinfer1::IElementWiseLayer* block7 = convBnSiLU(network, weightMap, *block6->getOutput(0),
                                                     get_width(1024, gw, max_channels), {3, 3}, 2, "model.7");

    nvinfer1::IElementWiseLayer* block8 =
            C3K2(network, weightMap, *block7->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), get_depth(2, gd), true, true, false, 0.5, "model.8");

    nvinfer1::IElementWiseLayer* block9 =
            C2PSA(network, weightMap, *block8->getOutput(0), get_width(1024, gw, max_channels),
                  get_width(1024, gw, max_channels), get_depth(2, gd), 0.5, "model.9");

    /////////////////////////////////////////////////////

    nvinfer1::IElementWiseLayer* block10_convbn =
            convBnSiLU(network, weightMap, *block9->getOutput(0), 1280, {1, 1}, 1, "model.10.conv");
    nvinfer1::Dims dims =
            block10_convbn->getOutput(0)->getDimensions();  // Obtain the dimensions of the output of conv_class
    assert(dims.nbDims == 4);
    nvinfer1::IPoolingLayer* block10_pool = network->addPoolingNd(
            *block10_convbn->getOutput(0), nvinfer1::PoolingType::kAVERAGE, nvinfer1::DimsHW{dims.d[2], dims.d[3]});
    nvinfer1::IShuffleLayer* block10_reshape = network->addShuffle(*block10_pool->getOutput(0));
    block10_reshape->setReshapeDimensions(nvinfer1::Dims2{kBatchSize, 1280});
    nvinfer1::IConstantLayer* block10_linear_weight =
            network->addConstant(nvinfer1::Dims2{kClsNumClass, 1280}, weightMap["model.10.linear.weight"]);
    nvinfer1::IConstantLayer* block10_linear_bias =
            network->addConstant(nvinfer1::Dims2{kClsNumClass, 1}, weightMap["model.10.linear.bias"]);
    nvinfer1::IMatrixMultiplyLayer* block10_linear_matrix_multiply =
            network->addMatrixMultiply(*block10_reshape->getOutput(0), nvinfer1::MatrixOperation::kNONE,
                                       *block10_linear_weight->getOutput(0), nvinfer1::MatrixOperation::kTRANSPOSE);
    nvinfer1::IElementWiseLayer* block10_linear_add =
            network->addElementWise(*block10_linear_matrix_multiply->getOutput(0), *block10_linear_bias->getOutput(0),
                                    nvinfer1::ElementWiseOperation::kSUM);
    nvinfer1::IActivationLayer* output =
            network->addActivation(*block10_linear_add->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
    assert(output);

    output->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*output->getOutput(0));
    // Use setMemoryPoolLimit instead of deprecated setMaxWorkspaceSize
    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cerr << "INT8 not supported for YOLO26 model yet." << std::endl;
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}


================================================
FILE: yolo26/src/postprocess.cpp
================================================

#include "postprocess.h"
#include "utils.h"

cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
    float l, r, t, b;
    float r_w = kInputW / (img.cols * 1.0);
    float r_h = kInputH / (img.rows * 1.0);

    if (r_h > r_w) {
        l = bbox[0];
        r = bbox[2];
        t = bbox[1] - (kInputH - r_w * img.rows) / 2;
        b = bbox[3] - (kInputH - r_w * img.rows) / 2;
        l = l / r_w;
        r = r / r_w;
        t = t / r_w;
        b = b / r_w;
    } else {
        l = bbox[0] - (kInputW - r_h * img.cols) / 2;
        r = bbox[2] - (kInputW - r_h * img.cols) / 2;
        t = bbox[1];
        b = bbox[3];
        l = l / r_h;
        r = r / r_h;
        t = t / r_h;
        b = b / r_h;
    }
    l = std::max(0.0f, l);
    t = std::max(0.0f, t);
    int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
    int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));

    return cv::Rect(int(round(l)), int(round(t)), width, height);
}

cv::Rect get_rect_obb(cv::Mat& img, float bbox[4]) {
    float l, r, t, b;
    float r_w = kObbInputW / (img.cols * 1.0);
    float r_h = kObbInputH / (img.rows * 1.0);

    if (r_h > r_w) {
        l = bbox[0];
        r = bbox[2];
        t = bbox[1] - (kObbInputH - r_w * img.rows) / 2;
        b = bbox[3] - (kObbInputH - r_w * img.rows) / 2;
        l = l / r_w;
        r = r / r_w;
        t = t / r_w;
        b = b / r_w;
    } else {
        l = bbox[0] - (kObbInputW - r_h * img.cols) / 2;
        r = bbox[2] - (kObbInputW - r_h * img.cols) / 2;
        t = bbox[1];
        b = bbox[3];
        l = l / r_h;
        r = r / r_h;
        t = t / r_h;
        b = b / r_h;
    }
    l = std::max(0.0f, l);
    t = std::max(0.0f, t);
    int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
    int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));

    return cv::Rect(int(round(l)), int(round(t)), width, height);
}

cv::Rect get_rect_adapt_landmark(cv::Mat& img, float bbox[4], float lmk[kNumberOfPoints * 3]) {
    float l, r, t, b;
    float r_w = kInputW / (img.cols * 1.0);
    float r_h = kInputH / (img.rows * 1.0);
    if (r_h > r_w) {
        l = bbox[0] / r_w;
        r = bbox[2] / r_w;
        t = (bbox[1] - (kInputH - r_w * img.rows) / 2) / r_w;
        b = (bbox[3] - (kInputH - r_w * img.rows) / 2) / r_w;
        for (int i = 0; i < kNumberOfPoints * 3; i += 3) {
            lmk[i] /= r_w;
            lmk[i + 1] = (lmk[i + 1] - (kInputH - r_w * img.rows) / 2) / r_w;
            // lmk[i + 2]
        }
    } else {
        l = (bbox[0] - (kInputW - r_h * img.cols) / 2) / r_h;
        r = (bbox[2] - (kInputW - r_h * img.cols) / 2) / r_h;
        t = bbox[1] / r_h;
        b = bbox[3] / r_h;
        for (int i = 0; i < kNumberOfPoints * 3; i += 3) {
            lmk[i] = (lmk[i] - (kInputW - r_h * img.cols) / 2) / r_h;
            lmk[i + 1] /= r_h;
            // lmk[i + 2]
        }
    }
    l = std::max(0.0f, l);
    t = std::max(0.0f, t);
    int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
    int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));

    return cv::Rect(int(round(l)), int(round(t)), width, height);
}

static float iou(float lbox[4], float rbox[4]) {
    float interBox[] = {
            (std::max)(lbox[0], rbox[0]),
            (std::min)(lbox[2], rbox[2]),
            (std::max)(lbox[1], rbox[1]),
            (std::min)(lbox[3], rbox[3]),
    };

    if (interBox[2] > interBox[3] || interBox[0] > interBox[1])
        return 0.0f;

    float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]);
    float unionBoxS = (lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) - interBoxS;
    return interBoxS / unionBoxS;
}

static bool cmp(const Detection& a, const Detection& b) {
    if (a.conf == b.conf) {
        return a.bbox[0] < b.bbox[0];
    }
    return a.conf > b.conf;
}

void decode(std::vector<Detection>& res, float* output) {
    int det_size = sizeof(Detection) / sizeof(float);
    std::map<float, std::vector<Detection>> m;

    for (int i = 0; i < output[0]; i++) {
        Detection det;
        memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
        res.push_back(det);
    }
}

void batch_decode(std::vector<std::vector<Detection>>& res_batch, float* output, int batch_size, int output_size) {
    res_batch.resize(batch_size);
    for (int i = 0; i < batch_size; i++) {
        decode(res_batch[i], &output[i * output_size]);
    }
}

void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
    for (size_t i = 0; i < img_batch.size(); i++) {
        auto& res = res_batch[i];
        cv::Mat img = img_batch[i];
        for (size_t j = 0; j < res.size(); j++) {
            cv::Rect r = get_rect(img, res[j].bbox);
            cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
            cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
                        cv::Scalar(0xFF, 0xFF, 0xFF), 2);
        }
    }
}

void draw_bbox_keypoints_line(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
    const std::vector<std::pair<int, int>> skeleton_pairs = {
            {0, 1}, {0, 2},  {0, 5}, {0, 6},  {1, 2},   {1, 3},   {2, 4},   {5, 6},   {5, 7},  {5, 11},
            {6, 8}, {6, 12}, {7, 9}, {8, 10}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}};

    for (size_t i = 0; i < img_batch.size(); i++) {
        auto& res = res_batch[i];
        cv::Mat img = img_batch[i];
        for (size_t j = 0; j < res.size(); j++) {
            cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].keypoints);
            cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
            cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
                        cv::Scalar(0xFF, 0xFF, 0xFF), 2);

            for (int k = 0; k < kNumberOfPoints * 3; k += 3) {
                if (res[j].keypoints[k + 2] > 0.5) {
                    cv::circle(img, cv::Point((int)res[j].keypoints[k], (int)res[j].keypoints[k + 1]), 3,
                               cv::Scalar(0, 0x27, 0xC1), -1);
                }
            }

            for (const auto& bone : skeleton_pairs) {
                int kp1_idx = bone.first * 3;
                int kp2_idx = bone.second * 3;
                if (res[j].keypoints[kp1_idx + 2] > 0.5 && res[j].keypoints[kp2_idx + 2] > 0.5) {
                    cv::Point p1((int)res[j].keypoints[kp1_idx], (int)res[j].keypoints[kp1_idx + 1]);
                    cv::Point p2((int)res[j].keypoints[kp2_idx], (int)res[j].keypoints[kp2_idx + 1]);
                    cv::line(img, p1, p2, cv::Scalar(0, 0x27, 0xC1), 2);
                }
            }
        }
    }
}

cv::Mat scale_mask(cv::Mat mask, cv::Mat img) {
    int x, y, w, h;
    float r_w = kInputW / (img.cols * 1.0);
    float r_h = kInputH / (img.rows * 1.0);
    if (r_h > r_w) {
        w = kInputW;
        h = r_w * img.rows;
        x = 0;
        y = (kInputH - h) / 2;
    } else {
        w = r_h * img.cols;
        h = kInputH;
        x = (kInputW - w) / 2;
        y = 0;
    }
    cv::Rect r(x, y, w, h);
    cv::Mat res;
    cv::resize(mask(r), res, img.size());
    return res;
}

void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks,
                    std::unordered_map<int, std::string>& labels_map) {
    static std::vector<uint32_t> colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17,
                                           0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF,
                                           0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7};
    for (size_t i = 0; i < dets.size(); i++) {
        cv::Mat img_mask = scale_mask(masks[i], img);
        auto color = colors[(int)dets[i].class_id % colors.size()];
        auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF);

        cv::Rect r = get_rect(img, dets[i].bbox);
        for (int x = r.x; x < r.x + r.width; x++) {
            for (int y = r.y; y < r.y + r.height; y++) {
                float val = img_mask.at<float>(y, x);
                if (val <= 0.5)
                    continue;
                img.at<cv::Vec3b>(y, x)[0] = img.at<cv::Vec3b>(y, x)[0] / 2 + bgr[0] / 2;
                img.at<cv::Vec3b>(y, x)[1] = img.at<cv::Vec3b>(y, x)[1] / 2 + bgr[1] / 2;
                img.at<cv::Vec3b>(y, x)[2] = img.at<cv::Vec3b>(y, x)[2] / 2 + bgr[2] / 2;
            }
        }

        cv::rectangle(img, r, bgr, 2);

        // Get the size of the text
        cv::Size textSize =
                cv::getTextSize(labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf),
                                cv::FONT_HERSHEY_PLAIN, 1.2, 2, NULL);
        // Set the top left corner of the rectangle
        cv::Point topLeft(r.x, r.y - textSize.height);

        // Set the bottom right corner of the rectangle
        cv::Point bottomRight(r.x + textSize.width, r.y + textSize.height);

        // Set the thickness of the rectangle lines
        int lineThickness = 2;

        // Draw the rectangle on the image
        cv::rectangle(img, topLeft, bottomRight, bgr, -1);

        cv::putText(img, labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf),
                    cv::Point(r.x, r.y + 4), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar::all(0xFF), 2);
    }
}

std::tuple<float, float, float> convariance_matrix(Detection res) {
    float w = res.bbox[2];
    float h = res.bbox[3];

    float a = w * w / 12.0;
    float b = h * h / 12.0;
    float c = res.angle;

    float cos_r = std::cos(c);
    float sin_r = std::sin(c);

    float cos_r2 = cos_r * cos_r;
    float sin_r2 = sin_r * sin_r;

    float a_val = a * cos_r2 + b * sin_r2;
    float b_val = a * sin_r2 + b * cos_r2;
    float c_val = (a - b) * cos_r * sin_r;

    return std::make_tuple(a_val, b_val, c_val);
}

static float probiou(const Detection& res1, const Detection& res2, float eps = 1e-7) {
    // Calculate the prob iou between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf.
    float a1, b1, c1, a2, b2, c2;
    std::tuple<float, float, float> matrix1 = {a1, b1, c1};
    std::tuple<float, float, float> matrix2 = {a2, b2, c2};
    matrix1 = convariance_matrix(res1);
    matrix2 = convariance_matrix(res2);
    a1 = std::get<0>(matrix1);
    b1 = std::get<1>(matrix1);
    c1 = std::get<2>(matrix1);
    a2 = std::get<0>(matrix2);
    b2 = std::get<1>(matrix2);
    c2 = std::get<2>(matrix2);

    float x1 = res1.bbox[0], y1 = res1.bbox[1];
    float x2 = res2.bbox[0], y2 = res2.bbox[1];

    float t1 = ((a1 + a2) * std::pow(y1 - y2, 2) + (b1 + b2) * std::pow(x1 - x2, 2)) /
               ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps);
    float t2 = ((c1 + c2) * (x2 - x1) * (y1 - y2)) / ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps);
    float t3 = std::log(
            ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2)) /
                    (4 * std::sqrt(std::max(a1 * b1 - c1 * c1, 0.0f)) * std::sqrt(std::max(a2 * b2 - c2 * c2, 0.0f)) +
                     eps) +
            eps);

    float bd = 0.25f * t1 + 0.5f * t2 + 0.5f * t3;
    bd = std::max(std::min(bd, 100.0f), eps);
    float hd = std::sqrt(1.0 - std::exp(-bd) + eps);

    return 1 - hd;
}

void decode_obb(std::vector<Detection>& res, float* output) {
    int det_size = sizeof(Detection) / sizeof(float);
    std::map<float, std::vector<Detection>> m;

    for (int i = 0; i < output[0]; i++) {
        Detection det;
        memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
        res.push_back(det);
    }
}

void batch_decode_obb(std::vector<std::vector<Detection>>& res_batch, float* output, int batch_size, int output_size) {
    res_batch.resize(batch_size);
    for (int i = 0; i < batch_size; i++) {
        decode_obb(res_batch[i], &output[i * output_size]);
    }
}

static std::vector<cv::Point> get_corner(cv::Mat& img, const Detection& box) {
    float cos_value, sin_value;

    // Calculate center point and width/height
    float x1 = box.bbox[0];
    float y1 = box.bbox[1];
    float w = box.bbox[2];
    float h = box.bbox[3];
    float angle = box.angle * 180.0f / CV_PI;  // Convert radians to degrees

    // Print original angle
    std::cout << "Original angle: " << angle << std::endl;

    // Swap width and height if height is greater than or equal to width
    if (h >= w) {
        std::swap(w, h);
        angle = fmod(angle + 90.0f, 180.0f);  // Adjust angle to be within [0, 180)
    }

    // Ensure the angle is between 0 and 180 degrees
    if (angle < 0) {
        angle += 360.0f;  // Convert to positive value
    }
    if (angle > 180.0f) {
        angle -= 180.0f;  // Subtract 180 from angles greater than 180
    }

    // Print adjusted angle
    std::cout << "Adjusted angle: " << angle << std::endl;

    // Convert to normal angle value
    float normal_angle = fmod(angle, 180.0f);
    if (normal_angle < 0) {
        normal_angle += 180.0f;  // Ensure it's a positive value
    }

    // Print normal angle value
    std::cout << "Normal angle: " << normal_angle << std::endl;

    cos_value = std::cos(angle * CV_PI / 180.0f);  // Convert to radians
    sin_value = std::sin(angle * CV_PI / 180.0f);

    // Calculate each corner point
    float l = x1 - w / 2;  // Left boundary
    float r = x1 + w / 2;  // Right boundary
    float t = y1 - h / 2;  // Top boundary
    float b = y1 + h / 2;  // Bottom boundary

    // Use get_rect function to scale the coordinates
    float bbox[4] = {l, t, r, b};
    cv::Rect rect = get_rect_obb(img, bbox);

    float x_ = (rect.x + rect.x + rect.width) / 2;   // Center x
    float y_ = (rect.y + rect.y + rect.height) / 2;  // Center y
    float width = rect.width;                        // Width
    float height = rect.height;                      // Height

    // Calculate each corner point
    std::vector<cv::Point> corner_points(4);
    float vec1x = width / 2 * cos_value;
    float vec1y = width / 2 * sin_value;
    float vec2x = -height / 2 * sin_value;
    float vec2y = height / 2 * cos_value;

    corner_points[0] = cv::Point(int(round(x_ + vec1x + vec2x)), int(round(y_ + vec1y + vec2y)));  // Top-left corner
    corner_points[1] = cv::Point(int(round(x_ + vec1x - vec2x)), int(round(y_ + vec1y - vec2y)));  // Top-right corner
    corner_points[2] =
            cv::Point(int(round(x_ - vec1x - vec2x)), int(round(y_ - vec1y - vec2y)));  // Bottom-right corner
    corner_points[3] = cv::Point(int(round(x_ - vec1x + vec2x)), int(round(y_ - vec1y + vec2y)));  // Bottom-left corner

    // Check and adjust corner points to ensure the rectangle is parallel to image boundaries
    for (auto& point : corner_points) {
        point.x = std::max(0, std::min(point.x, img.cols - 1));
        point.y = std::max(0, std::min(point.y, img.rows - 1));
    }

    return corner_points;
}

void draw_bbox_obb(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
    static std::vector<uint32_t> colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17,
                                           0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF,
                                           0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7};
    for (size_t i = 0; i < img_batch.size(); i++) {
        auto& res = res_batch[i];
        auto& img = img_batch[i];
        for (auto& obj : res) {
            auto color = colors[(int)obj.class_id % colors.size()];
            auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF);
            auto corner_points = get_corner(img, obj);
            cv::polylines(img, std::vector<std::vector<cv::Point>>{corner_points}, true, bgr, 1);

            auto text = (std::to_string((int)(obj.class_id)) + ":" + to_string_with_precision(obj.conf));
            cv::Size textsize = cv::getTextSize(text, 0, 0.3, 1, nullptr);

            int width = textsize.width;
            int height = textsize.height;
            bool outside = (corner_points[0].y - height >= 3) ? true : false;
            cv::Point p1(corner_points[0].x, corner_points[0].y), p2;
            p2.x = corner_points[0].x + width;
            if (outside) {
                p2.y = corner_points[0].y - height - 3;
            } else {
                p2.y = corner_points[0].y + height + 3;
            }
            cv::rectangle(img, p1, p2, bgr, -1, cv::LINE_AA);
            cv::putText(
                    img, text,
                    cv::Point(corner_points[0].x, (outside ? corner_points[0].y - 2 : corner_points[0].y + height + 2)),
                    0, 0.3, cv::Scalar::all(255), 1, cv::LINE_AA);
        }
    }
}


================================================
FILE: yolo26/src/preprocess.cu
================================================
#include "cuda_utils.h"
#include "preprocess.h"

static uint8_t* img_buffer_host = nullptr;
static uint8_t* img_buffer_device = nullptr;

__global__ void warpaffine_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst,
                                  int dst_width, int dst_height, uint8_t const_value_st, AffineMatrix d2s, int edge) {
    int position = blockDim.x * blockIdx.x + threadIdx.x;
    if (position >= edge)
        return;

    float m_x1 = d2s.value[0];
    float m_y1 = d2s.value[1];
    float m_z1 = d2s.value[2];
    float m_x2 = d2s.value[3];
    float m_y2 = d2s.value[4];
    float m_z2 = d2s.value[5];

    int dx = position % dst_width;
    int dy = position / dst_width;
    float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
    float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
    float c0, c1, c2;

    if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
        // out of range
        c0 = const_value_st;
        c1 = const_value_st;
        c2 = const_value_st;
    } else {
        int y_low = floorf(src_y);
        int x_low = floorf(src_x);
        int y_high = y_low + 1;
        int x_high = x_low + 1;

        uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
        float ly = src_y - y_low;
        float lx = src_x - x_low;
        float hy = 1 - ly;
        float hx = 1 - lx;
        float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
        uint8_t* v1 = const_value;
        uint8_t* v2 = const_value;
        uint8_t* v3 = const_value;
        uint8_t* v4 = const_value;

        if (y_low >= 0) {
            if (x_low >= 0)
                v1 = src + y_low * src_line_size + x_low * 3;

            if (x_high < src_width)
                v2 = src + y_low * src_line_size + x_high * 3;
        }

        if (y_high < src_height) {
            if (x_low >= 0)
                v3 = src + y_high * src_line_size + x_low * 3;

            if (x_high < src_width)
                v4 = src + y_high * src_line_size + x_high * 3;
        }

        c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
        c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
        c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
    }

    // bgr to rgb
    float t = c2;
    c2 = c0;
    c0 = t;

    // normalization
    c0 = c0 / 255.0f;
    c1 = c1 / 255.0f;
    c2 = c2 / 255.0f;

    // rgbrgbrgb to rrrgggbbb
    int area = dst_width * dst_height;
    float* pdst_c0 = dst + dy * dst_width + dx;
    float* pdst_c1 = pdst_c0 + area;
    float* pdst_c2 = pdst_c1 + area;
    *pdst_c0 = c0;
    *pdst_c1 = c1;
    *pdst_c2 = c2;
}

void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height,
                     cudaStream_t stream) {
    int img_size = src_width * src_height * 3;
    // copy data to pinned memory
    memcpy(img_buffer_host, src, img_size);
    // copy data to device memory
    CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream));

    AffineMatrix s2d, d2s;
    float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width);

    s2d.value[0] = scale;
    s2d.value[1] = 0;
    s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5;
    s2d.value[3] = 0;
    s2d.value[4] = scale;
    s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5;
    cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
    cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
    cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);

    memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));

    int jobs = dst_height * dst_width;
    int threads = 256;
    int blocks = ceil(jobs / (float)threads);
    warpaffine_kernel<<<blocks, threads, 0, stream>>>(img_buffer_device, src_width * 3, src_width, src_height, dst,
                                                      dst_width, dst_height, 128, d2s, jobs);
}

void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch, float* dst, int dst_width, int dst_height,
                           cudaStream_t stream) {
    int dst_size = dst_width * dst_height * 3;
    for (size_t i = 0; i < img_batch.size(); i++) {
        cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width,
                        dst_height, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
    }
}

void cuda_preprocess_init(int max_image_size) {
    // prepare input data in pinned memory
    CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3));
    // prepare input data in device memory
    CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3));
}

void cuda_preprocess_destroy() {
    CUDA_CHECK(cudaFree(img_buffer_device));
    CUDA_CHECK(cudaFreeHost(img_buffer_host));
}

================================================
FILE: yolo26/yolo26_cls.cpp
================================================
#include <fstream>
#include <iostream>
#include <numeric>
#include <opencv2/opencv.hpp>
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "postprocess.h"
#include "preprocess.h"
#include "types.h"
#include "utils.h"

#include "yololayer.h"

Logger gLogger;
using namespace nvinfer1;
const static int kOutputSize = kClsNumClass;

void batch_preprocess(std::vector<cv::Mat>& imgs, float* output, int dst_width = 224, int dst_height = 224) {
    for (size_t b = 0; b < imgs.size(); b++) {
        int h = imgs[b].rows;
        int w = imgs[b].cols;
        int m = std::min(h, w);
        int top = (h - m) / 2;
        int left = (w - m) / 2;
        cv::Mat img = imgs[b](cv::Rect(left, top, m, m));
        cv::resize(img, img, cv::Size(dst_width, dst_height), 0, 0, cv::INTER_LINEAR);
        cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
        img.convertTo(img, CV_32F, 1 / 255.0);

        std::vector<cv::Mat> channels(3);
        cv::split(img, channels);

        // CHW format
        for (int c = 0; c < 3; ++c) {
            int i = 0;
            for (int row = 0; row < dst_height; ++row) {
                for (int col = 0; col < dst_width; ++col) {
                    output[b * 3 * dst_height * dst_width + c * dst_height * dst_width + i] =
                            channels[c].at<float>(row, col);
                    ++i;
                }
            }
        }
    }
}

void serialize_engine(const std::string& wts_name, std::string& engine_name, float& gd, float& gw, int& max_channels,
                      std::string& type) {
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();
    IHostMemory* serialized_engine =
            buildEngineYolo26Cls(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels, type);

    assert(serialized_engine);
    std::ofstream p(engine_name, std::ios::binary);
    if (!p) {
        std::cout << "could not open plan output file" << std::endl;
        assert(false);
    }
    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());
    delete serialized_engine;
    delete config;
    delete builder;
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
                        IExecutionContext** context) {
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        assert(false);
    }
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    char* serialized_engine = new char[size];
    assert(serialized_engine);
    file.read(serialized_engine, size);
    file.close();

    *runtime = createInferRuntime(gLogger);
    assert(*runtime);
    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
    assert(*engine);
    *context = (*engine)->createExecutionContext();
    assert(*context);
    delete[] serialized_engine;
}

void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
                    float** input_buffer_host, float** output_buffer_host) {
    assert(engine->getNbBindings() == 2);
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(kInputTensorName);
    const int outputIndex = engine->getBindingIndex(kOutputTensorName);
    assert(inputIndex == 0);
    assert(outputIndex == 1);

    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kClsInputH * kClsInputW * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));

    *input_buffer_host = new float[kBatchSize * 3 * kClsInputH * kClsInputW];
    *output_buffer_host = new float[kBatchSize * kOutputSize];
}

void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* input, float* output,
           int batchSize) {
    CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * kClsInputH * kClsInputW * sizeof(float),
                               cudaMemcpyHostToDevice, stream));
    context.enqueueV2(buffers, stream, nullptr);
    CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
                               stream));
    cudaStreamSynchronize(stream);
}

std::vector<int> topk(const std::vector<float>& vec, int k) {
    std::vector<int> topk_index;
    std::vector<size_t> vec_index(vec.size());
    std::iota(vec_index.begin(), vec_index.end(), 0);

    std::sort(vec_index.begin(), vec_index.end(),
              [&vec](size_t index_1, size_t index_2) { return vec[index_1] > vec[index_2]; });

    int k_num = std::min<int>(vec.size(), k);

    for (int i = 0; i < k_num; ++i) {
        topk_index.push_back(vec_index[i]);
    }

    return topk_index;
}

int main(int argc, char** argv) {
    cudaSetDevice(kGpuId);
    std::string wts_name;
    std::string engine_name;
    std::string img_dir;
    std::string type;
    int model_bboxes = 0;
    float gd = 0, gw = 0;
    int max_channels = 0;

    if (!parse_args(argc, argv, wts_name, engine_name, img_dir, type, gd, gw, max_channels)) {
        std::cerr << "Arguments not right!" << std::endl;
        std::cerr << "./yolo26_cls -s [.wts] [.engine] [n/s/m/l/x]  // serialize model to "
                     "plan file"
                  << std::endl;
        std::cerr << "./yolo26_cls -d [.engine] ../images  // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // Create a model using the API directly and serialize it to a file
    if (!wts_name.empty()) {
        serialize_engine(wts_name, engine_name, gd, gw, max_channels, type);
        return 0;
    }

    // Deserialize the engine from file
    IRuntime* runtime = nullptr;
    ICudaEngine* engine = nullptr;
    IExecutionContext* context = nullptr;
    deserialize_engine(engine_name, &runtime, &engine, &context);
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    // Prepare cpu and gpu buffers
    float* device_buffers[2];
    float* input_buffer_host = nullptr;
    float* output_buffer_host = nullptr;
    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &input_buffer_host, &output_buffer_host);

    // Read images from directory
    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    // Read imagenet labels
    auto classes = read_classes("imagenet_classes.txt");

    // batch predict
    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
        // Get a batch of images
        std::vector<cv::Mat> img_batch;
        std::vector<std::string> img_name_batch;
        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
            img_batch.push_back(img);
            img_name_batch.push_back(file_names[j]);
        }

        // Preprocess
        batch_preprocess(img_batch, input_buffer_host, kClsInputW, kClsInputH);

        std::ofstream p("engine_input.txt");
        if (!p) {
            std::cout << "could not open input file" << std::endl;
            assert(false);
        }
        for (int i = 0; i < kBatchSize * 3 * kClsInputH * kClsInputW; i++) {
            p << input_buffer_host[i] << "\n";
        }
        p.close();

        // Run inference
        auto start = std::chrono::system_clock::now();
        infer(*context, stream, (void**)device_buffers, input_buffer_host, output_buffer_host, kBatchSize);
        auto end = std::chrono::system_clock::now();
        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
                  << "ms" << std::endl;

        // Postprocess and get top-k result
        for (size_t b = 0; b < img_name_batch.size(); b++) {
            float* p = &output_buffer_host[b * kOutputSize];
            std::vector<float> prob(p, p + kOutputSize);
            auto topk_idx = topk(prob, 3);
            std::cout << img_name_batch[b] << std::endl;
            for (auto idx : topk_idx) {
                std::cout << "  " << classes[idx] << " " << p[idx] << std::endl;
            }
        }
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(device_buffers[0]));
    CUDA_CHECK(cudaFree(device_buffers[1]));
    delete[] input_buffer_host;
    delete[] output_buffer_host;
    delete context;
    delete engine;
    delete runtime;
    return 0;
}

================================================
FILE: yolo26/yolo26_det.cpp
================================================
#include <fstream>
#include <iostream>
#include <opencv2/opencv.hpp>
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "postprocess.h"
#include "preprocess.h"
#include "types.h"
#include "utils.h"

#include "yololayer.h"

Logger gLogger;
using namespace nvinfer1;
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;

void serialize_engine(const std::string& wts_name, std::string& engine_name, float& gd, float& gw, int& max_channels,
                      std::string& type) {
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();
    IHostMemory* serialized_engine =
            buildEngineYolo26Det(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels, type);

    assert(serialized_engine);
    std::ofstream p(engine_name, std::ios::binary);
    if (!p) {
        std::cout << "could not open plan output file" << std::endl;
        assert(false);
    }
    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());
    delete serialized_engine;
    delete config;
    delete builder;
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
                        IExecutionContext** context) {
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        assert(false);
    }
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    char* serialized_engine = new char[size];
    assert(serialized_engine);
    file.read(serialized_engine, size);
    file.close();

    *runtime = createInferRuntime(gLogger);
    assert(*runtime);
    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
    assert(*engine);
    *context = (*engine)->createExecutionContext();
    assert(*context);
    delete[] serialized_engine;
}

void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
                    float** output_buffer_host) {
    assert(engine->getNbBindings() == 2);
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(kInputTensorName);
    const int outputIndex = engine->getBindingIndex(kOutputTensorName);
    assert(inputIndex == 0);
    assert(outputIndex == 1);

    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));

    *output_buffer_host = new float[kBatchSize * kOutputSize];
}

void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
           int model_bboxes) {
    // infer on the batch asynchronously, and DMA output back to host
    auto start = std::chrono::system_clock::now();
    context.enqueueV2(buffers, stream, nullptr);

    CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
                               stream));

    auto end = std::chrono::system_clock::now();
    std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
              << "ms" << std::endl;

    CUDA_CHECK(cudaStreamSynchronize(stream));
}

int main(int argc, char** argv) {
    cudaSetDevice(kGpuId);
    std::string wts_name;
    std::string engine_name;
    std::string img_dir;
    std::string type;
    int model_bboxes = 0;
    float gd = 0, gw = 0;
    int max_channels = 0;

    if (!parse_args(argc, argv, wts_name, engine_name, img_dir, type, gd, gw, max_channels)) {
        std::cerr << "Arguments not right!" << std::endl;
        std::cerr << "./yolo26_det -s [.wts] [.engine] [n/s/m/l/x]  // serialize model to "
                     "plan file"
                  << std::endl;
        std::cerr << "./yolo26_det -d [.engine] ../images  // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // Create a model using the API directly and serialize it to a file
    if (!wts_name.empty()) {
        serialize_engine(wts_name, engine_name, gd, gw, max_channels, type);
        return 0;
    }

    // Deserialize the engine from file
    IRuntime* runtime = nullptr;
    ICudaEngine* engine = nullptr;
    IExecutionContext* context = nullptr;
    deserialize_engine(engine_name, &runtime, &engine, &context);
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));
    cuda_preprocess_init(kMaxInputImageSize);
    auto out_dims = engine->getBindingDimensions(1);
    model_bboxes = out_dims.d[0];
    // Prepare cpu and gpu buffers
    float* device_buffers[2];
    float* output_buffer_host = nullptr;

    // WARN: If you change kMaxNumOutputBbox, it must be smaller than the value kMaxNumOutputBbox in config.h,
    // otherwise there will be memory overflow!
    // Or you should modify the config.h and recompile.
    setPluginDeviceParams(kConfThresh);

    // Read images from directory
    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host);

    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
        // Get a batch of images
        std::vector<cv::Mat> img_batch;
        std::vector<std::string> img_name_batch;
        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
            img_batch.push_back(img);
            img_name_batch.push_back(file_names[j]);
        }

        // Preprocess
        cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);

        // Run inference
        infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, model_bboxes);

        std::vector<std::vector<Detection>> res_batch;
        batch_decode(res_batch, output_buffer_host, kBatchSize, kOutputSize);

        // Draw bounding boxes
        draw_bbox(img_batch, res_batch);

        // Save images
        for (size_t j = 0; j < img_batch.size(); j++) {
            cv::imwrite("_" + img_name_batch[j], img_batch[j]);
        }
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(device_buffers[0]));
    CUDA_CHECK(cudaFree(device_buffers[1]));
    delete[] output_buffer_host;
    cuda_preprocess_destroy();
    // Destroy the engine
    delete context;
    delete engine;
    delete runtime;

    return 0;
}

================================================
FILE: yolo26/yolo26_obb.cpp
================================================
#include <fstream>
#include <iostream>
#include <opencv2/opencv.hpp>
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "postprocess.h"
#include "preprocess.h"
#include "types.h"
#include "utils.h"
#include "yololayer.h"

Logger gLogger;
using namespace nvinfer1;
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;

void serialize_engine(const std::string& wts_name, std::string& engine_name, float& gd, float& gw, int& max_channels,
                      std::string& type) {
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();
    IHostMemory* serialized_engine =
            buildEngineYolo26Obb(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels, type);

    assert(serialized_engine);
    std::ofstream p(engine_name, std::ios::binary);
    if (!p) {
        std::cout << "could not open plan output file" << std::endl;
        assert(false);
    }
    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());
    delete serialized_engine;
    delete config;
    delete builder;
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
                        IExecutionContext** context) {
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        assert(false);
    }
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    char* serialized_engine = new char[size];
    assert(serialized_engine);
    file.read(serialized_engine, size);
    file.close();

    *runtime = createInferRuntime(gLogger);
    assert(*runtime);
    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
    assert(*engine);
    *context = (*engine)->createExecutionContext();
    assert(*context);
    delete[] serialized_engine;
}

void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
                    float** output_buffer_host) {
    assert(engine->getNbBindings() == 2);
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(kInputTensorName);
    const int outputIndex = engine->getBindingIndex(kOutputTensorName);

    assert(inputIndex == 0);
    assert(outputIndex == 1);
    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kObbInputH * kObbInputW * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));

    *output_buffer_host = new float[kBatchSize * kOutputSize];
}

void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
           int model_bboxes) {
    // infer on the batch asynchronously, and DMA output back to host
    auto start = std::chrono::system_clock::now();
    context.enqueueV2(buffers, stream, nullptr);

    CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
                               stream));
    auto end = std::chrono::system_clock::now();
    std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
              << "ms" << std::endl;

    CUDA_CHECK(cudaStreamSynchronize(stream));
}

int main(int argc, char** argv) {
    cudaSetDevice(kGpuId);
    std::string wts_name;
    std::string engine_name;
    std::string img_dir;
    std::string type;
    int model_bboxes;
    float gd = 0.0f, gw = 0.0f;
    int max_channels = 0;

    if (!parse_args(argc, argv, wts_name, engine_name, img_dir, type, gd, gw, max_channels)) {
        std::cerr << "Arguments not right!" << std::endl;
        std::cerr << "./yolo26_obb -s [.wts] [.engine] [n/s/m/l/x]  // serialize model to plan file" << std::endl;
        std::cerr << "./yolo26_obb -d [.engine] ../images  // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // Create a model using the API directly and serialize it to a file
    if (!wts_name.empty()) {
        serialize_engine(wts_name, engine_name, gd, gw, max_channels, type);
        return 0;
    }

    // Deserialize the engine from file
    IRuntime* runtime = nullptr;
    ICudaEngine* engine = nullptr;
    IExecutionContext* context = nullptr;
    deserialize_engine(engine_name, &runtime, &engine, &context);
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));
    cuda_preprocess_init(kMaxInputImageSize);
    auto out_dims = engine->getBindingDimensions(1);
    model_bboxes = out_dims.d[0];
    // Prepare cpu and gpu buffers
    float* device_buffers[2];
    float* output_buffer_host = nullptr;

    setPluginDeviceParams(kConfThresh);

    // Read images from directory
    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host);

    // batch predict
    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
        // Get a batch of images
        std::vector<cv::Mat> img_batch;
        std::vector<std::string> img_name_batch;
        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
            img_batch.push_back(img);
            img_name_batch.push_back(file_names[j]);
        }
        // Preprocess
        cuda_batch_preprocess(img_batch, device_buffers[0], kObbInputW, kObbInputH, stream);

        infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, model_bboxes);

        std::vector<std::vector<Detection>> res_batch;
        batch_decode_obb(res_batch, output_buffer_host, img_batch.size(), kOutputSize);
        draw_bbox_obb(img_batch, res_batch);
        // Save images
        for (size_t j = 0; j < img_batch.size(); j++) {
            cv::imwrite("_" + img_name_batch[j], img_batch[j]);
        }
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(device_buffers[0]));
    CUDA_CHECK(cudaFree(device_buffers[1]));
    delete[] output_buffer_host;
    cuda_preprocess_destroy();
    // Destroy the engine
    delete context;
    delete engine;
    delete runtime;

    return 0;
}

================================================
FILE: yolop/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(yolop)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Release)

find_package(CUDA  REQUIRED)

include_directories(${PROJECT_SOURCE_DIR}/include)

find_package(OpenCV REQUIRED)
include_directories(${OpenCV_INCLUDE_DIRS})

# cuda
include_directories(/usr/local/cuda-10.2/include)
link_directories(/usr/local/cuda-10.2/lib64)
# tensorrt
include_directories(/usr/include/aarch64-linux-gnu/)
link_directories(/usr/lib/aarch64-linux-gnu/)

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")

# to generate plugins
cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu)
target_link_libraries(myplugins nvinfer cudart)

# to generate trt and test image dir
add_executable(yolop ${PROJECT_SOURCE_DIR}/yolop.cpp)
target_link_libraries(yolop nvinfer cudart myplugins ${OpenCV_LIBS})
add_definitions(-O3 -pthread)


================================================
FILE: yolop/README.md
================================================
YoloP
=====

The original pytorch model is from [hustvl/YOLOP](https://github.com/hustvl/YOLOP)

## Authors

<a href="https://github.com/ausk"><img src="https://avatars.githubusercontent.com/u/4545060?v=4?s=48" width="40px;" alt=""/></a>
<a href="https://github.com/aliceint"><img src="https://avatars.githubusercontent.com/u/15520773?v=4?s=48" width="40px;" alt=""/></a>
<a href="https://github.com/mantuoluozk"><img src="https://avatars.githubusercontent.com/u/43333969?v=4?s=48" width="40px;" alt=""/></a>

## 1. Prepare building environments

Make sure you have install `c++`(support c++11)、 `cmake`、`opencv`(4.x)、`cuda`(10.x)、`nvinfer`(7.x).


## 2. build yolop

Go to `yolop`.

```
mkdir build
cd build

cmake ..
make
```

Now you can get `yolop` and `libmyplugins.so`.


## 3. Test in C++

Go to `yolop/build`.

### 3.1 generate yolop.wts
Download/Clone [YOLOP](https://github.com/hustvl/YOLOP)

Edit `gen_wts.py` , change `YOLOP_BASE_DIR` to realpath of `YOLOP`.

```
# [WARN] Please download/clone YOLOP, then set YOLOP_BASE_DIR to the root of YOLOP
python3 ../gen_wts.py
```

### 3.2 generate yolop.trt
```
./yolop -s yolop.wts  yolop.trt
```

Now you have such files:  `libmyplugins.so yolop yolop.wts  yolop.trt`


### 3.3 test yolop.trt
```
mkdir ../results

YOLOP_BASE_DIR=/home/user/jetson/tmp/YOLOP
./yolop -d yolop.trt  $YOLOP_BASE_DIR/inference/images/
```

It will output like as follow if successful! ( test on `Jetson Xavier NX - Jetpack 4.4`)
```
1601ms # the fist time is slow
26ms   # then it is faster
29ms
27ms
29ms
29ms
```

![](https://user-images.githubusercontent.com/4545060/197756635-38348dc5-d8e7-4ae3-be56-6b231dd2f5db.jpg)


## 4. Test in python3
Go to `yolop`.

Make sure you have install `pycuda` `tensorrt`; and modify `image_dir` to your image dir.

```
# usage: xxx <engine file> <plugin file> <image dir>

python3 yolop_trt.py  build/yolop.trt  build/libmyplugins.so /home/user/jetson/tmp/YOLOP/inference/images
```

It will output like as follow if successful! ( test on `Jetson Xavier NX - Jetpack 4.4`)
```
usage: xxx <engine file> <plugin file> <image dir>
[WARN] preaprea you image_dir, such as: samples, or /home/user/jetson/tmp/YOLOP/inference/images
bingding:  data (3, 384, 640)
bingding:  det (6001, 1, 1)
bingding:  seg (1, 360, 640)
bingding:  lane (1, 360, 640)
batch size is 1
warm_up->(384, 640, 3), time->1070.87ms
input->['/home/user/jetson/tmp/YOLOP/inference/images/3c0e7240-96e390d2.jpg'], time->25.94ms, saving into output/
input->['/home/user/jetson/tmp/YOLOP/inference/images/adb4871d-4d063244.jpg'], time->25.34ms, saving into output/
input->['/home/user/jetson/tmp/YOLOP/inference/images/8e1c1ab0-a8b92173.jpg'], time->25.03ms, saving into output/
input->['/home/user/jetson/tmp/YOLOP/inference/images/7dd9ef45-f197db95.jpg'], time->25.45ms, saving into output/
input->['/home/user/jetson/tmp/YOLOP/inference/images/9aa94005-ff1d4c9a.jpg'], time->24.93ms, saving into output/
input->['/home/user/jetson/tmp/YOLOP/inference/images/0ace96c3-48481887.jpg'], time->25.33ms, saving into output/
done!
```

![](https://user-images.githubusercontent.com/4545060/198003852-204f3bae-18ad-44fb-9ecd-4a2a07a726a3.jpg)


**Notice** : The results of c++ and python are not aligned for now!

----------------------------------------

```BibTeX
@misc{2108.11250,
Author = {Dong Wu and Manwen Liao and Weitian Zhang and Xinggang Wang},
Title = {YOLOP: You Only Look Once for Panoptic Driving Perception},
Year = {2021},
Eprint = {arXiv:2108.11250},
}
```


================================================
FILE: yolop/common.hpp
================================================
#pragma once

#include <fstream>
#include <map>
#include <sstream>
#include <vector>
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "yololayer.h"

using namespace nvinfer1;

cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
    int l, r, t, b;
    float r_w = Yolo::INPUT_W / (img.cols * 1.0);
    float r_h = Yolo::INPUT_H / (img.rows * 1.0);
    if (r_h > r_w) {
        l = bbox[0] - bbox[2] / 2.f;
        r = bbox[0] + bbox[2] / 2.f;
        t = bbox[1] - bbox[3] / 2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
        b = bbox[1] + bbox[3] / 2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
        l = l / r_w;
        r = r / r_w;
        t = t / r_w;
        b = b / r_w;
    } else {
        l = bbox[0] - bbox[2] / 2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
        r = bbox[0] + bbox[2] / 2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
        t = bbox[1] - bbox[3] / 2.f;
        b = bbox[1] + bbox[3] / 2.f;
        l = l / r_h;
        r = r / r_h;
        t = t / r_h;
        b = b / r_h;
    }
    return cv::Rect(l, t, r - l, b - t);
}

float iou(float lbox[4], float rbox[4]) {
    float interBox[] = {
        (std::max)(lbox[0] - lbox[2] / 2.f , rbox[0] - rbox[2] / 2.f), //left
        (std::min)(lbox[0] + lbox[2] / 2.f , rbox[0] + rbox[2] / 2.f), //right
        (std::max)(lbox[1] - lbox[3] / 2.f , rbox[1] - rbox[3] / 2.f), //top
        (std::min)(lbox[1] + lbox[3] / 2.f , rbox[1] + rbox[3] / 2.f), //bottom
    };

    if (interBox[2] > interBox[3] || interBox[0] > interBox[1])
        return 0.0f;

    float interBoxS = (interBox[1] - interBox[0])*(interBox[3] - interBox[2]);
    return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS);
}

bool cmp(const Yolo::Detection& a, const Yolo::Detection& b) {
    return a.conf > b.conf;
}

void nms(std::vector<Yolo::Detection>& res, float *output, float conf_thresh, float nms_thresh = 0.5) {
    int det_size = sizeof(Yolo::Detection) / sizeof(float);
    std::map<float, std::vector<Yolo::Detection>> m;
    for (int i = 0; i < output[0] && i < Yolo::MAX_OUTPUT_BBOX_COUNT; i++) {
        if (output[1 + det_size * i + 4] <= conf_thresh) continue;
        Yolo::Detection det;
        memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
        if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Yolo::Detection>());
        m[det.class_id].push_back(det);
    }
    for (auto it = m.begin(); it != m.end(); it++) {
        //std::cout << it->second[0].class_id << " --- " << std::endl;
        auto& dets = it->second;
        std::sort(dets.begin(), dets.end(), cmp);
        for (size_t m = 0; m < dets.size(); ++m) {
            auto& item = dets[m];
            res.push_back(item);
            for (size_t n = m + 1; n < dets.size(); ++n) {
                if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
                    dets.erase(dets.begin() + n);
                    --n;
                }
            }
        }
    }
}

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{ DataType::kFLOAT, nullptr, 0 };
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + ".weight"].values;
    float *beta = (float*)weightMap[lname + ".bias"].values;
    float *mean = (float*)weightMap[lname + ".running_mean"].values;
    float *var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{ DataType::kFLOAT, scval, len };

    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{ DataType::kFLOAT, shval, len };

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{ DataType::kFLOAT, pval, len };

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

ILayer* convBlock(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname) {
    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
    int p = ksize / 2;
    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ ksize, ksize }, weightMap[lname + ".conv.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{ s, s });
    conv1->setPaddingNd(DimsHW{ p, p });
    conv1->setNbGroups(g);
    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3);

    // silu = x * sigmoid
    // auto sig = network->addActivation(*bn1->getOutput(0), ActivationType::kSIGMOID);
    // assert(sig);
    // auto ew = network->addElementWise(*bn1->getOutput(0), *sig->getOutput(0), ElementWiseOperation::kPROD);
    // assert(ew);

    // hard_swish = x * hard_sigmoid
    auto hsig = network->addActivation(*bn1->getOutput(0), ActivationType::kHARD_SIGMOID);
    assert(hsig);
    hsig->setAlpha(1.0 / 6.0);
    hsig->setBeta(0.5);
    auto ew = network->addElementWise(*bn1->getOutput(0), *hsig->getOutput(0), ElementWiseOperation::kPROD);
    assert(ew);
    return ew;
}

ILayer* focus(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int ksize, std::string lname) {
    ISliceLayer *s1 = network->addSlice(input, Dims3{ 0, 0, 0 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 });
    ISliceLayer *s2 = network->addSlice(input, Dims3{ 0, 1, 0 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 });
    ISliceLayer *s3 = network->addSlice(input, Dims3{ 0, 0, 1 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 });
    ISliceLayer *s4 = network->addSlice(input, Dims3{ 0, 1, 1 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 });
    ITensor* inputTensors[] = { s1->getOutput(0), s2->getOutput(0), s3->getOutput(0), s4->getOutput(0) };
    auto cat = network->addConcatenation(inputTensors, 4);
    auto conv = convBlock(network, weightMap, *cat->getOutput(0), outch, ksize, 1, 1, lname + ".conv");
    return conv;
}

ILayer* bottleneck(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, bool shortcut, int g, float e, std::string lname) {
    auto cv1 = convBlock(network, weightMap, input, (int)((float)c2 * e), 1, 1, 1, lname + ".cv1");
    auto cv2 = convBlock(network, weightMap, *cv1->getOutput(0), c2, 3, 1, g, lname + ".cv2");
    if (shortcut && c1 == c2) {
        auto ew = network->addElementWise(input, *cv2->getOutput(0), ElementWiseOperation::kSUM);
        return ew;
    }
    return cv2;
}

ILayer* bottleneckCSP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) {
    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
    int c_ = (int)((float)c2 * e);
    auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
    auto cv2 = network->addConvolutionNd(input, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv2.weight"], emptywts);
    ITensor *y1 = cv1->getOutput(0);
    for (int i = 0; i < n; i++) {
        auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i));
        y1 = b->getOutput(0);
    }
    auto cv3 = network->addConvolutionNd(*y1, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv3.weight"], emptywts);

    ITensor* inputTensors[] = { cv3->getOutput(0), cv2->getOutput(0) };
    auto cat = network->addConcatenation(inputTensors, 2);

    IScaleLayer* bn = addBatchNorm2d(network, weightMap, *cat->getOutput(0), lname + ".bn", 1e-4);
    auto lr = network->addActivation(*bn->getOutput(0), ActivationType::kLEAKY_RELU);
    lr->setAlpha(0.1);

    auto cv4 = convBlock(network, weightMap, *lr->getOutput(0), c2, 1, 1, 1, lname + ".cv4");
    return cv4;
}

ILayer* C3(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) {
    int c_ = (int)((float)c2 * e);
    auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
    auto cv2 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv2");
    ITensor *y1 = cv1->getOutput(0);
    for (int i = 0; i < n; i++) {
        auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i));
        y1 = b->getOutput(0);
    }

    ITensor* inputTensors[] = { y1, cv2->getOutput(0) };
    auto cat = network->addConcatenation(inputTensors, 2);

    auto cv3 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv3");
    return cv3;
}

ILayer* SPP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int k1, int k2, int k3, std::string lname) {
    int c_ = c1 / 2;
    auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");

    auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k1, k1 });
    pool1->setPaddingNd(DimsHW{ k1 / 2, k1 / 2 });
    pool1->setStrideNd(DimsHW{ 1, 1 });
    auto pool2 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k2, k2 });
    pool2->setPaddingNd(DimsHW{ k2 / 2, k2 / 2 });
    pool2->setStrideNd(DimsHW{ 1, 1 });
    auto pool3 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k3, k3 });
    pool3->setPaddingNd(DimsHW{ k3 / 2, k3 / 2 });
    pool3->setStrideNd(DimsHW{ 1, 1 });

    ITensor* inputTensors[] = { cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0) };
    auto cat = network->addConcatenation(inputTensors, 4);

    auto cv2 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2");
    return cv2;
}

ILayer* preprocess_layer(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input) {
    // rescale
    auto rescale = network->addResize(input);
    rescale->setOutputDimensions(Dims3{ 3, Yolo::IMG_H, Yolo::IMG_W });
    rescale->setResizeMode(ResizeMode::kLINEAR);
    // normalize
    // long len = 3 * Yolo::IMG_H * Yolo::IMG_W;
    // float *normval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    // for (size_t i = 0; i < len; ++i) {
    //     normval[i] = 255.0;
    // }
    // Weights norm{ DataType::kFLOAT, normval, len };
    // weightMap["prep.norm"] = norm;
    // auto constant = network->addConstant(Dims3{ 3, Yolo::IMG_H, Yolo::IMG_W }, norm);
    // auto normalize = network->addElementWise(*rescale->getOutput(0), *constant->getOutput(0), ElementWiseOperation::kDIV);

    //paddng
    auto padding = network->addPaddingNd(*rescale->getOutput(0),
                                        DimsHW{ (Yolo::INPUT_H - Yolo::IMG_H) / 2, (Yolo::INPUT_W - Yolo::IMG_W) / 2 },
                                        DimsHW{ (Yolo::INPUT_H - Yolo::IMG_H) / 2, (Yolo::INPUT_W - Yolo::IMG_W) / 2 });

    assert(padding);
    return padding;

}

std::vector<float> getAnchors(std::map<std::string, Weights>& weightMap)
{
    std::vector<float> anchors_yolo;
    Weights Yolo_Anchors = weightMap["model.24.anchor_grid"];
    assert(Yolo_Anchors.count == 18);
    int each_yololayer_anchorsnum = Yolo_Anchors.count / 3;
    const float* tempAnchors = (const float*)(Yolo_Anchors.values);
    for (int i = 0; i < Yolo_Anchors.count; i++)
    {
        if (i < each_yololayer_anchorsnum)
        {
            anchors_yolo.push_back(const_cast<float*>(tempAnchors)[i]);
        }
        if ((i >= each_yololayer_anchorsnum) && (i < (2 * each_yololayer_anchorsnum)))
        {
            anchors_yolo.push_back(const_cast<float*>(tempAnchors)[i]);
        }
        if (i >= (2 * each_yololayer_anchorsnum))
        {
            anchors_yolo.push_back(const_cast<float*>(tempAnchors)[i]);
        }
    }

    return anchors_yolo;
}

IPluginV2Layer* addYoLoLayer(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, IConvolutionLayer* det0, IConvolutionLayer* det1, IConvolutionLayer* det2)
{
    auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
    std::vector<float> anchors_yolo = getAnchors(weightMap);
    PluginField pluginMultidata[4];
    int NetData[4];
    NetData[0] = Yolo::CLASS_NUM;
    NetData[1] = Yolo::INPUT_W;
    NetData[2] = Yolo::INPUT_H;
    NetData[3] = Yolo::MAX_OUTPUT_BBOX_COUNT;
    pluginMultidata[0].data = NetData;
    pluginMultidata[0].length = 3;
    pluginMultidata[0].name = "netdata";
    pluginMultidata[0].type = PluginFieldType::kFLOAT32;
    int scale[3] = { 8, 16, 32 };
    int plugindata[3][8];
    std::string names[3];
    for (int k = 1; k < 4; k++)
    {
        plugindata[k - 1][0] = Yolo::INPUT_W / scale[k - 1];
        plugindata[k - 1][1] = Yolo::INPUT_H / scale[k - 1];
        for (int i = 2; i < 8; i++)
        {
            plugindata[k - 1][i] = int(anchors_yolo[(k - 1) * 6 + i - 2]);
        }
        pluginMultidata[k].data = plugindata[k - 1];
        pluginMultidata[k].length = 8;
        names[k - 1] = "yolodata" + std::to_string(k);
        pluginMultidata[k].name = names[k - 1].c_str();
        pluginMultidata[k].type = PluginFieldType::kFLOAT32;
    }
    PluginFieldCollection pluginData;
    pluginData.nbFields = 4;
    pluginData.fields = pluginMultidata;
    IPluginV2 *pluginObj = creator->createPlugin("yololayer", &pluginData);
    ITensor* inputTensors_yolo[] = { det2->getOutput(0), det1->getOutput(0), det0->getOutput(0) };
    auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj);
    return yolo;
}


================================================
FILE: yolop/cuda_utils.h
================================================
#pragma once
#include <cuda_runtime_api.h>

#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)\
    {\
        cudaError_t error_code = callstr;\
        if (error_code != cudaSuccess) {\
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
            assert(0);\
        }\
    }
#endif  // CUDA_CHECK


================================================
FILE: yolop/gen_wts.py
================================================
import os, sys
import torch
import struct

# TODO: YOLOP_BASE_DIR is the root of YOLOP
print("[WARN] Please download/clone YOLOP, then set YOLOP_BASE_DIR to the root of YOLOP")

#YOLOP_BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
YOLOP_BASE_DIR = "/home/user/jetson/tmp/YOLOP"

sys.path.append(YOLOP_BASE_DIR)
from lib.models import get_net
from lib.config import cfg


# Initialize
device = torch.device('cpu')
# Load model
model = get_net(cfg)
checkpoint = torch.load(YOLOP_BASE_DIR + '/weights/End-to-end.pth', map_location=device)
model.load_state_dict(checkpoint['state_dict'])
# load to FP32
model.float()
model.to(device).eval()

f = open('yolop.wts', 'w')
f.write('{}\n'.format(len(model.state_dict().keys())))
for k, v in model.state_dict().items():
    vr = v.reshape(-1).cpu().numpy()
    f.write('{} {} '.format(k, len(vr)))
    for vv in vr:
        f.write(' ')
        f.write(struct.pack('>f',float(vv)).hex())
    f.write('\n')

f.close()

print("save as yolop.wts")

================================================
FILE: yolop/logging.h
================================================
// create by ausk(jinlj) 2022/10/25
#pragma once

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "macros.h"

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#else
#define TRT_NOEXCEPT
#endif

using Severity = nvinfer1::ILogger::Severity;

class Logger : public nvinfer1::ILogger
{
public:
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override
    {
        if (severity < Severity::kINFO) {
            std::cout << msg << std::endl;
        }
    }
};


================================================
FILE: yolop/macros.h
================================================
#ifndef __MACROS_H
#define __MACROS_H

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif

#endif  // __MACROS_H

================================================
FILE: yolop/utils.h
================================================
#pragma once

#include <dirent.h>
#include <opencv2/opencv.hpp>

#include <iostream>
#include "common.hpp"

#define SHOW_IMG

static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
    int w, h, x, y;
    float r_w = input_w / (img.cols*1.0);
    float r_h = input_h / (img.rows*1.0);
    if (r_h > r_w) {
        w = input_w;
        h = r_w * img.rows;
        x = 0;
        y = (input_h - h) / 2;
    } else {
        w = r_h * img.cols;
        h = input_h;
        x = (input_w - w) / 2;
        y = 0;
    }
    cv::Mat re(h, w, CV_8UC3);
    cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
    cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(114, 114, 114));
    re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
    cv::Mat tensor;
    out.convertTo(tensor, CV_32FC3, 1.f / 255.f);

    cv::subtract(tensor, cv::Scalar(0.485, 0.456, 0.406), tensor, cv::noArray(), -1);
    cv::divide(tensor, cv::Scalar(0.229, 0.224, 0.225), tensor, 1, -1);
    return tensor;
}

static inline int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
    DIR *p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) {
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}


================================================
FILE: yolop/yololayer.cu
================================================
#include <assert.h>
#include <vector>
#include <iostream>
#include "yololayer.h"
#include "cuda_utils.h"

namespace Tn
{
    template<typename T>
    void write(char*& buffer, const T& val)
    {
        *reinterpret_cast<T*>(buffer) = val;
        buffer += sizeof(T);
    }

    template<typename T>
    void read(const char*& buffer, T& val)
    {
        val = *reinterpret_cast<const T*>(buffer);
        buffer += sizeof(T);
    }
}

using namespace Yolo;

namespace nvinfer1
{
    YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector<Yolo::YoloKernel>& vYoloKernel)
    {
        mClassCount = classCount;
        mNetWidth = netWidth;
        mNetHeight = netHeight;
        mMaxOutObject = maxOut;
        mYoloKernel = vYoloKernel;
        mKernelCount = vYoloKernel.size();

        CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
        size_t AnchorLen = sizeof(float)* CHECK_COUNT * 2;
        for (int ii = 0; ii < mKernelCount; ii++)
        {
            CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
            const auto& yolo = mYoloKernel[ii];
            CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
        }
    }
    YoloLayerPlugin::~YoloLayerPlugin()
    {
        for (int ii = 0; ii < mKernelCount; ii++)
        {
            CUDA_CHECK(cudaFree(mAnchor[ii]));
        }
        CUDA_CHECK(cudaFreeHost(mAnchor));
    }

    // create the plugin at runtime from a byte stream
    YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length)
    {
        using namespace Tn;
        const char *d = reinterpret_cast<const char *>(data), *a = d;
        read(d, mClassCount);
        read(d, mThreadCount);
        read(d, mKernelCount);
        read(d, mNetWidth);
        read(d, mNetHeight);
        read(d, mMaxOutObject);
        mYoloKernel.resize(mKernelCount);
        auto kernelSize = mKernelCount * sizeof(YoloKernel);
        memcpy(mYoloKernel.data(), d, kernelSize);
        d += kernelSize;
        CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
        size_t AnchorLen = sizeof(float)* CHECK_COUNT * 2;
        for (int ii = 0; ii < mKernelCount; ii++)
        {
            CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
            const auto& yolo = mYoloKernel[ii];
            CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
        }
        assert(d == a + length);
    }

    void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT
    {
        using namespace Tn;
        char* d = static_cast<char*>(buffer), *a = d;
        write(d, mClassCount);
        write(d, mThreadCount);
        write(d, mKernelCount);
        write(d, mNetWidth);
        write(d, mNetHeight);
        write(d, mMaxOutObject);
        auto kernelSize = mKernelCount * sizeof(YoloKernel);
        memcpy(d, mYoloKernel.data(), kernelSize);
        d += kernelSize;

        assert(d == a + getSerializationSize());
    }

    size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT
    {
        return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(Yolo::YoloKernel) * mYoloKernel.size() + sizeof(mNetWidth) + sizeof(mNetHeight) + sizeof(mMaxOutObject);
    }

    int YoloLayerPlugin::initialize() TRT_NOEXCEPT
    {
        return 0;
    }

    Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT
    {
        //output the result to channel
        int totalsize = mMaxOutObject * sizeof(Detection) / sizeof(float);

        return Dims3(totalsize + 1, 1, 1);
    }

    // Set plugin namespace
    void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT
    {
        mPluginNamespace = pluginNamespace;
    }

    const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT
    {
        return mPluginNamespace;
    }

    // Return the DataType of the plugin output at the requested index
    DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT
    {
        return DataType::kFLOAT;
    }

    // Return true if output tensor is broadcast across a batch.
    bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT
    {
        return false;
    }

    // Return true if plugin can use input that is broadcast across batch without replication.
    bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT
    {
        return false;
    }

    void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT
    {
    }

    // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
    void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT
    {
    }

    // Detach the plugin object from its execution context.
    void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {}

    const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT
    {
        return "YoloLayer_TRT";
    }

    const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT
    {
        return "1";
    }

    void YoloLayerPlugin::destroy() TRT_NOEXCEPT
    {
        delete this;
    }

    // Clone the plugin
    IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT
    {
        YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mNetWidth, mNetHeight, mMaxOutObject, mYoloKernel);
        p->setPluginNamespace(mPluginNamespace);
        return p;
    }

    __device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); };

    __global__ void CalDetection(const float *input, float *output, int noElements,
        const int netwidth, const int netheight, int maxoutobject, int yoloWidth, int yoloHeight, const float anchors[CHECK_COUNT * 2], int classes, int outputElem)
    {

        int idx = threadIdx.x + blockDim.x * blockIdx.x;
        if (idx >= noElements) return;

        int total_grid = yoloWidth * yoloHeight;
        int bnIdx = idx / total_grid;
        idx = idx - total_grid * bnIdx;
        int info_len_i = 5 + classes;
        const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT);

        for (int k = 0; k < 3; ++k) {
            float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]);
            if (box_prob < IGNORE_THRESH) continue;
            int class_id = 0;
            float max_cls_prob = 0.0;
            for (int i = 5; i < info_len_i; ++i) {
                float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]);
                if (p > max_cls_prob) {
                    max_cls_prob = p;
                    class_id = i - 5;
                }
            }
            float *res_count = output + bnIdx * outputElem;
            int count = (int)atomicAdd(res_count, 1);
            if (count >= maxoutobject) return;
            char* data = (char *)res_count + sizeof(float) + count * sizeof(Detection);
            Detection* det = (Detection*)(data);

            int row = idx / yoloWidth;
            int col = idx % yoloWidth;

            //Location
            // pytorch:
            //  y = x[i].sigmoid()
            //  y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]  # xy
            //  y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
            //  X: (sigmoid(tx) + cx)/FeaturemapW *  netwidth
            det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * netwidth / yoloWidth;
            det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * netheight / yoloHeight;

            // W: (Pw * e^tw) / FeaturemapW * netwidth
            // v5: https://github.com/ultralytics/yolov5/issues/471
            det->bbox[2] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]);
            det->bbox[2] = det->bbox[2] * det->bbox[2] * anchors[2 * k];
            det->bbox[3] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]);
            det->bbox[3] = det->bbox[3] * det->bbox[3] * anchors[2 * k + 1];
            det->conf = box_prob * max_cls_prob;
            det->class_id = class_id;
        }
    }

    void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize)
    {
        int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
        for (int idx = 0; idx < batchSize; ++idx) {
            CUDA_CHECK(cudaMemset(output + idx * outputElem, 0, sizeof(float)));
        }
        int numElem = 0;
        for (unsigned int i = 0; i < mYoloKernel.size(); ++i)
        {
            const auto& yolo = mYoloKernel[i];
            numElem = yolo.width*yolo.height*batchSize;
            if (numElem < mThreadCount)
                mThreadCount = numElem;

            //printf("Net: %d  %d \n", mNetWidth, mNetHeight);
            CalDetection << < (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount >> >
                (inputs[i], output, numElem, mNetWidth, mNetHeight, mMaxOutObject, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount, outputElem);
        }
    }


    int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT
    {
        forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);
        return 0;
    }

    PluginFieldCollection YoloPluginCreator::mFC{};
    std::vector<PluginField> YoloPluginCreator::mPluginAttributes;

    YoloPluginCreator::YoloPluginCreator()
    {
        mPluginAttributes.clear();

        mFC.nbFields = mPluginAttributes.size();
        mFC.fields = mPluginAttributes.data();
    }

    const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT
    {
        return "YoloLayer_TRT";
    }

    const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT
    {
        return "1";
    }

    const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT
    {
        return &mFC;
    }

    IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT
    {
        int class_count = -1;
        int input_w = -1;
        int input_h = -1;
        int max_output_object_count = -1;
        std::vector<Yolo::YoloKernel> yolo_kernels(3);

        const PluginField* fields = fc->fields;
        for (int i = 0; i < fc->nbFields; i++) {
            if (strcmp(fields[i].name, "netdata") == 0) {
                assert(fields[i].type == PluginFieldType::kFLOAT32);
                int *tmp = (int*)(fields[i].data);
                class_count = tmp[0];
                input_w = tmp[1];
                input_h = tmp[2];
                max_output_object_count = tmp[3];
            } else if (strstr(fields[i].name, "yolodata") != NULL) {
                assert(fields[i].type == PluginFieldType::kFLOAT32);
                int *tmp = (int*)(fields[i].data);
                YoloKernel kernel;
                kernel.width = tmp[0];
                kernel.height = tmp[1];
                for (int j = 0; j < fields[i].length - 2; j++) {
                    kernel.anchors[j] = tmp[j + 2];
                }
                yolo_kernels[2 - (fields[i].name[8] - '1')] = kernel;
            }
        }
        assert(class_count && input_w && input_h && max_output_object_count);
        YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, yolo_kernels);
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

    IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT
    {
        // This object will be deleted when the network is destroyed, which will
        // call YoloLayerPlugin::destroy()
        YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }
}


================================================
FILE: yolop/yololayer.h
================================================
#ifndef _YOLO_LAYER_H
#define _YOLO_LAYER_H

#include <vector>
#include <string>
#include "NvInfer.h"
#include "macros.h"

namespace Yolo
{
    static constexpr int CHECK_COUNT = 3;
    static constexpr float IGNORE_THRESH = 0.1f;
    struct YoloKernel
    {
        int width;
        int height;
        float anchors[CHECK_COUNT * 2];
    };
    static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000;
    static constexpr int CLASS_NUM = 1;
    static constexpr int INPUT_H = 384;
    static constexpr int INPUT_W = 640;
    static constexpr int IMG_H = 360;
    static constexpr int IMG_W = 640;

    static constexpr int LOCATIONS = 4;
    struct alignas(float) Detection {
        //center_x center_y w h
        float bbox[LOCATIONS];
        float conf;  // bbox_conf * cls_conf
        float class_id;
    };
}

namespace nvinfer1
{
    class YoloLayerPlugin : public IPluginV2IOExt
    {
    public:
        YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector<Yolo::YoloKernel>& vYoloKernel);
        YoloLayerPlugin(const void* data, size_t length);
        ~YoloLayerPlugin();

        int getNbOutputs() const TRT_NOEXCEPT override
        {
            return 1;
        }

        Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;

        int initialize() TRT_NOEXCEPT override;

        virtual void terminate()  TRT_NOEXCEPT override {};

        virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; }

        virtual int enqueue(int batchSize, const void*const * inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;

        virtual size_t getSerializationSize() const TRT_NOEXCEPT override;

        virtual void serialize(void* buffer) const TRT_NOEXCEPT override;

        bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override {
            return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
        }

        const char* getPluginType() const TRT_NOEXCEPT override;

        const char* getPluginVersion() const TRT_NOEXCEPT override;

        void destroy() TRT_NOEXCEPT override;

        IPluginV2IOExt* clone() const TRT_NOEXCEPT override;

        void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;

        const char* getPluginNamespace() const TRT_NOEXCEPT override;

        DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;

        bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override;

        bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;

        void attachToContext(
            cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;

        void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT override;

        using IPluginV2Ext::configurePlugin;

        void detachFromContext() TRT_NOEXCEPT override;

    private:
        void forwardGpu(const float *const * inputs, float * output, cudaStream_t stream, int batchSize = 1);
        int mThreadCount = 256;
        const char* mPluginNamespace;
        int mKernelCount;
        int mClassCount;
        int mNetWidth;
        int mNetHeight;
        int mMaxOutObject;
        std::vector<Yolo::YoloKernel> mYoloKernel;
        void** mAnchor;
    };

    class YoloPluginCreator : public IPluginCreator
    {
    public:
        YoloPluginCreator();

        ~YoloPluginCreator() override = default;

        const char* getPluginName() const TRT_NOEXCEPT override;

        const char* getPluginVersion() const TRT_NOEXCEPT override;

        const PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;

        IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT override;

        IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;

        void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override
        {
            mNamespace = libNamespace;
        }

        const char* getPluginNamespace() const TRT_NOEXCEPT override
        {
            return mNamespace.c_str();
        }

    private:
        std::string mNamespace;
        static PluginFieldCollection mFC;
        static std::vector<PluginField> mPluginAttributes;
    };
    REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
};

#endif


================================================
FILE: yolop/yolop.cpp
================================================
#include "yolop.hpp"


int main(int argc, char** argv) {
    cudaSetDevice(DEVICE);

    std::string wts_name = "";
    std::string engine_name = "";
    std::string img_dir;
    if (!parse_args(argc, argv, wts_name, engine_name, img_dir)) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./yolop -s [.wts] [.engine] // serialize model to plan file" << std::endl;
        std::cerr << "./yolop -d [.engine] ../samples  // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // create a model using the API directly and serialize it to a stream
    if (!wts_name.empty()) {
        IHostMemory* modelStream{ nullptr };
        APIToModel(BATCH_SIZE, &modelStream, wts_name);
        assert(modelStream != nullptr);
        std::ofstream p(engine_name, std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 0;
    }

    // deserialize the .engine and run inference
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        return -1;
    }
    char *trtModelStream = nullptr;
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    trtModelStream = new char[size];
    assert(trtModelStream);
    file.read(trtModelStream, size);
    file.close();

    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    // prepare input data ---------------------------
    static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
    //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
    //    data[i] = 1.0;
    static float prob[BATCH_SIZE * OUTPUT_SIZE];
    static int seg_out[BATCH_SIZE * IMG_H * IMG_W];
    static int lane_out[BATCH_SIZE * IMG_H * IMG_W];
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;
    assert(engine->getNbBindings() == 4);
    void* buffers[4];
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
    const int output_det_index = engine->getBindingIndex(OUTPUT_DET_NAME);
    const int output_seg_index = engine->getBindingIndex(OUTPUT_SEG_NAME);
    const int output_lane_index = engine->getBindingIndex(OUTPUT_LANE_NAME);
    assert(inputIndex == 0);
    assert(output_det_index == 1);
    assert(output_seg_index == 2);
    assert(output_lane_index == 3);
    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc(&buffers[inputIndex], BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&buffers[output_det_index], BATCH_SIZE * OUTPUT_SIZE * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&buffers[output_seg_index], BATCH_SIZE * IMG_H * IMG_W * sizeof(int)));
    CUDA_CHECK(cudaMalloc(&buffers[output_lane_index], BATCH_SIZE * IMG_H * IMG_W * sizeof(int)));
    // Create stream
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    // store seg results
    cv::Mat tmp_seg(IMG_H, IMG_W, CV_32S, seg_out);
    // store lane results
    cv::Mat tmp_lane(IMG_H, IMG_W, CV_32S, lane_out);
    // PrintMat(tmp_seg);
    std::vector<cv::Vec3b> segColor;
    segColor.push_back(cv::Vec3b(0, 0, 0));
    segColor.push_back(cv::Vec3b(0, 255, 0));
    segColor.push_back(cv::Vec3b(255, 0, 0));

    std::vector<cv::Vec3b> laneColor;
    laneColor.push_back(cv::Vec3b(0, 0, 0));
    laneColor.push_back(cv::Vec3b(0, 0, 255));
    laneColor.push_back(cv::Vec3b(0, 0, 0));

    int fcount = 0;  // set for batch-inference
    for (int f = 0; f < (int)file_names.size(); f++) {
        fcount++;
        if (fcount < BATCH_SIZE && f + 1 != (int)file_names.size()) continue;

        // preprocess ~3ms
        for (int b = 0; b < fcount; b++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[f - fcount + 1 + b]);  // load image takes ~17ms
            if (img.empty()) continue;
            //cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
            cv::Mat pr_img = preprocess_img(img, INPUT_W, INPUT_H); // letterbox
            int i = 0;
            // BGR to RGB and normalize
            for (int row = 0; row < INPUT_H; ++row) {
                float* uc_pixel = pr_img.ptr<float>(row);
                for (int col = 0; col < INPUT_W; ++col) {
                    data[b * 3 * INPUT_H * INPUT_W + i] = uc_pixel[0];
                    data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = uc_pixel[1];
                    data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = uc_pixel[2];
                    uc_pixel += 3;
                    ++i;
                }
            }
        }

        // Run inference
        auto start = std::chrono::system_clock::now();
        doInferenceCpu(*context, stream, buffers, data, prob, seg_out, lane_out, BATCH_SIZE);
        auto end = std::chrono::system_clock::now();
        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;

        // postprocess ~0ms
        std::vector<std::vector<Yolo::Detection>> batch_res(fcount);
        for (int b = 0; b < fcount; b++) {
            auto& res = batch_res[b];
            nms(res, &prob[b * OUTPUT_SIZE], CONF_THRESH, NMS_THRESH);
        }

        // show results
        for (int b = 0; b < fcount; ++b) {
            auto& res = batch_res[b];
            //std::cout << res.size() << std::endl;
            cv::Mat img = cv::imread(img_dir + "/" + file_names[f - fcount + 1 + b]);

            // handling seg and lane results
            cv::Mat seg_res(img.rows, img.cols, CV_32S);
            cv::resize(tmp_seg, seg_res, seg_res.size(), 0, 0, cv::INTER_NEAREST);
            cv::Mat lane_res(img.rows, img.cols, CV_32S);
            cv::resize(tmp_lane, lane_res, lane_res.size(), 0, 0, cv::INTER_NEAREST);
            for (int row = 0; row < img.rows; ++row) {
                uchar* pdata = img.data + row * img.step;
                for (int col = 0; col < img.cols; ++col) {
                    int seg_idx = seg_res.at<int>(row, col);
                    int lane_idx = lane_res.at<int>(row, col);
                    //std::cout << "enter" << ix << std::endl;
                    for (int i = 0; i < 3; ++i) {
                        if (lane_idx) {
                            if (i != 2)
                                pdata[i] = pdata[i] / 2 + laneColor[lane_idx][i] / 2;
                        }
                        else if (seg_idx)
                            pdata[i] = pdata[i] / 2 + segColor[seg_idx][i] / 2;
                    }
                    pdata += 3;
                }
            }
            // handling det results

            for (size_t j = 0; j < res.size(); ++j) {
                cv::Rect r = get_rect(img, res[j].bbox);
                cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
                cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
            }
            cv::imwrite("../results/_" + file_names[f - fcount + 1 + b], img);
        }
        fcount = 0;
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(buffers[inputIndex]));
    CUDA_CHECK(cudaFree(buffers[output_det_index]));
    CUDA_CHECK(cudaFree(buffers[output_seg_index]));
    CUDA_CHECK(cudaFree(buffers[output_lane_index]));
    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    return 0;
}


================================================
FILE: yolop/yolop.hpp
================================================
#pragma once

#include <chrono>
#include "cuda_utils.h"
#include "logging.h"
#include "utils.h"

#define USE_FP16  // set USE_INT8 or USE_FP16 or USE_FP32
#define DEVICE 0  // GPU id
#define NMS_THRESH 0.45
#define CONF_THRESH 0.25
#define BATCH_SIZE 1

// stuff we know about the network and the input/output blobs
static const int INPUT_H = Yolo::INPUT_H;
static const int INPUT_W = Yolo::INPUT_W;
static const int IMG_H = Yolo::IMG_H;
static const int IMG_W = Yolo::IMG_W;
static const int CLASS_NUM = Yolo::CLASS_NUM;
static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1;  // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_DET_NAME = "det";
const char* OUTPUT_SEG_NAME = "seg";
const char* OUTPUT_LANE_NAME = "lane";
static Logger gLogger;

ICudaEngine* build_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, std::string& wts_name) {
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W });
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights(wts_name);
    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };

    // yolop backbone
    // auto focus0 = focus(network, weightMap, *shuffle->getOutput(0), 3, 32, 3, "model.0");
    auto focus0 = focus(network, weightMap, *data, 3, 32, 3, "model.0");
    auto conv1 = convBlock(network, weightMap, *focus0->getOutput(0), 64, 3, 2, 1, "model.1");
    auto bottleneck_CSP2 = bottleneckCSP(network, weightMap, *conv1->getOutput(0), 64, 64, 1, true, 1, 0.5, "model.2");
    auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), 128, 3, 2, 1, "model.3");
    auto bottleneck_csp4 = bottleneckCSP(network, weightMap, *conv3->getOutput(0), 128, 128, 3, true, 1, 0.5, "model.4");
    auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), 256, 3, 2, 1, "model.5");
    auto bottleneck_csp6 = bottleneckCSP(network, weightMap, *conv5->getOutput(0), 256, 256, 3, true, 1, 0.5, "model.6");
    auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), 512, 3, 2, 1, "model.7");
    auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), 512, 512, 5, 9, 13, "model.8");

    // yolop head
    auto bottleneck_csp9 = bottleneckCSP(network, weightMap, *spp8->getOutput(0), 512, 512, 1, false, 1, 0.5, "model.9");
    auto conv10 = convBlock(network, weightMap, *bottleneck_csp9->getOutput(0), 256, 1, 1, 1, "model.10");

    float *deval = reinterpret_cast<float*>(malloc(sizeof(float) * 256 * 2 * 2));
    for (int i = 0; i < 256 * 2 * 2; i++) {
        deval[i] = 1.0;
    }
    Weights deconvwts11{ DataType::kFLOAT, deval, 256 * 2 * 2 };
    IDeconvolutionLayer* deconv11 = network->addDeconvolutionNd(*conv10->getOutput(0), 256, DimsHW{ 2, 2 }, deconvwts11, emptywts);
    deconv11->setStrideNd(DimsHW{ 2, 2 });
    deconv11->setNbGroups(256);
    weightMap["deconv11"] = deconvwts11;

    ITensor* inputTensors12[] = { deconv11->getOutput(0), bottleneck_csp6->getOutput(0) };
    auto cat12 = network->addConcatenation(inputTensors12, 2);
    auto bottleneck_csp13 = bottleneckCSP(network, weightMap, *cat12->getOutput(0), 512, 256, 1, false, 1, 0.5, "model.13");
    auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), 128, 1, 1, 1, "model.14");

    Weights deconvwts15{ DataType::kFLOAT, deval, 128 * 2 * 2 };
    IDeconvolutionLayer* deconv15 = network->addDeconvolutionNd(*conv14->getOutput(0), 128, DimsHW{ 2, 2 }, deconvwts15, emptywts);
    deconv15->setStrideNd(DimsHW{ 2, 2 });
    deconv15->setNbGroups(128);

    ITensor* inputTensors16[] = { deconv15->getOutput(0), bottleneck_csp4->getOutput(0) };
    auto cat16 = network->addConcatenation(inputTensors16, 2);
    auto bottleneck_csp17 = bottleneckCSP(network, weightMap, *cat16->getOutput(0), 256, 128, 1, false, 1, 0.5, "model.17");
    IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);

    auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), 128, 3, 2, 1, "model.18");
    ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) };
    auto cat19 = network->addConcatenation(inputTensors19, 2);
    auto bottleneck_csp20 = bottleneckCSP(network, weightMap, *cat19->getOutput(0), 256, 256, 1, false, 1, 0.5, "model.20");
    IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);

    auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), 256, 3, 2, 1, "model.21");
    ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) };
    auto cat22 = network->addConcatenation(inputTensors22, 2);
    auto bottleneck_csp23 = bottleneckCSP(network, weightMap, *cat22->getOutput(0), 512, 512, 1, false, 1, 0.5, "model.23");
    IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);

    auto detect24 = addYoLoLayer(network, weightMap, det0, det1, det2);
    detect24->getOutput(0)->setName(OUTPUT_DET_NAME);

    auto conv25 = convBlock(network, weightMap, *cat16->getOutput(0), 128, 3, 1, 1, "model.25");
    // upsample 26
    Weights deconvwts26{ DataType::kFLOAT, deval, 128 * 2 * 2 };
    IDeconvolutionLayer* deconv26 = network->addDeconvolutionNd(*conv25->getOutput(0), 128, DimsHW{ 2, 2 }, deconvwts26, emptywts);
    deconv26->setStrideNd(DimsHW{ 2, 2 });
    deconv26->setNbGroups(128);

    auto bottleneck_csp27 = bottleneckCSP(network, weightMap, *deconv26->getOutput(0), 128, 64, 1, false, 1, 0.5, "model.27");
    auto conv28 = convBlock(network, weightMap, *bottleneck_csp27->getOutput(0), 32, 3, 1, 1, "model.28");
    // upsample 29
    Weights deconvwts29{ DataType::kFLOAT, deval, 32 * 2 * 2 };
    IDeconvolutionLayer* deconv29 = network->addDeconvolutionNd(*conv28->getOutput(0), 32, DimsHW{ 2, 2 }, deconvwts29, emptywts);
    deconv29->setStrideNd(DimsHW{ 2, 2 });
    deconv29->setNbGroups(32);

    auto conv30 = convBlock(network, weightMap, *deconv29->getOutput(0), 16, 3, 1, 1, "model.30");
    auto bottleneck_csp31 = bottleneckCSP(network, weightMap, *conv30->getOutput(0), 16, 8, 1, false, 1, 0.5, "model.31");

    // upsample32
    Weights deconvwts32{ DataType::kFLOAT, deval, 8 * 2 * 2 };
    IDeconvolutionLayer* deconv32 = network->addDeconvolutionNd(*bottleneck_csp31->getOutput(0), 8, DimsHW{ 2, 2 }, deconvwts32, emptywts);
    deconv32->setStrideNd(DimsHW{ 2, 2 });
    deconv32->setNbGroups(8);

    auto conv33 = convBlock(network, weightMap, *deconv32->getOutput(0), 2, 3, 1, 1, "model.33");
    // segmentation output
    ISliceLayer *slicelayer = network->addSlice(*conv33->getOutput(0), Dims3{ 0, (Yolo::INPUT_H - Yolo::IMG_H) / 2, 0 }, Dims3{ 2, Yolo::IMG_H, Yolo::IMG_W }, Dims3{ 1, 1, 1 });
    auto segout = network->addTopK(*slicelayer->getOutput(0), TopKOperation::kMAX, 1, 1);
    segout->getOutput(1)->setName(OUTPUT_SEG_NAME);

    auto conv34 = convBlock(network, weightMap, *cat16->getOutput(0), 128, 3, 1, 1, "model.34");

    // upsample35
    Weights deconvwts35{ DataType::kFLOAT, deval, 128 * 2 * 2 };
    IDeconvolutionLayer* deconv35 = network->addDeconvolutionNd(*conv34->getOutput(0), 128, DimsHW{ 2, 2 }, deconvwts35, emptywts);
    deconv35->setStrideNd(DimsHW{ 2, 2 });
    deconv35->setNbGroups(128);

    auto bottleneck_csp36 = bottleneckCSP(network, weightMap, *deconv35->getOutput(0), 128, 64, 1, false, 1, 0.5, "model.36");
    auto conv37 = convBlock(network, weightMap, *bottleneck_csp36->getOutput(0), 32, 3, 1, 1, "model.37");

    // upsample38
    Weights deconvwts38{ DataType::kFLOAT, deval, 32 * 2 * 2 };
    IDeconvolutionLayer* deconv38 = network->addDeconvolutionNd(*conv37->getOutput(0), 32, DimsHW{ 2, 2 }, deconvwts38, emptywts);
    deconv38->setStrideNd(DimsHW{ 2, 2 });
    deconv38->setNbGroups(32);

    auto conv39 = convBlock(network, weightMap, *deconv38->getOutput(0), 16, 3, 1, 1, "model.39");
    auto bottleneck_csp40 = bottleneckCSP(network, weightMap, *conv39->getOutput(0), 16, 8, 1, false, 1, 0.5, "model.40");

    // upsample41
    Weights deconvwts41{ DataType::kFLOAT, deval, 8 * 2 * 2 };
    IDeconvolutionLayer* deconv41 = network->addDeconvolutionNd(*bottleneck_csp40->getOutput(0), 8, DimsHW{ 2, 2 }, deconvwts41, emptywts);
    deconv41->setStrideNd(DimsHW{ 2, 2 });
    deconv41->setNbGroups(8);

    auto conv42 = convBlock(network, weightMap, *deconv41->getOutput(0), 2, 3, 1, 1, "model.42");
    // lane-det output
    ISliceLayer *laneSlice = network->addSlice(*conv42->getOutput(0), Dims3{ 0, (Yolo::INPUT_H - Yolo::IMG_H) / 2, 0 }, Dims3{ 2, Yolo::IMG_H, Yolo::IMG_W }, Dims3{ 1, 1, 1 });
    auto laneout = network->addTopK(*laneSlice->getOutput(0), TopKOperation::kMAX, 1, 1);
    laneout->getOutput(1)->setName(OUTPUT_LANE_NAME);

    // detection output
    network->markOutput(*detect24->getOutput(0));
    // segmentation output
    network->markOutput(*segout->getOutput(1));
    // lane output
    network->markOutput(*laneout->getOutput(1));

    assert(false);

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(2L * (1L << 30));  // 2GB
#if defined(USE_FP16)
    config->setFlag(BuilderFlag::kFP16);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*)(mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream, std::string& wts_name) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = build_engine(maxBatchSize, builder, config, DataType::kFLOAT, wts_name);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
    config->destroy();
}

void doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* det_output, int* seg_output, int* lane_output, int batchSize) {
    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    // CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CUDA_CHECK(cudaMemcpyAsync(det_output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(cudaMemcpyAsync(seg_output, buffers[2], batchSize * IMG_H * IMG_W * sizeof(int), cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(cudaMemcpyAsync(lane_output, buffers[3], batchSize * IMG_H * IMG_W * sizeof(int), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);
}

void doInferenceCpu(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* det_output, int* seg_output, int* lane_output, int batchSize) {
    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CUDA_CHECK(cudaMemcpyAsync(det_output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(cudaMemcpyAsync(seg_output, buffers[2], batchSize * IMG_H * IMG_W * sizeof(int), cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(cudaMemcpyAsync(lane_output, buffers[3], batchSize * IMG_H * IMG_W * sizeof(int), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);
}

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir) {
    if (argc < 4) return false;
    if (std::string(argv[1]) == "-s" && argc == 4) {
        wts = std::string(argv[2]);
        engine = std::string(argv[3]);
    } else if (std::string(argv[1]) == "-d" && argc == 4) {
        engine = std::string(argv[2]);
        img_dir = std::string(argv[3]);
    } else {
        return false;
    }
    return true;
}


================================================
FILE: yolop/yolop_trt.py
================================================
# 2022/10/26 by ausk
"""
An example that uses TensorRT's Python api to make yolop inferences.
"""
import ctypes
import os
import shutil
import random
import sys
import time
import cv2
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt

CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4


def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret

def plot_one_box(x, img, color=None, label=None, line_thickness=None):
    """
    description: Plots one bounding box on image img,
                 this function comes from YoLov5 project.
    """
    tl = ( line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1)  # line/font thickness
    color = color or [random.randint(0, 255) for _ in range(3)]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
    if label:
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText( img, label,  (c1[0], c1[1] - 2), 0,  tl / 3, [225, 255, 255], thickness=tf,  lineType=cv2.LINE_AA)

class YolopTRT(object):
    """
    description: Warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding: ', binding, engine.get_binding_shape(binding))
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        self.input_h = 384
        self.input_w = 640
        self.img_h = 360
        self.img_w = 640

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.batch_size = engine.max_batch_size

    def infer(self, raw_image_generator):
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        engine = self.engine
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_origin_h = []
        batch_origin_w = []
        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
            batch_image_raw.append(image_raw)
            batch_origin_h.append(origin_h)
            batch_origin_w.append(origin_w)
            np.copyto(batch_input_image[i], input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        for i in range(len(host_outputs)):
            cuda.memcpy_dtoh_async(host_outputs[i], cuda_outputs[i], stream)
        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1

        detout = host_outputs[0]
        segout = host_outputs[1].reshape( (self.batch_size, self.img_h,self.img_w))
        laneout = host_outputs[2].reshape( (self.batch_size, self.img_h,self.img_w))

        # Do postprocess
        for i in range(self.batch_size):
            result_boxes, result_scores, result_classid = self.post_process(
                detout[i * 6001: (i + 1) * 6001], batch_origin_h[i], batch_origin_w[i]
            )

            # Draw rectangles and labels on the original image
            img = batch_image_raw[i]
            nh = img.shape[0]
            nw = img.shape[1]
            for j in range(len(result_boxes)):
                box = result_boxes[j]
                label="{}:{:.2f}".format( categories[int(result_classid[j])], result_scores[j])
                plot_one_box( box, img, label=label)

            seg  = cv2.resize(segout[i], (nw, nh), interpolation=cv2.INTER_NEAREST)
            lane = cv2.resize(laneout[i], (nw, nh), interpolation=cv2.INTER_NEAREST)
            color_area = np.zeros_like(img)
            color_area[seg==1]  = (0,255,0)
            color_area[lane==1] = (0,0,255)
            color_mask = np.mean(color_area, 2)
            img[color_mask != 0] = img[color_mask != 0] * 0.5 + color_area[color_mask != 0] * 0.5
            img = img.astype(np.uint8)

        return batch_image_raw, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()

    def get_raw_image(self, image_path_batch):
        for img_path in image_path_batch:
            yield cv2.imread(img_path)

    def get_raw_image_zeros(self, image_path_batch=None):
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_image(self, raw_bgr_image):
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = self.input_w / w
        r_h = self.input_h / h
        if r_h > r_w:
            tw = self.input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.input_h - th) / 2)
            ty2 = self.input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = self.input_h
            tx1 = int((self.input_w - tw) / 2)
            tx2 = self.input_w - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (114, 114, 114)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        image = (image - (0.485, 0.456, 0.406)) /(0.229, 0.224, 0.225)
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy(self, origin_h, origin_w, x):
        """
        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
        param:
            origin_h:   height of original image
            origin_w:   width of original image
            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
        return:
            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
        """
        y = np.zeros_like(x)
        r_w = self.input_w / origin_w
        r_h = self.input_h / origin_h
        if r_h > r_w:
            y[:, 0] = x[:, 0] - x[:, 2] / 2
            y[:, 2] = x[:, 0] + x[:, 2] / 2
            y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
            y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
            y /= r_w
        else:
            y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
            y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
            y[:, 1] = x[:, 1] - x[:, 3] / 2
            y[:, 3] = x[:, 1] + x[:, 3] / 2
            y /= r_h

        return y

    def post_process(self, output, origin_h, origin_w):
        # Get the num of boxes detected
        num = int(output[0])
        # Reshape to a two dimentional ndarray
        pred = np.reshape(output[1:], (-1, 6))[:num, :]
        # Do nms
        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
        result_scores = boxes[:, 4] if len(boxes) else np.array([])
        result_classid = boxes[:, 5] if len(boxes) else np.array([])
        return result_boxes, result_scores, result_classid

    def bbox_iou(self, box1, box2, x1y1x2y2=True):
        """
        description: compute the IoU of two bounding boxes
        param:
            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            x1y1x2y2: select the coordinate format
        return:
            iou: computed iou
        """
        if not x1y1x2y2:
            # Transform from center and width to exact coordinates
            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
        else:
            # Get the coordinates of bounding boxes
            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

        # Get the coordinates of the intersection rectangle
        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
        # Intersection area
        inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \
                     np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)
        # Union Area
        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)

        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

        return iou

    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
        """
        description: Removes detections with lower object confidence score than 'conf_thres' and performs
        Non-Maximum Suppression to further filter detections.
        param:
            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
            origin_h: original image height
            origin_w: original image width
            conf_thres: a confidence threshold to filter detections
            nms_thres: a iou threshold to filter detections
        return:
            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
        """
        # Get the boxes that score > CONF_THRESH
        boxes = prediction[prediction[:, 4] >= conf_thres]
        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
        boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
        # clip the coordinates
        boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w -1)
        boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w -1)
        boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h -1)
        boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h -1)
        # Object confidence
        confs = boxes[:, 4]
        # Sort by the confs
        boxes = boxes[np.argsort(-confs)]
        # Perform non-maximum suppression
        keep_boxes = []
        while boxes.shape[0]:
            large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
            label_match = boxes[0, -1] == boxes[:, -1]
            # Indices of boxes with lower confidence scores, large IOUs and matching labels
            invalid = large_overlap & label_match
            keep_boxes += [boxes[0]]
            boxes = boxes[~invalid]
        boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
        return boxes


if __name__ == "__main__":
    # load custom plugin and engine
    PLUGIN_LIBRARY = "build/libmyplugins.so"
    engine_file_path = "build/yolop.trt"

    print("usage: xxx <engine file> <plugin file> <image dir>")
    print("[WARN] preaprea you image_dir, such as: samples, or /home/user/jetson/tmp/YOLOP/inference/images")
    IMAGE_DIR =  "/home/user/jetson/tmp/YOLOP/inference/images"

    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]
    if len(sys.argv) > 2:
        PLUGIN_LIBRARY = sys.argv[2]
    if len(sys.argv) > 3:
        IMAGE_DIR = sys.argv[3]

    ctypes.CDLL(PLUGIN_LIBRARY)

    categories = ["car"]

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')

    # a YolopTRT instance
    yolop_wrapper = YolopTRT(engine_file_path)

    try:
        print('batch size is', yolop_wrapper.batch_size)

        image_dir = IMAGE_DIR
        image_path_batches = get_img_path_batches(yolop_wrapper.batch_size, image_dir)

        for i in range(1):
            batch_image_raw, use_time = yolop_wrapper.infer(yolop_wrapper.get_raw_image_zeros())
            print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))

        for batch in image_path_batches:
            batch_image_raw, use_time = yolop_wrapper.infer(yolop_wrapper.get_raw_image(batch))
            for i, img_path in enumerate(batch):
                parent, filename = os.path.split(img_path)
                save_name = os.path.join('output', filename)
                # Save image
                cv2.imwrite(save_name, batch_image_raw[i])
            print('input->{}, time->{:.2f}ms, saving into output/'.format(batch, use_time * 1000))

    finally:
        # destroy the instance
        yolop_wrapper.destroy()

    print("done!")

================================================
FILE: yolov10/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.10)

project(yolov10)

add_definitions(-std=c++11)
add_definitions(-DAPI_EXPORTS)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
enable_language(CUDA)

include_directories(${PROJECT_SOURCE_DIR}/include)
include_directories(${PROJECT_SOURCE_DIR}/plugin)

# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
  message("embed_platform on")
  include_directories(/usr/local/cuda/targets/aarch64-linux/include)
  link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
else()
  message("embed_platform off")

  # cuda
  include_directories(/usr/local/cuda/include)
  link_directories(/usr/local/cuda/lib64)

  # tensorrt
  include_directories(/workspace/shared/TensorRT-8.4.3.1/include)
  link_directories(/workspace/shared/TensorRT-8.4.3.1/lib)

  # include_directories(/home/lindsay/TensorRT-7.2.3.4/include)
  # link_directories(/home/lindsay/TensorRT-7.2.3.4/lib)
endif()

add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu)
target_link_libraries(myplugins nvinfer cudart)

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu)
add_executable(yolov10_det ${PROJECT_SOURCE_DIR}/yolov10_det.cpp ${SRCS})

target_link_libraries(yolov10_det nvinfer)
target_link_libraries(yolov10_det cudart)
target_link_libraries(yolov10_det myplugins)
target_link_libraries(yolov10_det ${OpenCV_LIBS})


================================================
FILE: yolov10/README.md
================================================
## Introduce

Yolov10 model supports TensorRT-8.

## Environment

CUDA: 11.8

CUDNN: 8.9.1.23

TensorRT: TensorRT-8.2.5.1   / GPU: RTX1650

TensorRT: TensorRT-8.4.3.1   / GPU: RTX4070

```
# faq
Error Code 1: Internal Error (Unsupported SM: 0x809)
The architecture of the higher version does not support the use of the earlier version of TensorRT,
and you need to upgrade the TensorRT version
```

## Support

* [x] YOLOv10-det support FP32/FP16/INT8 and Python/C++ API

## Config

* Choose the YOLOv10 sub-model n/s/m/b/l/x from command line arguments.
* Other configs please check [src/config.h](src/config.h)

## Build and Run

1. generate .wts from pytorch with .pt, or download .wts from model zoo

```shell
git clone https://github.com/THU-MIG/yolov10.git
cd yolov10/
wget https://github.com/THU-MIG/yolov10/releases/download/v1.1/yolov10n.pt

git clone https://github.com/wang-xinyu/tensorrtx.git
cp [PATH-TO-TENSORRTX]/yolov10/gen_wts.py .

python gen_wts.py -w yolov10n.pt -o yolov10n.wts
# A file 'yolov10n.wts' will be generated.
```

2. build tensorrtx/yolov10 and run

#### Detection

```shell
cd [PATH-TO-TENSORRTX]/yolov10

# add test images
mkdir images
cp [PATH-TO-TENSORRTX]/yolov3-spp/samples/*.jpg ./images

# Update kNumClass in src/config.h if your model is trained on custom dataset
mkdir build
cd build
cp [PATH-TO-yolov10]/yolov10n.wts .
cmake ..
make

# Build and serialize TensorRT engine
./yolov10_det -s yolov10n.wts yolov10n.engine [n/s/m/b/l/x]

# Run inference
./yolov10_det -d yolov10n.engine ../images
# The results are displayed in the console
```

3. Optional, load and run the tensorrt model in Python
```shell
// Install python-tensorrt, pycuda, etc.
// Ensure the yolov10n.engine
python yolov10_det_trt.py ./build/yolov10n.engine ./build/libmyplugins.so
```

## INT8 Quantization
1. Prepare calibration images, you can randomly select 1000s images from your train set. For coco, you can also download my calibration images `coco_calib` from [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing) or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh
2. unzip it in yolov10/build
3. set the macro `USE_INT8` in src/config.h and make again
4. serialize the model and test

## More Information
See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)


================================================
FILE: yolov10/gen_wts.py
================================================
# -*- coding: UTF-8 -*-
"""
  @Author: mpj
  @Date  : 2024/7/22 下午9:17
  @version V1.0
"""
import sys  # noqa: F401
import argparse
import os
import struct
import torch


def parse_args():
    parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
    parser.add_argument('-w', '--weights', default='./weights/yolov10n.pt',
                        help='Input weights (.pt) file path (required)')
    parser.add_argument(
        '-o', '--output', help='Output (.wts) file path (optional)')
    args = parser.parse_args()
    if not os.path.isfile(args.weights):
        raise SystemExit('Invalid input file')
    if not args.output:
        args.output = os.path.splitext(args.weights)[0] + '.wts'
    elif os.path.isdir(args.output):
        args.output = os.path.join(
            args.output,
            os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
    return args.weights, args.output


pt_file, wts_file = parse_args()

# Load model
print(f'Loading {pt_file}')

# Initialize
device = 'cpu'

# Load model
model = torch.load(pt_file, map_location=device, weights_only=False)  # Load FP32 weights
model = model['ema' if model.get('ema') else 'model'].float()

model.to(device).eval()

with open(wts_file, 'w') as f:
    f.write('{}\n'.format(len(model.state_dict().keys())))
    for k, v in model.state_dict().items():
        vr = v.reshape(-1).cpu().numpy()
        f.write('{} {} '.format(k, len(vr)))
        for vv in vr:
            f.write(' ')
            f.write(struct.pack('>f', float(vv)).hex())
        f.write('\n')
print(f'success {wts_file}!!!')


================================================
FILE: yolov10/include/block.h
================================================
#pragma once

#include <map>
#include <string>
#include <vector>
#include "NvInfer.h"

std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file);

nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network,
                                      std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                      std::string lname, float eps);

nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network,
                                        std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                        int ch, int k, int s, std::string lname, int g = 1);

nvinfer1::IElementWiseLayer* C2F(nvinfer1::INetworkDefinition* network,
                                 std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
                                 int c2, int n, bool shortcut, float e, std::string lname);

nvinfer1::IElementWiseLayer* C2(nvinfer1::INetworkDefinition* network,
                                std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c1,
                                int c2, int n, bool shortcut, float e, std::string lname);

nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network,
                                  std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
                                  int c2, int k, std::string lname);

nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                             nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname);

nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network, std::vector<nvinfer1::ILayer*> dets,
                                       const int* px_arry, int px_arry_num);

nvinfer1::ILayer* SCDown(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                         nvinfer1::ITensor& input, int ch, int k, int s, std::string lname);

nvinfer1::ILayer* PSA(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                      nvinfer1::ITensor& input, int ch, std::string lname);

nvinfer1::ILayer* C2fCIB(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                         nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, bool lk, float e,
                         std::string lname);


================================================
FILE: yolov10/include/calibrator.h
================================================
#ifndef ENTROPY_CALIBRATOR_H
#define ENTROPY_CALIBRATOR_H

#include <NvInfer.h>
#include <string>
#include <vector>
#include "macros.h"

//! \class Int8EntropyCalibrator2
//!
//! \brief Implements Entropy calibrator 2.
//!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
//!
class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
   public:
    Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name,
                           const char* input_blob_name, bool read_cache = true);
    virtual ~Int8EntropyCalibrator2();
    int getBatchSize() const TRT_NOEXCEPT override;
    bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
    const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
    void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;

   private:
    int batchsize_;
    int input_w_;
    int input_h_;
    int img_idx_;
    std::string img_dir_;
    std::vector<std::string> img_files_;
    size_t input_count_;
    std::string calib_table_name_;
    const char* input_blob_name_;
    bool read_cache_;
    void* device_input_;
    std::vector<char> calib_cache_;
};

#endif  // ENTROPY_CALIBRATOR_H


================================================
FILE: yolov10/include/config.h
================================================
//#define USE_FP32
#define USE_FP16
// #define USE_INT8

const static char* kInputTensorName = "images";
const static char* kOutputTensorName = "output";
const static int kNumClass = 80;
const static int kBatchSize = 1;
const static int kGpuId = 0;
const static int kInputH = 640;
const static int kInputW = 640;
const static float kConfThresh = 0.5f;
const static int kMaxInputImageSize = 3000 * 3000;
const static int kMaxNumOutputBbox = 1000;
//Quantization input image folder path
const static char* kInputQuantizationFolder = "./coco_calib";


================================================
FILE: yolov10/include/cuda_utils.h
================================================
#ifndef TRTX_CUDA_UTILS_H_
#define TRTX_CUDA_UTILS_H_

#include <cuda_runtime_api.h>

#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)                                                                    \
    {                                                                                          \
        cudaError_t error_code = callstr;                                                      \
        if (error_code != cudaSuccess) {                                                       \
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
            assert(0);                                                                         \
        }                                                                                      \
    }
#endif  // CUDA_CHECK

#endif  // TRTX_CUDA_UTILS_H_


================================================
FILE: yolov10/include/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "NvInferRuntimeCommon.h"
#include "macros.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf {
   public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {}

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {}

    ~LogStreamConsumerBuffer() {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr()) {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync() {
        putOutput();
        return 0;
    }

    void putOutput() {
        if (mShouldLog) {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }

   private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase {
   public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog) {}

   protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
   public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(severity <= reportableSeverity),
          mSeverity(severity) {}

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(other.mShouldLog),
          mSeverity(other.mSeverity) {}

    void setReportableSeverity(Severity reportableSeverity) {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

   private:
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger {
   public:
    Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {}

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult {
        kRUNNING,  //!< The test is running
        kPASSED,   //!< The test passed
        kFAILED,   //!< The test failed
        kWAIVED    //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger() { return *this; }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity) { mReportableSeverity = severity; }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom {
       public:
        TestAtom(TestAtom&&) = default;

       private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started), mName(name), mCmdline(cmdline) {}

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline) {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom) {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result) {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass) {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const { return mReportableSeverity; }

   private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result) {
        switch (result) {
            case TestResult::kRUNNING:
                return "RUNNING";
            case TestResult::kPASSED:
                return "PASSED";
            case TestResult::kFAILED:
                return "FAILED";
            case TestResult::kWAIVED:
                return "WAIVED";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result) {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv) {
        std::stringstream ss;
        for (int i = 0; i < argc; i++) {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace {

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

}  // anonymous namespace

#endif  // TENSORRT_LOGGING_H


================================================
FILE: yolov10/include/macros.h
================================================
#ifndef __MACROS_H
#define __MACROS_H

#include "NvInfer.h"

#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else

#if defined(_MSC_VER)
#define API __declspec(dllimport)
#else
#define API
#endif
#endif  // API_EXPORTS

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif

#endif  // __MACROS_H


================================================
FILE: yolov10/include/model.h
================================================
#pragma once

#include <assert.h>
#include <string>
#include "NvInfer.h"

nvinfer1::IHostMemory* buildEngineYolov10DetN(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                              nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                              int& max_channels);

nvinfer1::IHostMemory* buildEngineYolov10DetS(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                              nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                              int& max_channels);

nvinfer1::IHostMemory* buildEngineYolov10DetM(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                              nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                              int& max_channels);

nvinfer1::IHostMemory* buildEngineYolov10DetBL(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                               nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                               int& max_channels);

nvinfer1::IHostMemory* buildEngineYolov10DetX(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                              nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                              int& max_channels);


================================================
FILE: yolov10/include/postprocess.h
================================================
#pragma once

#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "types.h"

cv::Rect get_rect(cv::Mat& img, float bbox[4]);

void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);

void batch_topk(std::vector<std::vector<Detection>>& res_batch, float* output, int batch_size, int output_size,
                float conf_thresh, int topk = 300);


================================================
FILE: yolov10/include/preprocess.h
================================================
#pragma once

#include <map>
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "types.h"

void cuda_preprocess_init(int max_image_size);

void cuda_preprocess_destroy();

void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height,
                     cudaStream_t stream);

void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch, float* dst, int dst_width, int dst_height,
                           cudaStream_t stream);


================================================
FILE: yolov10/include/types.h
================================================
#pragma once
#include "config.h"

struct alignas(float) Detection {
    //center_x center_y w h
    float bbox[4];
    float conf;  // bbox_conf * cls_conf
    float class_id;
};

struct AffineMatrix {
    float value[6];
};

const int bbox_element =
        sizeof(Detection) / sizeof(float) + 1;  // left, top, right, bottom, confidence, class, keepflag


================================================
FILE: yolov10/include/utils.h
================================================
#pragma once
#include <dirent.h>
#include <fstream>
#include <opencv2/opencv.hpp>

static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
    int w, h, x, y;
    float r_w = input_w / (img.cols * 1.0);
    float r_h = input_h / (img.rows * 1.0);
    if (r_h > r_w) {
        w = input_w;
        h = r_w * img.rows;
        x = 0;
        y = (input_h - h) / 2;
    } else {
        w = r_h * img.cols;
        h = input_h;
        x = (input_w - w) / 2;
        y = 0;
    }
    cv::Mat re(h, w, CV_8UC3);
    cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
    cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
    re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
    return out;
}

static inline int read_files_in_dir(const char* p_dir_name, std::vector<std::string>& file_names) {
    DIR* p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            //            std::cout << "Found file: " << cur_file_name << std::endl;
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}

// Function to trim leading and trailing whitespace from a string
static inline std::string trim_leading_whitespace(const std::string& str) {
    size_t first = str.find_first_not_of(' ');
    if (std::string::npos == first) {
        return str;
    }
    size_t last = str.find_last_not_of(' ');
    return str.substr(first, (last - first + 1));
}

// Src: https://stackoverflow.com/questions/16605967
static inline std::string to_string_with_precision(const float a_value, const int n = 2) {
    std::ostringstream out;
    out.precision(n);
    out << std::fixed << a_value;
    return out.str();
}

static inline int read_labels(const std::string labels_filename, std::unordered_map<int, std::string>& labels_map) {
    std::ifstream file(labels_filename);
    // Read each line of the file
    std::string line;
    int index = 0;
    while (std::getline(file, line)) {
        // Strip the line of any leading or trailing whitespace
        line = trim_leading_whitespace(line);

        // Add the stripped line to the labels_map, using the loop index as the key
        labels_map[index] = line;
        index++;
    }
    // Close the file
    file.close();

    return 0;
}


================================================
FILE: yolov10/plugin/yololayer.cu
================================================
#include <assert.h>
#include <math.h>
#include <iostream>
#include <vector>
#include "cuda_utils.h"
#include "types.h"
#include "yololayer.h"

namespace Tn {
template <typename T>
void write(char*& buffer, const T& val) {
    *reinterpret_cast<T*>(buffer) = val;
    buffer += sizeof(T);
}

template <typename T>
void read(const char*& buffer, T& val) {
    val = *reinterpret_cast<const T*>(buffer);
    buffer += sizeof(T);
}
}  // namespace Tn

__device__ float sigmoid(float x) {
    return 1.0f / (1.0f + exp(-x));
}

namespace nvinfer1 {
YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const int* strides,
                                 int stridesLength) {

    mClassCount = classCount;
    mYoloV10NetWidth = netWidth;
    mYoloV10netHeight = netHeight;
    mMaxOutObject = maxOut;
    mStridesLength = stridesLength;
    mStrides = new int[stridesLength];
    memcpy(mStrides, strides, stridesLength * sizeof(int));
}

YoloLayerPlugin::~YoloLayerPlugin() {
    if (mStrides != nullptr) {
        delete[] mStrides;
        mStrides = nullptr;
    }
}

YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) {
    using namespace Tn;
    const char *d = reinterpret_cast<const char*>(data), *a = d;
    read(d, mClassCount);
    read(d, mThreadCount);
    read(d, mYoloV10NetWidth);
    read(d, mYoloV10netHeight);
    read(d, mMaxOutObject);
    read(d, mStridesLength);
    mStrides = new int[mStridesLength];
    for (int i = 0; i < mStridesLength; ++i) {
        read(d, mStrides[i]);
    }

    assert(d == a + length);
}

void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT {

    using namespace Tn;
    char *d = static_cast<char*>(buffer), *a = d;
    write(d, mClassCount);
    write(d, mThreadCount);
    write(d, mYoloV10NetWidth);
    write(d, mYoloV10netHeight);
    write(d, mMaxOutObject);
    write(d, mStridesLength);
    for (int i = 0; i < mStridesLength; ++i) {
        write(d, mStrides[i]);
    }

    assert(d == a + getSerializationSize());
}

size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT {
    return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mYoloV10netHeight) + sizeof(mYoloV10NetWidth) +
           sizeof(mMaxOutObject) + sizeof(mStridesLength) + sizeof(int) * mStridesLength;
}

int YoloLayerPlugin::initialize() TRT_NOEXCEPT {
    return 0;
}

nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs,
                                                    int nbInputDims) TRT_NOEXCEPT {
    int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float);
    return nvinfer1::Dims3(total_size + 1, 1, 1);
}

void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT {
    mPluginNamespace = pluginNamespace;
}

const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT {
    return mPluginNamespace;
}

nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
                                                      int nbInputs) const TRT_NOEXCEPT {
    return nvinfer1::DataType::kFLOAT;
}

bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
                                                   int nbInputs) const TRT_NOEXCEPT {
    return false;
}

bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT {
    return false;
}

void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput,
                                      nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT{};

void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
                                      IGpuAllocator* gpuAllocator) TRT_NOEXCEPT{};

void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {}

const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT {

    return "YoloLayer_TRT";
}

const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT {
    return "1";
}

void YoloLayerPlugin::destroy() TRT_NOEXCEPT {
    delete this;
}

nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT {

    YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV10NetWidth, mYoloV10netHeight, mMaxOutObject, mStrides,
                                             mStridesLength);
    p->setPluginNamespace(mPluginNamespace);
    return p;
}

int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs,
                             void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
    forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV10netHeight, mYoloV10NetWidth, batchSize);
    return 0;
}

__device__ float Logist(float data) {
    return 1.0f / (1.0f + expf(-data));
};

__global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h,
                             int grid_w, const int stride, int classes, int outputElem) {
    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    if (idx >= numElements)
        return;

    int total_grid = grid_h * grid_w;
    int info_len = 4 + classes;
    int batchIdx = idx / total_grid;
    int elemIdx = idx % total_grid;
    const float* curInput = input + batchIdx * total_grid * info_len;
    int outputIdx = batchIdx * outputElem;

    int class_id = 0;
    float max_cls_prob = 0.0;
    for (int i = 4; i < 4 + classes; i++) {
        float p = Logist(curInput[elemIdx + i * total_grid]);
        if (p > max_cls_prob) {
            max_cls_prob = p;
            class_id = i - 4;
        }
    }

    if (max_cls_prob < 0.1)
        return;

    int count = (int)atomicAdd(output + outputIdx, 1);
    if (count >= maxoutobject)
        return;
    char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection);
    Detection* det = (Detection*)(data);

    int row = elemIdx / grid_w;
    int col = elemIdx % grid_w;

    det->conf = max_cls_prob;
    det->class_id = class_id;
    det->bbox[0] = (col + 0.5f - curInput[elemIdx + 0 * total_grid]) * stride;
    det->bbox[1] = (row + 0.5f - curInput[elemIdx + 1 * total_grid]) * stride;
    det->bbox[2] = (col + 0.5f + curInput[elemIdx + 2 * total_grid]) * stride;
    det->bbox[3] = (row + 0.5f + curInput[elemIdx + 3 * total_grid]) * stride;
}

void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV10netHeight,
                                 int mYoloV10NetWidth, int batchSize) {
    int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
    cudaMemsetAsync(output, 0, sizeof(float), stream);
    for (int idx = 0; idx < batchSize; ++idx) {
        CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream));
    }
    int numElem = 0;

    //    const int maxGrids = mStridesLength;
    //    int grids[maxGrids][2];
    //    for (int i = 0; i < maxGrids; ++i) {
    //        grids[i][0] = mYoloV10netHeight / mStrides[i];
    //        grids[i][1] = mYoloV10NetWidth / mStrides[i];
    //    }

    int maxGrids = mStridesLength;
    int flatGridsLen = 2 * maxGrids;
    int* flatGrids = new int[flatGridsLen];

    for (int i = 0; i < maxGrids; ++i) {
        flatGrids[2 * i] = mYoloV10netHeight / mStrides[i];
        flatGrids[2 * i + 1] = mYoloV10NetWidth / mStrides[i];
    }

    for (unsigned int i = 0; i < maxGrids; i++) {
        // Access the elements of the original 2D array from the flattened 1D array
        int grid_h = flatGrids[2 * i];      // Corresponds to the access of grids[i][0]
        int grid_w = flatGrids[2 * i + 1];  // Corresponds to the access of grids[i][1]
        int stride = mStrides[i];
        numElem = grid_h * grid_w * batchSize;  // Calculate the total number of elements
        if (numElem < mThreadCount)             // Adjust the thread count if needed
            mThreadCount = numElem;

        // The CUDA kernel call remains unchanged
        CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>(
                inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, outputElem);
    }

    delete[] flatGrids;
}

PluginFieldCollection YoloPluginCreator::mFC{};
std::vector<PluginField> YoloPluginCreator::mPluginAttributes;

YoloPluginCreator::YoloPluginCreator() {
    mPluginAttributes.clear();
    mFC.nbFields = mPluginAttributes.size();
    mFC.fields = mPluginAttributes.data();
}

const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT {
    return "YoloLayer_TRT";
}

const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT {
    return "1";
}

const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT {
    return &mFC;
}

IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT {
    assert(fc->nbFields == 1);
    assert(strcmp(fc->fields[0].name, "combinedInfo") == 0);
    const int* combinedInfo = static_cast<const int*>(fc->fields[0].data);
    int netinfo_count = 4;
    int class_count = combinedInfo[0];
    int input_w = combinedInfo[1];
    int input_h = combinedInfo[2];
    int max_output_object_count = combinedInfo[3];
    const int* px_arry = combinedInfo + netinfo_count;
    int px_arry_length = fc->fields[0].length - netinfo_count;
    YoloLayerPlugin* obj =
            new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, px_arry, px_arry_length);
    obj->setPluginNamespace(mNamespace.c_str());
    return obj;
}

IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData,
                                                     size_t serialLength) TRT_NOEXCEPT {
    // This object will be deleted when the network is destroyed, which will
    // call YoloLayerPlugin::destroy()
    YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
    obj->setPluginNamespace(mNamespace.c_str());
    return obj;
}

}  // namespace nvinfer1


================================================
FILE: yolov10/plugin/yololayer.h
================================================
#pragma once

#include <string>
#include <vector>
#include "NvInfer.h"
#include "macros.h"

namespace nvinfer1 {
class API YoloLayerPlugin : public IPluginV2IOExt {
   public:
    YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const int* strides, int stridesLength);

    YoloLayerPlugin(const void* data, size_t length);

    ~YoloLayerPlugin();

    int getNbOutputs() const TRT_NOEXCEPT override { return 1; }

    nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;

    int initialize() TRT_NOEXCEPT override;

    virtual void terminate() TRT_NOEXCEPT override {}

    virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; }

    virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace,
                        cudaStream_t stream) TRT_NOEXCEPT override;

    virtual size_t getSerializationSize() const TRT_NOEXCEPT override;

    virtual void serialize(void* buffer) const TRT_NOEXCEPT override;

    bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs,
                                   int nbOutputs) const TRT_NOEXCEPT override {
        return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
    }

    const char* getPluginType() const TRT_NOEXCEPT override;

    const char* getPluginVersion() const TRT_NOEXCEPT override;

    void destroy() TRT_NOEXCEPT override;

    IPluginV2IOExt* clone() const TRT_NOEXCEPT override;

    void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;

    const char* getPluginNamespace() const TRT_NOEXCEPT override;

    nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
                                         int32_t nbInputs) const TRT_NOEXCEPT;

    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
                                      int nbInputs) const TRT_NOEXCEPT override;

    bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;

    void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
                         IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;

    void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out,
                         int32_t nbOutput) TRT_NOEXCEPT override;

    void detachFromContext() TRT_NOEXCEPT override;

   private:
    void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV10netHeight,
                    int mYoloV10NetWidth, int batchSize);

    int mThreadCount = 256;
    const char* mPluginNamespace;
    int mClassCount;
    int mYoloV10NetWidth;
    int mYoloV10netHeight;
    int mMaxOutObject;
    int* mStrides;
    int mStridesLength;
};

class API YoloPluginCreator : public IPluginCreator {
   public:
    YoloPluginCreator();

    ~YoloPluginCreator() override = default;

    const char* getPluginName() const TRT_NOEXCEPT override;

    const char* getPluginVersion() const TRT_NOEXCEPT override;

    const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;

    nvinfer1::IPluginV2IOExt* createPlugin(const char* name,
                                           const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;

    nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData,
                                                size_t serialLength) TRT_NOEXCEPT override;

    void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; }

    const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); }

   private:
    std::string mNamespace;
    static PluginFieldCollection mFC;
    static std::vector<PluginField> mPluginAttributes;
};

REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
}  // namespace nvinfer1


================================================
FILE: yolov10/src/block.cpp
================================================
#include "block.h"
#include <assert.h>
#include <math.h>
#include <fstream>
#include <iostream>
#include "config.h"
#include "yololayer.h"

std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, nvinfer1::Weights> WeightMap;

    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");

    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--) {
        nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        std::string name;
        input >> name >> std::dec >> size;
        wt.type = nvinfer1::DataType::kFLOAT;

        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; x++) {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        wt.count = size;
        WeightMap[name] = wt;
    }
    return WeightMap;
}

nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network,
                                      std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                      std::string lname, float eps) {
    float* gamma = (float*)weightMap[lname + ".weight"].values;
    float* beta = (float*)weightMap[lname + ".bias"].values;
    float* mean = (float*)weightMap[lname + ".running_mean"].values;
    float* var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;

    float* scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, scval, len};

    float* shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, shval, len};

    float* pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, pval, len};
    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    nvinfer1::IScaleLayer* output = network->addScale(input, nvinfer1::ScaleMode::kCHANNEL, shift, scale, power);
    assert(output);
    return output;
}

nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network,
                                        std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                        int ch, int k, int s, std::string lname, int g) {
    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv =
            network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty);
    assert(conv);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    int p = k / 2;
    conv->setPaddingNd(nvinfer1::DimsHW{p, p});
    conv->setNbGroups(g);

    nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);

    nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
    nvinfer1::IElementWiseLayer* ew =
            network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
    assert(ew);
    return ew;
}

nvinfer1::ILayer* convBn(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                         nvinfer1::ITensor& input, int ch, int k, int s, std::string lname, int g = 1) {
    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv =
            network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty);
    assert(conv);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    int p = k / 2;
    conv->setPaddingNd(nvinfer1::DimsHW{p, p});
    conv->setNbGroups(g);

    nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);
    return bn;
}

nvinfer1::ILayer* bottleneck(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                             nvinfer1::ITensor& input, int c1, int c2, bool shortcut, float e, std::string lname) {
    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c2, 3, 1, lname + ".cv1");
    nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *conv1->getOutput(0), c2, 3, 1, lname + ".cv2");

    if (shortcut && c1 == c2) {
        nvinfer1::IElementWiseLayer* ew =
                network->addElementWise(input, *conv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
        return ew;
    }
    return conv2;
}

nvinfer1::IElementWiseLayer* C2F(nvinfer1::INetworkDefinition* network,
                                 std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
                                 int c2, int n, bool shortcut, float e, std::string lname) {
    int c_ = (float)c2 * e;

    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, 2 * c_, 1, 1, lname + ".cv1");
    nvinfer1::Dims d = conv1->getOutput(0)->getDimensions();

    nvinfer1::ISliceLayer* split1 =
            network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0},
                              nvinfer1::Dims4{d.d[0], d.d[1] / 2, d.d[2], d.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    nvinfer1::ISliceLayer* split2 =
            network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, d.d[1] / 2, 0, 0},
                              nvinfer1::Dims4{d.d[0], d.d[1] / 2, d.d[2], d.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    nvinfer1::ITensor* inputTensor0[] = {split1->getOutput(0), split2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor0, 2);
    nvinfer1::ITensor* y1 = split2->getOutput(0);
    for (int i = 0; i < n; i++) {
        auto* b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, 1.0, lname + ".m." + std::to_string(i));
        y1 = b->getOutput(0);

        nvinfer1::ITensor* inputTensors[] = {cat->getOutput(0), b->getOutput(0)};
        cat = network->addConcatenation(inputTensors, 2);
    }

    nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, lname + ".cv2");

    return conv2;
}

nvinfer1::IElementWiseLayer* C2(nvinfer1::INetworkDefinition* network,
                                std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c1,
                                int c2, int n, bool shortcut, float e, std::string lname) {
    assert(network != nullptr);
    int hidden_channels = static_cast<int>(c2 * e);

    // cv1 branch
    nvinfer1::IElementWiseLayer* conv1 =
            convBnSiLU(network, weightMap, input, 2 * hidden_channels, 1, 1, lname + ".cv1");
    nvinfer1::ITensor* cv1_out = conv1->getOutput(0);

    // Split the output of cv1 into two tensors
    nvinfer1::Dims dims = cv1_out->getDimensions();
    nvinfer1::ISliceLayer* split1 = network->addSlice(*cv1_out, nvinfer1::Dims4{0, 0, 0, 0},
                                                      nvinfer1::Dims4{dims.d[0], dims.d[1] / 2, dims.d[2], dims.d[3]},
                                                      nvinfer1::Dims4{1, 1, 1, 1});
    nvinfer1::ISliceLayer* split2 = network->addSlice(*cv1_out, nvinfer1::Dims4{0, dims.d[1] / 2, 0, 0},
                                                      nvinfer1::Dims4{dims.d[0], dims.d[1] / 2, dims.d[2], dims.d[3]},
                                                      nvinfer1::Dims4{1, 1, 1, 1});

    // Create y1 bottleneck sequence
    nvinfer1::ITensor* y1 = split1->getOutput(0);
    for (int i = 0; i < n; ++i) {
        auto* bottleneck_layer = bottleneck(network, weightMap, *y1, hidden_channels, hidden_channels, shortcut, 1.0,
                                            lname + ".m." + std::to_string(i));
        y1 = bottleneck_layer->getOutput(0);  // update 'y1' to be the output of the current bottleneck
    }

    // Concatenate y1 with the second split of cv1
    nvinfer1::ITensor* concatInputs[2] = {y1, split2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(concatInputs, 2);

    // cv2 to produce the final output
    nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, lname + ".cv2");

    return conv2;
}

nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network,
                                  std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
                                  int c2, int k, std::string lname) {
    int c_ = c1 / 2;
    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c_, 1, 1, lname + ".cv1");
    nvinfer1::IPoolingLayer* pool1 =
            network->addPoolingNd(*conv1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
    pool1->setStrideNd(nvinfer1::DimsHW{1, 1});
    pool1->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
    nvinfer1::IPoolingLayer* pool2 =
            network->addPoolingNd(*pool1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
    pool2->setStrideNd(nvinfer1::DimsHW{1, 1});
    pool2->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
    nvinfer1::IPoolingLayer* pool3 =
            network->addPoolingNd(*pool2->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
    pool3->setStrideNd(nvinfer1::DimsHW{1, 1});
    pool3->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
    nvinfer1::ITensor* inputTensors[] = {conv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0),
                                         pool3->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 4);
    nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, lname + ".cv2");
    return conv2;
}

nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                             nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname) {

    nvinfer1::IShuffleLayer* shuffle1 = network->addShuffle(input);
    shuffle1->setReshapeDimensions(nvinfer1::Dims4{kBatchSize, 4, 16, grid});
    shuffle1->setSecondTranspose(nvinfer1::Permutation{0, 2, 1, 3});
    nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*shuffle1->getOutput(0));
    softmax->setAxes(1 << 1);

    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv =
            network->addConvolutionNd(*softmax->getOutput(0), 1, nvinfer1::DimsHW{1, 1}, weightMap[lname], bias_empty);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    conv->setPaddingNd(nvinfer1::DimsHW{p, p});

    nvinfer1::IShuffleLayer* shuffle2 = network->addShuffle(*conv->getOutput(0));
    shuffle2->setReshapeDimensions(nvinfer1::Dims3{kBatchSize, 4, grid});

    return shuffle2;
}

nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network, std::vector<nvinfer1::ILayer*> dets,
                                       const int* px_arry, int px_arry_num) {
    auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
    const int netinfo_count = 4;  // Assuming the first 5 elements are for netinfo as per existing code.
    const int total_count = netinfo_count + px_arry_num;  // Total number of elements for netinfo and px_arry combined.

    std::vector<int> combinedInfo(total_count);
    // Fill in the first 5 elements as per existing netinfo.
    combinedInfo[0] = kNumClass;
    combinedInfo[1] = kInputW;
    combinedInfo[2] = kInputH;
    combinedInfo[3] = kMaxNumOutputBbox;

    // Copy the contents of px_arry into the combinedInfo vector after the initial 5 elements.
    std::copy(px_arry, px_arry + px_arry_num, combinedInfo.begin() + netinfo_count);

    // Now let's create the PluginField object to hold this combined information.
    nvinfer1::PluginField pluginField;
    pluginField.name = "combinedInfo";  // This can be any name that the plugin will recognize
    pluginField.data = combinedInfo.data();
    pluginField.type = nvinfer1::PluginFieldType::kINT32;
    pluginField.length = combinedInfo.size();

    // Create the PluginFieldCollection to hold the PluginField object.
    nvinfer1::PluginFieldCollection pluginFieldCollection{};
    pluginFieldCollection.nbFields = 1;  // We have just one field, but it's a combined array
    pluginFieldCollection.fields = &pluginField;

    // Create the plugin object using the PluginFieldCollection.
    nvinfer1::IPluginV2* pluginObject = creator->createPlugin("yololayer", &pluginFieldCollection);

    // We assume that the plugin is to be added onto the network.
    // Prepare input tensors for the YOLO Layer.
    std::vector<nvinfer1::ITensor*> inputTensors;
    for (auto det : dets) {
        inputTensors.push_back(det->getOutput(0));  // Assuming each IConcatenationLayer has one output tensor.
    }

    // Add the plugin to the network using the prepared input tensors.
    nvinfer1::IPluginV2Layer* yoloLayer = network->addPluginV2(inputTensors.data(), inputTensors.size(), *pluginObject);

    return yoloLayer;  // Return the added YOLO layer.
}

nvinfer1::ILayer* SCDown(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                         nvinfer1::ITensor& input, int ch, int k, int s, std::string lname) {
    auto* conv1 = convBnSiLU(network, weightMap, input, ch, 1, 1, lname + ".cv1");

    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv2 = network->addConvolutionNd(*conv1->getOutput(0), ch, nvinfer1::DimsHW{k, k},
                                                                   weightMap[lname + ".cv2.conv.weight"], bias_empty);
    assert(conv2);
    conv2->setStrideNd(nvinfer1::DimsHW{s, s});
    int p = k / 2;
    conv2->setPaddingNd(nvinfer1::DimsHW{p, p});
    conv2->setNbGroups(ch);

    nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + ".cv2.bn", 1e-3);
    assert(bn);
    return bn;
}

nvinfer1::ILayer* Attention(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                            nvinfer1::ITensor& input, int dim, int num_heads, float attn_ratio, std::string lname) {
    int head_dim = dim / num_heads;
    int key_dim = head_dim * attn_ratio;
    float scale = pow(key_dim, -0.5);
    int nh_kd = key_dim * num_heads;
    int h = dim + nh_kd * 2;

    auto d = input.getDimensions();
    int B = d.d[0];
    int H = d.d[2];
    int W = d.d[3];
    int N = H * W;
    auto* qkv = convBn(network, weightMap, input, h, 1, 1, lname + ".qkv");
    // qkv.view(B, self.num_heads, -1, N)
    auto shuffle = network->addShuffle(*qkv->getOutput(0));
    shuffle->setReshapeDimensions(nvinfer1::Dims4{B, num_heads, -1, N});
    // q, k, v = .split([self.key_dim, self.key_dim, self.head_dim], dim=2)
    auto d1 = shuffle->getOutput(0)->getDimensions();
    auto q = network->addSlice(*shuffle->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0},
                               nvinfer1::Dims4{d1.d[0], d1.d[1], key_dim, d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    auto k = network->addSlice(*shuffle->getOutput(0), nvinfer1::Dims4{0, 0, key_dim, 0},
                               nvinfer1::Dims4{d1.d[0], d1.d[1], key_dim, d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    auto v = network->addSlice(*shuffle->getOutput(0), nvinfer1::Dims4{0, 0, key_dim * 2, 0},
                               nvinfer1::Dims4{d1.d[0], d1.d[1], head_dim, d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    // attn = ((q.transpose(-2, -1) @ k) * self.scale)
    auto qT = network->addShuffle(*q->getOutput(0));
    qT->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2});
    auto matmul = network->addMatrixMultiply(*qT->getOutput(0), nvinfer1::MatrixOperation::kNONE, *k->getOutput(0),
                                             nvinfer1::MatrixOperation::kNONE);
    // There are not many memory leaks, and I will change it when I have time
    float* scale_val = reinterpret_cast<float*>(malloc(sizeof(float) * 1));
    scale_val[0] = scale;
    nvinfer1::Weights s_w{nvinfer1::DataType::kFLOAT, scale_val, 1};
    float* shift_val = reinterpret_cast<float*>(malloc(sizeof(float) * 1));
    shift_val[0] = 0;
    nvinfer1::Weights sh_w{nvinfer1::DataType::kFLOAT, shift_val, 1};
    float* power_val = reinterpret_cast<float*>(malloc(sizeof(float) * 1));
    power_val[0] = 1;
    nvinfer1::Weights p_w{nvinfer1::DataType::kFLOAT, power_val, 1};
    nvinfer1::IScaleLayer* scaleLayer =
            network->addScale(*matmul->getOutput(0), nvinfer1::ScaleMode::kUNIFORM, sh_w, s_w, p_w);
    // attn = attn.softmax(dim=-1)
    nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*scaleLayer->getOutput(0));
    softmax->setAxes(1 << 3);
    // x = (v @ attn.transpose(-2, -1)).view(B, -1, H, W) + self.pe(v.reshape(B, -1, H, W))
    auto attnT = network->addShuffle(*softmax->getOutput(0));
    attnT->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2});
    auto matmul2 = network->addMatrixMultiply(*v->getOutput(0), nvinfer1::MatrixOperation::kNONE, *attnT->getOutput(0),
                                              nvinfer1::MatrixOperation::kNONE);
    auto reshape = network->addShuffle(*matmul2->getOutput(0));
    reshape->setReshapeDimensions(nvinfer1::Dims4{B, -1, H, W});
    auto v_reshape = network->addShuffle(*v->getOutput(0));
    v_reshape->setReshapeDimensions(nvinfer1::Dims4{B, -1, H, W});
    // self.pe = Conv(dim, dim, 3, 1, g=dim, act=False)
    auto pe = convBn(network, weightMap, *v_reshape->getOutput(0), dim, 3, 1, lname + ".pe", dim);
    auto sum = network->addElementWise(*reshape->getOutput(0), *pe->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
    // x = self.proj(x)
    // self.proj = Conv(dim, dim, 1, act=False)
    auto proj = convBn(network, weightMap, *sum->getOutput(0), dim, 1, 1, lname + ".proj");
    return proj;
}

nvinfer1::ILayer* PSA(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                      nvinfer1::ITensor& input, int ch, std::string lname) {
    int c = int(ch * 0.5);
    auto conv1 = convBnSiLU(network, weightMap, input, c * 2, 1, 1, lname + ".cv1");
    // a, b = split((self.c, self.c), dim=1)
    auto d1 = conv1->getOutput(0)->getDimensions();
    auto a = network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0},
                               nvinfer1::Dims4{d1.d[0], c, d1.d[2], d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    auto b = network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, c, 0, 0},
                               nvinfer1::Dims4{d1.d[0], c, d1.d[2], d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    // b = b + self.attn(b)
    auto attn = Attention(network, weightMap, *b->getOutput(0), c, c / 64, 0.5f, lname + ".attn");
    auto sum = network->addElementWise(*b->getOutput(0), *attn->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
    // b = b + self.ffn(b)
    // self.ffn = nn.Sequential(
    //			Conv(self.c, self.c * 2, 1),
    //			Conv(self.c * 2, self.c, 1, act=False)
    //		)
    auto ffn1 = convBnSiLU(network, weightMap, *sum->getOutput(0), c * 2, 1, 1, lname + ".ffn.0");
    auto ffn2 = convBn(network, weightMap, *ffn1->getOutput(0), c, 1, 1, lname + ".ffn.1");
    auto sum2 = network->addElementWise(*sum->getOutput(0), *ffn2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
    // self.cv2(torch.cat((a, b), 1))
    nvinfer1::ITensor* inputTensors[] = {a->getOutput(0), sum2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 2);
    auto conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), ch, 1, 1, lname + ".cv2");
    return conv2;
}

nvinfer1::ILayer* RepVGGDW(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                           nvinfer1::ITensor& input, int ch, std::string lname) {
    // self.conv = Conv(ed, ed, 7, 1, 3, g=ed, act=False)
    // self.conv1 = Conv(ed, ed, 3, 1, 1, g=ed, act=False)
    // self.dim = ed
    // self.act = nn.SiLU()
    // return self.act(self.conv(x) + self.conv1(x))
    auto conv = convBn(network, weightMap, input, ch, 7, 1, lname + ".conv", ch);
    auto conv1 = convBn(network, weightMap, input, ch, 3, 1, lname + ".conv1", ch);
    auto ew = network->addElementWise(*conv->getOutput(0), *conv1->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
    auto sigmoid = network->addActivation(*ew->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
    auto ew_silu =
            network->addElementWise(*ew->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
    assert(ew_silu);
    return ew_silu;
}

nvinfer1::ILayer* CIB(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                      nvinfer1::ITensor& input, int c1, int c2, bool shortcut, float e, bool lk, std::string lname) {
    // self.cv1 = nn.Sequential(
    //			Conv(c1, c1, 3, g=c1),
    //			Conv(c1, 2 * c_, 1),
    //			Conv(2 * c_, 2 * c_, 3, g=2 * c_) if not lk else RepVGGDW(2 * c_),
    //			Conv(2 * c_, c2, 1),
    //			Conv(c2, c2, 3, g=c2),
    //		)
    int c_ = (float)c2 * e;
    auto* conv1 = convBnSiLU(network, weightMap, input, c1, 3, 1, lname + ".cv1.0", c1);
    auto* conv2 = convBnSiLU(network, weightMap, *conv1->getOutput(0), 2 * c_, 1, 1, lname + ".cv1.1");
    nvinfer1::ILayer* conv3;
    if (!lk) {
        conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), 2 * c_, 3, 1, lname + ".cv1.2", 2 * c_);
    } else {
        conv3 = RepVGGDW(network, weightMap, *conv2->getOutput(0), 2 * c_, lname + ".cv1.2");
    }
    auto* conv4 = convBnSiLU(network, weightMap, *conv3->getOutput(0), c2, 1, 1, lname + ".cv1.3");
    auto* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0), c2, 3, 1, lname + ".cv1.4", c2);
    if (shortcut && c1 == c2) {
        auto* ew = network->addElementWise(input, *conv5->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
        return ew;
    } else {
        return conv5;
    }
}

nvinfer1::ILayer* C2fCIB(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                         nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, bool lk, float e,
                         std::string lname) {
    int c_ = (float)c2 * e;

    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, 2 * c_, 1, 1, lname + ".cv1");
    nvinfer1::Dims d = conv1->getOutput(0)->getDimensions();

    nvinfer1::ISliceLayer* split1 =
            network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0},
                              nvinfer1::Dims4{d.d[0], d.d[1] / 2, d.d[2], d.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    nvinfer1::ISliceLayer* split2 =
            network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, d.d[1] / 2, 0, 0},
                              nvinfer1::Dims4{d.d[0], d.d[1] / 2, d.d[2], d.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    nvinfer1::ITensor* inputTensor0[] = {split1->getOutput(0), split2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor0, 2);
    nvinfer1::ITensor* y1 = split2->getOutput(0);
    for (int i = 0; i < n; i++) {
        auto* b = CIB(network, weightMap, *y1, c_, c_, shortcut, 1.0, lk, lname + ".m." + std::to_string(i));
        y1 = b->getOutput(0);

        nvinfer1::ITensor* inputTensors[] = {cat->getOutput(0), b->getOutput(0)};
        cat = network->addConcatenation(inputTensors, 2);
    }

    nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, lname + ".cv2");

    return conv2;
}


================================================
FILE: yolov10/src/calibrator.cpp
================================================
#include "calibrator.h"
#include <fstream>
#include <iostream>
#include <iterator>
#include <opencv2/dnn/dnn.hpp>
#include "cuda_utils.h"
#include "utils.h"

Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir,
                                               const char* calib_table_name, const char* input_blob_name,
                                               bool read_cache)
    : batchsize_(batchsize),
      input_w_(input_w),
      input_h_(input_h),
      img_idx_(0),
      img_dir_(img_dir),
      calib_table_name_(calib_table_name),
      input_blob_name_(input_blob_name),
      read_cache_(read_cache) {
    input_count_ = 3 * input_w * input_h * batchsize;
    CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float)));
    read_files_in_dir(img_dir, img_files_);
}

Int8EntropyCalibrator2::~Int8EntropyCalibrator2() {
    CUDA_CHECK(cudaFree(device_input_));
}

int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT {
    return batchsize_;
}

bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT {
    if (img_idx_ + batchsize_ > (int)img_files_.size()) {
        return false;
    }

    std::vector<cv::Mat> input_imgs_;
    for (int i = img_idx_; i < img_idx_ + batchsize_; i++) {
        std::cout << img_files_[i] << "  " << i << std::endl;
        cv::Mat temp = cv::imread(img_dir_ + "/" + img_files_[i]);
        if (temp.empty()) {
            std::cerr << "Fatal error: image cannot open!" << std::endl;
            return false;
        }
        cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_);
        input_imgs_.push_back(pr_img);
    }
    img_idx_ += batchsize_;
    cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0),
                                           true, false);
    CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr<float>(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice));
    assert(!strcmp(names[0], input_blob_name_));
    bindings[0] = device_input_;
    return true;
}

const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT {
    std::cout << "reading calib cache: " << calib_table_name_ << std::endl;
    calib_cache_.clear();
    std::ifstream input(calib_table_name_, std::ios::binary);
    input >> std::noskipws;
    if (read_cache_ && input.good()) {
        std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(calib_cache_));
    }
    length = calib_cache_.size();
    return length ? calib_cache_.data() : nullptr;
}

void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT {
    std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl;
    std::ofstream output(calib_table_name_, std::ios::binary);
    output.write(reinterpret_cast<const char*>(cache), length);
}


================================================
FILE: yolov10/src/model.cpp
================================================
#include <cmath>
#include <iostream>

#include "block.h"
#include "calibrator.h"
#include "config.h"
#include "model.h"

static int get_width(int x, float gw, int max_channels, int divisor = 8) {
    int c = std::min(x, max_channels);
    auto channel = int(ceil((c * gw) / divisor)) * divisor;
    return channel;
}

static int get_depth(int x, float gd) {
    if (x == 1)
        return 1;
    int r = round(x * gd);
    if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0)
        --r;
    return std::max<int>(r, 1);
}

void calculateStrides(nvinfer1::ILayer* conv_layers[], int size, int reference_size, int strides[]) {
    for (int i = 0; i < size; ++i) {
        nvinfer1::ILayer* layer = conv_layers[i];
        nvinfer1::Dims dims = layer->getOutput(0)->getDimensions();
        int feature_map_size = dims.d[2];
        strides[i] = reference_size / feature_map_size;
    }
}

nvinfer1::IHostMemory* buildEngineYolov10DetN(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                              nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                              int& max_channels) {
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
    //	nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(
            1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));

    /*******************************************************************************************************
    ******************************************  YOLOV10 INPUT  **********************************************
    *******************************************************************************************************/
    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW});
    assert(data);

    /*******************************************************************************************************
    *****************************************  YOLOV10 BACKBONE  ********************************************
    *******************************************************************************************************/
    auto* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, "model.0");
    auto* conv1 =
            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, "model.1");
    // 11233
    auto* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
                      get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2");
    auto* conv3 =
            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, "model.3");
    // 22466
    auto* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
                      get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4");
    auto* conv5 = SCDown(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, "model.5");
    // 22466
    auto* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                      get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");
    auto* conv7 = SCDown(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, "model.7");
    // 11233
    auto* conv8 = C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
                      get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8");
    auto* conv9 = SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
                       get_width(1024, gw, max_channels), 5, "model.9");
    auto* conv10 = PSA(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels), "model.10");
    /*******************************************************************************************************
    *********************************************  YOLOV10 HEAD  ********************************************
    *******************************************************************************************************/
    float scale[] = {1.0, 1.0, 2.0, 2.0};
    nvinfer1::IResizeLayer* upsample11 = network->addResize(*conv10->getOutput(0));
    assert(upsample11);
    upsample11->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample11->setScales(scale, 4);

    nvinfer1::ITensor* inputTensor12[] = {upsample11->getOutput(0), conv6->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat12 = network->addConcatenation(inputTensor12, 2);

    auto* conv13 = C2F(network, weightMap, *cat12->getOutput(0), get_width(512, gw, max_channels),
                       get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.13");

    nvinfer1::IResizeLayer* upsample14 = network->addResize(*conv13->getOutput(0));
    assert(upsample14);
    upsample14->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample14->setScales(scale, 4);

    nvinfer1::ITensor* inputTensor15[] = {upsample14->getOutput(0), conv4->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat15 = network->addConcatenation(inputTensor15, 2);

    auto* conv16 = C2F(network, weightMap, *cat15->getOutput(0), get_width(256, gw, max_channels),
                       get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.16");
    auto* conv17 =
            convBnSiLU(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), 3, 2, "model.17");
    nvinfer1::ITensor* inputTensor18[] = {conv17->getOutput(0), conv13->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat18 = network->addConcatenation(inputTensor18, 2);
    auto* conv19 = C2F(network, weightMap, *cat18->getOutput(0), get_width(512, gw, max_channels),
                       get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.19");
    auto* conv20 =
            SCDown(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), 3, 2, "model.20");
    nvinfer1::ITensor* inputTensor21[] = {conv20->getOutput(0), conv10->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat21 = network->addConcatenation(inputTensor21, 2);
    auto* conv22 = C2fCIB(network, weightMap, *cat21->getOutput(0), get_width(1024, gw, max_channels),
                          get_width(1024, gw, max_channels), get_depth(3, gd), true, true, 0.5, "model.22");

    /*******************************************************************************************************
    *********************************************  YOLOV10 OUTPUT  ******************************************
    *******************************************************************************************************/
    auto d = conv16->getOutput(0)->getDimensions();
    assert(d.nbDims == 4);
    int ch_0 = d.d[1];
    int base_in_channel = std::max(16, std::max(ch_0 / 4, 16 * 4));
    int base_out_channel = std::max(ch_0, std::min(kNumClass, 100));

    // output0
    auto* conv23_cv2_0_0 =
            convBnSiLU(network, weightMap, *conv16->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.0.0");
    auto* conv23_cv2_0_1 = convBnSiLU(network, weightMap, *conv23_cv2_0_0->getOutput(0), base_in_channel, 3, 1,
                                      "model.23.one2one_cv2.0.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_0_2 = network->addConvolutionNd(
            *conv23_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.0.2.weight"],
            weightMap["model.23.one2one_cv2.0.2.bias"]);
    conv23_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    auto* conv23_cv3_0_0_0 = convBnSiLU(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), 3,
                                        1, "model.23.one2one_cv3.0.0.0", get_width(256, gw, max_channels));
    auto* conv23_cv3_0_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_0_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.0.0.1");
    auto* conv23_cv3_0_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_0_0_1->getOutput(0), base_out_channel, 3, 1,
                                        "model.23.one2one_cv3.0.1.0", base_out_channel);
    auto* conv23_cv3_0_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_1_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.0.1.1");
    auto* conv23_cv3_0_2 = network->addConvolutionNd(*conv23_cv3_0_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                                     weightMap["model.23.one2one_cv3.0.2.weight"],
                                                     weightMap["model.23.one2one_cv3.0.2.bias"]);
    conv23_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor23_0[] = {conv23_cv2_0_2->getOutput(0), conv23_cv3_0_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_0 = network->addConcatenation(inputTensor23_0, 2);

    // output1
    auto* conv23_cv2_1_0 =
            convBnSiLU(network, weightMap, *conv19->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.1.0");
    auto* conv23_cv2_1_1 = convBnSiLU(network, weightMap, *conv23_cv2_1_0->getOutput(0), base_in_channel, 3, 1,
                                      "model.23.one2one_cv2.1.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_1_2 = network->addConvolutionNd(
            *conv23_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.1.2.weight"],
            weightMap["model.23.one2one_cv2.1.2.bias"]);
    conv23_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    auto* conv23_cv3_1_0_0 = convBnSiLU(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), 3,
                                        1, "model.23.one2one_cv3.1.0.0", get_width(512, gw, max_channels));
    auto* conv23_cv3_1_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_0_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.1.0.1");
    auto* conv23_cv3_1_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_1_0_1->getOutput(0), base_out_channel, 3, 1,
                                        "model.23.one2one_cv3.1.1.0", base_out_channel);
    auto* conv23_cv3_1_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_1_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.1.1.1");
    auto* conv23_cv3_1_2 = network->addConvolutionNd(*conv23_cv3_1_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                                     weightMap["model.23.one2one_cv3.1.2.weight"],
                                                     weightMap["model.23.one2one_cv3.1.2.bias"]);
    conv23_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor23_1[] = {conv23_cv2_1_2->getOutput(0), conv23_cv3_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_1 = network->addConcatenation(inputTensor23_1, 2);

    // output2
    auto* conv23_cv2_2_0 =
            convBnSiLU(network, weightMap, *conv22->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.2.0");
    auto* conv23_cv2_2_1 = convBnSiLU(network, weightMap, *conv23_cv2_2_0->getOutput(0), base_in_channel, 3, 1,
                                      "model.23.one2one_cv2.2.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_2_2 = network->addConvolutionNd(
            *conv23_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.2.2.weight"],
            weightMap["model.23.one2one_cv2.2.2.bias"]);
    auto* conv23_cv3_2_0_0 = convBnSiLU(network, weightMap, *conv22->getOutput(0), get_width(1024, gw, max_channels), 3,
                                        1, "model.23.one2one_cv3.2.0.0", get_width(1024, gw, max_channels));
    auto* conv23_cv3_2_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_0_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.2.0.1");
    auto* conv23_cv3_2_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_2_0_1->getOutput(0), base_out_channel, 3, 1,
                                        "model.23.one2one_cv3.2.1.0", base_out_channel);
    auto* conv23_cv3_2_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_1_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.2.1.1");
    auto* conv23_cv3_2_2 = network->addConvolutionNd(*conv23_cv3_2_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                                     weightMap["model.23.one2one_cv3.2.2.weight"],
                                                     weightMap["model.23.one2one_cv3.2.2.bias"]);
    nvinfer1::ITensor* inputTensor23_2[] = {conv23_cv2_2_2->getOutput(0), conv23_cv3_2_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_2 = network->addConcatenation(inputTensor23_2, 2);

    /*******************************************************************************************************
    *********************************************  YOLOV10 DETECT  ******************************************
    *******************************************************************************************************/

    nvinfer1::ILayer* conv_layers[] = {conv3, conv5, conv7};
    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
    int stridesLength = sizeof(strides) / sizeof(int);

    nvinfer1::IShuffleLayer* shuffle23_0 = network->addShuffle(*cat23_0->getOutput(0));
    shuffle23_0->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
    nvinfer1::ISliceLayer* split23_0_0 = network->addSlice(
            *shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_0_1 =
            network->addSlice(*shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])},
                              nvinfer1::Dims3{1, 1, 1});

    nvinfer1::IShuffleLayer* dfl23_0 =
            DFL(network, weightMap, *split23_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
                1, 0, "model.23.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor23_dfl_0[] = {dfl23_0->getOutput(0), split23_0_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_dfl_0 = network->addConcatenation(inputTensor23_dfl_0, 2);
    cat23_dfl_0->setAxis(1);

    nvinfer1::IShuffleLayer* shuffle23_1 = network->addShuffle(*cat23_1->getOutput(0));
    shuffle23_1->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
    nvinfer1::ISliceLayer* split23_1_0 = network->addSlice(
            *shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_1_1 =
            network->addSlice(*shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl23_1 =
            DFL(network, weightMap, *split23_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
                1, 0, "model.23.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor23_dfl_1[] = {dfl23_1->getOutput(0), split23_1_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_dfl_1 = network->addConcatenation(inputTensor23_dfl_1, 2);
    cat23_dfl_1->setAxis(1);

    nvinfer1::IShuffleLayer* shuffle23_2 = network->addShuffle(*cat23_2->getOutput(0));
    shuffle23_2->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
    nvinfer1::ISliceLayer* split23_2_0 = network->addSlice(
            *shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_2_1 =
            network->addSlice(*shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl23_2 =
            DFL(network, weightMap, *split23_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
                1, 0, "model.23.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor23_dfl_2[] = {dfl23_2->getOutput(0), split23_2_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_dfl_2 = network->addConcatenation(inputTensor23_dfl_2, 2);
    cat23_dfl_2->setAxis(1);

    nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(
            network, std::vector<nvinfer1::ILayer*>{cat23_dfl_0, cat23_dfl_1, cat23_dfl_2}, strides, stridesLength);

    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    config->setMaxWorkspaceSize(16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table",
                                                  kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}

nvinfer1::IHostMemory* buildEngineYolov10DetS(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                              nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                              int& max_channels) {
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
    //	nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(
            1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));

    /*******************************************************************************************************
    ******************************************  YOLOV10 INPUT  **********************************************
    *******************************************************************************************************/
    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW});
    assert(data);

    /*******************************************************************************************************
    *****************************************  YOLOV10 BACKBONE  ********************************************
    *******************************************************************************************************/
    auto* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, "model.0");
    auto* conv1 =
            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, "model.1");
    // 11233
    auto* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
                      get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2");
    auto* conv3 =
            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, "model.3");
    // 22466
    auto* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
                      get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4");
    auto* conv5 = SCDown(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, "model.5");
    // 22466
    auto* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                      get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");
    auto* conv7 = SCDown(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, "model.7");
    // 11233
    auto* conv8 = C2fCIB(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
                         get_width(1024, gw, max_channels), get_depth(3, gd), true, true, 0.5, "model.8");
    auto* conv9 = SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
                       get_width(1024, gw, max_channels), 5, "model.9");
    auto* conv10 = PSA(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels), "model.10");
    /*******************************************************************************************************
    *********************************************  YOLOV10 HEAD  ********************************************
    *******************************************************************************************************/
    float scale[] = {1.0, 1.0, 2.0, 2.0};
    nvinfer1::IResizeLayer* upsample11 = network->addResize(*conv10->getOutput(0));
    assert(upsample11);
    upsample11->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample11->setScales(scale, 4);

    nvinfer1::ITensor* inputTensor12[] = {upsample11->getOutput(0), conv6->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat12 = network->addConcatenation(inputTensor12, 2);

    auto* conv13 = C2F(network, weightMap, *cat12->getOutput(0), get_width(512, gw, max_channels),
                       get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.13");

    nvinfer1::IResizeLayer* upsample14 = network->addResize(*conv13->getOutput(0));
    assert(upsample14);
    upsample14->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample14->setScales(scale, 4);

    nvinfer1::ITensor* inputTensor15[] = {upsample14->getOutput(0), conv4->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat15 = network->addConcatenation(inputTensor15, 2);

    auto* conv16 = C2F(network, weightMap, *cat15->getOutput(0), get_width(256, gw, max_channels),
                       get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.16");
    auto* conv17 =
            convBnSiLU(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), 3, 2, "model.17");
    nvinfer1::ITensor* inputTensor18[] = {conv17->getOutput(0), conv13->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat18 = network->addConcatenation(inputTensor18, 2);
    auto* conv19 = C2F(network, weightMap, *cat18->getOutput(0), get_width(512, gw, max_channels),
                       get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.19");
    auto* conv20 =
            SCDown(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), 3, 2, "model.20");
    nvinfer1::ITensor* inputTensor21[] = {conv20->getOutput(0), conv10->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat21 = network->addConcatenation(inputTensor21, 2);
    auto* conv22 = C2fCIB(network, weightMap, *cat21->getOutput(0), get_width(1024, gw, max_channels),
                          get_width(1024, gw, max_channels), get_depth(3, gd), true, true, 0.5, "model.22");

    /*******************************************************************************************************
    *********************************************  YOLOV10 OUTPUT  ******************************************
    *******************************************************************************************************/
    auto d = conv16->getOutput(0)->getDimensions();
    assert(d.nbDims == 4);
    int ch_0 = d.d[1];
    int base_in_channel = std::max(16, std::max(ch_0 / 4, 16 * 4));
    int base_out_channel = std::max(ch_0, std::min(kNumClass, 100));

    // output0
    auto* conv23_cv2_0_0 =
            convBnSiLU(network, weightMap, *conv16->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.0.0");
    auto* conv23_cv2_0_1 = convBnSiLU(network, weightMap, *conv23_cv2_0_0->getOutput(0), base_in_channel, 3, 1,
                                      "model.23.one2one_cv2.0.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_0_2 = network->addConvolutionNd(
            *conv23_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.0.2.weight"],
            weightMap["model.23.one2one_cv2.0.2.bias"]);
    conv23_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    auto* conv23_cv3_0_0_0 = convBnSiLU(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), 3,
                                        1, "model.23.one2one_cv3.0.0.0", get_width(256, gw, max_channels));
    auto* conv23_cv3_0_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_0_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.0.0.1");
    auto* conv23_cv3_0_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_0_0_1->getOutput(0), base_out_channel, 3, 1,
                                        "model.23.one2one_cv3.0.1.0", base_out_channel);
    auto* conv23_cv3_0_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_1_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.0.1.1");
    auto* conv23_cv3_0_2 = network->addConvolutionNd(*conv23_cv3_0_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                                     weightMap["model.23.one2one_cv3.0.2.weight"],
                                                     weightMap["model.23.one2one_cv3.0.2.bias"]);
    conv23_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor23_0[] = {conv23_cv2_0_2->getOutput(0), conv23_cv3_0_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_0 = network->addConcatenation(inputTensor23_0, 2);

    // output1
    auto* conv23_cv2_1_0 =
            convBnSiLU(network, weightMap, *conv19->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.1.0");
    auto* conv23_cv2_1_1 = convBnSiLU(network, weightMap, *conv23_cv2_1_0->getOutput(0), base_in_channel, 3, 1,
                                      "model.23.one2one_cv2.1.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_1_2 = network->addConvolutionNd(
            *conv23_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.1.2.weight"],
            weightMap["model.23.one2one_cv2.1.2.bias"]);
    conv23_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    auto* conv23_cv3_1_0_0 = convBnSiLU(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), 3,
                                        1, "model.23.one2one_cv3.1.0.0", get_width(512, gw, max_channels));
    auto* conv23_cv3_1_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_0_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.1.0.1");
    auto* conv23_cv3_1_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_1_0_1->getOutput(0), base_out_channel, 3, 1,
                                        "model.23.one2one_cv3.1.1.0", base_out_channel);
    auto* conv23_cv3_1_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_1_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.1.1.1");
    auto* conv23_cv3_1_2 = network->addConvolutionNd(*conv23_cv3_1_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                                     weightMap["model.23.one2one_cv3.1.2.weight"],
                                                     weightMap["model.23.one2one_cv3.1.2.bias"]);
    conv23_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor23_1[] = {conv23_cv2_1_2->getOutput(0), conv23_cv3_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_1 = network->addConcatenation(inputTensor23_1, 2);

    // output2
    auto* conv23_cv2_2_0 =
            convBnSiLU(network, weightMap, *conv22->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.2.0");
    auto* conv23_cv2_2_1 = convBnSiLU(network, weightMap, *conv23_cv2_2_0->getOutput(0), base_in_channel, 3, 1,
                                      "model.23.one2one_cv2.2.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_2_2 = network->addConvolutionNd(
            *conv23_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.2.2.weight"],
            weightMap["model.23.one2one_cv2.2.2.bias"]);
    auto* conv23_cv3_2_0_0 = convBnSiLU(network, weightMap, *conv22->getOutput(0), get_width(1024, gw, max_channels), 3,
                                        1, "model.23.one2one_cv3.2.0.0", get_width(1024, gw, max_channels));
    auto* conv23_cv3_2_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_0_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.2.0.1");
    auto* conv23_cv3_2_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_2_0_1->getOutput(0), base_out_channel, 3, 1,
                                        "model.23.one2one_cv3.2.1.0", base_out_channel);
    auto* conv23_cv3_2_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_1_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.2.1.1");
    auto* conv23_cv3_2_2 = network->addConvolutionNd(*conv23_cv3_2_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                                     weightMap["model.23.one2one_cv3.2.2.weight"],
                                                     weightMap["model.23.one2one_cv3.2.2.bias"]);
    nvinfer1::ITensor* inputTensor23_2[] = {conv23_cv2_2_2->getOutput(0), conv23_cv3_2_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_2 = network->addConcatenation(inputTensor23_2, 2);

    /*******************************************************************************************************
    *********************************************  YOLOV10 DETECT  ******************************************
    *******************************************************************************************************/

    nvinfer1::ILayer* conv_layers[] = {conv3, conv5, conv7};
    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
    int stridesLength = sizeof(strides) / sizeof(int);

    nvinfer1::IShuffleLayer* shuffle23_0 = network->addShuffle(*cat23_0->getOutput(0));
    shuffle23_0->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
    nvinfer1::ISliceLayer* split23_0_0 = network->addSlice(
            *shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_0_1 =
            network->addSlice(*shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])},
                              nvinfer1::Dims3{1, 1, 1});

    nvinfer1::IShuffleLayer* dfl23_0 =
            DFL(network, weightMap, *split23_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
                1, 0, "model.23.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor23_dfl_0[] = {dfl23_0->getOutput(0), split23_0_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_dfl_0 = network->addConcatenation(inputTensor23_dfl_0, 2);
    cat23_dfl_0->setAxis(1);

    nvinfer1::IShuffleLayer* shuffle23_1 = network->addShuffle(*cat23_1->getOutput(0));
    shuffle23_1->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
    nvinfer1::ISliceLayer* split23_1_0 = network->addSlice(
            *shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_1_1 =
            network->addSlice(*shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl23_1 =
            DFL(network, weightMap, *split23_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
                1, 0, "model.23.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor23_dfl_1[] = {dfl23_1->getOutput(0), split23_1_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_dfl_1 = network->addConcatenation(inputTensor23_dfl_1, 2);
    cat23_dfl_1->setAxis(1);

    nvinfer1::IShuffleLayer* shuffle23_2 = network->addShuffle(*cat23_2->getOutput(0));
    shuffle23_2->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
    nvinfer1::ISliceLayer* split23_2_0 = network->addSlice(
            *shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_2_1 =
            network->addSlice(*shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl23_2 =
            DFL(network, weightMap, *split23_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
                1, 0, "model.23.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor23_dfl_2[] = {dfl23_2->getOutput(0), split23_2_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_dfl_2 = network->addConcatenation(inputTensor23_dfl_2, 2);
    cat23_dfl_2->setAxis(1);

    nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(
            network, std::vector<nvinfer1::ILayer*>{cat23_dfl_0, cat23_dfl_1, cat23_dfl_2}, strides, stridesLength);

    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    config->setMaxWorkspaceSize(16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table",
                                                  kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}

nvinfer1::IHostMemory* buildEngineYolov10DetM(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                              nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                              int& max_channels) {
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
    //	nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(
            1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));

    /*******************************************************************************************************
    ******************************************  YOLOV10 INPUT  **********************************************
    *******************************************************************************************************/
    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW});
    assert(data);

    /*******************************************************************************************************
    *****************************************  YOLOV10 BACKBONE  ********************************************
    *******************************************************************************************************/
    auto* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, "model.0");
    auto* conv1 =
            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, "model.1");
    // 11233
    auto* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
                      get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2");
    auto* conv3 =
            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, "model.3");
    // 22466
    auto* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
                      get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4");
    auto* conv5 = SCDown(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, "model.5");
    // 22466
    auto* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                      get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");
    auto* conv7 = SCDown(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, "model.7");
    // 11233
    auto* conv8 = C2fCIB(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
                         get_width(1024, gw, max_channels), get_depth(3, gd), true, false, 0.5, "model.8");
    auto* conv9 = SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
                       get_width(1024, gw, max_channels), 5, "model.9");
    auto* conv10 = PSA(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels), "model.10");
    /*******************************************************************************************************
    *********************************************  YOLOV10 HEAD  ********************************************
    *******************************************************************************************************/
    float scale[] = {1.0, 1.0, 2.0, 2.0};
    nvinfer1::IResizeLayer* upsample11 = network->addResize(*conv10->getOutput(0));
    assert(upsample11);
    upsample11->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample11->setScales(scale, 4);

    nvinfer1::ITensor* inputTensor12[] = {upsample11->getOutput(0), conv6->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat12 = network->addConcatenation(inputTensor12, 2);

    auto* conv13 = C2F(network, weightMap, *cat12->getOutput(0), get_width(512, gw, max_channels),
                       get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.13");

    nvinfer1::IResizeLayer* upsample14 = network->addResize(*conv13->getOutput(0));
    assert(upsample14);
    upsample14->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample14->setScales(scale, 4);

    nvinfer1::ITensor* inputTensor15[] = {upsample14->getOutput(0), conv4->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat15 = network->addConcatenation(inputTensor15, 2);

    auto* conv16 = C2F(network, weightMap, *cat15->getOutput(0), get_width(256, gw, max_channels),
                       get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.16");
    auto* conv17 =
            convBnSiLU(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), 3, 2, "model.17");
    nvinfer1::ITensor* inputTensor18[] = {conv17->getOutput(0), conv13->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat18 = network->addConcatenation(inputTensor18, 2);
    auto* conv19 = C2fCIB(network, weightMap, *cat18->getOutput(0), get_width(512, gw, max_channels),
                          get_width(512, gw, max_channels), get_depth(3, gd), true, false, 0.5, "model.19");
    auto* conv20 =
            SCDown(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), 3, 2, "model.20");
    nvinfer1::ITensor* inputTensor21[] = {conv20->getOutput(0), conv10->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat21 = network->addConcatenation(inputTensor21, 2);
    auto* conv22 = C2fCIB(network, weightMap, *cat21->getOutput(0), get_width(1024, gw, max_channels),
                          get_width(1024, gw, max_channels), get_depth(3, gd), true, false, 0.5, "model.22");

    /*******************************************************************************************************
    *********************************************  YOLOV10 OUTPUT  ******************************************
    *******************************************************************************************************/
    auto d = conv16->getOutput(0)->getDimensions();
    assert(d.nbDims == 4);
    int ch_0 = d.d[1];
    int base_in_channel = std::max(16, std::max(ch_0 / 4, 16 * 4));
    int base_out_channel = std::max(ch_0, std::min(kNumClass, 100));

    // output0
    auto* conv23_cv2_0_0 =
            convBnSiLU(network, weightMap, *conv16->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.0.0");
    auto* conv23_cv2_0_1 = convBnSiLU(network, weightMap, *conv23_cv2_0_0->getOutput(0), base_in_channel, 3, 1,
                                      "model.23.one2one_cv2.0.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_0_2 = network->addConvolutionNd(
            *conv23_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.0.2.weight"],
            weightMap["model.23.one2one_cv2.0.2.bias"]);
    conv23_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    auto* conv23_cv3_0_0_0 = convBnSiLU(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), 3,
                                        1, "model.23.one2one_cv3.0.0.0", get_width(256, gw, max_channels));
    auto* conv23_cv3_0_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_0_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.0.0.1");
    auto* conv23_cv3_0_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_0_0_1->getOutput(0), base_out_channel, 3, 1,
                                        "model.23.one2one_cv3.0.1.0", base_out_channel);
    auto* conv23_cv3_0_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_1_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.0.1.1");
    auto* conv23_cv3_0_2 = network->addConvolutionNd(*conv23_cv3_0_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                                     weightMap["model.23.one2one_cv3.0.2.weight"],
                                                     weightMap["model.23.one2one_cv3.0.2.bias"]);
    conv23_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor23_0[] = {conv23_cv2_0_2->getOutput(0), conv23_cv3_0_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_0 = network->addConcatenation(inputTensor23_0, 2);

    // output1
    auto* conv23_cv2_1_0 =
            convBnSiLU(network, weightMap, *conv19->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.1.0");
    auto* conv23_cv2_1_1 = convBnSiLU(network, weightMap, *conv23_cv2_1_0->getOutput(0), base_in_channel, 3, 1,
                                      "model.23.one2one_cv2.1.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_1_2 = network->addConvolutionNd(
            *conv23_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.1.2.weight"],
            weightMap["model.23.one2one_cv2.1.2.bias"]);
    conv23_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    auto* conv23_cv3_1_0_0 = convBnSiLU(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), 3,
                                        1, "model.23.one2one_cv3.1.0.0", get_width(512, gw, max_channels));
    auto* conv23_cv3_1_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_0_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.1.0.1");
    auto* conv23_cv3_1_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_1_0_1->getOutput(0), base_out_channel, 3, 1,
                                        "model.23.one2one_cv3.1.1.0", base_out_channel);
    auto* conv23_cv3_1_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_1_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.1.1.1");
    auto* conv23_cv3_1_2 = network->addConvolutionNd(*conv23_cv3_1_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                                     weightMap["model.23.one2one_cv3.1.2.weight"],
                                                     weightMap["model.23.one2one_cv3.1.2.bias"]);
    conv23_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor23_1[] = {conv23_cv2_1_2->getOutput(0), conv23_cv3_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_1 = network->addConcatenation(inputTensor23_1, 2);

    // output2
    auto* conv23_cv2_2_0 =
            convBnSiLU(network, weightMap, *conv22->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.2.0");
    auto* conv23_cv2_2_1 = convBnSiLU(network, weightMap, *conv23_cv2_2_0->getOutput(0), base_in_channel, 3, 1,
                                      "model.23.one2one_cv2.2.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_2_2 = network->addConvolutionNd(
            *conv23_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.2.2.weight"],
            weightMap["model.23.one2one_cv2.2.2.bias"]);
    auto* conv23_cv3_2_0_0 = convBnSiLU(network, weightMap, *conv22->getOutput(0), get_width(1024, gw, max_channels), 3,
                                        1, "model.23.one2one_cv3.2.0.0", get_width(1024, gw, max_channels));
    auto* conv23_cv3_2_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_0_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.2.0.1");
    auto* conv23_cv3_2_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_2_0_1->getOutput(0), base_out_channel, 3, 1,
                                        "model.23.one2one_cv3.2.1.0", base_out_channel);
    auto* conv23_cv3_2_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_1_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.2.1.1");
    auto* conv23_cv3_2_2 = network->addConvolutionNd(*conv23_cv3_2_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                                     weightMap["model.23.one2one_cv3.2.2.weight"],
                                                     weightMap["model.23.one2one_cv3.2.2.bias"]);
    nvinfer1::ITensor* inputTensor23_2[] = {conv23_cv2_2_2->getOutput(0), conv23_cv3_2_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_2 = network->addConcatenation(inputTensor23_2, 2);

    /*******************************************************************************************************
    *********************************************  YOLOV10 DETECT  ******************************************
    *******************************************************************************************************/

    nvinfer1::ILayer* conv_layers[] = {conv3, conv5, conv7};
    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
    int stridesLength = sizeof(strides) / sizeof(int);

    nvinfer1::IShuffleLayer* shuffle23_0 = network->addShuffle(*cat23_0->getOutput(0));
    shuffle23_0->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
    nvinfer1::ISliceLayer* split23_0_0 = network->addSlice(
            *shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_0_1 =
            network->addSlice(*shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])},
                              nvinfer1::Dims3{1, 1, 1});

    nvinfer1::IShuffleLayer* dfl23_0 =
            DFL(network, weightMap, *split23_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
                1, 0, "model.23.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor23_dfl_0[] = {dfl23_0->getOutput(0), split23_0_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_dfl_0 = network->addConcatenation(inputTensor23_dfl_0, 2);
    cat23_dfl_0->setAxis(1);

    nvinfer1::IShuffleLayer* shuffle23_1 = network->addShuffle(*cat23_1->getOutput(0));
    shuffle23_1->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
    nvinfer1::ISliceLayer* split23_1_0 = network->addSlice(
            *shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_1_1 =
            network->addSlice(*shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl23_1 =
            DFL(network, weightMap, *split23_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
                1, 0, "model.23.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor23_dfl_1[] = {dfl23_1->getOutput(0), split23_1_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_dfl_1 = network->addConcatenation(inputTensor23_dfl_1, 2);
    cat23_dfl_1->setAxis(1);

    nvinfer1::IShuffleLayer* shuffle23_2 = network->addShuffle(*cat23_2->getOutput(0));
    shuffle23_2->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
    nvinfer1::ISliceLayer* split23_2_0 = network->addSlice(
            *shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_2_1 =
            network->addSlice(*shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl23_2 =
            DFL(network, weightMap, *split23_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
                1, 0, "model.23.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor23_dfl_2[] = {dfl23_2->getOutput(0), split23_2_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_dfl_2 = network->addConcatenation(inputTensor23_dfl_2, 2);
    cat23_dfl_2->setAxis(1);

    nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(
            network, std::vector<nvinfer1::ILayer*>{cat23_dfl_0, cat23_dfl_1, cat23_dfl_2}, strides, stridesLength);

    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    config->setMaxWorkspaceSize(16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table",
                                                  kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}

nvinfer1::IHostMemory* buildEngineYolov10DetBL(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                               nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                               int& max_channels) {
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
    //	nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(
            1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));

    /*******************************************************************************************************
    ******************************************  YOLOV10 INPUT  **********************************************
    *******************************************************************************************************/
    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW});
    assert(data);

    /*******************************************************************************************************
    *****************************************  YOLOV10 BACKBONE  ********************************************
    *******************************************************************************************************/
    auto* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, "model.0");
    auto* conv1 =
            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, "model.1");
    // 11233
    auto* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
                      get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2");
    auto* conv3 =
            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, "model.3");
    // 22466
    auto* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
                      get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4");
    auto* conv5 = SCDown(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, "model.5");
    // 22466
    auto* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                      get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");
    auto* conv7 = SCDown(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, "model.7");
    // 11233
    auto* conv8 = C2fCIB(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
                         get_width(1024, gw, max_channels), get_depth(3, gd), true, false, 0.5, "model.8");
    auto* conv9 = SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
                       get_width(1024, gw, max_channels), 5, "model.9");
    auto* conv10 = PSA(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels), "model.10");
    /*******************************************************************************************************
    *********************************************  YOLOV10 HEAD  ********************************************
    *******************************************************************************************************/
    float scale[] = {1.0, 1.0, 2.0, 2.0};
    nvinfer1::IResizeLayer* upsample11 = network->addResize(*conv10->getOutput(0));
    assert(upsample11);
    upsample11->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample11->setScales(scale, 4);

    nvinfer1::ITensor* inputTensor12[] = {upsample11->getOutput(0), conv6->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat12 = network->addConcatenation(inputTensor12, 2);

    auto* conv13 = C2fCIB(network, weightMap, *cat12->getOutput(0), get_width(512, gw, max_channels),
                          get_width(512, gw, max_channels), get_depth(3, gd), true, false, 0.5, "model.13");

    nvinfer1::IResizeLayer* upsample14 = network->addResize(*conv13->getOutput(0));
    assert(upsample14);
    upsample14->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample14->setScales(scale, 4);

    nvinfer1::ITensor* inputTensor15[] = {upsample14->getOutput(0), conv4->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat15 = network->addConcatenation(inputTensor15, 2);

    auto* conv16 = C2F(network, weightMap, *cat15->getOutput(0), get_width(256, gw, max_channels),
                       get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.16");
    auto* conv17 =
            convBnSiLU(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), 3, 2, "model.17");
    nvinfer1::ITensor* inputTensor18[] = {conv17->getOutput(0), conv13->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat18 = network->addConcatenation(inputTensor18, 2);
    auto* conv19 = C2fCIB(network, weightMap, *cat18->getOutput(0), get_width(512, gw, max_channels),
                          get_width(512, gw, max_channels), get_depth(3, gd), true, false, 0.5, "model.19");
    auto* conv20 =
            SCDown(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), 3, 2, "model.20");
    nvinfer1::ITensor* inputTensor21[] = {conv20->getOutput(0), conv10->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat21 = network->addConcatenation(inputTensor21, 2);
    auto* conv22 = C2fCIB(network, weightMap, *cat21->getOutput(0), get_width(1024, gw, max_channels),
                          get_width(1024, gw, max_channels), get_depth(3, gd), true, false, 0.5, "model.22");

    /*******************************************************************************************************
    *********************************************  YOLOV10 OUTPUT  ******************************************
    *******************************************************************************************************/
    auto d = conv16->getOutput(0)->getDimensions();
    assert(d.nbDims == 4);
    int ch_0 = d.d[1];
    int base_in_channel = std::max(16, std::max(ch_0 / 4, 16 * 4));
    int base_out_channel = std::max(ch_0, std::min(kNumClass, 100));

    // output0
    auto* conv23_cv2_0_0 =
            convBnSiLU(network, weightMap, *conv16->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.0.0");
    auto* conv23_cv2_0_1 = convBnSiLU(network, weightMap, *conv23_cv2_0_0->getOutput(0), base_in_channel, 3, 1,
                                      "model.23.one2one_cv2.0.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_0_2 = network->addConvolutionNd(
            *conv23_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.0.2.weight"],
            weightMap["model.23.one2one_cv2.0.2.bias"]);
    conv23_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    auto* conv23_cv3_0_0_0 = convBnSiLU(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), 3,
                                        1, "model.23.one2one_cv3.0.0.0", get_width(256, gw, max_channels));
    auto* conv23_cv3_0_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_0_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.0.0.1");
    auto* conv23_cv3_0_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_0_0_1->getOutput(0), base_out_channel, 3, 1,
                                        "model.23.one2one_cv3.0.1.0", base_out_channel);
    auto* conv23_cv3_0_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_1_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.0.1.1");
    auto* conv23_cv3_0_2 = network->addConvolutionNd(*conv23_cv3_0_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                                     weightMap["model.23.one2one_cv3.0.2.weight"],
                                                     weightMap["model.23.one2one_cv3.0.2.bias"]);
    conv23_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor23_0[] = {conv23_cv2_0_2->getOutput(0), conv23_cv3_0_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_0 = network->addConcatenation(inputTensor23_0, 2);

    // output1
    auto* conv23_cv2_1_0 =
            convBnSiLU(network, weightMap, *conv19->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.1.0");
    auto* conv23_cv2_1_1 = convBnSiLU(network, weightMap, *conv23_cv2_1_0->getOutput(0), base_in_channel, 3, 1,
                                      "model.23.one2one_cv2.1.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_1_2 = network->addConvolutionNd(
            *conv23_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.1.2.weight"],
            weightMap["model.23.one2one_cv2.1.2.bias"]);
    conv23_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    auto* conv23_cv3_1_0_0 = convBnSiLU(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), 3,
                                        1, "model.23.one2one_cv3.1.0.0", get_width(512, gw, max_channels));
    auto* conv23_cv3_1_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_0_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.1.0.1");
    auto* conv23_cv3_1_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_1_0_1->getOutput(0), base_out_channel, 3, 1,
                                        "model.23.one2one_cv3.1.1.0", base_out_channel);
    auto* conv23_cv3_1_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_1_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.1.1.1");
    auto* conv23_cv3_1_2 = network->addConvolutionNd(*conv23_cv3_1_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                                     weightMap["model.23.one2one_cv3.1.2.weight"],
                                                     weightMap["model.23.one2one_cv3.1.2.bias"]);
    conv23_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor23_1[] = {conv23_cv2_1_2->getOutput(0), conv23_cv3_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_1 = network->addConcatenation(inputTensor23_1, 2);

    // output2
    auto* conv23_cv2_2_0 =
            convBnSiLU(network, weightMap, *conv22->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.2.0");
    auto* conv23_cv2_2_1 = convBnSiLU(network, weightMap, *conv23_cv2_2_0->getOutput(0), base_in_channel, 3, 1,
                                      "model.23.one2one_cv2.2.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_2_2 = network->addConvolutionNd(
            *conv23_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.2.2.weight"],
            weightMap["model.23.one2one_cv2.2.2.bias"]);
    auto* conv23_cv3_2_0_0 = convBnSiLU(network, weightMap, *conv22->getOutput(0), get_width(1024, gw, max_channels), 3,
                                        1, "model.23.one2one_cv3.2.0.0", get_width(1024, gw, max_channels));
    auto* conv23_cv3_2_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_0_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.2.0.1");
    auto* conv23_cv3_2_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_2_0_1->getOutput(0), base_out_channel, 3, 1,
                                        "model.23.one2one_cv3.2.1.0", base_out_channel);
    auto* conv23_cv3_2_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_1_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.2.1.1");
    auto* conv23_cv3_2_2 = network->addConvolutionNd(*conv23_cv3_2_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                                     weightMap["model.23.one2one_cv3.2.2.weight"],
                                                     weightMap["model.23.one2one_cv3.2.2.bias"]);
    nvinfer1::ITensor* inputTensor23_2[] = {conv23_cv2_2_2->getOutput(0), conv23_cv3_2_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_2 = network->addConcatenation(inputTensor23_2, 2);

    /*******************************************************************************************************
    *********************************************  YOLOV10 DETECT  ******************************************
    *******************************************************************************************************/

    nvinfer1::ILayer* conv_layers[] = {conv3, conv5, conv7};
    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
    int stridesLength = sizeof(strides) / sizeof(int);

    nvinfer1::IShuffleLayer* shuffle23_0 = network->addShuffle(*cat23_0->getOutput(0));
    shuffle23_0->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
    nvinfer1::ISliceLayer* split23_0_0 = network->addSlice(
            *shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_0_1 =
            network->addSlice(*shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])},
                              nvinfer1::Dims3{1, 1, 1});

    nvinfer1::IShuffleLayer* dfl23_0 =
            DFL(network, weightMap, *split23_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
                1, 0, "model.23.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor23_dfl_0[] = {dfl23_0->getOutput(0), split23_0_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_dfl_0 = network->addConcatenation(inputTensor23_dfl_0, 2);
    cat23_dfl_0->setAxis(1);

    nvinfer1::IShuffleLayer* shuffle23_1 = network->addShuffle(*cat23_1->getOutput(0));
    shuffle23_1->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
    nvinfer1::ISliceLayer* split23_1_0 = network->addSlice(
            *shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_1_1 =
            network->addSlice(*shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl23_1 =
            DFL(network, weightMap, *split23_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
                1, 0, "model.23.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor23_dfl_1[] = {dfl23_1->getOutput(0), split23_1_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_dfl_1 = network->addConcatenation(inputTensor23_dfl_1, 2);
    cat23_dfl_1->setAxis(1);

    nvinfer1::IShuffleLayer* shuffle23_2 = network->addShuffle(*cat23_2->getOutput(0));
    shuffle23_2->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
    nvinfer1::ISliceLayer* split23_2_0 = network->addSlice(
            *shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_2_1 =
            network->addSlice(*shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl23_2 =
            DFL(network, weightMap, *split23_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
                1, 0, "model.23.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor23_dfl_2[] = {dfl23_2->getOutput(0), split23_2_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_dfl_2 = network->addConcatenation(inputTensor23_dfl_2, 2);
    cat23_dfl_2->setAxis(1);

    nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(
            network, std::vector<nvinfer1::ILayer*>{cat23_dfl_0, cat23_dfl_1, cat23_dfl_2}, strides, stridesLength);

    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    config->setMaxWorkspaceSize(16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table",
                                                  kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}

nvinfer1::IHostMemory* buildEngineYolov10DetX(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                              nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                              int& max_channels) {
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
    //	nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(
            1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));

    /*******************************************************************************************************
    ******************************************  YOLOV10 INPUT  **********************************************
    *******************************************************************************************************/
    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW});
    assert(data);

    /*******************************************************************************************************
    *****************************************  YOLOV10 BACKBONE  ********************************************
    *******************************************************************************************************/
    auto* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, "model.0");
    auto* conv1 =
            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, "model.1");
    // 11233
    auto* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
                      get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2");
    auto* conv3 =
            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, "model.3");
    // 22466
    auto* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
                      get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4");
    auto* conv5 = SCDown(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, "model.5");
    // 22466
    auto* conv6 = C2fCIB(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                         get_width(512, gw, max_channels), get_depth(6, gd), true, false, 0.5, "model.6");
    auto* conv7 = SCDown(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, "model.7");
    // 11233
    auto* conv8 = C2fCIB(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
                         get_width(1024, gw, max_channels), get_depth(3, gd), true, false, 0.5, "model.8");
    auto* conv9 = SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
                       get_width(1024, gw, max_channels), 5, "model.9");
    auto* conv10 = PSA(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels), "model.10");
    /*******************************************************************************************************
    *********************************************  YOLOV10 HEAD  ********************************************
    *******************************************************************************************************/
    float scale[] = {1.0, 1.0, 2.0, 2.0};
    nvinfer1::IResizeLayer* upsample11 = network->addResize(*conv10->getOutput(0));
    assert(upsample11);
    upsample11->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample11->setScales(scale, 4);

    nvinfer1::ITensor* inputTensor12[] = {upsample11->getOutput(0), conv6->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat12 = network->addConcatenation(inputTensor12, 2);

    auto* conv13 = C2fCIB(network, weightMap, *cat12->getOutput(0), get_width(512, gw, max_channels),
                          get_width(512, gw, max_channels), get_depth(3, gd), true, false, 0.5, "model.13");

    nvinfer1::IResizeLayer* upsample14 = network->addResize(*conv13->getOutput(0));
    assert(upsample14);
    upsample14->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample14->setScales(scale, 4);

    nvinfer1::ITensor* inputTensor15[] = {upsample14->getOutput(0), conv4->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat15 = network->addConcatenation(inputTensor15, 2);

    auto* conv16 = C2F(network, weightMap, *cat15->getOutput(0), get_width(256, gw, max_channels),
                       get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.16");
    auto* conv17 =
            convBnSiLU(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), 3, 2, "model.17");
    nvinfer1::ITensor* inputTensor18[] = {conv17->getOutput(0), conv13->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat18 = network->addConcatenation(inputTensor18, 2);
    auto* conv19 = C2fCIB(network, weightMap, *cat18->getOutput(0), get_width(512, gw, max_channels),
                          get_width(512, gw, max_channels), get_depth(3, gd), true, false, 0.5, "model.19");
    auto* conv20 =
            SCDown(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), 3, 2, "model.20");
    nvinfer1::ITensor* inputTensor21[] = {conv20->getOutput(0), conv10->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat21 = network->addConcatenation(inputTensor21, 2);
    auto* conv22 = C2fCIB(network, weightMap, *cat21->getOutput(0), get_width(1024, gw, max_channels),
                          get_width(1024, gw, max_channels), get_depth(3, gd), true, false, 0.5, "model.22");

    /*******************************************************************************************************
    *********************************************  YOLOV10 OUTPUT  ******************************************
    *******************************************************************************************************/
    auto d = conv16->getOutput(0)->getDimensions();
    assert(d.nbDims == 4);
    int ch_0 = d.d[1];
    int base_in_channel = std::max(16, std::max(ch_0 / 4, 16 * 4));
    int base_out_channel = std::max(ch_0, std::min(kNumClass, 100));

    // output0
    auto* conv23_cv2_0_0 =
            convBnSiLU(network, weightMap, *conv16->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.0.0");
    auto* conv23_cv2_0_1 = convBnSiLU(network, weightMap, *conv23_cv2_0_0->getOutput(0), base_in_channel, 3, 1,
                                      "model.23.one2one_cv2.0.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_0_2 = network->addConvolutionNd(
            *conv23_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.0.2.weight"],
            weightMap["model.23.one2one_cv2.0.2.bias"]);
    conv23_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    auto* conv23_cv3_0_0_0 = convBnSiLU(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), 3,
                                        1, "model.23.one2one_cv3.0.0.0", get_width(256, gw, max_channels));
    auto* conv23_cv3_0_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_0_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.0.0.1");
    auto* conv23_cv3_0_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_0_0_1->getOutput(0), base_out_channel, 3, 1,
                                        "model.23.one2one_cv3.0.1.0", base_out_channel);
    auto* conv23_cv3_0_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_1_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.0.1.1");
    auto* conv23_cv3_0_2 = network->addConvolutionNd(*conv23_cv3_0_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                                     weightMap["model.23.one2one_cv3.0.2.weight"],
                                                     weightMap["model.23.one2one_cv3.0.2.bias"]);
    conv23_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor23_0[] = {conv23_cv2_0_2->getOutput(0), conv23_cv3_0_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_0 = network->addConcatenation(inputTensor23_0, 2);

    // output1
    auto* conv23_cv2_1_0 =
            convBnSiLU(network, weightMap, *conv19->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.1.0");
    auto* conv23_cv2_1_1 = convBnSiLU(network, weightMap, *conv23_cv2_1_0->getOutput(0), base_in_channel, 3, 1,
                                      "model.23.one2one_cv2.1.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_1_2 = network->addConvolutionNd(
            *conv23_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.1.2.weight"],
            weightMap["model.23.one2one_cv2.1.2.bias"]);
    conv23_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    auto* conv23_cv3_1_0_0 = convBnSiLU(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), 3,
                                        1, "model.23.one2one_cv3.1.0.0", get_width(512, gw, max_channels));
    auto* conv23_cv3_1_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_0_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.1.0.1");
    auto* conv23_cv3_1_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_1_0_1->getOutput(0), base_out_channel, 3, 1,
                                        "model.23.one2one_cv3.1.1.0", base_out_channel);
    auto* conv23_cv3_1_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_1_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.1.1.1");
    auto* conv23_cv3_1_2 = network->addConvolutionNd(*conv23_cv3_1_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                                     weightMap["model.23.one2one_cv3.1.2.weight"],
                                                     weightMap["model.23.one2one_cv3.1.2.bias"]);
    conv23_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv23_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor23_1[] = {conv23_cv2_1_2->getOutput(0), conv23_cv3_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_1 = network->addConcatenation(inputTensor23_1, 2);

    // output2
    auto* conv23_cv2_2_0 =
            convBnSiLU(network, weightMap, *conv22->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.2.0");
    auto* conv23_cv2_2_1 = convBnSiLU(network, weightMap, *conv23_cv2_2_0->getOutput(0), base_in_channel, 3, 1,
                                      "model.23.one2one_cv2.2.1");
    nvinfer1::IConvolutionLayer* conv23_cv2_2_2 = network->addConvolutionNd(
            *conv23_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.2.2.weight"],
            weightMap["model.23.one2one_cv2.2.2.bias"]);
    auto* conv23_cv3_2_0_0 = convBnSiLU(network, weightMap, *conv22->getOutput(0), get_width(1024, gw, max_channels), 3,
                                        1, "model.23.one2one_cv3.2.0.0", get_width(1024, gw, max_channels));
    auto* conv23_cv3_2_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_0_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.2.0.1");
    auto* conv23_cv3_2_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_2_0_1->getOutput(0), base_out_channel, 3, 1,
                                        "model.23.one2one_cv3.2.1.0", base_out_channel);
    auto* conv23_cv3_2_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_1_0->getOutput(0), base_out_channel, 1, 1,
                                        "model.23.one2one_cv3.2.1.1");
    auto* conv23_cv3_2_2 = network->addConvolutionNd(*conv23_cv3_2_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                                     weightMap["model.23.one2one_cv3.2.2.weight"],
                                                     weightMap["model.23.one2one_cv3.2.2.bias"]);
    nvinfer1::ITensor* inputTensor23_2[] = {conv23_cv2_2_2->getOutput(0), conv23_cv3_2_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_2 = network->addConcatenation(inputTensor23_2, 2);

    /*******************************************************************************************************
    *********************************************  YOLOV10 DETECT  ******************************************
    *******************************************************************************************************/

    nvinfer1::ILayer* conv_layers[] = {conv3, conv5, conv7};
    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
    int stridesLength = sizeof(strides) / sizeof(int);

    nvinfer1::IShuffleLayer* shuffle23_0 = network->addShuffle(*cat23_0->getOutput(0));
    shuffle23_0->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
    nvinfer1::ISliceLayer* split23_0_0 = network->addSlice(
            *shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_0_1 =
            network->addSlice(*shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])},
                              nvinfer1::Dims3{1, 1, 1});

    nvinfer1::IShuffleLayer* dfl23_0 =
            DFL(network, weightMap, *split23_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
                1, 0, "model.23.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor23_dfl_0[] = {dfl23_0->getOutput(0), split23_0_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_dfl_0 = network->addConcatenation(inputTensor23_dfl_0, 2);
    cat23_dfl_0->setAxis(1);

    nvinfer1::IShuffleLayer* shuffle23_1 = network->addShuffle(*cat23_1->getOutput(0));
    shuffle23_1->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
    nvinfer1::ISliceLayer* split23_1_0 = network->addSlice(
            *shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_1_1 =
            network->addSlice(*shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl23_1 =
            DFL(network, weightMap, *split23_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
                1, 0, "model.23.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor23_dfl_1[] = {dfl23_1->getOutput(0), split23_1_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_dfl_1 = network->addConcatenation(inputTensor23_dfl_1, 2);
    cat23_dfl_1->setAxis(1);

    nvinfer1::IShuffleLayer* shuffle23_2 = network->addShuffle(*cat23_2->getOutput(0));
    shuffle23_2->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
    nvinfer1::ISliceLayer* split23_2_0 = network->addSlice(
            *shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split23_2_1 =
            network->addSlice(*shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl23_2 =
            DFL(network, weightMap, *split23_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
                1, 0, "model.23.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor23_dfl_2[] = {dfl23_2->getOutput(0), split23_2_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat23_dfl_2 = network->addConcatenation(inputTensor23_dfl_2, 2);
    cat23_dfl_2->setAxis(1);

    nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(
            network, std::vector<nvinfer1::ILayer*>{cat23_dfl_0, cat23_dfl_1, cat23_dfl_2}, strides, stridesLength);

    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    config->setMaxWorkspaceSize(16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table",
                                                  kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}


================================================
FILE: yolov10/src/postprocess.cpp
================================================
#include "postprocess.h"

cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
    float l, r, t, b;
    float r_w = kInputW / (img.cols * 1.0);
    float r_h = kInputH / (img.rows * 1.0);

    if (r_h > r_w) {
        l = bbox[0];
        r = bbox[2];
        t = bbox[1] - (kInputH - r_w * img.rows) / 2;
        b = bbox[3] - (kInputH - r_w * img.rows) / 2;
        l = l / r_w;
        r = r / r_w;
        t = t / r_w;
        b = b / r_w;
    } else {
        l = bbox[0] - (kInputW - r_h * img.cols) / 2;
        r = bbox[2] - (kInputW - r_h * img.cols) / 2;
        t = bbox[1];
        b = bbox[3];
        l = l / r_h;
        r = r / r_h;
        t = t / r_h;
        b = b / r_h;
    }
    l = std::max(0.0f, l);
    t = std::max(0.0f, t);
    int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
    int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));

    return cv::Rect(int(round(l)), int(round(t)), width, height);
}

void get_topk(std::vector<Detection>& res, float* output, float conf_thresh, int tokp) {
    int det_size = sizeof(Detection) / sizeof(float);
    for (int i = 0; i < output[0]; i++) {
        if (output[1 + det_size * i + 4] <= conf_thresh)
            continue;
        Detection det{};
        memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
        res.push_back(det);
    }
}

void batch_topk(std::vector<std::vector<Detection>>& res_batch, float* output, int batch_size, int output_size,
                float conf_thresh, int topk) {
    res_batch.resize(batch_size);
    for (int i = 0; i < batch_size; i++) {
        get_topk(res_batch[i], &output[i * output_size], conf_thresh, topk);
    }
}

void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
    for (size_t i = 0; i < img_batch.size(); i++) {
        auto& res = res_batch[i];
        cv::Mat img = img_batch[i];
        for (size_t j = 0; j < res.size(); j++) {
            cv::Rect r = get_rect(img, res[j].bbox);
            cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
            cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
                        cv::Scalar(0xFF, 0xFF, 0xFF), 2);
        }
    }
}


================================================
FILE: yolov10/src/preprocess.cu
================================================
#include "cuda_utils.h"
#include "preprocess.h"

static uint8_t* img_buffer_host = nullptr;
static uint8_t* img_buffer_device = nullptr;

__global__ void warpaffine_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst,
                                  int dst_width, int dst_height, uint8_t const_value_st, AffineMatrix d2s, int edge) {
    int position = blockDim.x * blockIdx.x + threadIdx.x;
    if (position >= edge)
        return;

    float m_x1 = d2s.value[0];
    float m_y1 = d2s.value[1];
    float m_z1 = d2s.value[2];
    float m_x2 = d2s.value[3];
    float m_y2 = d2s.value[4];
    float m_z2 = d2s.value[5];

    int dx = position % dst_width;
    int dy = position / dst_width;
    float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
    float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
    float c0, c1, c2;

    if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
        // out of range
        c0 = const_value_st;
        c1 = const_value_st;
        c2 = const_value_st;
    } else {
        int y_low = floorf(src_y);
        int x_low = floorf(src_x);
        int y_high = y_low + 1;
        int x_high = x_low + 1;

        uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
        float ly = src_y - y_low;
        float lx = src_x - x_low;
        float hy = 1 - ly;
        float hx = 1 - lx;
        float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
        uint8_t* v1 = const_value;
        uint8_t* v2 = const_value;
        uint8_t* v3 = const_value;
        uint8_t* v4 = const_value;

        if (y_low >= 0) {
            if (x_low >= 0)
                v1 = src + y_low * src_line_size + x_low * 3;

            if (x_high < src_width)
                v2 = src + y_low * src_line_size + x_high * 3;
        }

        if (y_high < src_height) {
            if (x_low >= 0)
                v3 = src + y_high * src_line_size + x_low * 3;

            if (x_high < src_width)
                v4 = src + y_high * src_line_size + x_high * 3;
        }

        c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
        c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
        c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
    }

    // bgr to rgb
    float t = c2;
    c2 = c0;
    c0 = t;

    // normalization
    c0 = c0 / 255.0f;
    c1 = c1 / 255.0f;
    c2 = c2 / 255.0f;

    // rgbrgbrgb to rrrgggbbb
    int area = dst_width * dst_height;
    float* pdst_c0 = dst + dy * dst_width + dx;
    float* pdst_c1 = pdst_c0 + area;
    float* pdst_c2 = pdst_c1 + area;
    *pdst_c0 = c0;
    *pdst_c1 = c1;
    *pdst_c2 = c2;
    //    *pdst_c0 = 0.1;
    //    *pdst_c1 = 0.1;
    //    *pdst_c2 = 0.1;
}

void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height,
                     cudaStream_t stream) {
    int img_size = src_width * src_height * 3;
    // copy data to pinned memory
    memcpy(img_buffer_host, src, img_size);
    // copy data to device memory
    CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream));

    AffineMatrix s2d, d2s;
    float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width);

    s2d.value[0] = scale;
    s2d.value[1] = 0;
    s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5;
    s2d.value[3] = 0;
    s2d.value[4] = scale;
    s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5;
    cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
    cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
    cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);

    memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));

    int jobs = dst_height * dst_width;
    int threads = 256;
    int blocks = ceil(jobs / (float)threads);
    warpaffine_kernel<<<blocks, threads, 0, stream>>>(img_buffer_device, src_width * 3, src_width, src_height, dst,
                                                      dst_width, dst_height, 128, d2s, jobs);
}

void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch, float* dst, int dst_width, int dst_height,
                           cudaStream_t stream) {
    int dst_size = dst_width * dst_height * 3;
    for (size_t i = 0; i < img_batch.size(); i++) {
        cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width,
                        dst_height, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
    }
}

void cuda_preprocess_init(int max_image_size) {
    // prepare input data in pinned memory
    CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3));
    // prepare input data in device memory
    CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3));
}

void cuda_preprocess_destroy() {
    CUDA_CHECK(cudaFree(img_buffer_device));
    CUDA_CHECK(cudaFreeHost(img_buffer_host));
}


================================================
FILE: yolov10/yolov10_det.cpp
================================================
#include <fstream>
#include <iostream>
#include <opencv2/opencv.hpp>
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "postprocess.h"
#include "preprocess.h"
#include "utils.h"

Logger gLogger;
using namespace nvinfer1;
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;

void serialize_engine(std::string& wts_name, std::string& engine_name, std::string& type, float& gd, float& gw,
                      int& max_channels) {
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();
    IHostMemory* serialized_engine = nullptr;

    if (type == "n") {
        serialized_engine = buildEngineYolov10DetN(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
    } else if (type == "s") {
        serialized_engine = buildEngineYolov10DetS(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
    } else if (type == "m") {
        serialized_engine = buildEngineYolov10DetM(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
    } else if (type == "b" || type == "l") {
        serialized_engine = buildEngineYolov10DetBL(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
    } else if (type == "x") {
        serialized_engine = buildEngineYolov10DetX(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
    } else {
        std::cerr << "Unsupported type!" << std::endl;
        exit(0);
    }

    assert(serialized_engine);
    std::ofstream p(engine_name, std::ios::binary);
    if (!p) {
        std::cout << "could not open plan output file" << std::endl;
        assert(false);
    }
    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());

    delete serialized_engine;
    delete config;
    delete builder;
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
                        IExecutionContext** context) {
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        assert(false);
    }
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    char* serialized_engine = new char[size];
    assert(serialized_engine);
    file.read(serialized_engine, size);
    file.close();

    *runtime = createInferRuntime(gLogger);
    assert(*runtime);
    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
    assert(*engine);
    *context = (*engine)->createExecutionContext();
    assert(*context);
    delete[] serialized_engine;
}

void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
                    float** output_buffer_host) {
    assert(engine->getNbBindings() == 2);
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(kInputTensorName);
    const int outputIndex = engine->getBindingIndex(kOutputTensorName);
    assert(inputIndex == 0);
    assert(outputIndex == 1);
    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));

    *output_buffer_host = new float[kBatchSize * kOutputSize];
}

void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize) {
    // infer on the batch asynchronously, and DMA output back to host
    auto start = std::chrono::system_clock::now();
    context.enqueueV2(buffers, stream, nullptr);

    CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
                               stream));
    auto end = std::chrono::system_clock::now();
    std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
              << "ms" << std::endl;

    CUDA_CHECK(cudaStreamSynchronize(stream));
}

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir, std::string& type,
                float& gd, float& gw, int& max_channels) {
    if (argc < 4)
        return false;
    if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) {
        wts = std::string(argv[2]);
        engine = std::string(argv[3]);
        auto sub_type = std::string(argv[4]);

        if (sub_type[0] == 'n') {
            gd = 0.33;
            gw = 0.25;
            max_channels = 1024;
            type = "n";
        } else if (sub_type[0] == 's') {
            gd = 0.33;
            gw = 0.50;
            max_channels = 1024;
            type = "s";
        } else if (sub_type[0] == 'm') {
            gd = 0.67;
            gw = 0.75;
            max_channels = 768;
            type = "m";
        } else if (sub_type[0] == 'b') {
            gd = 0.67;
            gw = 1.0;
            max_channels = 512;
            type = "b";
        } else if (sub_type[0] == 'l') {
            gd = 1.0;
            gw = 1.0;
            max_channels = 512;
            type = "l";
        } else if (sub_type[0] == 'x') {
            gd = 1.0;
            gw = 1.25;
            max_channels = 512;
            type = "x";
        } else {
            return false;
        }
    } else if (std::string(argv[1]) == "-d" && argc == 4) {
        engine = std::string(argv[2]);
        img_dir = std::string(argv[3]);
    } else {
        return false;
    }
    return true;
}

int main(int argc, char** argv) {
    // -s ../models/yolov10n.wts ../models/yolov10n.fp32.trt n
    // -d ../models/yolov10n.fp32.trt ../images
    cudaSetDevice(kGpuId);
    std::string wts_name = "";
    std::string engine_name = "";
    std::string img_dir;
    std::string type = "";
    float gd = 0.0f, gw = 0.0f;
    int max_channels = 0;

    if (!parse_args(argc, argv, wts_name, engine_name, img_dir, type, gd, gw, max_channels)) {
        std::cerr << "Arguments not right!" << std::endl;
        std::cerr << "./yolov10_det -s [.wts] [.engine] [n/s/m/b/l/x]  // serialize model to "
                     "plan file"
                  << std::endl;
        std::cerr << "./yolov10_det -d [.engine] ../samples  // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // Create a model using the API directly and serialize it to a file
    if (!wts_name.empty()) {
        serialize_engine(wts_name, engine_name, type, gd, gw, max_channels);
        return 0;
    }

    // Deserialize the engine from file
    IRuntime* runtime = nullptr;
    ICudaEngine* engine = nullptr;
    IExecutionContext* context = nullptr;
    deserialize_engine(engine_name, &runtime, &engine, &context);
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));
    cuda_preprocess_init(kMaxInputImageSize);
    // Prepare cpu and gpu buffers
    float* device_buffers[2];
    float* output_buffer_host = nullptr;

    // Read images from directory
    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host);

    // batch predict
    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
        // Get a batch of images
        std::vector<cv::Mat> img_batch;
        std::vector<std::string> img_name_batch;
        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
            if (img.empty()) {
                std::cerr << "Fatal error: image cannot open!" << std::endl;
                return -1;
            }
            img_batch.push_back(img);
            img_name_batch.push_back(file_names[j]);
        }
        // Preprocess
        cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
        // Run inference
        infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize);
        // output_buffer_host保存前100个值到文件
        //        std::ofstream out_file("../output.txt");
        //        for (int i = 0; i < 100; i++) {
        //            out_file << output_buffer_host[i] << std::endl;
        //        }
        //        out_file.close();

        std::vector<std::vector<Detection>> res_batch;
        batch_topk(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh);

        // print results
        for (size_t j = 0; j < res_batch.size(); j++) {
            for (size_t k = 0; k < res_batch[j].size(); k++) {
                std::cout << "image: " << img_name_batch[j] << ", bbox: " << res_batch[j][k].bbox[0] << ", "
                          << res_batch[j][k].bbox[1] << ", " << res_batch[j][k].bbox[2] << ", "
                          << res_batch[j][k].bbox[3] << ", conf: " << res_batch[j][k].conf
                          << ", class_id: " << res_batch[j][k].class_id << std::endl;
            }
        }

        // Draw bounding boxes
        draw_bbox(img_batch, res_batch);
        // Save images
        for (size_t j = 0; j < img_batch.size(); j++) {
            cv::imwrite("_" + img_name_batch[j], img_batch[j]);
        }
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(device_buffers[0]));
    CUDA_CHECK(cudaFree(device_buffers[1]));
    cuda_preprocess_destroy();
    // Destroy the engine
    delete context;
    delete engine;
    delete runtime;

    return 0;
}


================================================
FILE: yolov10/yolov10_det_trt.py
================================================
# -*- coding: UTF-8 -*-
"""
  @Author: mpj
  @Date  : 2024/7/24 下午7:11
  @version V1.0
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit  # noqa: F401
import pycuda.driver as cuda
import tensorrt as trt

CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4
DET_NUM = 6


def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret


def plot_one_box(x, img, color=None, label=None, line_thickness=None):
    """
    description: Plots one bounding box on image img,
                 this function comes from Yolov10 project.
    param:
        x:      a box likes [x1,y1,x2,y2]
        img:    a opencv image object
        color:  color to draw rectangle, such as (0,255,0)
        label:  str
        line_thickness: int
    return:
        no return

    """
    tl = (
            line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
    )  # line/font thickness
    color = color or [random.randint(0, 255) for _ in range(3)]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
    if label:
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(
            img,
            label,
            (c1[0], c1[1] - 2),
            0,
            tl / 3,
            [225, 255, 255],
            thickness=tf,
            lineType=cv2.LINE_AA,
            )


class Yolov10TRT(object):
    """
    description: A Yolov10 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            self.batch_size = engine.get_binding_shape(binding)[0]
            if self.batch_size != 1:
                raise ValueError("Only support batch_size=1")
            size = trt.volume(engine.get_binding_shape(binding))
            dtype = engine.get_binding_dtype(binding)
            dtype = trt.nptype(dtype)
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        print('batch_size:', self.batch_size)
        self.det_output_length = host_outputs[0].shape[0]

    def infer(self, raw_image_generator):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_origin_h = []
        batch_origin_w = []
        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
            batch_image_raw.append(image_raw)
            batch_origin_h.append(origin_h)
            batch_origin_w.append(origin_w)
            np.copyto(batch_input_image[i], input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        # Do postprocess
        for i in range(self.batch_size):
            result_boxes, result_scores, result_classid = self.post_process(
                output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i],
                batch_origin_w[i]
            )
            # Draw rectangles and labels on the original image
            for j in range(len(result_boxes)):
                box = result_boxes[j]
                plot_one_box(
                    box,
                    batch_image_raw[i],
                    label="{}:{:.2f}".format(
                        categories[int(result_classid[j])], result_scores[j]
                    ),
                )
        return batch_image_raw, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()

    def get_raw_image(self, image_path_batch):
        """
        description: Read an image from image path
        """
        for img_path in image_path_batch:
            yield cv2.imread(img_path)

    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_image(self, raw_bgr_image):
        """
        description: Convert BGR image to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = self.input_w / w
        r_h = self.input_h / h
        if r_h > r_w:
            tw = self.input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.input_h - th) / 2)
            ty2 = self.input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = self.input_h
            tx1 = int((self.input_w - tw) / 2)
            tx2 = self.input_w - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy(self, origin_h, origin_w, x):
        """
        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
        param:
            origin_h:   height of original image
            origin_w:   width of original image
            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
        return:
            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
        """
        y = np.zeros_like(x)
        r_w = self.input_w / origin_w
        r_h = self.input_h / origin_h
        if r_h > r_w:
            y[:, 0] = x[:, 0]
            y[:, 2] = x[:, 2]
            y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2
            y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2
            y /= r_w
        else:
            y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2
            y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2
            y[:, 1] = x[:, 1]
            y[:, 3] = x[:, 3]
            y /= r_h

        return y

    def post_process(self, output, origin_h, origin_w):
        """
        description: postprocess the prediction
        param:
            output:     A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
            result_scores: finally scores, a numpy, each element is the score correspoing to box
            result_classid: finally classid, a numpy, each element is the classid correspoing to box
        """
        num_values_per_detection = DET_NUM
        # Get the num of boxes detected
        num = int(output[0])
        # Reshape to a two dimentional ndarray
        # pred = np.reshape(output[1:], (-1, 38))[:num, :]
        pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :]
        # Do nms
        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
        result_scores = boxes[:, 4] if len(boxes) else np.array([])
        result_classid = boxes[:, 5] if len(boxes) else np.array([])
        return result_boxes, result_scores, result_classid

    def bbox_iou(self, box1, box2, x1y1x2y2=True):
        """
        description: compute the IoU of two bounding boxes
        param:
            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            x1y1x2y2: select the coordinate format
        return:
            iou: computed iou
        """
        if not x1y1x2y2:
            # Transform from center and width to exact coordinates
            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
        else:
            # Get the coordinates of bounding boxes
            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

        # Get the coordinates of the intersection rectangle
        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
        # Intersection area
        inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None)
                      * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None))
        # Union Area
        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)

        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

        return iou

    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
        """
        description: Removes detections with lower object confidence score than 'conf_thres' and performs
        Non-Maximum Suppression to further filter detections.
        param:
            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
            origin_h: original image height
            origin_w: original image width
            conf_thres: a confidence threshold to filter detections
            nms_thres: a iou threshold to filter detections
        return:
            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
        """
        # Get the boxes that score > CONF_THRESH
        boxes = prediction[prediction[:, 4] >= conf_thres]
        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
        boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
        # clip the coordinates
        boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1)
        boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1)
        boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1)
        boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1)
        # Object confidence
        confs = boxes[:, 4]
        # Sort by the confs
        boxes = boxes[np.argsort(-confs)]
        # Perform non-maximum suppression
        keep_boxes = []
        while boxes.shape[0]:
            large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
            label_match = boxes[0, -1] == boxes[:, -1]
            # Indices of boxes with lower confidence scores, large IOUs and matching labels
            invalid = large_overlap & label_match
            keep_boxes += [boxes[0]]
            boxes = boxes[~invalid]
        boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
        return boxes


class inferThread(threading.Thread):
    def __init__(self, yolov8_wrapper, image_path_batch):
        threading.Thread.__init__(self)
        self.yolov8_wrapper = yolov8_wrapper
        self.image_path_batch = image_path_batch

    def run(self):
        batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch))
        for i, img_path in enumerate(self.image_path_batch):
            parent, filename = os.path.split(img_path)
            save_name = os.path.join('output', filename)
            # Save image
            cv2.imwrite(save_name, batch_image_raw[i])
        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))


class warmUpThread(threading.Thread):
    def __init__(self, yolov8_wrapper):
        threading.Thread.__init__(self)
        self.yolov8_wrapper = yolov8_wrapper

    def run(self):
        batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros())
        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))


if __name__ == "__main__":
    # load custom plugin and engine
    PLUGIN_LIBRARY = "build/libmyplugins.so"
    engine_file_path = "yolov8s.engine"

    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]
    if len(sys.argv) > 2:
        PLUGIN_LIBRARY = sys.argv[2]

    ctypes.CDLL(PLUGIN_LIBRARY)

    # load coco labels

    categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
                  "traffic light",
                  "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
                  "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase",
                  "frisbee",
                  "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
                  "surfboard",
                  "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
                  "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
                  "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
                  "cell phone",
                  "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
                  "teddy bear",
                  "hair drier", "toothbrush"]

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')
    # a Yolov10TRT instance
    yolov8_wrapper = Yolov10TRT(engine_file_path)
    try:
        print('batch size is', yolov8_wrapper.batch_size)

        image_dir = "images/"
        image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir)

        for i in range(10):
            # create a new thread to do warm_up
            thread1 = warmUpThread(yolov8_wrapper)
            thread1.start()
            thread1.join()
        for batch in image_path_batches:
            # create a new thread to do inference
            thread1 = inferThread(yolov8_wrapper, batch)
            thread1.start()
            thread1.join()
    finally:
        # destroy the instance
        yolov8_wrapper.destroy()


================================================
FILE: yolov12/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.10)

project(yolov12)

add_definitions(-std=c++11)
add_definitions(-DAPI_EXPORTS)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
enable_language(CUDA)

include_directories(${PROJECT_SOURCE_DIR}/include)
include_directories(${PROJECT_SOURCE_DIR}/plugin)

# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
  message("embed_platform on")
  include_directories(/usr/local/cuda/targets/aarch64-linux/include)
  link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
else()
  message("embed_platform off")

  # cuda
  include_directories(/usr/local/cuda/include)
  link_directories(/usr/local/cuda/lib64)

  # tensorrt
  include_directories(/workspace/shared/TensorRT-8.6.1.6/include)
  link_directories(/workspace/shared/TensorRT-8.6.1.6/lib)
endif()

add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu)
target_link_libraries(myplugins nvinfer cudart)

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu)

add_executable(yolo12_det ${PROJECT_SOURCE_DIR}/yolo12_det.cpp ${SRCS})
target_link_libraries(yolo12_det nvinfer)
target_link_libraries(yolo12_det cudart)
target_link_libraries(yolo12_det myplugins)
target_link_libraries(yolo12_det ${OpenCV_LIBS})


================================================
FILE: yolov12/gen_wts.py
================================================
import sys  # noqa: F401
import argparse
import os
import struct
import torch


def parse_args():
    parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
    parser.add_argument('-w', '--weights', required=True,
                        help='Input weights (.pt) file path (required)')
    parser.add_argument(
        '-o', '--output', help='Output (.wts) file path (optional)')
    parser.add_argument(
        '-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg', 'pose', 'obb'],
        help='determines the model is detection/classification')
    args = parser.parse_args()
    if not os.path.isfile(args.weights):
        raise SystemExit('Invalid input file')
    if not args.output:
        args.output = os.path.splitext(args.weights)[0] + '.wts'
    elif os.path.isdir(args.output):
        args.output = os.path.join(
            args.output,
            os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
    return args.weights, args.output, args.type


pt_file, wts_file, m_type = parse_args()

print(f'Generating .wts for {m_type} model')

# Load model
print(f'Loading {pt_file}')

# Initialize
device = 'cpu'

# Load model
model = torch.load(pt_file, map_location=device, weights_only=False)['model'].float()  # load to FP32

if m_type in ['detect', 'seg', 'pose', 'obb']:
    anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None]

    delattr(model.model[-1], 'anchors')

model.to(device).eval()

with open(wts_file, 'w') as f:
    f.write('{}\n'.format(len(model.state_dict().keys())))
    for k, v in model.state_dict().items():
        vr = v.reshape(-1).cpu().numpy()
        f.write('{} {} '.format(k, len(vr)))
        for vv in vr:
            f.write(' ')
            f.write(struct.pack('>f', float(vv)).hex())
        f.write('\n')


================================================
FILE: yolov12/include/block.h
================================================
#pragma once

#include <map>
#include <string>
#include <vector>
#include "NvInfer.h"

std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file);

nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network,
                                      std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                      std::string lname, float eps);

nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network,
                                        std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                        int ch, std::vector<int> k, int s, std::string lname);

nvinfer1::IElementWiseLayer* C2F(nvinfer1::INetworkDefinition* network,
                                 std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
                                 int c2, int n, bool shortcut, float e, std::string lname);

nvinfer1::IElementWiseLayer* C2(nvinfer1::INetworkDefinition* network,
                                std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c1,
                                int c2, int n, bool shortcut, float e, std::string lname);

nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network,
                                  std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
                                  int c2, int k, std::string lname);

nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                             nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname);

nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
                                       std::vector<nvinfer1::IConcatenationLayer*> dets, const int* px_arry,
                                       int px_arry_num, bool is_segmentation, bool is_pose, bool is_obb);

nvinfer1::IElementWiseLayer* C3K2(nvinfer1::INetworkDefinition* network,
                                  std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c1,
                                  int c2, int n, bool c3k, bool shortcut, float e, std::string lname);

nvinfer1::ILayer* C2PSA(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights>& weightMap,
                        nvinfer1::ITensor& input, int c1, int c2, int n, float e, std::string lname);

nvinfer1::ILayer* DWConv(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                         nvinfer1::ITensor& input, int ch, std::vector<int> k, int s, std::string lname);

nvinfer1::ILayer* A2C2f(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                        nvinfer1::ITensor& input, int c1, int c2, int n, bool a2, int area, bool residual,
                        float mlp_ratio, float e, int g, bool shortcut, std::string lname);

nvinfer1::ILayer* ABlock(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                         nvinfer1::ITensor& input, int dim, int num_heads, float mlp_ratio, int area,
                         std::string lname);

nvinfer1::ILayer* AAttn(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                        nvinfer1::ITensor& input, int dim, int num_heads, float mlp_ratio, int area, std::string lname);


================================================
FILE: yolov12/include/config.h
================================================
#define USE_FP16
// #define USE_FP32
// #define USE_INT8

const static char* kInputTensorName = "images";
const static char* kOutputTensorName = "output";
const static char* kProtoTensorName = "proto";
const static int kNumClass = 80;
const static int kPoseNumClass = 1;
const static int kNumberOfPoints = 17;  // number of keypoints total
// obb model's number of classes
constexpr static int kObbNumClass = 15;
const static int kObbNe = 1;  // number of extra parameters
const static int kBatchSize = 1;
const static int kGpuId = 0;
const static int kInputH = 640;
const static int kInputW = 640;
const static int kObbInputH = 1024;
const static int kObbInputW = 1024;
const static float kNmsThresh = 0.45f;
const static float kConfThresh = 0.5f;
const static float kConfThreshKeypoints = 0.5f;  // keypoints confidence
const static int kMaxInputImageSize = 3000 * 3000;
const static int kMaxNumOutputBbox = 1000;
//Quantization input image folder path
const static char* kInputQuantizationFolder = "./coco_calib";

// Classfication model's number of classes
constexpr static int kClsNumClass = 1000;
// Classfication model's input shape
constexpr static int kClsInputH = 224;
constexpr static int kClsInputW = 224;


================================================
FILE: yolov12/include/cuda_utils.h
================================================
#ifndef TRTX_CUDA_UTILS_H_
#define TRTX_CUDA_UTILS_H_

#include <cuda_runtime_api.h>

#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)                                                                    \
    {                                                                                          \
        cudaError_t error_code = callstr;                                                      \
        if (error_code != cudaSuccess) {                                                       \
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
            assert(0);                                                                         \
        }                                                                                      \
    }
#endif  // CUDA_CHECK

#endif  // TRTX_CUDA_UTILS_H_


================================================
FILE: yolov12/include/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "NvInferRuntimeCommon.h"
#include "macros.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf {
   public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {}

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {}

    ~LogStreamConsumerBuffer() {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr()) {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync() {
        putOutput();
        return 0;
    }

    void putOutput() {
        if (mShouldLog) {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }

   private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase {
   public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog) {}

   protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
   public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(severity <= reportableSeverity),
          mSeverity(severity) {}

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(other.mShouldLog),
          mSeverity(other.mSeverity) {}

    void setReportableSeverity(Severity reportableSeverity) {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

   private:
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger {
   public:
    Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {}

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult {
        kRUNNING,  //!< The test is running
        kPASSED,   //!< The test passed
        kFAILED,   //!< The test failed
        kWAIVED    //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger() { return *this; }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity) { mReportableSeverity = severity; }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom {
       public:
        TestAtom(TestAtom&&) = default;

       private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started), mName(name), mCmdline(cmdline) {}

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline) {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom) {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result) {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass) {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const { return mReportableSeverity; }

   private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result) {
        switch (result) {
            case TestResult::kRUNNING:
                return "RUNNING";
            case TestResult::kPASSED:
                return "PASSED";
            case TestResult::kFAILED:
                return "FAILED";
            case TestResult::kWAIVED:
                return "WAIVED";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result) {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv) {
        std::stringstream ss;
        for (int i = 0; i < argc; i++) {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace {

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

}  // anonymous namespace

#endif  // TENSORRT_LOGGING_H


================================================
FILE: yolov12/include/macros.h
================================================
#ifndef __MACROS_H
#define __MACROS_H

#include "NvInfer.h"

#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else

#if defined(_MSC_VER)
#define API __declspec(dllimport)
#else
#define API
#endif
#endif  // API_EXPORTS

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif

#endif  // __MACROS_H


================================================
FILE: yolov12/include/model.h
================================================
#pragma once

#include <assert.h>
#include <string>
#include "NvInfer.h"

nvinfer1::IHostMemory* buildEngineYolo12Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                            nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                            int& max_channels, std::string& type);


================================================
FILE: yolov12/include/postprocess.h
================================================
#pragma once

#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "types.h"

// Preprocessing functions
cv::Rect get_rect(cv::Mat& img, float bbox[4]);

// Processing functions
void batch_process(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
                   int bbox_element, const std::vector<cv::Mat>& img_batch);

void batch_process_obb(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
                       int bbox_element, const std::vector<cv::Mat>& img_batch);

void process_decode_ptr_host(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img,
                             int count);

void process_decode_ptr_host_obb(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element,
                                 cv::Mat& img, int count);

// NMS functions
void nms(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh = 0.5);

void batch_nms(std::vector<std::vector<Detection>>& batch_res, float* output, int batch_size, int output_size,
               float conf_thresh, float nms_thresh = 0.5);

void nms_obb(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh = 0.5);

void batch_nms_obb(std::vector<std::vector<Detection>>& batch_res, float* output, int batch_size, int output_size,
                   float conf_thresh, float nms_thresh = 0.5);

// CUDA-related functions
void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
                 cudaStream_t stream);

void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);

void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
                     cudaStream_t stream);

void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);

// Drawing functions
void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);

void draw_bbox_obb(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);

void draw_bbox_keypoints_line(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);

void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks,
                    std::unordered_map<int, std::string>& labels_map);


================================================
FILE: yolov12/include/preprocess.h
================================================
#pragma once

#include <map>
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "types.h"

void cuda_preprocess_init(int max_image_size);

void cuda_preprocess_destroy();

void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height,
                     cudaStream_t stream);

void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch, float* dst, int dst_width, int dst_height,
                           cudaStream_t stream);


================================================
FILE: yolov12/include/types.h
================================================
#pragma once
#include "config.h"

struct alignas(float) Detection {
    //center_x center_y w h
    float bbox[4];
    float conf;  // bbox_conf * cls_conf
    float class_id;
    float mask[32];
    float keypoints[kNumberOfPoints * 3];  // 17*3 keypoints
    float angle;                           // obb angle
};

struct AffineMatrix {
    float value[6];
};

const int bbox_element =
        sizeof(AffineMatrix) / sizeof(float) + 1;  // left, top, right, bottom, confidence, class, keepflag


================================================
FILE: yolov12/include/utils.h
================================================
#pragma once
#include <dirent.h>
#include <fstream>
#include <opencv2/opencv.hpp>

static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
    int w, h, x, y;
    float r_w = input_w / (img.cols * 1.0);
    float r_h = input_h / (img.rows * 1.0);
    if (r_h > r_w) {
        w = input_w;
        h = r_w * img.rows;
        x = 0;
        y = (input_h - h) / 2;
    } else {
        w = r_h * img.cols;
        h = input_h;
        x = (input_w - w) / 2;
        y = 0;
    }
    cv::Mat re(h, w, CV_8UC3);
    cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
    cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
    re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
    return out;
}

static inline int read_files_in_dir(const char* p_dir_name, std::vector<std::string>& file_names) {
    DIR* p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            //            std::cout << "Found file: " << cur_file_name << std::endl;
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}

// Function to trim leading and trailing whitespace from a string
static inline std::string trim_leading_whitespace(const std::string& str) {
    size_t first = str.find_first_not_of(' ');
    if (std::string::npos == first) {
        return str;
    }
    size_t last = str.find_last_not_of(' ');
    return str.substr(first, (last - first + 1));
}

// Src: https://stackoverflow.com/questions/16605967
static inline std::string to_string_with_precision(const float a_value, const int n = 2) {
    std::ostringstream out;
    out.precision(n);
    out << std::fixed << a_value;
    return out.str();
}

static inline int read_labels(const std::string labels_filename, std::unordered_map<int, std::string>& labels_map) {
    std::ifstream file(labels_filename);
    // Read each line of the file
    std::string line;
    int index = 0;
    while (std::getline(file, line)) {
        // Strip the line of any leading or trailing whitespace
        line = trim_leading_whitespace(line);

        // Add the stripped line to the labels_map, using the loop index as the key
        labels_map[index] = line;
        index++;
    }
    // Close the file
    file.close();

    return 0;
}


================================================
FILE: yolov12/plugin/yololayer.cu
================================================
#include <assert.h>
#include <math.h>
#include <iostream>
#include <vector>
#include "cuda_utils.h"
#include "types.h"
#include "yololayer.h"

namespace Tn {
template <typename T>
void write(char*& buffer, const T& val) {
    *reinterpret_cast<T*>(buffer) = val;
    buffer += sizeof(T);
}

template <typename T>
void read(const char*& buffer, T& val) {
    val = *reinterpret_cast<const T*>(buffer);
    buffer += sizeof(T);
}
}  // namespace Tn

__device__ float sigmoid(float x) {
    return 1.0f / (1.0f + exp(-x));
}

namespace nvinfer1 {
YoloLayerPlugin::YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth,
                                 int netHeight, int maxOut, bool is_segmentation, bool is_pose, bool is_obb,
                                 const int* strides, int stridesLength) {

    mClassCount = classCount;
    mNumberofpoints = numberofpoints;
    mConfthreshkeypoints = confthreshkeypoints;
    mYoloV8NetWidth = netWidth;
    mYoloV8netHeight = netHeight;
    mMaxOutObject = maxOut;
    mStridesLength = stridesLength;
    mStrides = new int[stridesLength];
    memcpy(mStrides, strides, stridesLength * sizeof(int));
    is_segmentation_ = is_segmentation;
    is_pose_ = is_pose;
    is_obb_ = is_obb;
}

YoloLayerPlugin::~YoloLayerPlugin() {
    if (mStrides != nullptr) {
        delete[] mStrides;
        mStrides = nullptr;
    }
}

YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) {
    using namespace Tn;
    const char *d = reinterpret_cast<const char*>(data), *a = d;
    read(d, mClassCount);
    read(d, mNumberofpoints);
    read(d, mConfthreshkeypoints);
    read(d, mThreadCount);
    read(d, mYoloV8NetWidth);
    read(d, mYoloV8netHeight);
    read(d, mMaxOutObject);
    read(d, mStridesLength);
    mStrides = new int[mStridesLength];
    for (int i = 0; i < mStridesLength; ++i) {
        read(d, mStrides[i]);
    }
    read(d, is_segmentation_);
    read(d, is_pose_);
    read(d, is_obb_);

    assert(d == a + length);
}

void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT {

    using namespace Tn;
    char *d = static_cast<char*>(buffer), *a = d;
    write(d, mClassCount);
    write(d, mNumberofpoints);
    write(d, mConfthreshkeypoints);
    write(d, mThreadCount);
    write(d, mYoloV8NetWidth);
    write(d, mYoloV8netHeight);
    write(d, mMaxOutObject);
    write(d, mStridesLength);
    for (int i = 0; i < mStridesLength; ++i) {
        write(d, mStrides[i]);
    }
    write(d, is_segmentation_);
    write(d, is_pose_);
    write(d, is_obb_);

    assert(d == a + getSerializationSize());
}

size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT {
    return sizeof(mClassCount) + sizeof(mNumberofpoints) + sizeof(mConfthreshkeypoints) + sizeof(mThreadCount) +
           sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + sizeof(mMaxOutObject) + sizeof(mStridesLength) +
           sizeof(int) * mStridesLength + sizeof(is_segmentation_) + sizeof(is_pose_) + sizeof(is_obb_);
}

int YoloLayerPlugin::initialize() TRT_NOEXCEPT {
    return 0;
}

nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs,
                                                    int nbInputDims) TRT_NOEXCEPT {
    int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float);
    return nvinfer1::Dims3(total_size + 1, 1, 1);
}

void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT {
    mPluginNamespace = pluginNamespace;
}

const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT {
    return mPluginNamespace;
}

nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
                                                      int nbInputs) const TRT_NOEXCEPT {
    return nvinfer1::DataType::kFLOAT;
}

bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
                                                   int nbInputs) const TRT_NOEXCEPT {

    return false;
}

bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT {

    return false;
}

void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput,
                                      nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT{};

void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
                                      IGpuAllocator* gpuAllocator) TRT_NOEXCEPT{};

void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {}

const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT {

    return "YoloLayer_TRT";
}

const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT {
    return "1";
}

void YoloLayerPlugin::destroy() TRT_NOEXCEPT {
    delete this;
}

nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT {

    YoloLayerPlugin* p =
            new YoloLayerPlugin(mClassCount, mNumberofpoints, mConfthreshkeypoints, mYoloV8NetWidth, mYoloV8netHeight,
                                mMaxOutObject, is_segmentation_, is_pose_, is_obb_, mStrides, mStridesLength);
    p->setPluginNamespace(mPluginNamespace);
    return p;
}

int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs,
                             void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
    forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV8netHeight, mYoloV8NetWidth, batchSize);
    return 0;
}

__device__ float Logist(float data) {
    return 1.0f / (1.0f + expf(-data));
};

__global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h,
                             int grid_w, const int stride, int classes, int nk, float confkeypoints, int outputElem,
                             bool is_segmentation, bool is_pose, bool is_obb) {
    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    if (idx >= numElements)
        return;

    const int N_kpts = nk;
    int total_grid = grid_h * grid_w;
    int info_len = 4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0) + (is_obb ? 1 : 0);
    int batchIdx = idx / total_grid;
    int elemIdx = idx % total_grid;
    const float* curInput = input + batchIdx * total_grid * info_len;
    int outputIdx = batchIdx * outputElem;

    int class_id = 0;
    float max_cls_prob = 0.0;
    for (int i = 4; i < 4 + classes; i++) {
        float p = Logist(curInput[elemIdx + i * total_grid]);
        if (p > max_cls_prob) {
            max_cls_prob = p;
            class_id = i - 4;
        }
    }

    if (max_cls_prob < 0.1)
        return;

    int count = (int)atomicAdd(output + outputIdx, 1);
    if (count >= maxoutobject)
        return;
    char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection);
    Detection* det = (Detection*)(data);

    int row = elemIdx / grid_w;
    int col = elemIdx % grid_w;

    det->conf = max_cls_prob;
    det->class_id = class_id;
    det->bbox[0] = (col + 0.5f - curInput[elemIdx + 0 * total_grid]) * stride;
    det->bbox[1] = (row + 0.5f - curInput[elemIdx + 1 * total_grid]) * stride;
    det->bbox[2] = (col + 0.5f + curInput[elemIdx + 2 * total_grid]) * stride;
    det->bbox[3] = (row + 0.5f + curInput[elemIdx + 3 * total_grid]) * stride;

    if (is_segmentation) {
        for (int k = 0; k < 32; ++k) {
            det->mask[k] =
                    curInput[elemIdx + (4 + classes + (is_pose ? N_kpts * 3 : 0) + (is_obb ? 1 : 0) + k) * total_grid];
        }
    }

    if (is_pose) {
        for (int kpt = 0; kpt < N_kpts; kpt++) {
            int kpt_x_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3) * total_grid;
            int kpt_y_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3 + 1) * total_grid;
            int kpt_conf_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3 + 2) * total_grid;

            float kpt_confidence = sigmoid(curInput[elemIdx + kpt_conf_idx]);

            float kpt_x = (curInput[elemIdx + kpt_x_idx] * 2.0 + col) * stride;
            float kpt_y = (curInput[elemIdx + kpt_y_idx] * 2.0 + row) * stride;

            bool is_within_bbox =
                    kpt_x >= det->bbox[0] && kpt_x <= det->bbox[2] && kpt_y >= det->bbox[1] && kpt_y <= det->bbox[3];

            if (kpt_confidence < confkeypoints || !is_within_bbox) {
                det->keypoints[kpt * 3] = -1;
                det->keypoints[kpt * 3 + 1] = -1;
                det->keypoints[kpt * 3 + 2] = -1;
            } else {
                det->keypoints[kpt * 3] = kpt_x;
                det->keypoints[kpt * 3 + 1] = kpt_y;
                det->keypoints[kpt * 3 + 2] = kpt_confidence;
            }
        }
    }

    if (is_obb) {
        double pi = CV_PI;
        auto angle_inx = curInput[elemIdx + (4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0) +
                                             0) * total_grid];
        auto angle = (sigmoid(angle_inx) - 0.25f) * pi;

        auto cos1 = cos(angle);
        auto sin1 = sin(angle);
        auto xf = (curInput[elemIdx + 2 * total_grid] - curInput[elemIdx + 0 * total_grid]) / 2;
        auto yf = (curInput[elemIdx + 3 * total_grid] - curInput[elemIdx + 1 * total_grid]) / 2;

        auto x = xf * cos1 - yf * sin1;
        auto y = xf * sin1 + yf * cos1;

        float cx = (col + 0.5f + x) * stride;
        float cy = (row + 0.5f + y) * stride;

        float w1 = (curInput[elemIdx + 0 * total_grid] + curInput[elemIdx + 2 * total_grid]) * stride;
        float h1 = (curInput[elemIdx + 1 * total_grid] + curInput[elemIdx + 3 * total_grid]) * stride;
        det->bbox[0] = cx;
        det->bbox[1] = cy;
        det->bbox[2] = w1;
        det->bbox[3] = h1;
        det->angle = angle;
    }
}

void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,
                                 int mYoloV8NetWidth, int batchSize) {
    int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
    cudaMemsetAsync(output, 0, sizeof(float), stream);
    for (int idx = 0; idx < batchSize; ++idx) {
        CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream));
    }
    int numElem = 0;

    //    const int maxGrids = mStridesLength;
    //    int grids[maxGrids][2];
    //    for (int i = 0; i < maxGrids; ++i) {
    //        grids[i][0] = mYoloV8netHeight / mStrides[i];
    //        grids[i][1] = mYoloV8NetWidth / mStrides[i];
    //    }

    int maxGrids = mStridesLength;
    int flatGridsLen = 2 * maxGrids;
    int* flatGrids = new int[flatGridsLen];

    for (int i = 0; i < maxGrids; ++i) {
        flatGrids[2 * i] = mYoloV8netHeight / mStrides[i];
        flatGrids[2 * i + 1] = mYoloV8NetWidth / mStrides[i];
    }

    for (unsigned int i = 0; i < maxGrids; i++) {
        // Access the elements of the original 2D array from the flattened 1D array
        int grid_h = flatGrids[2 * i];      // Corresponds to the access of grids[i][0]
        int grid_w = flatGrids[2 * i + 1];  // Corresponds to the access of grids[i][1]
        int stride = mStrides[i];
        numElem = grid_h * grid_w * batchSize;  // Calculate the total number of elements
        if (numElem < mThreadCount)             // Adjust the thread count if needed
            mThreadCount = numElem;

        // The CUDA kernel call remains unchanged
        CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>(
                inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, mNumberofpoints,
                mConfthreshkeypoints, outputElem, is_segmentation_, is_pose_, is_obb_);
    }

    delete[] flatGrids;
}

PluginFieldCollection YoloPluginCreator::mFC{};
std::vector<PluginField> YoloPluginCreator::mPluginAttributes;

YoloPluginCreator::YoloPluginCreator() {
    mPluginAttributes.clear();
    mFC.nbFields = mPluginAttributes.size();
    mFC.fields = mPluginAttributes.data();
}

const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT {
    return "YoloLayer_TRT";
}

const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT {
    return "1";
}

const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT {
    return &mFC;
}

IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT {
    assert(fc->nbFields == 1);
    assert(strcmp(fc->fields[0].name, "combinedInfo") == 0);
    const int* combinedInfo = static_cast<const int*>(fc->fields[0].data);
    int netinfo_count = 9;
    int class_count = combinedInfo[0];
    int numberofpoints = combinedInfo[1];
    float confthreshkeypoints = combinedInfo[2];
    int input_w = combinedInfo[3];
    int input_h = combinedInfo[4];
    int max_output_object_count = combinedInfo[5];
    bool is_segmentation = combinedInfo[6];
    bool is_pose = combinedInfo[7];
    bool is_obb = combinedInfo[8];
    const int* px_arry = combinedInfo + netinfo_count;
    int px_arry_length = fc->fields[0].length - netinfo_count;
    YoloLayerPlugin* obj =
            new YoloLayerPlugin(class_count, numberofpoints, confthreshkeypoints, input_w, input_h,
                                max_output_object_count, is_segmentation, is_pose, is_obb, px_arry, px_arry_length);
    obj->setPluginNamespace(mNamespace.c_str());
    return obj;
}

IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData,
                                                     size_t serialLength) TRT_NOEXCEPT {
    // This object will be deleted when the network is destroyed, which will
    // call YoloLayerPlugin::destroy()
    YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
    obj->setPluginNamespace(mNamespace.c_str());
    return obj;
}

}  // namespace nvinfer1


================================================
FILE: yolov12/plugin/yololayer.h
================================================
#pragma once

#include <opencv2/opencv.hpp>
#include <string>
#include <vector>
#include "NvInfer.h"
#include "macros.h"

namespace nvinfer1 {
class API YoloLayerPlugin : public IPluginV2IOExt {
   public:
    YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth, int netHeight,
                    int maxOut, bool is_segmentation, bool is_pose, bool is_obb, const int* strides, int stridesLength);

    YoloLayerPlugin(const void* data, size_t length);

    ~YoloLayerPlugin();

    int getNbOutputs() const TRT_NOEXCEPT override { return 1; }

    nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;

    int initialize() TRT_NOEXCEPT override;

    virtual void terminate() TRT_NOEXCEPT override {}

    virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; }

    virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace,
                        cudaStream_t stream) TRT_NOEXCEPT override;

    virtual size_t getSerializationSize() const TRT_NOEXCEPT override;

    virtual void serialize(void* buffer) const TRT_NOEXCEPT override;

    bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs,
                                   int nbOutputs) const TRT_NOEXCEPT override {
        return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
    }

    const char* getPluginType() const TRT_NOEXCEPT override;

    const char* getPluginVersion() const TRT_NOEXCEPT override;

    void destroy() TRT_NOEXCEPT override;

    IPluginV2IOExt* clone() const TRT_NOEXCEPT override;

    void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;

    const char* getPluginNamespace() const TRT_NOEXCEPT override;

    nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
                                         int32_t nbInputs) const TRT_NOEXCEPT;

    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
                                      int nbInputs) const TRT_NOEXCEPT override;

    bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;

    void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
                         IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;

    void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out,
                         int32_t nbOutput) TRT_NOEXCEPT override;

    void detachFromContext() TRT_NOEXCEPT override;

   private:
    void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,
                    int mYoloV8NetWidth, int batchSize);

    int mThreadCount = 256;
    const char* mPluginNamespace;
    int mClassCount;
    int mNumberofpoints;
    float mConfthreshkeypoints;
    int mYoloV8NetWidth;
    int mYoloV8netHeight;
    int mMaxOutObject;
    bool is_segmentation_;
    bool is_pose_;
    bool is_obb_;
    int* mStrides;
    int mStridesLength;
};

class API YoloPluginCreator : public IPluginCreator {
   public:
    YoloPluginCreator();

    ~YoloPluginCreator() override = default;

    const char* getPluginName() const TRT_NOEXCEPT override;

    const char* getPluginVersion() const TRT_NOEXCEPT override;

    const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;

    nvinfer1::IPluginV2IOExt* createPlugin(const char* name,
                                           const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;

    nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData,
                                                size_t serialLength) TRT_NOEXCEPT override;

    void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; }

    const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); }

   private:
    std::string mNamespace;
    static PluginFieldCollection mFC;
    static std::vector<PluginField> mPluginAttributes;
};

REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
}  // namespace nvinfer1


================================================
FILE: yolov12/readme.md
================================================
## Introduction

Yolo12 model supports TensorRT-8.

Training code [link](https://github.com/ultralytics/ultralytics/archive/refs/tags/v8.3.38.zip)

## Environment

* cuda 11.8
* cudnn 8.9.1.23
* tensorrt 8.6.1.6
* opencv 4.8.0
* ultralytics 8.3.0

## Support

* [x] YOLO12-det support FP32/FP16 and C++ API


## Config

* Choose the YOLO12 sub-model n/s/m/l/x from command line arguments.
* Other configs please check [src/config.h](src/config.h)

## Build and Run

1. generate .wts from pytorch with .pt, or download .wts from model zoo

```shell
# Download ultralytics
wget https://github.com/ultralytics/ultralytics/archive/refs/tags/v8.3.119.zip -O ultralytics-8.3.119.zip
# Unzip ultralytics
unzip ultralytics-8.3.119.zip
cd ultralytics-8.3.119
# Download models
wget https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo12n.pt -O yolo12n.pt # to download other models, replace 'yolo12n.pt' with 'yolo12s.pt', 'yolo12m.pt', 'yolo12l.pt' or 'yolo12x.pt'
# Generate .wts
cp [PATH-TO-TENSORRTX]/yolov12/gen_wts.py .
python gen_wts.py -w yolo12n.pt -o yolo12n.wts -t detect
# A file 'yolo12n.wts' will be generated.
```

2. build tensorrtx/yolov12 and run
```shell
cd [PATH-TO-TENSORRTX]/yolov12
mkdir build
cd build
cmake ..
make
```

### Detection
```shell
cp [PATH-TO-ultralytics]/yolo12n.wts .
# Build and serialize TensorRT engine
./yolo12_det -s yolo12n.wts yolo12n.engine [n/s/m/l/x]
# Run inference
./yolo12_det -d yolo12n.engine ../images [c/g]
# results saved in build directory
```

## More Information
See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)


================================================
FILE: yolov12/src/block.cpp
================================================
#include "block.h"
#include <assert.h>
#include <math.h>
#include <fstream>
#include <iostream>
#include "config.h"
#include "model.h"
#include "yololayer.h"

std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, nvinfer1::Weights> WeightMap;

    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");

    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--) {
        nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        std::string name;
        input >> name >> std::dec >> size;
        wt.type = nvinfer1::DataType::kFLOAT;

        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; x++) {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        wt.count = size;
        WeightMap[name] = wt;
    }
    return WeightMap;
}

nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network,
                                      std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                      std::string lname, float eps) {
    float* gamma = (float*)weightMap[lname + ".weight"].values;
    float* beta = (float*)weightMap[lname + ".bias"].values;
    float* mean = (float*)weightMap[lname + ".running_mean"].values;
    float* var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;

    float* scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, scval, len};

    float* shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, shval, len};

    float* pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, pval, len};
    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    nvinfer1::IScaleLayer* output = network->addScale(input, nvinfer1::ScaleMode::kCHANNEL, shift, scale, power);
    assert(output);
    return output;
}

nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network,
                                        std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                        int ch, std::vector<int> k, int s, std::string lname) {
    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k[0], k[1]},
                                                                  weightMap[lname + ".conv.weight"], bias_empty);
    assert(conv);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    // auto pad
    int p0 = k[0] / 2;
    int p1 = k[1] / 2;
    conv->setPaddingNd(nvinfer1::DimsHW{p0, p1});

    nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);

    nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
    nvinfer1::IElementWiseLayer* ew =
            network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
    assert(ew);
    return ew;
}

static nvinfer1::ILayer* bottleneck(nvinfer1::INetworkDefinition* network,
                                    std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                    int c1, int c2, bool shortcut, std::vector<int> k1, std::vector<int> k2, float e,
                                    std::string lname) {
    int c_ = (int)((float)c2 * e);
    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c_, k1, 1, lname + ".cv1");
    nvinfer1::IElementWiseLayer* conv2 =
            convBnSiLU(network, weightMap, *conv1->getOutput(0), c2, k2, 1, lname + ".cv2");

    if (shortcut && c1 == c2) {
        nvinfer1::IElementWiseLayer* ew =
                network->addElementWise(input, *conv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
        return ew;
    }
    return conv2;
}

nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network,
                                  std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
                                  int c2, int k, std::string lname) {
    int c_ = c1 / 2;
    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv1");
    nvinfer1::IPoolingLayer* pool1 =
            network->addPoolingNd(*conv1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
    pool1->setStrideNd(nvinfer1::DimsHW{1, 1});
    pool1->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
    nvinfer1::IPoolingLayer* pool2 =
            network->addPoolingNd(*pool1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
    pool2->setStrideNd(nvinfer1::DimsHW{1, 1});
    pool2->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
    nvinfer1::IPoolingLayer* pool3 =
            network->addPoolingNd(*pool2->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
    pool3->setStrideNd(nvinfer1::DimsHW{1, 1});
    pool3->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
    nvinfer1::ITensor* inputTensors[] = {conv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0),
                                         pool3->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 4);
    nvinfer1::IElementWiseLayer* conv2 =
            convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv2");
    return conv2;
}

nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                             nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname) {

    nvinfer1::IShuffleLayer* shuffle1 = network->addShuffle(input);
    shuffle1->setReshapeDimensions(nvinfer1::Dims4{kBatchSize, 4, 16, grid});
    shuffle1->setSecondTranspose(nvinfer1::Permutation{0, 2, 1, 3});
    nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*shuffle1->getOutput(0));
    softmax->setAxes(1 << 1);

    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv =
            network->addConvolutionNd(*softmax->getOutput(0), 1, nvinfer1::DimsHW{1, 1}, weightMap[lname], bias_empty);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    conv->setPaddingNd(nvinfer1::DimsHW{p, p});

    nvinfer1::IShuffleLayer* shuffle2 = network->addShuffle(*conv->getOutput(0));
    shuffle2->setReshapeDimensions(nvinfer1::Dims3{kBatchSize, 4, grid});

    return shuffle2;
}

nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
                                       std::vector<nvinfer1::IConcatenationLayer*> dets, const int* px_arry,
                                       int px_arry_num, bool is_segmentation, bool is_pose, bool is_obb) {
    auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
    const int netinfo_count = 9;  // Assuming the first 5 elements are for netinfo as per existing code.
    const int total_count = netinfo_count + px_arry_num;  // Total number of elements for netinfo and px_arry combined.

    std::vector<int> combinedInfo(total_count);
    int class_num = kNumClass;
    if (is_pose)
        class_num = kPoseNumClass;
    else if (is_obb)
        class_num = kObbNumClass;
    int input_w = kInputW;
    if (is_obb)
        input_w = kObbInputW;
    int input_h = kInputH;
    if (is_obb)
        input_h = kObbInputH;
    // Fill in the first 5 elements as per existing netinfo.
    combinedInfo[0] = class_num;
    combinedInfo[1] = kNumberOfPoints;
    combinedInfo[2] = kConfThreshKeypoints;
    combinedInfo[3] = input_w;
    combinedInfo[4] = input_h;
    combinedInfo[5] = kMaxNumOutputBbox;
    combinedInfo[6] = is_segmentation;
    combinedInfo[7] = is_pose;
    combinedInfo[8] = is_obb;

    // Copy the contents of px_arry into the combinedInfo vector after the initial 5 elements.
    std::copy(px_arry, px_arry + px_arry_num, combinedInfo.begin() + netinfo_count);

    // Now let's create the PluginField object to hold this combined information.
    nvinfer1::PluginField pluginField;
    pluginField.name = "combinedInfo";  // This can be any name that the plugin will recognize
    pluginField.data = combinedInfo.data();
    pluginField.type = nvinfer1::PluginFieldType::kINT32;
    pluginField.length = combinedInfo.size();

    // Create the PluginFieldCollection to hold the PluginField object.
    nvinfer1::PluginFieldCollection pluginFieldCollection;
    pluginFieldCollection.nbFields = 1;  // We have just one field, but it's a combined array
    pluginFieldCollection.fields = &pluginField;

    // Create the plugin object using the PluginFieldCollection.
    nvinfer1::IPluginV2* pluginObject = creator->createPlugin("yololayer", &pluginFieldCollection);

    // We assume that the plugin is to be added onto the network.
    // Prepare input tensors for the YOLO Layer.
    std::vector<nvinfer1::ITensor*> inputTensors;
    for (auto det : dets) {
        inputTensors.push_back(det->getOutput(0));  // Assuming each IConcatenationLayer has one output tensor.
    }

    // Add the plugin to the network using the prepared input tensors.
    nvinfer1::IPluginV2Layer* yoloLayer = network->addPluginV2(inputTensors.data(), inputTensors.size(), *pluginObject);

    return yoloLayer;  // Return the added YOLO layer.
}

static nvinfer1::ILayer* C3k(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                             nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, std::vector<int> k1,
                             std::vector<int> k2, float e, std::string lname) {
    int c_ = (int)((float)c2 * e);
    auto cv1 = convBnSiLU(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv1");
    auto cv2 = convBnSiLU(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv2");
    nvinfer1::ITensor* y1 = cv1->getOutput(0);
    for (int i = 0; i < n; i++) {
        auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, k1, k2, 1.0, lname + ".m." + std::to_string(i));
        y1 = b->getOutput(0);
    }

    nvinfer1::ITensor* inputTensors[] = {y1, cv2->getOutput(0)};
    auto cat = network->addConcatenation(inputTensors, 2);

    auto cv3 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv3");
    return cv3;
}

nvinfer1::IElementWiseLayer* C3K2(nvinfer1::INetworkDefinition* network,
                                  std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c1,
                                  int c2, int n, bool c3k, bool shortcut, float e, std::string lname) {
    int c_ = (float)c2 * e;

    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, 2 * c_, {1, 1}, 1, lname + ".cv1");
    nvinfer1::Dims d = conv1->getOutput(0)->getDimensions();

    nvinfer1::ISliceLayer* split1 =
            network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0},
                              nvinfer1::Dims4{d.d[0], d.d[1] / 2, d.d[2], d.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    nvinfer1::ISliceLayer* split2 =
            network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, d.d[1] / 2, 0, 0},
                              nvinfer1::Dims4{d.d[0], d.d[1] / 2, d.d[2], d.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    nvinfer1::ITensor* inputTensor0[] = {split1->getOutput(0), split2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor0, 2);
    nvinfer1::ITensor* y1 = split2->getOutput(0);
    for (int i = 0; i < n; i++) {
        nvinfer1::ILayer* b;
        if (c3k) {
            b = C3k(network, weightMap, *y1, c_, c_, 2, shortcut, {3, 3}, {3, 3}, 0.5,
                    lname + ".m." + std::to_string(i));
        } else {
            b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, {3, 3}, {3, 3}, 0.5,
                           lname + ".m." + std::to_string(i));
        }
        y1 = b->getOutput(0);

        nvinfer1::ITensor* inputTensors[] = {cat->getOutput(0), b->getOutput(0)};
        cat = network->addConcatenation(inputTensors, 2);
    }

    nvinfer1::IElementWiseLayer* conv2 =
            convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv2");

    return conv2;
}

static nvinfer1::ILayer* convBn(nvinfer1::INetworkDefinition* network,
                                std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int ch,
                                int k, int s, std::string lname, int g = 1) {
    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv;
    if (lname.find(".pe") != std::string::npos) {
        nvinfer1::Weights conv_bias = weightMap[lname + ".conv.bias"];
        conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"],
                                         conv_bias);
        assert(conv);
        conv->setStrideNd(nvinfer1::DimsHW{s, s});
        int p = k / 2;
        conv->setPaddingNd(nvinfer1::DimsHW{p, p});
        conv->setNbGroups(g);
        conv->setName((lname + ".conv").c_str());

        nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);
        bn->setName((lname + ".bn").c_str());
        return bn;

    } else {
        conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"],
                                         bias_empty);
        assert(conv);
        conv->setStrideNd(nvinfer1::DimsHW{s, s});
        int p = k / 2;
        conv->setPaddingNd(nvinfer1::DimsHW{p, p});
        conv->setNbGroups(g);
        conv->setName((lname + ".conv").c_str());

        nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);
        bn->setName((lname + ".bn").c_str());
        return bn;
    }
}

static nvinfer1::ILayer* Attention(nvinfer1::INetworkDefinition* network,
                                   std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                   int dim, int num_heads, float attn_ratio, std::string lname) {
    int head_dim = dim / num_heads;
    int key_dim = head_dim * attn_ratio;
    float scale = pow(key_dim, -0.5);
    int nh_kd = key_dim * num_heads;
    int h = dim + nh_kd * 2;

    auto d = input.getDimensions();
    int B = d.d[0];
    int H = d.d[2];
    int W = d.d[3];
    int N = H * W;
    auto* qkv = convBn(network, weightMap, input, h, 1, 1, lname + ".qkv");
    // qkv.view(B, self.num_heads, -1, N)
    auto shuffle = network->addShuffle(*qkv->getOutput(0));
    shuffle->setReshapeDimensions(nvinfer1::Dims4{B, num_heads, -1, N});
    // q, k, v = .split([self.key_dim, self.key_dim, self.head_dim], dim=2)
    auto d1 = shuffle->getOutput(0)->getDimensions();
    auto q = network->addSlice(*shuffle->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0},
                               nvinfer1::Dims4{d1.d[0], d1.d[1], key_dim, d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    auto k = network->addSlice(*shuffle->getOutput(0), nvinfer1::Dims4{0, 0, key_dim, 0},
                               nvinfer1::Dims4{d1.d[0], d1.d[1], key_dim, d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    auto v = network->addSlice(*shuffle->getOutput(0), nvinfer1::Dims4{0, 0, key_dim * 2, 0},
                               nvinfer1::Dims4{d1.d[0], d1.d[1], head_dim, d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    // attn = ((q.transpose(-2, -1) @ k) * self.scale)
    auto qT = network->addShuffle(*q->getOutput(0));
    qT->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2});
    auto matmul = network->addMatrixMultiply(*qT->getOutput(0), nvinfer1::MatrixOperation::kNONE, *k->getOutput(0),
                                             nvinfer1::MatrixOperation::kNONE);
    // There are not many memory leaks, and I will change it when I have time
    float* scale_val = reinterpret_cast<float*>(malloc(sizeof(float) * 1));
    scale_val[0] = scale;
    nvinfer1::Weights s_w{nvinfer1::DataType::kFLOAT, scale_val, 1};
    float* shift_val = reinterpret_cast<float*>(malloc(sizeof(float) * 1));
    shift_val[0] = 0;
    nvinfer1::Weights sh_w{nvinfer1::DataType::kFLOAT, shift_val, 1};
    float* power_val = reinterpret_cast<float*>(malloc(sizeof(float) * 1));
    power_val[0] = 1;
    nvinfer1::Weights p_w{nvinfer1::DataType::kFLOAT, power_val, 1};
    nvinfer1::IScaleLayer* scaleLayer =
            network->addScale(*matmul->getOutput(0), nvinfer1::ScaleMode::kUNIFORM, sh_w, s_w, p_w);
    // attn = attn.softmax(dim=-1)
    nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*scaleLayer->getOutput(0));
    softmax->setAxes(1 << 3);
    // x = (v @ attn.transpose(-2, -1)).view(B, -1, H, W) + self.pe(v.reshape(B, -1, H, W))
    auto attnT = network->addShuffle(*softmax->getOutput(0));
    attnT->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2});
    auto matmul2 = network->addMatrixMultiply(*v->getOutput(0), nvinfer1::MatrixOperation::kNONE, *attnT->getOutput(0),
                                              nvinfer1::MatrixOperation::kNONE);
    auto reshape = network->addShuffle(*matmul2->getOutput(0));
    reshape->setReshapeDimensions(nvinfer1::Dims4{B, -1, H, W});
    auto v_reshape = network->addShuffle(*v->getOutput(0));
    v_reshape->setReshapeDimensions(nvinfer1::Dims4{B, -1, H, W});
    // self.pe = Conv(dim, dim, 3, 1, g=dim, act=False)
    auto pe = convBn(network, weightMap, *v_reshape->getOutput(0), dim, 3, 1, lname + ".pe", dim);
    auto sum = network->addElementWise(*reshape->getOutput(0), *pe->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
    // x = self.proj(x)
    // self.proj = Conv(dim, dim, 1, act=False)
    auto proj = convBn(network, weightMap, *sum->getOutput(0), dim, 1, 1, lname + ".proj");
    return proj;
}

static nvinfer1::ILayer* PSABlock(nvinfer1::INetworkDefinition* network,
                                  std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int dim,
                                  float attn_ratio, int num_heads, bool shortcut, std::string lname) {
    // x = x + self.attn(x) if self.add else self.attn(x)
    auto attn = Attention(network, weightMap, input, dim, num_heads, attn_ratio, lname + ".attn");
    nvinfer1::ILayer* shortcut_layer = nullptr;
    if (shortcut) {
        shortcut_layer = network->addElementWise(input, *attn->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
    } else {
        shortcut_layer = attn;
    }
    // self.ffn = nn.Sequential(Conv(c, c * 2, 1), Conv(c * 2, c, 1, act=False))
    // x = x + self.ffn(x) if self.add else self.ffn(x)
    auto ffn0 = convBnSiLU(network, weightMap, *shortcut_layer->getOutput(0), dim * 2, {1, 1}, 1, lname + ".ffn.0");
    auto ffn1 = convBn(network, weightMap, *ffn0->getOutput(0), dim, 1, 1, lname + ".ffn.1");
    if (shortcut) {
        return network->addElementWise(*shortcut_layer->getOutput(0), *ffn1->getOutput(0),
                                       nvinfer1::ElementWiseOperation::kSUM);
    } else {
        return ffn1;
    }
}

nvinfer1::ILayer* C2PSA(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights>& weightMap,
                        nvinfer1::ITensor& input, int c1, int c2, int n, float e, std::string lname) {
    assert(network != nullptr);
    int c = c1 * e;

    // cv1 branch
    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, 2 * c, {1, 1}, 1, lname + ".cv1");
    nvinfer1::ITensor* cv1_out = conv1->getOutput(0);

    // Split the output of cv1 into two tensors
    nvinfer1::Dims dims = cv1_out->getDimensions();
    nvinfer1::ISliceLayer* split1 = network->addSlice(*cv1_out, nvinfer1::Dims4{0, 0, 0, 0},
                                                      nvinfer1::Dims4{dims.d[0], dims.d[1] / 2, dims.d[2], dims.d[3]},
                                                      nvinfer1::Dims4{1, 1, 1, 1});
    nvinfer1::ISliceLayer* split2 = network->addSlice(*cv1_out, nvinfer1::Dims4{0, dims.d[1] / 2, 0, 0},
                                                      nvinfer1::Dims4{dims.d[0], dims.d[1] / 2, dims.d[2], dims.d[3]},
                                                      nvinfer1::Dims4{1, 1, 1, 1});

    // Create y1 bottleneck sequence
    nvinfer1::ITensor* y = split2->getOutput(0);
    for (int i = 0; i < n; ++i) {
        auto* bottleneck_layer =
                PSABlock(network, weightMap, *y, c, 0.5, c / 64, true, lname + ".m." + std::to_string(i));
        y = bottleneck_layer->getOutput(0);  // update 'y1' to be the output of the current bottleneck
    }

    // Concatenate y1 with the second split of cv1
    nvinfer1::ITensor* concatInputs[2] = {split1->getOutput(0), y};
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(concatInputs, 2);

    // cv2 to produce the final output
    nvinfer1::IElementWiseLayer* conv2 =
            convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv2");

    return conv2;
}

nvinfer1::ILayer* DWConv(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                         nvinfer1::ITensor& input, int ch, std::vector<int> k, int s, std::string lname) {
    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k[0], k[1]},
                                                                  weightMap[lname + ".conv.weight"], bias_empty);
    assert(conv);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    conv->setNbGroups(ch);
    // auto pad
    int p0 = k[0] / 2;
    int p1 = k[1] / 2;
    conv->setPaddingNd(nvinfer1::DimsHW{p0, p1});

    nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);

    nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
    nvinfer1::IElementWiseLayer* ew =
            network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
    assert(ew);
    return ew;
}

nvinfer1::ILayer* A2C2f(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                        nvinfer1::ITensor& input, int c1, int c2, int n, bool a2, int area, bool residual,
                        float mlp_ratio, float e, int g, bool shortcut, std::string lname) {
    int c = (int)(((float)c2) * e);
    int num_heads = c / 32 * 2;

    //assert(c % 32 == 0 && "c2 should be divisible by 32");
    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c * 2, {1, 1}, 1, lname + ".cv1");

    if (a2) {

        nvinfer1::ILayer* ablock1 =
                ABlock(network, weightMap, *conv1->getOutput(0), c, num_heads, mlp_ratio, area, lname + ".m.0.0");
        nvinfer1::ILayer* ablock2 =
                ABlock(network, weightMap, *ablock1->getOutput(0), c, num_heads, mlp_ratio, area, lname + ".m.0.1");
        nvinfer1::ILayer* ablock3 =
                ABlock(network, weightMap, *ablock2->getOutput(0), c, num_heads, mlp_ratio, area, lname + ".m.1.0");
        nvinfer1::ILayer* ablock4 =
                ABlock(network, weightMap, *ablock3->getOutput(0), c, num_heads, mlp_ratio, area, lname + ".m.1.1");

        nvinfer1::ITensor* inputTensors[] = {conv1->getOutput(0), ablock2->getOutput(0), ablock4->getOutput(0)};
        nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 3);

        nvinfer1::IElementWiseLayer* conv2 =
                convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv2");
        return conv2;
    } else {

        nvinfer1::ILayer* c3k_ = C3k(network, weightMap, *conv1->getOutput(0), c * 2, c * 2, 2, shortcut, {3, 3},
                                     {3, 3}, 0.5, lname + ".m.0");

        nvinfer1::ITensor* inputTensors[] = {conv1->getOutput(0), c3k_->getOutput(0)};
        nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 2);

        nvinfer1::IElementWiseLayer* conv2 =
                convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv2");
        return conv2;
    }
}

nvinfer1::ILayer* ABlock(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                         nvinfer1::ITensor& input, int dim, int num_heads, float mlp_ratio, int area,
                         std::string lname) {
    int mlp_hidden_dim = (int)(dim * mlp_ratio);

    nvinfer1::ILayer* attn = AAttn(network, weightMap, input, dim, num_heads, mlp_ratio, area, lname + ".attn");
    nvinfer1::IElementWiseLayer* sum =
            network->addElementWise(input, *attn->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);

    //mlp
    nvinfer1::IElementWiseLayer* mlp1 =
            convBnSiLU(network, weightMap, *sum->getOutput(0), mlp_hidden_dim * 2, {1, 1}, 1, lname + ".mlp.0");

    nvinfer1::ILayer* mlp2 = convBn(network, weightMap, *mlp1->getOutput(0), dim * 2, 1, 1, lname + ".mlp.1");

    nvinfer1::IElementWiseLayer* sum2 =
            network->addElementWise(*sum->getOutput(0), *mlp2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);

    return sum2;
}

nvinfer1::ILayer* AAttn(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                        nvinfer1::ITensor& input, int dim, int num_heads, float mlp_ratio, int area,
                        std::string lname) {
    int head_dim = (int)(dim / num_heads);
    int all_head_dim = head_dim * num_heads;
    //TODO: SCALE IS STATIC, CONVERT TO DYNAMIC!
    float scale = 0.176777;
    auto dims = input.getDimensions();
    int B = dims.d[0];
    int C = dims.d[1];
    int H = dims.d[2];
    int W = dims.d[3];
    int N = H * W;

    auto* qkv = convBn(network, weightMap, input, all_head_dim * 3 * 2, 1, 1, lname + ".qkv");

    auto* reshape = network->addShuffle(*qkv->getOutput(0));
    reshape->setReshapeDimensions(nvinfer1::Dims3{B, -1, N});
    reshape->setSecondTranspose(nvinfer1::Permutation{0, 2, 1});

    if (area > 1) {
        B = B * area;
        N = (H * W) / area;
    }

    auto* reshape1 = network->addShuffle(*reshape->getOutput(0));
    reshape1->setReshapeDimensions(nvinfer1::Dims4{B, N, num_heads, head_dim * 3 * 2});
    reshape1->setSecondTranspose(nvinfer1::Permutation{0, 2, 3, 1});

    nvinfer1::ISliceLayer* q = network->addSlice(
            *reshape1->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0},
            nvinfer1::Dims4{reshape1->getOutput(0)->getDimensions().d[0], reshape1->getOutput(0)->getDimensions().d[1],
                            reshape1->getOutput(0)->getDimensions().d[2] / 3,
                            reshape1->getOutput(0)->getDimensions().d[3]},
            nvinfer1::Dims4{1, 1, 1, 1});
    nvinfer1::ISliceLayer* k = network->addSlice(
            *reshape1->getOutput(0), nvinfer1::Dims4{0, 0, reshape1->getOutput(0)->getDimensions().d[2] / 3, 0},
            nvinfer1::Dims4{reshape1->getOutput(0)->getDimensions().d[0], reshape1->getOutput(0)->getDimensions().d[1],
                            reshape1->getOutput(0)->getDimensions().d[2] / 3,
                            reshape1->getOutput(0)->getDimensions().d[3]},
            nvinfer1::Dims4{1, 1, 1, 1});
    nvinfer1::ISliceLayer* v = network->addSlice(
            *reshape1->getOutput(0), nvinfer1::Dims4{0, 0, 2 * reshape1->getOutput(0)->getDimensions().d[2] / 3, 0},
            nvinfer1::Dims4{reshape1->getOutput(0)->getDimensions().d[0], reshape1->getOutput(0)->getDimensions().d[1],
                            reshape1->getOutput(0)->getDimensions().d[2] / 3,
                            reshape1->getOutput(0)->getDimensions().d[3]},
            nvinfer1::Dims4{1, 1, 1, 1});

    auto* qT = network->addShuffle(*q->getOutput(0));
    qT->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2});

    auto matmul = network->addMatrixMultiply(*qT->getOutput(0), nvinfer1::MatrixOperation::kNONE, *k->getOutput(0),
                                             nvinfer1::MatrixOperation::kNONE);

    float* scale_val = reinterpret_cast<float*>(malloc(sizeof(float) * 1));
    scale_val[0] = scale;
    nvinfer1::Weights s_w{nvinfer1::DataType::kFLOAT, scale_val, 1};
    float* shift_val = reinterpret_cast<float*>(malloc(sizeof(float) * 1));
    shift_val[0] = 0;
    nvinfer1::Weights sh_w{nvinfer1::DataType::kFLOAT, shift_val, 1};
    float* power_val = reinterpret_cast<float*>(malloc(sizeof(float) * 1));
    power_val[0] = 1;
    nvinfer1::Weights p_w{nvinfer1::DataType::kFLOAT, power_val, 1};
    nvinfer1::IScaleLayer* mul =
            network->addScale(*matmul->getOutput(0), nvinfer1::ScaleMode::kUNIFORM, sh_w, s_w, p_w);
    auto* softmax = network->addSoftMax(*mul->getOutput(0));
    softmax->setAxes(1 << 3);

    auto transpose3 = network->addShuffle(*softmax->getOutput(0));
    transpose3->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2});

    auto matmul1 = network->addMatrixMultiply(*v->getOutput(0), nvinfer1::MatrixOperation::kNONE,
                                              *transpose3->getOutput(0), nvinfer1::MatrixOperation::kNONE);

    auto transpose4 = network->addShuffle(*matmul1->getOutput(0));
    transpose4->setFirstTranspose(nvinfer1::Permutation{0, 3, 1, 2});

    if (area > 1) {
        B = B / area;
        N = N * area;
    }

    auto* reshape3 = network->addShuffle(*transpose4->getOutput(0));
    reshape3->setReshapeDimensions(nvinfer1::Dims4{B, H, W, -1});

    auto* transpose6 = network->addShuffle(*reshape3->getOutput(0));
    transpose6->setFirstTranspose(nvinfer1::Permutation{0, 3, 1, 2});

    auto transpose5 = network->addShuffle(*v->getOutput(0));
    transpose5->setFirstTranspose(nvinfer1::Permutation{0, 3, 1, 2});

    auto* reshape4 = network->addShuffle(*transpose5->getOutput(0));
    reshape4->setReshapeDimensions(nvinfer1::Dims4{B, H, W, C});

    //reshape4->setSecondTranspose(nvinfer1::Permutation{0, 3, 1, 2});
    auto* transpose7 = network->addShuffle(*reshape4->getOutput(0));
    transpose7->setFirstTranspose(nvinfer1::Permutation{0, 3, 1, 2});
    auto* pe = convBn(network, weightMap, *transpose7->getOutput(0), all_head_dim * 2, 7, 1, lname + ".pe",
                      all_head_dim * 2);

    auto* sum =
            network->addElementWise(*pe->getOutput(0), *transpose6->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
    auto* proj = convBn(network, weightMap, *sum->getOutput(0), all_head_dim * 2, 1, 1, lname + ".proj");

    return proj;
}


================================================
FILE: yolov12/src/model.cpp
================================================
#include <math.h>
#include <iostream>

#include "block.h"
//#include "calibrator.h"
#include "config.h"
#include "model.h"

static int get_width(int x, float gw, int max_channels, int divisor = 8) {
    auto channel = std::min(x, max_channels);
    channel = int(ceil((channel * gw) / divisor)) * divisor;
    return channel;
}

static int get_depth(int x, float gd) {
    if (x == 1)
        return 1;
    int r = round(x * gd);
    if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0)
        --r;
    return std::max<int>(r, 1);
}

void calculateStrides(nvinfer1::IElementWiseLayer* conv_layers[], int size, int reference_size, int strides[]) {
    for (int i = 0; i < size; ++i) {
        nvinfer1::ILayer* layer = conv_layers[i];
        nvinfer1::Dims dims = layer->getOutput(0)->getDimensions();
        int feature_map_size = dims.d[2];
        strides[i] = reference_size / feature_map_size;
    }
}

nvinfer1::IHostMemory* buildEngineYolo12Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                            nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                            int& max_channels, std::string& type)

{
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
    //	nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(
            1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));

    /*******************************************************************************************************
    ******************************************  YOLO12 INPUT  **********************************************
    *******************************************************************************************************/

    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW});
    assert(data);

    /*******************************************************************************************************
    *****************************************  YOLO12 BACKBONE  ********************************************
    *******************************************************************************************************/

    nvinfer1::IElementWiseLayer* conv0 =
            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), {3, 3}, 2, "model.0");

    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0),
                                                    get_width(128, gw, max_channels), {3, 3}, 2, "model.1");

    bool c3k = false;
    if (type == "m" || type == "l" || type == "x") {
        c3k = true;
    }

    nvinfer1::IElementWiseLayer* conv2 =
            C3K2(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
                 get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, 0.25, "model.2");

    nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0),
                                                    get_width(256, gw, max_channels), {3, 3}, 2, "model.3");

    nvinfer1::IElementWiseLayer* conv4 =
            C3K2(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.25, "model.4");

    nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0),
                                                    get_width(512, gw, max_channels), {3, 3}, 2, "model.5");

    nvinfer1::ILayer* conv6 = A2C2f(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                                    get_width(512, gw, max_channels), 4, true, 4, true, 2.0, 0.25, 1, true, "model.6");

    nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0),
                                                    get_width(1024, gw, max_channels), {3, 3}, 2, "model.7");

    nvinfer1::ILayer* conv8 = A2C2f(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
                                    get_width(1024, gw, max_channels), 4, true, 1, true, 2.0, 0.25, 1, true, "model.8");

    /*******************************************************************************************************
    *********************************************  YOLO12 HEAD  ********************************************
    *******************************************************************************************************/

    float scale[] = {1.0, 1.0, 2.0, 2.0};
    nvinfer1::IResizeLayer* upsample9 = network->addResize(*conv8->getOutput(0));
    assert(upsample9);
    upsample9->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample9->setScales(scale, 4);

    nvinfer1::ITensor* inputTensors10[] = {upsample9->getOutput(0), conv6->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat10 = network->addConcatenation(inputTensors10, 2);

    nvinfer1::ILayer* conv11 =
            A2C2f(network, weightMap, *cat10->getOutput(0), get_width(1024, gw, max_channels),
                  get_width(512, gw, max_channels), 4, false, 1, true, 2.0, 0.25, 1, true, "model.11");

    nvinfer1::IResizeLayer* upsample12 = network->addResize(*conv11->getOutput(0));
    assert(upsample12);
    upsample12->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample12->setScales(scale, 4);
    nvinfer1::ITensor* inputTensors13[] = {upsample12->getOutput(0), conv4->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat13 = network->addConcatenation(inputTensors13, 2);
    nvinfer1::ILayer* conv14 =
            A2C2f(network, weightMap, *cat13->getOutput(0), get_width(256, gw, max_channels),
                  get_width(256, gw, max_channels), 4, false, 1, true, 2.0, 0.25, 1, true, "model.14");

    nvinfer1::IElementWiseLayer* conv15 = convBnSiLU(network, weightMap, *conv14->getOutput(0),
                                                     get_width(256, gw, max_channels), {3, 3}, 2, "model.15");
    nvinfer1::ITensor* inputTensors16[] = {conv15->getOutput(0), conv11->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat16 = network->addConcatenation(inputTensors16, 2);

    nvinfer1::ILayer* conv17 =
            A2C2f(network, weightMap, *cat16->getOutput(0), get_width(512, gw, max_channels),
                  get_width(512, gw, max_channels), 4, false, 1, true, 2.0, 0.25, 1, true, "model.17");

    nvinfer1::IElementWiseLayer* conv18 = convBnSiLU(network, weightMap, *conv17->getOutput(0),
                                                     get_width(512, gw, max_channels), {3, 3}, 2, "model.18");
    nvinfer1::ITensor* inputTensors19[] = {conv18->getOutput(0), conv8->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat19 = network->addConcatenation(inputTensors19, 2);

    nvinfer1::IElementWiseLayer* conv20 =
            C3K2(network, weightMap, *cat19->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.20");

    /*******************************************************************************************************
    *********************************************  YOLO12 OUTPUT  ******************************************
    *******************************************************************************************************/

    int c2 = std::max(std::max(16, get_width(256, gw, max_channels) / 4), 16 * 4);
    int c3 = std::max(get_width(256, gw, max_channels), std::min(kNumClass, 100));

    // output 0
    nvinfer1::IElementWiseLayer* conv21_cv2_0_0 =
            convBnSiLU(network, weightMap, *conv14->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.0.0");
    nvinfer1::IElementWiseLayer* conv21_cv2_0_1 =
            convBnSiLU(network, weightMap, *conv21_cv2_0_0->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.0.1");

    nvinfer1::IConvolutionLayer* conv21_cv2_0_2 =
            network->addConvolutionNd(*conv21_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.21.cv2.0.2.weight"], weightMap["model.21.cv2.0.2.bias"]);
    conv21_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv21_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});

    auto* conv21_cv3_0_0_0 = DWConv(network, weightMap, *conv14->getOutput(0), get_width(256, gw, max_channels), {3, 3},
                                    1, "model.21.cv3.0.0.0");
    auto* conv21_cv3_0_0_1 =
            convBnSiLU(network, weightMap, *conv21_cv3_0_0_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.0.0.1");
    auto* conv21_cv3_0_1_0 =
            DWConv(network, weightMap, *conv21_cv3_0_0_1->getOutput(0), c3, {3, 3}, 1, "model.21.cv3.0.1.0");
    auto* conv21_cv3_0_1_1 =
            convBnSiLU(network, weightMap, *conv21_cv3_0_1_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.0.1.1");
    nvinfer1::IConvolutionLayer* conv21_cv3_0_2 =
            network->addConvolutionNd(*conv21_cv3_0_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.21.cv3.0.2.weight"], weightMap["model.21.cv3.0.2.bias"]);
    conv21_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv21_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});

    nvinfer1::ITensor* inputTensor21_0[] = {conv21_cv2_0_2->getOutput(0), conv21_cv3_0_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat21_0 = network->addConcatenation(inputTensor21_0, 2);

    //output 1
    nvinfer1::IElementWiseLayer* conv21_cv2_1_0 =
            convBnSiLU(network, weightMap, *conv17->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.1.0");
    nvinfer1::IElementWiseLayer* conv21_cv2_1_1 =
            convBnSiLU(network, weightMap, *conv21_cv2_1_0->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.1.1");
    nvinfer1::IConvolutionLayer* conv21_cv2_1_2 =
            network->addConvolutionNd(*conv21_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.21.cv2.1.2.weight"], weightMap["model.21.cv2.1.2.bias"]);
    conv21_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv21_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    auto* conv21_cv3_1_0_0 = DWConv(network, weightMap, *conv17->getOutput(0), get_width(512, gw, max_channels), {3, 3},
                                    1, "model.21.cv3.1.0.0");
    auto* conv21_cv3_1_0_1 =
            convBnSiLU(network, weightMap, *conv21_cv3_1_0_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.1.0.1");
    auto* conv21_cv3_1_1_0 =
            DWConv(network, weightMap, *conv21_cv3_1_0_1->getOutput(0), c3, {3, 3}, 1, "model.21.cv3.1.1.0");
    auto* conv21_cv3_1_1_1 =
            convBnSiLU(network, weightMap, *conv21_cv3_1_1_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.1.1.1");
    nvinfer1::IConvolutionLayer* conv21_cv3_1_2 =
            network->addConvolutionNd(*conv21_cv3_1_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.21.cv3.1.2.weight"], weightMap["model.21.cv3.1.2.bias"]);
    conv21_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv21_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor21_1[] = {conv21_cv2_1_2->getOutput(0), conv21_cv3_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat21_1 = network->addConcatenation(inputTensor21_1, 2);

    //output 2
    nvinfer1::IElementWiseLayer* conv21_cv2_2_0 =
            convBnSiLU(network, weightMap, *conv20->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.2.0");
    nvinfer1::IElementWiseLayer* conv21_cv2_2_1 =
            convBnSiLU(network, weightMap, *conv21_cv2_2_0->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.2.1");
    nvinfer1::IConvolutionLayer* conv21_cv2_2_2 =
            network->addConvolutionNd(*conv21_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.21.cv2.2.2.weight"], weightMap["model.21.cv2.2.2.bias"]);
    conv21_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv21_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    auto* conv21_cv3_2_0_0 = DWConv(network, weightMap, *conv20->getOutput(0), get_width(1024, gw, max_channels),
                                    {3, 3}, 1, "model.21.cv3.2.0.0");
    auto* conv21_cv3_2_0_1 =
            convBnSiLU(network, weightMap, *conv21_cv3_2_0_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.2.0.1");
    auto* conv21_cv3_2_1_0 =
            DWConv(network, weightMap, *conv21_cv3_2_0_1->getOutput(0), c3, {3, 3}, 1, "model.21.cv3.2.1.0");
    auto* conv21_cv3_2_1_1 =
            convBnSiLU(network, weightMap, *conv21_cv3_2_1_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.2.1.1");
    nvinfer1::IConvolutionLayer* conv21_cv3_2_2 =
            network->addConvolutionNd(*conv21_cv3_2_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.21.cv3.2.2.weight"], weightMap["model.21.cv3.2.2.bias"]);
    conv21_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv21_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor21_2[] = {conv21_cv2_2_2->getOutput(0), conv21_cv3_2_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat21_2 = network->addConcatenation(inputTensor21_2, 2);

    /*******************************************************************************************************
    *********************************************  YOLO12 DETECT  ******************************************
    *******************************************************************************************************/

    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7};
    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
    int stridesLength = sizeof(strides) / sizeof(int);

    nvinfer1::IShuffleLayer* shuffle21_0 = network->addShuffle(*cat21_0->getOutput(0));
    shuffle21_0->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
    nvinfer1::ISliceLayer* split21_0_0 = network->addSlice(
            *shuffle21_0->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split21_0_1 =
            network->addSlice(*shuffle21_0->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])},
                              nvinfer1::Dims3{1, 1, 1});

    nvinfer1::IShuffleLayer* dfl21_0 =
            DFL(network, weightMap, *split21_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
                1, 0, "model.21.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor22_dfl_0[] = {dfl21_0->getOutput(0), split21_0_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 2);
    cat22_dfl_0->setAxis(1);

    nvinfer1::IShuffleLayer* shuffle21_1 = network->addShuffle(*cat21_1->getOutput(0));
    shuffle21_1->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
    nvinfer1::ISliceLayer* split21_1_0 = network->addSlice(
            *shuffle21_1->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split21_1_1 =
            network->addSlice(*shuffle21_1->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl21_1 =
            DFL(network, weightMap, *split21_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
                1, 0, "model.21.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor22_dfl_1[] = {dfl21_1->getOutput(0), split21_1_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 2);
    cat22_dfl_1->setAxis(1);

    nvinfer1::IShuffleLayer* shuffle21_2 = network->addShuffle(*cat21_2->getOutput(0));
    shuffle21_2->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
    nvinfer1::ISliceLayer* split21_2_0 = network->addSlice(
            *shuffle21_2->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split21_2_1 =
            network->addSlice(*shuffle21_2->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl21_2 =
            DFL(network, weightMap, *split21_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
                1, 0, "model.21.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor22_dfl_2[] = {dfl21_2->getOutput(0), split21_2_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 2);
    cat22_dfl_2->setAxis(1);

    nvinfer1::IPluginV2Layer* yolo =
            addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer*>{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2},
                         strides, stridesLength, false, false, false);

    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20));
    config->setFlag(nvinfer1::BuilderFlag::kFP16);

    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}


================================================
FILE: yolov12/src/postprocess.cpp
================================================
#include "postprocess.h"
#include "utils.h"

cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
    float l, r, t, b;
    float r_w = kInputW / (img.cols * 1.0);
    float r_h = kInputH / (img.rows * 1.0);

    if (r_h > r_w) {
        l = bbox[0];
        r = bbox[2];
        t = bbox[1] - (kInputH - r_w * img.rows) / 2;
        b = bbox[3] - (kInputH - r_w * img.rows) / 2;
        l = l / r_w;
        r = r / r_w;
        t = t / r_w;
        b = b / r_w;
    } else {
        l = bbox[0] - (kInputW - r_h * img.cols) / 2;
        r = bbox[2] - (kInputW - r_h * img.cols) / 2;
        t = bbox[1];
        b = bbox[3];
        l = l / r_h;
        r = r / r_h;
        t = t / r_h;
        b = b / r_h;
    }
    l = std::max(0.0f, l);
    t = std::max(0.0f, t);
    int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
    int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));

    return cv::Rect(int(round(l)), int(round(t)), width, height);
}

cv::Rect get_rect_obb(cv::Mat& img, float bbox[4]) {
    float l, r, t, b;
    float r_w = kObbInputW / (img.cols * 1.0);
    float r_h = kObbInputH / (img.rows * 1.0);

    if (r_h > r_w) {
        l = bbox[0];
        r = bbox[2];
        t = bbox[1] - (kObbInputH - r_w * img.rows) / 2;
        b = bbox[3] - (kObbInputH - r_w * img.rows) / 2;
        l = l / r_w;
        r = r / r_w;
        t = t / r_w;
        b = b / r_w;
    } else {
        l = bbox[0] - (kObbInputW - r_h * img.cols) / 2;
        r = bbox[2] - (kObbInputW - r_h * img.cols) / 2;
        t = bbox[1];
        b = bbox[3];
        l = l / r_h;
        r = r / r_h;
        t = t / r_h;
        b = b / r_h;
    }
    l = std::max(0.0f, l);
    t = std::max(0.0f, t);
    int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
    int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));

    return cv::Rect(int(round(l)), int(round(t)), width, height);
}

cv::Rect get_rect_adapt_landmark(cv::Mat& img, float bbox[4], float lmk[kNumberOfPoints * 3]) {
    float l, r, t, b;
    float r_w = kInputW / (img.cols * 1.0);
    float r_h = kInputH / (img.rows * 1.0);
    if (r_h > r_w) {
        l = bbox[0] / r_w;
        r = bbox[2] / r_w;
        t = (bbox[1] - (kInputH - r_w * img.rows) / 2) / r_w;
        b = (bbox[3] - (kInputH - r_w * img.rows) / 2) / r_w;
        for (int i = 0; i < kNumberOfPoints * 3; i += 3) {
            lmk[i] /= r_w;
            lmk[i + 1] = (lmk[i + 1] - (kInputH - r_w * img.rows) / 2) / r_w;
            // lmk[i + 2]
        }
    } else {
        l = (bbox[0] - (kInputW - r_h * img.cols) / 2) / r_h;
        r = (bbox[2] - (kInputW - r_h * img.cols) / 2) / r_h;
        t = bbox[1] / r_h;
        b = bbox[3] / r_h;
        for (int i = 0; i < kNumberOfPoints * 3; i += 3) {
            lmk[i] = (lmk[i] - (kInputW - r_h * img.cols) / 2) / r_h;
            lmk[i + 1] /= r_h;
            // lmk[i + 2]
        }
    }
    l = std::max(0.0f, l);
    t = std::max(0.0f, t);
    int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
    int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));

    return cv::Rect(int(round(l)), int(round(t)), width, height);
}

static float iou(float lbox[4], float rbox[4]) {
    float interBox[] = {
            (std::max)(lbox[0], rbox[0]),
            (std::min)(lbox[2], rbox[2]),
            (std::max)(lbox[1], rbox[1]),
            (std::min)(lbox[3], rbox[3]),
    };

    if (interBox[2] > interBox[3] || interBox[0] > interBox[1])
        return 0.0f;

    float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]);
    float unionBoxS = (lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) - interBoxS;
    return interBoxS / unionBoxS;
}

static bool cmp(const Detection& a, const Detection& b) {
    if (a.conf == b.conf) {
        return a.bbox[0] < b.bbox[0];
    }
    return a.conf > b.conf;
}

void nms(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh) {
    int det_size = sizeof(Detection) / sizeof(float);
    std::map<float, std::vector<Detection>> m;

    for (int i = 0; i < output[0]; i++) {
        if (output[1 + det_size * i + 4] <= conf_thresh || isnan(output[1 + det_size * i + 4]))
            continue;
        Detection det;
        memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
        if (m.count(det.class_id) == 0)
            m.emplace(det.class_id, std::vector<Detection>());
        m[det.class_id].push_back(det);
    }
    for (auto it = m.begin(); it != m.end(); it++) {
        auto& dets = it->second;
        std::sort(dets.begin(), dets.end(), cmp);
        for (size_t m = 0; m < dets.size(); ++m) {
            auto& item = dets[m];
            res.push_back(item);
            for (size_t n = m + 1; n < dets.size(); ++n) {
                if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
                    dets.erase(dets.begin() + n);
                    --n;
                }
            }
        }
    }
}

void batch_nms(std::vector<std::vector<Detection>>& res_batch, float* output, int batch_size, int output_size,
               float conf_thresh, float nms_thresh) {
    res_batch.resize(batch_size);
    for (int i = 0; i < batch_size; i++) {
        nms(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh);
    }
}

void process_decode_ptr_host(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img,
                             int count) {
    Detection det;
    for (int i = 0; i < count; i++) {
        int basic_pos = 1 + i * bbox_element;
        int keep_flag = decode_ptr_host[basic_pos + 6];
        if (keep_flag == 1) {
            det.bbox[0] = decode_ptr_host[basic_pos + 0];
            det.bbox[1] = decode_ptr_host[basic_pos + 1];
            det.bbox[2] = decode_ptr_host[basic_pos + 2];
            det.bbox[3] = decode_ptr_host[basic_pos + 3];
            det.conf = decode_ptr_host[basic_pos + 4];
            det.class_id = decode_ptr_host[basic_pos + 5];
            res.push_back(det);
        }
    }
}

void batch_process(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
                   int bbox_element, const std::vector<cv::Mat>& img_batch) {
    res_batch.resize(batch_size);
    int count = static_cast<int>(*decode_ptr_host);
    count = std::min(count, kMaxNumOutputBbox);
    for (int i = 0; i < batch_size; i++) {
        auto& img = const_cast<cv::Mat&>(img_batch[i]);
        process_decode_ptr_host(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count);
    }
}

void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
    for (size_t i = 0; i < img_batch.size(); i++) {
        auto& res = res_batch[i];
        cv::Mat img = img_batch[i];
        for (size_t j = 0; j < res.size(); j++) {
            cv::Rect r = get_rect(img, res[j].bbox);
            cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
            cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
                        cv::Scalar(0xFF, 0xFF, 0xFF), 2);
        }
    }
}

void draw_bbox_keypoints_line(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
    const std::vector<std::pair<int, int>> skeleton_pairs = {
            {0, 1}, {0, 2},  {0, 5}, {0, 6},  {1, 2},   {1, 3},   {2, 4},   {5, 6},   {5, 7},  {5, 11},
            {6, 8}, {6, 12}, {7, 9}, {8, 10}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}};

    for (size_t i = 0; i < img_batch.size(); i++) {
        auto& res = res_batch[i];
        cv::Mat img = img_batch[i];
        for (size_t j = 0; j < res.size(); j++) {
            cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].keypoints);
            cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
            cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
                        cv::Scalar(0xFF, 0xFF, 0xFF), 2);

            for (int k = 0; k < kNumberOfPoints * 3; k += 3) {
                if (res[j].keypoints[k + 2] > 0.5) {
                    cv::circle(img, cv::Point((int)res[j].keypoints[k], (int)res[j].keypoints[k + 1]), 3,
                               cv::Scalar(0, 0x27, 0xC1), -1);
                }
            }

            for (const auto& bone : skeleton_pairs) {
                int kp1_idx = bone.first * 3;
                int kp2_idx = bone.second * 3;
                if (res[j].keypoints[kp1_idx + 2] > 0.5 && res[j].keypoints[kp2_idx + 2] > 0.5) {
                    cv::Point p1((int)res[j].keypoints[kp1_idx], (int)res[j].keypoints[kp1_idx + 1]);
                    cv::Point p2((int)res[j].keypoints[kp2_idx], (int)res[j].keypoints[kp2_idx + 1]);
                    cv::line(img, p1, p2, cv::Scalar(0, 0x27, 0xC1), 2);
                }
            }
        }
    }
}

cv::Mat scale_mask(cv::Mat mask, cv::Mat img) {
    int x, y, w, h;
    float r_w = kInputW / (img.cols * 1.0);
    float r_h = kInputH / (img.rows * 1.0);
    if (r_h > r_w) {
        w = kInputW;
        h = r_w * img.rows;
        x = 0;
        y = (kInputH - h) / 2;
    } else {
        w = r_h * img.cols;
        h = kInputH;
        x = (kInputW - w) / 2;
        y = 0;
    }
    cv::Rect r(x, y, w, h);
    cv::Mat res;
    cv::resize(mask(r), res, img.size());
    return res;
}

void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks,
                    std::unordered_map<int, std::string>& labels_map) {
    static std::vector<uint32_t> colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17,
                                           0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF,
                                           0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7};
    for (size_t i = 0; i < dets.size(); i++) {
        cv::Mat img_mask = scale_mask(masks[i], img);
        auto color = colors[(int)dets[i].class_id % colors.size()];
        auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF);

        cv::Rect r = get_rect(img, dets[i].bbox);
        for (int x = r.x; x < r.x + r.width; x++) {
            for (int y = r.y; y < r.y + r.height; y++) {
                float val = img_mask.at<float>(y, x);
                if (val <= 0.5)
                    continue;
                img.at<cv::Vec3b>(y, x)[0] = img.at<cv::Vec3b>(y, x)[0] / 2 + bgr[0] / 2;
                img.at<cv::Vec3b>(y, x)[1] = img.at<cv::Vec3b>(y, x)[1] / 2 + bgr[1] / 2;
                img.at<cv::Vec3b>(y, x)[2] = img.at<cv::Vec3b>(y, x)[2] / 2 + bgr[2] / 2;
            }
        }

        cv::rectangle(img, r, bgr, 2);

        // Get the size of the text
        cv::Size textSize =
                cv::getTextSize(labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf),
                                cv::FONT_HERSHEY_PLAIN, 1.2, 2, NULL);
        // Set the top left corner of the rectangle
        cv::Point topLeft(r.x, r.y - textSize.height);

        // Set the bottom right corner of the rectangle
        cv::Point bottomRight(r.x + textSize.width, r.y + textSize.height);

        // Set the thickness of the rectangle lines
        int lineThickness = 2;

        // Draw the rectangle on the image
        cv::rectangle(img, topLeft, bottomRight, bgr, -1);

        cv::putText(img, labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf),
                    cv::Point(r.x, r.y + 4), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar::all(0xFF), 2);
    }
}

void process_decode_ptr_host_obb(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element,
                                 cv::Mat& img, int count) {
    Detection det;
    for (int i = 0; i < count; i++) {
        int basic_pos = 1 + i * bbox_element;
        int keep_flag = decode_ptr_host[basic_pos + 6];
        if (keep_flag == 1) {
            det.bbox[0] = decode_ptr_host[basic_pos + 0];
            det.bbox[1] = decode_ptr_host[basic_pos + 1];
            det.bbox[2] = decode_ptr_host[basic_pos + 2];
            det.bbox[3] = decode_ptr_host[basic_pos + 3];
            det.conf = decode_ptr_host[basic_pos + 4];
            det.class_id = decode_ptr_host[basic_pos + 5];
            det.angle = decode_ptr_host[basic_pos + 7];
            res.push_back(det);
        }
    }
}

void batch_process_obb(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
                       int bbox_element, const std::vector<cv::Mat>& img_batch) {
    res_batch.resize(batch_size);
    int count = static_cast<int>(*decode_ptr_host);
    count = std::min(count, kMaxNumOutputBbox);
    for (int i = 0; i < batch_size; i++) {
        auto& img = const_cast<cv::Mat&>(img_batch[i]);
        process_decode_ptr_host_obb(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count);
    }
}

std::tuple<float, float, float> convariance_matrix(Detection res) {
    float w = res.bbox[2];
    float h = res.bbox[3];

    float a = w * w / 12.0;
    float b = h * h / 12.0;
    float c = res.angle;

    float cos_r = std::cos(c);
    float sin_r = std::sin(c);

    float cos_r2 = cos_r * cos_r;
    float sin_r2 = sin_r * sin_r;

    float a_val = a * cos_r2 + b * sin_r2;
    float b_val = a * sin_r2 + b * cos_r2;
    float c_val = (a - b) * cos_r * sin_r;

    return std::make_tuple(a_val, b_val, c_val);
}

static float probiou(const Detection& res1, const Detection& res2, float eps = 1e-7) {
    // Calculate the prob iou between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf.
    float a1, b1, c1, a2, b2, c2;
    std::tuple<float, float, float> matrix1 = {a1, b1, c1};
    std::tuple<float, float, float> matrix2 = {a2, b2, c2};
    matrix1 = convariance_matrix(res1);
    matrix2 = convariance_matrix(res2);
    a1 = std::get<0>(matrix1);
    b1 = std::get<1>(matrix1);
    c1 = std::get<2>(matrix1);
    a2 = std::get<0>(matrix2);
    b2 = std::get<1>(matrix2);
    c2 = std::get<2>(matrix2);

    float x1 = res1.bbox[0], y1 = res1.bbox[1];
    float x2 = res2.bbox[0], y2 = res2.bbox[1];

    float t1 = ((a1 + a2) * std::pow(y1 - y2, 2) + (b1 + b2) * std::pow(x1 - x2, 2)) /
               ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps);
    float t2 = ((c1 + c2) * (x2 - x1) * (y1 - y2)) / ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps);
    float t3 = std::log(
            ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2)) /
                    (4 * std::sqrt(std::max(a1 * b1 - c1 * c1, 0.0f)) * std::sqrt(std::max(a2 * b2 - c2 * c2, 0.0f)) +
                     eps) +
            eps);

    float bd = 0.25f * t1 + 0.5f * t2 + 0.5f * t3;
    bd = std::max(std::min(bd, 100.0f), eps);
    float hd = std::sqrt(1.0 - std::exp(-bd) + eps);

    return 1 - hd;
}

void nms_obb(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh) {
    int det_size = sizeof(Detection) / sizeof(float);
    std::map<float, std::vector<Detection>> m;

    for (int i = 0; i < output[0]; i++) {

        if (output[1 + det_size * i + 4] <= conf_thresh)
            continue;
        Detection det;
        memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
        if (m.count(det.class_id) == 0)
            m.emplace(det.class_id, std::vector<Detection>());
        m[det.class_id].push_back(det);
    }
    for (auto it = m.begin(); it != m.end(); it++) {
        auto& dets = it->second;
        std::sort(dets.begin(), dets.end(), cmp);
        for (size_t m = 0; m < dets.size(); ++m) {
            auto& item = dets[m];
            res.push_back(item);
            for (size_t n = m + 1; n < dets.size(); ++n) {
                if (probiou(item, dets[n]) >= nms_thresh) {
                    dets.erase(dets.begin() + n);
                    --n;
                }
            }
        }
    }
}

void batch_nms_obb(std::vector<std::vector<Detection>>& res_batch, float* output, int batch_size, int output_size,
                   float conf_thresh, float nms_thresh) {
    res_batch.resize(batch_size);
    for (int i = 0; i < batch_size; i++) {
        nms_obb(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh);
    }
}

static std::vector<cv::Point> get_corner(cv::Mat& img, const Detection& box) {
    float cos_value, sin_value;

    // Calculate center point and width/height
    float x1 = box.bbox[0];
    float y1 = box.bbox[1];
    float w = box.bbox[2];
    float h = box.bbox[3];
    float angle = box.angle * 180.0f / CV_PI;  // Convert radians to degrees

    // Print original angle
    std::cout << "Original angle: " << angle << std::endl;

    // Swap width and height if height is greater than or equal to width
    if (h >= w) {
        std::swap(w, h);
        angle = fmod(angle + 90.0f, 180.0f);  // Adjust angle to be within [0, 180)
    }

    // Ensure the angle is between 0 and 180 degrees
    if (angle < 0) {
        angle += 360.0f;  // Convert to positive value
    }
    if (angle > 180.0f) {
        angle -= 180.0f;  // Subtract 180 from angles greater than 180
    }

    // Print adjusted angle
    std::cout << "Adjusted angle: " << angle << std::endl;

    // Convert to normal angle value
    float normal_angle = fmod(angle, 180.0f);
    if (normal_angle < 0) {
        normal_angle += 180.0f;  // Ensure it's a positive value
    }

    // Print normal angle value
    std::cout << "Normal angle: " << normal_angle << std::endl;

    cos_value = std::cos(angle * CV_PI / 180.0f);  // Convert to radians
    sin_value = std::sin(angle * CV_PI / 180.0f);

    // Calculate each corner point
    float l = x1 - w / 2;  // Left boundary
    float r = x1 + w / 2;  // Right boundary
    float t = y1 - h / 2;  // Top boundary
    float b = y1 + h / 2;  // Bottom boundary

    // Use get_rect function to scale the coordinates
    float bbox[4] = {l, t, r, b};
    cv::Rect rect = get_rect_obb(img, bbox);

    float x_ = (rect.x + rect.x + rect.width) / 2;   // Center x
    float y_ = (rect.y + rect.y + rect.height) / 2;  // Center y
    float width = rect.width;                        // Width
    float height = rect.height;                      // Height

    // Calculate each corner point
    std::vector<cv::Point> corner_points(4);
    float vec1x = width / 2 * cos_value;
    float vec1y = width / 2 * sin_value;
    float vec2x = -height / 2 * sin_value;
    float vec2y = height / 2 * cos_value;

    corner_points[0] = cv::Point(int(round(x_ + vec1x + vec2x)), int(round(y_ + vec1y + vec2y)));  // Top-left corner
    corner_points[1] = cv::Point(int(round(x_ + vec1x - vec2x)), int(round(y_ + vec1y - vec2y)));  // Top-right corner
    corner_points[2] =
            cv::Point(int(round(x_ - vec1x - vec2x)), int(round(y_ - vec1y - vec2y)));  // Bottom-right corner
    corner_points[3] = cv::Point(int(round(x_ - vec1x + vec2x)), int(round(y_ - vec1y + vec2y)));  // Bottom-left corner

    // Check and adjust corner points to ensure the rectangle is parallel to image boundaries
    for (auto& point : corner_points) {
        point.x = std::max(0, std::min(point.x, img.cols - 1));
        point.y = std::max(0, std::min(point.y, img.rows - 1));
    }

    return corner_points;
}

void draw_bbox_obb(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
    static std::vector<uint32_t> colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17,
                                           0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF,
                                           0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7};
    for (size_t i = 0; i < img_batch.size(); i++) {
        auto& res = res_batch[i];
        auto& img = img_batch[i];
        for (auto& obj : res) {
            auto color = colors[(int)obj.class_id % colors.size()];
            auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF);
            auto corner_points = get_corner(img, obj);
            cv::polylines(img, std::vector<std::vector<cv::Point>>{corner_points}, true, bgr, 1);

            auto text = (std::to_string((int)(obj.class_id)) + ":" + to_string_with_precision(obj.conf));
            cv::Size textsize = cv::getTextSize(text, 0, 0.3, 1, nullptr);

            int width = textsize.width;
            int height = textsize.height;
            bool outside = (corner_points[0].y - height >= 3) ? true : false;
            cv::Point p1(corner_points[0].x, corner_points[0].y), p2;
            p2.x = corner_points[0].x + width;
            if (outside) {
                p2.y = corner_points[0].y - height - 3;
            } else {
                p2.y = corner_points[0].y + height + 3;
            }
            cv::rectangle(img, p1, p2, bgr, -1, cv::LINE_AA);
            cv::putText(
                    img, text,
                    cv::Point(corner_points[0].x, (outside ? corner_points[0].y - 2 : corner_points[0].y + height + 2)),
                    0, 0.3, cv::Scalar::all(255), 1, cv::LINE_AA);
        }
    }
}


================================================
FILE: yolov12/src/postprocess.cu
================================================
//
// Created by lindsay on 23-7-17.
//
#include "postprocess.h"
#include "types.h"

static __global__ void decode_kernel_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray,
                                         int max_objects) {
    float count = predict[0];
    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    if (position >= count)
        return;

    float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float));
    int index = atomicAdd(parray, 1);
    if (index >= max_objects)
        return;

    float confidence = pitem[4];

    if (confidence < confidence_threshold)
        return;
    //[center_x center_y w h conf class_id  mask[32] keypoints[51] angle]
    float cx = pitem[0];
    float cy = pitem[1];
    float width = pitem[2];
    float height = pitem[3];
    float label = pitem[5];
    float angle = pitem[89];

    float* pout_item = parray + 1 + index * bbox_element;
    *pout_item++ = cx;
    *pout_item++ = cy;
    *pout_item++ = width;
    *pout_item++ = height;
    *pout_item++ = confidence;
    *pout_item++ = label;
    *pout_item++ = 1;  // 1 = keep, 0 = ignore
    *pout_item++ = angle;
}

static __global__ void decode_kernel(float* predict, int num_bboxes, float confidence_threshold, float* parray,
                                     int max_objects) {
    float count = predict[0];
    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    if (position >= count)
        return;

    float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float));
    int index = atomicAdd(parray, 1);
    if (index >= max_objects)
        return;

    float confidence = pitem[4];
    if (confidence < confidence_threshold)
        return;

    float left = pitem[0];
    float top = pitem[1];
    float right = pitem[2];
    float bottom = pitem[3];
    float label = pitem[5];

    float* pout_item = parray + 1 + index * bbox_element;
    *pout_item++ = left;
    *pout_item++ = top;
    *pout_item++ = right;
    *pout_item++ = bottom;
    *pout_item++ = confidence;
    *pout_item++ = label;
    *pout_item++ = 1;  // 1 = keep, 0 = ignore
}

static __device__ float box_iou(float aleft, float atop, float aright, float abottom, float bleft, float btop,
                                float bright, float bbottom) {
    float cleft = max(aleft, bleft);
    float ctop = max(atop, btop);
    float cright = min(aright, bright);
    float cbottom = min(abottom, bbottom);
    float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
    if (c_area == 0.0f)
        return 0.0f;

    float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
    float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
    return c_area / (a_area + b_area - c_area);
}

static __global__ void nms_kernel(float* bboxes, int max_objects, float threshold) {
    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    int count = min(static_cast<int>(bboxes[0]), max_objects);
    if (position >= count)
        return;

    float* pcurrent = bboxes + 1 + position * bbox_element;
    for (int i = 0; i < count; ++i) {
        float* pitem = bboxes + 1 + i * bbox_element;
        if (i == position || pcurrent[5] != pitem[5])
            continue;
        if (pitem[4] >= pcurrent[4]) {
            if (pitem[4] == pcurrent[4] && i < position)
                continue;
            float iou =
                    box_iou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pitem[0], pitem[1], pitem[2], pitem[3]);
            if (iou > threshold) {
                pcurrent[6] = 0;
                return;
            }
        }
    }
}

static __device__ void convariance_matrix(float w, float h, float r, float& a, float& b, float& c) {
    float a_val = w * w / 12.0f;
    float b_val = h * h / 12.0f;
    float cos_r = cosf(r);
    float sin_r = sinf(r);

    a = a_val * cos_r * cos_r + b_val * sin_r * sin_r;
    b = a_val * sin_r * sin_r + b_val * cos_r * cos_r;
    c = (a_val - b_val) * sin_r * cos_r;
}

static __device__ float box_probiou(float cx1, float cy1, float w1, float h1, float r1, float cx2, float cy2, float w2,
                                    float h2, float r2, float eps = 1e-7) {

    // Calculate the prob iou between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf.
    float a1, b1, c1, a2, b2, c2;
    convariance_matrix(w1, h1, r1, a1, b1, c1);
    convariance_matrix(w2, h2, r2, a2, b2, c2);

    float t1 = ((a1 + a2) * powf(cy1 - cy2, 2) + (b1 + b2) * powf(cx1 - cx2, 2)) /
               ((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2) + eps);
    float t2 = ((c1 + c2) * (cx2 - cx1) * (cy1 - cy2)) / ((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2) + eps);
    float t3 = logf(((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2)) /
                            (4 * sqrtf(fmaxf(a1 * b1 - c1 * c1, 0.0f)) * sqrtf(fmaxf(a2 * b2 - c2 * c2, 0.0f)) + eps) +
                    eps);
    float bd = 0.25f * t1 + 0.5f * t2 + 0.5f * t3;
    bd = fmaxf(fminf(bd, 100.0f), eps);
    float hd = sqrtf(1.0f - expf(-bd) + eps);
    return 1 - hd;
}

static __global__ void nms_kernel_obb(float* bboxes, int max_objects, float threshold) {
    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    int count = min(static_cast<int>(bboxes[0]), max_objects);
    if (position >= count)
        return;

    float* pcurrent = bboxes + 1 + position * bbox_element;
    for (int i = 0; i < count; ++i) {
        float* pitem = bboxes + 1 + i * bbox_element;
        if (i == position || pcurrent[5] != pitem[5])
            continue;
        if (pitem[4] >= pcurrent[4]) {
            if (pitem[4] == pcurrent[4] && i < position)
                continue;
            float iou = box_probiou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pcurrent[7], pitem[0], pitem[1],
                                    pitem[2], pitem[3], pitem[7]);
            if (iou > threshold) {
                pcurrent[6] = 0;
                return;
            }
        }
    }
}

void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
                 cudaStream_t stream) {
    int block = 256;
    int grid = ceil(num_bboxes / (float)block);
    decode_kernel<<<grid, block, 0, stream>>>((float*)predict, num_bboxes, confidence_threshold, parray, max_objects);
}

void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) {
    int block = max_objects < 256 ? max_objects : 256;
    int grid = ceil(max_objects / (float)block);
    nms_kernel<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold);
}

void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
                     cudaStream_t stream) {
    int block = 256;
    int grid = ceil(num_bboxes / (float)block);
    decode_kernel_obb<<<grid, block, 0, stream>>>((float*)predict, num_bboxes, confidence_threshold, parray,
                                                  max_objects);
}

void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) {
    int block = max_objects < 256 ? max_objects : 256;
    int grid = ceil(max_objects / (float)block);
    nms_kernel_obb<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold);
}


================================================
FILE: yolov12/src/preprocess.cu
================================================
#include "cuda_utils.h"
#include "preprocess.h"

static uint8_t* img_buffer_host = nullptr;
static uint8_t* img_buffer_device = nullptr;

__global__ void warpaffine_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst,
                                  int dst_width, int dst_height, uint8_t const_value_st, AffineMatrix d2s, int edge) {
    int position = blockDim.x * blockIdx.x + threadIdx.x;
    if (position >= edge)
        return;

    float m_x1 = d2s.value[0];
    float m_y1 = d2s.value[1];
    float m_z1 = d2s.value[2];
    float m_x2 = d2s.value[3];
    float m_y2 = d2s.value[4];
    float m_z2 = d2s.value[5];

    int dx = position % dst_width;
    int dy = position / dst_width;
    float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
    float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
    float c0, c1, c2;

    if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
        // out of range
        c0 = const_value_st;
        c1 = const_value_st;
        c2 = const_value_st;
    } else {
        int y_low = floorf(src_y);
        int x_low = floorf(src_x);
        int y_high = y_low + 1;
        int x_high = x_low + 1;

        uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
        float ly = src_y - y_low;
        float lx = src_x - x_low;
        float hy = 1 - ly;
        float hx = 1 - lx;
        float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
        uint8_t* v1 = const_value;
        uint8_t* v2 = const_value;
        uint8_t* v3 = const_value;
        uint8_t* v4 = const_value;

        if (y_low >= 0) {
            if (x_low >= 0)
                v1 = src + y_low * src_line_size + x_low * 3;

            if (x_high < src_width)
                v2 = src + y_low * src_line_size + x_high * 3;
        }

        if (y_high < src_height) {
            if (x_low >= 0)
                v3 = src + y_high * src_line_size + x_low * 3;

            if (x_high < src_width)
                v4 = src + y_high * src_line_size + x_high * 3;
        }

        c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
        c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
        c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
    }

    // bgr to rgb
    float t = c2;
    c2 = c0;
    c0 = t;

    // normalization
    c0 = c0 / 255.0f;
    c1 = c1 / 255.0f;
    c2 = c2 / 255.0f;

    // rgbrgbrgb to rrrgggbbb
    int area = dst_width * dst_height;
    float* pdst_c0 = dst + dy * dst_width + dx;
    float* pdst_c1 = pdst_c0 + area;
    float* pdst_c2 = pdst_c1 + area;
    *pdst_c0 = c0;
    *pdst_c1 = c1;
    *pdst_c2 = c2;
}

void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height,
                     cudaStream_t stream) {
    int img_size = src_width * src_height * 3;
    // copy data to pinned memory
    memcpy(img_buffer_host, src, img_size);
    // copy data to device memory
    CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream));

    AffineMatrix s2d, d2s;
    float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width);

    s2d.value[0] = scale;
    s2d.value[1] = 0;
    s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5;
    s2d.value[3] = 0;
    s2d.value[4] = scale;
    s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5;
    cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
    cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
    cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);

    memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));

    int jobs = dst_height * dst_width;
    int threads = 256;
    int blocks = ceil(jobs / (float)threads);
    warpaffine_kernel<<<blocks, threads, 0, stream>>>(img_buffer_device, src_width * 3, src_width, src_height, dst,
                                                      dst_width, dst_height, 128, d2s, jobs);
}

void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch, float* dst, int dst_width, int dst_height,
                           cudaStream_t stream) {
    int dst_size = dst_width * dst_height * 3;
    for (size_t i = 0; i < img_batch.size(); i++) {
        cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width,
                        dst_height, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
    }
}

void cuda_preprocess_init(int max_image_size) {
    // prepare input data in pinned memory
    CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3));
    // prepare input data in device memory
    CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3));
}

void cuda_preprocess_destroy() {
    CUDA_CHECK(cudaFree(img_buffer_device));
    CUDA_CHECK(cudaFreeHost(img_buffer_host));
}


================================================
FILE: yolov12/yolo12_det.cpp
================================================

#include <fstream>
#include <iostream>
#include <opencv2/opencv.hpp>
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "postprocess.h"
#include "preprocess.h"
#include "utils.h"

Logger gLogger;
using namespace nvinfer1;
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;

void serialize_engine(std::string& wts_name, std::string& engine_name, float& gd, float& gw, int& max_channels,
                      std::string& type) {
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();
    IHostMemory* serialized_engine = nullptr;

    serialized_engine = buildEngineYolo12Det(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels, type);

    assert(serialized_engine);
    std::ofstream p(engine_name, std::ios::binary);
    if (!p) {
        std::cout << "could not open plan output file" << std::endl;
        assert(false);
    }
    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());

    delete serialized_engine;
    delete config;
    delete builder;
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
                        IExecutionContext** context) {
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        assert(false);
    }
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    char* serialized_engine = new char[size];
    assert(serialized_engine);
    file.read(serialized_engine, size);
    file.close();

    *runtime = createInferRuntime(gLogger);
    assert(*runtime);
    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
    assert(*engine);
    *context = (*engine)->createExecutionContext();
    assert(*context);
    delete[] serialized_engine;
}

void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
                    float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
                    std::string cuda_post_process) {
    assert(engine->getNbBindings() == 2);
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(kInputTensorName);
    const int outputIndex = engine->getBindingIndex(kOutputTensorName);
    assert(inputIndex == 0);
    assert(outputIndex == 1);
    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
    if (cuda_post_process == "c") {
        *output_buffer_host = new float[kBatchSize * kOutputSize];
    } else if (cuda_post_process == "g") {
        if (kBatchSize > 1) {
            std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
            exit(0);
        }
        // Allocate memory for decode_ptr_host and copy to device
        *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
        CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
    }
}

void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
           float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
    // infer on the batch asynchronously, and DMA output back to host
    auto start = std::chrono::system_clock::now();
    context.enqueueV2(buffers, stream, nullptr);
    if (cuda_post_process == "c") {
        CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
                                   stream));
        auto end = std::chrono::system_clock::now();
        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
                  << "ms" << std::endl;
    } else if (cuda_post_process == "g") {
        CUDA_CHECK(
                cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
        cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
        cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream);  //cuda nms
        CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
                                   sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
                                   stream));
        auto end = std::chrono::system_clock::now();
        std::cout << "inference and gpu postprocess time: "
                  << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));
}

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir, std::string& type,
                std::string& cuda_post_process, float& gd, float& gw, int& max_channels) {
    if (argc < 4)
        return false;
    if (std::string(argv[1]) == "-s" && (argc == 5)) {
        wts = std::string(argv[2]);
        engine = std::string(argv[3]);
        auto sub_type = std::string(argv[4]);

        if (sub_type[0] == 'n') {
            gd = 0.50;
            gw = 0.25;
            max_channels = 1024;
            type = "n";
        } else if (sub_type[0] == 's') {
            gd = 0.50;
            gw = 0.50;
            max_channels = 1024;
            type = "s";
        } else if (sub_type[0] == 'm') {
            gd = 0.50;
            gw = 1.00;
            max_channels = 512;
            type = "m";
        } else if (sub_type[0] == 'l') {
            gd = 1.0;
            gw = 1.0;
            max_channels = 512;
            type = "l";
        } else if (sub_type[0] == 'x') {
            gd = 1.0;
            gw = 1.50;
            max_channels = 512;
            type = "x";
        } else {
            return false;
        }
    } else if (std::string(argv[1]) == "-d" && argc == 5) {
        engine = std::string(argv[2]);
        img_dir = std::string(argv[3]);
        cuda_post_process = std::string(argv[4]);
    } else {
        return false;
    }
    return true;
}

int main(int argc, char** argv) {
    // yolo12_det -s ../models/yolo12n.wts ../models/yolo12n.fp32.trt n
    // yolo12_det -d ../models/yolo12n.fp32.trt ../images c
    cudaSetDevice(kGpuId);
    std::string wts_name;
    std::string engine_name;
    std::string img_dir;
    std::string cuda_post_process;
    std::string type;
    int model_bboxes;
    float gd = 0, gw = 0;
    int max_channels = 0;

    if (!parse_args(argc, argv, wts_name, engine_name, img_dir, type, cuda_post_process, gd, gw, max_channels)) {
        std::cerr << "Arguments not right!" << std::endl;
        std::cerr << "./yolo12_det -s [.wts] [.engine] [n/s/m/l/x]  // serialize model to "
                     "plan file"
                  << std::endl;
        std::cerr << "./yolo12_det -d [.engine] ../images  [c/g]// deserialize plan file and run inference"
                  << std::endl;
        return -1;
    }

    // Create a model using the API directly and serialize it to a file
    if (!wts_name.empty()) {
        serialize_engine(wts_name, engine_name, gd, gw, max_channels, type);
        return 0;
    }

    // Deserialize the engine from file
    IRuntime* runtime = nullptr;
    ICudaEngine* engine = nullptr;
    IExecutionContext* context = nullptr;
    deserialize_engine(engine_name, &runtime, &engine, &context);
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));
    cuda_preprocess_init(kMaxInputImageSize);
    auto out_dims = engine->getBindingDimensions(1);
    model_bboxes = out_dims.d[0];
    // Prepare cpu and gpu buffers
    float* device_buffers[2];
    float* output_buffer_host = nullptr;
    float* decode_ptr_host = nullptr;
    float* decode_ptr_device = nullptr;

    // Read images from directory
    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
                   &decode_ptr_device, cuda_post_process);

    // batch predict
    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
        // Get a batch of images
        std::vector<cv::Mat> img_batch;
        std::vector<std::string> img_name_batch;
        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
            img_batch.push_back(img);
            img_name_batch.push_back(file_names[j]);
        }
        // Preprocess
        cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
        // Run inference
        infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
              decode_ptr_device, model_bboxes, cuda_post_process);
        // 保存output_buffer_host的前100个值，一行一个
        //        std::ofstream out("../models/output.txt");
        //        for (int j = 0; j < 100; j++) {
        //            out << output_buffer_host[j] << std::endl;
        //        }
        //        out.close();
        std::vector<std::vector<Detection>> res_batch;
        if (cuda_post_process == "c") {
            // NMS
            batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
        } else if (cuda_post_process == "g") {
            //Process gpu decode and nms results
            batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
        }
        // Draw bounding boxes
        draw_bbox(img_batch, res_batch);
        // Save images
        for (size_t j = 0; j < img_batch.size(); j++) {
            cv::imwrite("_" + img_name_batch[j], img_batch[j]);
        }
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(device_buffers[0]));
    CUDA_CHECK(cudaFree(device_buffers[1]));
    CUDA_CHECK(cudaFree(decode_ptr_device));
    delete[] decode_ptr_host;
    delete[] output_buffer_host;
    cuda_preprocess_destroy();
    // Destroy the engine
    delete context;
    delete engine;
    delete runtime;

    // Print histogram of the output distribution
    //std::cout << "\nOutput:\n\n";
    //for (unsigned int i = 0; i < kOutputSize; i++)
    //{
    //    std::cout << prob[i] << ", ";
    //    if (i % 10 == 0) std::cout << std::endl;
    //}
    //std::cout << std::endl;

    return 0;
}


================================================
FILE: yolov12-tubro/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.10)

project(yolov12)

add_definitions(-std=c++11)
add_definitions(-DAPI_EXPORTS)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

# Set CUDA compiler - use find_package or environment variable
if(NOT DEFINED CMAKE_CUDA_COMPILER)
  find_program(
    CMAKE_CUDA_COMPILER nvcc
    HINTS ENV CUDA_HOME
    PATH_SUFFIXES bin)
endif()
enable_language(CUDA)

include_directories(${PROJECT_SOURCE_DIR}/include)
include_directories(${PROJECT_SOURCE_DIR}/plugin)

# include and link dirs of cuda and tensorrt
# Use CUDA_TOOLKIT_ROOT_DIR or CUDA_HOME environment variable
if(NOT DEFINED CUDA_TOOLKIT_ROOT_DIR)
  if(DEFINED ENV{CUDA_HOME})
    set(CUDA_TOOLKIT_ROOT_DIR $ENV{CUDA_HOME})
  else()
    set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda")
  endif()
endif()

# Use TENSORRT_DIR environment variable or default path
if(NOT DEFINED TENSORRT_DIR)
  if(DEFINED ENV{TENSORRT_DIR})
    set(TENSORRT_DIR $ENV{TENSORRT_DIR})
  else()
    set(TENSORRT_DIR "/opt/TensorRT-8.6.1.6")
  endif()
endif()

if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
  message("embed_platform on")
  include_directories(
    ${CUDA_TOOLKIT_ROOT_DIR}/targets/aarch64-linux/include)
  link_directories(
    ${CUDA_TOOLKIT_ROOT_DIR}/targets/aarch64-linux/lib)
else()
  message("embed_platform off")

  # cuda
  include_directories(${CUDA_TOOLKIT_ROOT_DIR}/include)
  link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)

  # tensorrt
  include_directories(${TENSORRT_DIR}/include)
  link_directories(${TENSORRT_DIR}/lib)
endif()

add_library(
  myplugins SHARED
  ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu)
target_link_libraries(myplugins nvinfer cudart)

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

file(
  GLOB_RECURSE SRCS
  ${PROJECT_SOURCE_DIR}/src/*.cpp
  ${PROJECT_SOURCE_DIR}/src/*.cu)

add_executable(
  yolov12-det
  ${PROJECT_SOURCE_DIR}/yolov12_det.cpp
  ${SRCS})
target_link_libraries(
  yolov12-det nvinfer cudart myplugins ${OpenCV_LIBS})

add_executable(
  yolov12-seg
  ${PROJECT_SOURCE_DIR}/yolov12_seg.cpp
  ${SRCS})
target_link_libraries(
  yolov12-seg nvinfer cudart myplugins ${OpenCV_LIBS})

add_executable(
  yolov12-cls
  ${PROJECT_SOURCE_DIR}/yolov12_cls.cpp
  ${SRCS})
target_link_libraries(
  yolov12-cls nvinfer cudart myplugins ${OpenCV_LIBS})


================================================
FILE: yolov12-tubro/gen_wts.py
================================================
import sys  # noqa: F401
import argparse
import os
import struct
import torch


def parse_args():
    parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
    parser.add_argument('-w', '--weights', required=True,
                        help='Input weights (.pt) file path (required)')
    parser.add_argument(
        '-o', '--output', help='Output (.wts) file path (optional)')
    parser.add_argument(
        '-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg', 'pose', 'obb'],
        help='determines the model is detection/classification')
    args = parser.parse_args()
    if not os.path.isfile(args.weights):
        raise SystemExit('Invalid input file')
    if not args.output:
        args.output = os.path.splitext(args.weights)[0] + '.wts'
    elif os.path.isdir(args.output):
        args.output = os.path.join(
            args.output,
            os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
    return args.weights, args.output, args.type


pt_file, wts_file, m_type = parse_args()

print(f'Generating .wts for {m_type} model')

# Load model
print(f'Loading {pt_file}')

# Initialize
device = 'cpu'

# Load model
model = torch.load(pt_file, map_location=device, weights_only=False)['model'].float()  # load to FP32

if m_type in ['detect', 'seg', 'pose', 'obb']:
    anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None]

    delattr(model.model[-1], 'anchors')

model.to(device).eval()

with open(wts_file, 'w') as f:
    f.write('{}\n'.format(len(model.state_dict().keys())))
    for k, v in model.state_dict().items():
        vr = v.reshape(-1).cpu().numpy()
        f.write('{} {} '.format(k, len(vr)))
        for vv in vr:
            f.write(' ')
            f.write(struct.pack('>f', float(vv)).hex())
        f.write('\n')


================================================
FILE: yolov12-tubro/include/block.h
================================================
#pragma once

#include <map>
#include <string>
#include <vector>
#include "NvInfer.h"

using namespace std;
std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file);

nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network,
                                      std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                      std::string lname, float eps);

nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network,
                                        std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                        int ch, std::vector<int> k, int s, std::string lname, int p = 0, int g = 1,
                                        int d = 1);

nvinfer1::ILayer* Conv(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                       nvinfer1::ITensor& input, int c_out, std::string lname, int k = 1, int s = 1, int padding = 0,
                       int g = 1, bool act = true);

nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                             nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname);

nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
                                       std::vector<nvinfer1::IConcatenationLayer*> dets, const int* px_arry,
                                       int px_arry_num, bool is_segmentation, bool is_pose, bool is_obb);

nvinfer1::IElementWiseLayer* C3k(nvinfer1::INetworkDefinition* network,
                                 std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c2,
                                 std::string lname, int n = 1, bool shortcut = true, int g = 1, float e = 0.5,
                                 int k = 3);

nvinfer1::IElementWiseLayer* C3K2(nvinfer1::INetworkDefinition* network,
                                  std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c2,
                                  int n, std::string lname, bool c3k = false, float e = 0.5, int g = 1,
                                  bool shortcut = true);

nvinfer1::ILayer* AAttn(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                        nvinfer1::ITensor& input, int dim, int num_heads, std::string lname, int area = 1);

nvinfer1::ILayer* DWConv(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                         nvinfer1::ITensor& input, int ch, std::vector<int> k, int s, std::string lname);

nvinfer1::IElementWiseLayer* ABlock(nvinfer1::INetworkDefinition* network,
                                    std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                    int dim, int num_heads, std::string lname, float mlp_ratio = 1.2, int area = 1);

nvinfer1::ILayer* A2C2f(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights>,
                        nvinfer1::ITensor& input, int c2, int n, std::string lname, bool a2 = true, int area = 1,
                        bool residual = false, float mlp_ratio = 2.0, float e = 0.5, int g = 1, bool shortcut = true);

void cout_dim(nvinfer1::ITensor& input);


================================================
FILE: yolov12-tubro/include/calibrator.h
================================================
#ifndef ENTROPY_CALIBRATOR_H
#define ENTROPY_CALIBRATOR_H

#include <NvInfer.h>
#include <string>
#include <vector>
#include "macros.h"

//! \class Int8EntropyCalibrator2
//!
//! \brief Implements Entropy calibrator 2.
//!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
//!
class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
   public:
    Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name,
                           const char* input_blob_name, bool read_cache = true);
    virtual ~Int8EntropyCalibrator2();
    int getBatchSize() const TRT_NOEXCEPT override;
    bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
    const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
    void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;

   private:
    int batchsize_;
    int input_w_;
    int input_h_;
    int img_idx_;
    std::string img_dir_;
    std::vector<std::string> img_files_;
    size_t input_count_;
    std::string calib_table_name_;
    const char* input_blob_name_;
    bool read_cache_;
    void* device_input_;
    std::vector<char> calib_cache_;
};

#endif  // ENTROPY_CALIBRATOR_H


================================================
FILE: yolov12-tubro/include/config.h
================================================
#define USE_FP16
// #define USE_FP32
// #define USE_INT8

const static char* kInputTensorName = "images";
const static char* kOutputTensorName = "output";
const static char* kProtoTensorName = "proto";
const static int kNumClass = 4;
const static int kPoseNumClass = 1;
const static int kNumberOfPoints = 17;  // number of keypoints total
// obb model's number of classes
constexpr static int kObbNumClass = 15;
const static int kObbNe = 1;  // number of extra parameters
const static int kBatchSize = 1;
const static int kGpuId = 0;
const static int kInputH = 640;
const static int kInputW = 640;
const static int kObbInputH = 1024;
const static int kObbInputW = 1024;
const static float kNmsThresh = 0.45f;
const static float kConfThresh = 0.5f;
const static float kConfThreshKeypoints = 0.5f;  // keypoints confidence
const static int kMaxInputImageSize = 3000 * 3000;
const static int kMaxNumOutputBbox = 1000;
//Quantization input image folder path
const static char* kInputQuantizationFolder = "./coco_calib";

// Classfication model's number of classes
constexpr static int kClsNumClass = 5;
// Classfication model's input shape
constexpr static int kClsInputH = 224;
constexpr static int kClsInputW = 224;


================================================
FILE: yolov12-tubro/include/cuda_utils.h
================================================
#ifndef TRTX_CUDA_UTILS_H_
#define TRTX_CUDA_UTILS_H_

#include <cuda_runtime_api.h>

#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)                                                                    \
    {                                                                                          \
        cudaError_t error_code = callstr;                                                      \
        if (error_code != cudaSuccess) {                                                       \
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
            assert(0);                                                                         \
        }                                                                                      \
    }
#endif  // CUDA_CHECK

#endif  // TRTX_CUDA_UTILS_H_


================================================
FILE: yolov12-tubro/include/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "NvInferRuntimeCommon.h"
#include "macros.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf {
   public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {}

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {}

    ~LogStreamConsumerBuffer() {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr()) {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync() {
        putOutput();
        return 0;
    }

    void putOutput() {
        if (mShouldLog) {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }

   private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase {
   public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog) {}

   protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
   public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(severity <= reportableSeverity),
          mSeverity(severity) {}

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(other.mShouldLog),
          mSeverity(other.mSeverity) {}

    void setReportableSeverity(Severity reportableSeverity) {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

   private:
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger {
   public:
    Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {}

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult {
        kRUNNING,  //!< The test is running
        kPASSED,   //!< The test passed
        kFAILED,   //!< The test failed
        kWAIVED    //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger() { return *this; }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity) { mReportableSeverity = severity; }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom {
       public:
        TestAtom(TestAtom&&) = default;

       private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started), mName(name), mCmdline(cmdline) {}

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline) {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom) {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result) {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass) {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const { return mReportableSeverity; }

   private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result) {
        switch (result) {
            case TestResult::kRUNNING:
                return "RUNNING";
            case TestResult::kPASSED:
                return "PASSED";
            case TestResult::kFAILED:
                return "FAILED";
            case TestResult::kWAIVED:
                return "WAIVED";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result) {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv) {
        std::stringstream ss;
        for (int i = 0; i < argc; i++) {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace {

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

}  // anonymous namespace

#endif  // TENSORRT_LOGGING_H


================================================
FILE: yolov12-tubro/include/macros.h
================================================
#ifndef __MACROS_H
#define __MACROS_H

#include "NvInfer.h"

#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else

#if defined(_MSC_VER)
#define API __declspec(dllimport)
#else
#define API
#endif
#endif  // API_EXPORTS

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif

#endif  // __MACROS_H


================================================
FILE: yolov12-tubro/include/model.h
================================================
#pragma once

#include <assert.h>
#include <string>
#include "NvInfer.h"

nvinfer1::IHostMemory* buildEngineYolov12Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                             nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                             int& max_channels, std::string& type);

nvinfer1::IHostMemory* buildEngineYolov12Seg(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                             nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                             int& max_channels, std::string& type);

nvinfer1::IHostMemory* buildEngineYolov12Cls(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                             nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                             std::string& type, int max_channels);


================================================
FILE: yolov12-tubro/include/postprocess.h
================================================
#pragma once

#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "types.h"

// Preprocessing functions
cv::Rect get_rect(cv::Mat& img, float bbox[4]);

// Processing functions
void batch_process(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
                   int bbox_element, const std::vector<cv::Mat>& img_batch);

void batch_process_obb(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
                       int bbox_element, const std::vector<cv::Mat>& img_batch);

void process_decode_ptr_host(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img,
                             int count);

void process_decode_ptr_host_obb(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element,
                                 cv::Mat& img, int count);

// NMS functions
void nms(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh = 0.5);

void batch_nms(std::vector<std::vector<Detection>>& batch_res, float* output, int batch_size, int output_size,
               float conf_thresh, float nms_thresh = 0.5);

void nms_obb(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh = 0.5);

void batch_nms_obb(std::vector<std::vector<Detection>>& batch_res, float* output, int batch_size, int output_size,
                   float conf_thresh, float nms_thresh = 0.5);

// CUDA-related functions
void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
                 cudaStream_t stream);

void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);

void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
                     cudaStream_t stream);

void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);

// Drawing functions
void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);

void draw_bbox_obb(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);

void draw_bbox_keypoints_line(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);

void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks,
                    std::unordered_map<int, std::string>& labels_map);


================================================
FILE: yolov12-tubro/include/preprocess.h
================================================
#pragma once

#include <map>
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "types.h"

void cuda_preprocess_init(int max_image_size);

void cuda_preprocess_destroy();

void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height,
                     cudaStream_t stream);

void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch, float* dst, int dst_width, int dst_height,
                           cudaStream_t stream);


================================================
FILE: yolov12-tubro/include/types.h
================================================
#pragma once
#include "config.h"

struct alignas(float) Detection {
    //center_x center_y w h
    float bbox[4];
    float conf;  // bbox_conf * cls_conf
    float class_id;
    float mask[32];
    float keypoints[kNumberOfPoints * 3];  // 17*3 keypoints
    float angle;                           // obb angle
};

struct AffineMatrix {
    float value[6];
};

const int bbox_element =
        sizeof(AffineMatrix) / sizeof(float) + 1;  // left, top, right, bottom, confidence, class, keepflag


================================================
FILE: yolov12-tubro/include/utils.h
================================================
#pragma once
#include <dirent.h>
#include <fstream>
#include <opencv2/opencv.hpp>

static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
    int w, h, x, y;
    float r_w = input_w / (img.cols * 1.0);
    float r_h = input_h / (img.rows * 1.0);
    if (r_h > r_w) {
        w = input_w;
        h = r_w * img.rows;
        x = 0;
        y = (input_h - h) / 2;
    } else {
        w = r_h * img.cols;
        h = input_h;
        x = (input_w - w) / 2;
        y = 0;
    }
    cv::Mat re(h, w, CV_8UC3);
    cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
    cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
    re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
    return out;
}

static inline int read_files_in_dir(const char* p_dir_name, std::vector<std::string>& file_names) {
    DIR* p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            //            std::cout << "Found file: " << cur_file_name << std::endl;
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}

// Function to trim leading and trailing whitespace from a string
static inline std::string trim_leading_whitespace(const std::string& str) {
    size_t first = str.find_first_not_of(' ');
    if (std::string::npos == first) {
        return str;
    }
    size_t last = str.find_last_not_of(' ');
    return str.substr(first, (last - first + 1));
}

// Src: https://stackoverflow.com/questions/16605967
static inline std::string to_string_with_precision(const float a_value, const int n = 2) {
    std::ostringstream out;
    out.precision(n);
    out << std::fixed << a_value;
    return out.str();
}

static inline int read_labels(const std::string labels_filename, std::unordered_map<int, std::string>& labels_map) {
    std::ifstream file(labels_filename);
    // Read each line of the file
    std::string line;
    int index = 0;
    while (std::getline(file, line)) {
        // Strip the line of any leading or trailing whitespace
        line = trim_leading_whitespace(line);

        // Add the stripped line to the labels_map, using the loop index as the key
        labels_map[index] = line;
        index++;
    }
    // Close the file
    file.close();

    return 0;
}


================================================
FILE: yolov12-tubro/plugin/yololayer.cu
================================================
#include <assert.h>
#include <math.h>
#include <iostream>
#include <vector>
#include "cuda_utils.h"
#include "types.h"
#include "yololayer.h"

namespace Tn {
template <typename T>
void write(char*& buffer, const T& val) {
    *reinterpret_cast<T*>(buffer) = val;
    buffer += sizeof(T);
}

template <typename T>
void read(const char*& buffer, T& val) {
    val = *reinterpret_cast<const T*>(buffer);
    buffer += sizeof(T);
}
}  // namespace Tn

__device__ float sigmoid(float x) {
    return 1.0f / (1.0f + exp(-x));
}

namespace nvinfer1 {
YoloLayerPlugin::YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth,
                                 int netHeight, int maxOut, bool is_segmentation, bool is_pose, bool is_obb,
                                 const int* strides, int stridesLength) {

    mClassCount = classCount;
    mNumberofpoints = numberofpoints;
    mConfthreshkeypoints = confthreshkeypoints;
    mYoloV8NetWidth = netWidth;
    mYoloV8netHeight = netHeight;
    mMaxOutObject = maxOut;
    mStridesLength = stridesLength;
    mStrides = new int[stridesLength];
    memcpy(mStrides, strides, stridesLength * sizeof(int));
    is_segmentation_ = is_segmentation;
    is_pose_ = is_pose;
    is_obb_ = is_obb;
}

YoloLayerPlugin::~YoloLayerPlugin() {
    if (mStrides != nullptr) {
        delete[] mStrides;
        mStrides = nullptr;
    }
}

YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) {
    using namespace Tn;
    const char *d = reinterpret_cast<const char*>(data), *a = d;
    read(d, mClassCount);
    read(d, mNumberofpoints);
    read(d, mConfthreshkeypoints);
    read(d, mThreadCount);
    read(d, mYoloV8NetWidth);
    read(d, mYoloV8netHeight);
    read(d, mMaxOutObject);
    read(d, mStridesLength);
    mStrides = new int[mStridesLength];
    for (int i = 0; i < mStridesLength; ++i) {
        read(d, mStrides[i]);
    }
    read(d, is_segmentation_);
    read(d, is_pose_);
    read(d, is_obb_);

    assert(d == a + length);
}

void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT {

    using namespace Tn;
    char *d = static_cast<char*>(buffer), *a = d;
    write(d, mClassCount);
    write(d, mNumberofpoints);
    write(d, mConfthreshkeypoints);
    write(d, mThreadCount);
    write(d, mYoloV8NetWidth);
    write(d, mYoloV8netHeight);
    write(d, mMaxOutObject);
    write(d, mStridesLength);
    for (int i = 0; i < mStridesLength; ++i) {
        write(d, mStrides[i]);
    }
    write(d, is_segmentation_);
    write(d, is_pose_);
    write(d, is_obb_);

    assert(d == a + getSerializationSize());
}

size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT {
    return sizeof(mClassCount) + sizeof(mNumberofpoints) + sizeof(mConfthreshkeypoints) + sizeof(mThreadCount) +
           sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + sizeof(mMaxOutObject) + sizeof(mStridesLength) +
           sizeof(int) * mStridesLength + sizeof(is_segmentation_) + sizeof(is_pose_) + sizeof(is_obb_);
}

int YoloLayerPlugin::initialize() TRT_NOEXCEPT {
    return 0;
}

nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs,
                                                    int nbInputDims) TRT_NOEXCEPT {
    int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float);
    return nvinfer1::Dims3(total_size + 1, 1, 1);
}

void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT {
    mPluginNamespace = pluginNamespace;
}

const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT {
    return mPluginNamespace;
}

nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
                                                      int nbInputs) const TRT_NOEXCEPT {
    return nvinfer1::DataType::kFLOAT;
}

bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
                                                   int nbInputs) const TRT_NOEXCEPT {

    return false;
}

bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT {

    return false;
}

void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput,
                                      nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT{};

void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
                                      IGpuAllocator* gpuAllocator) TRT_NOEXCEPT{};

void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {}

const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT {

    return "YoloLayer_TRT";
}

const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT {
    return "1";
}

void YoloLayerPlugin::destroy() TRT_NOEXCEPT {
    delete this;
}

nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT {

    YoloLayerPlugin* p =
            new YoloLayerPlugin(mClassCount, mNumberofpoints, mConfthreshkeypoints, mYoloV8NetWidth, mYoloV8netHeight,
                                mMaxOutObject, is_segmentation_, is_pose_, is_obb_, mStrides, mStridesLength);
    p->setPluginNamespace(mPluginNamespace);
    return p;
}

int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs,
                             void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
    forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV8netHeight, mYoloV8NetWidth, batchSize);
    return 0;
}

__device__ float Logist(float data) {
    return 1.0f / (1.0f + expf(-data));
};

__global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h,
                             int grid_w, const int stride, int classes, int nk, float confkeypoints, int outputElem,
                             bool is_segmentation, bool is_pose, bool is_obb) {
    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    if (idx >= numElements)
        return;

    const int N_kpts = nk;
    int total_grid = grid_h * grid_w;
    int info_len = 4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0) + (is_obb ? 1 : 0);
    int batchIdx = idx / total_grid;
    int elemIdx = idx % total_grid;
    const float* curInput = input + batchIdx * total_grid * info_len;
    int outputIdx = batchIdx * outputElem;

    int class_id = 0;
    float max_cls_prob = 0.0;
    for (int i = 4; i < 4 + classes; i++) {
        float p = Logist(curInput[elemIdx + i * total_grid]);
        if (p > max_cls_prob) {
            max_cls_prob = p;
            class_id = i - 4;
        }
    }

    if (max_cls_prob < 0.1)
        return;

    int count = (int)atomicAdd(output + outputIdx, 1);
    if (count >= maxoutobject)
        return;
    char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection);
    Detection* det = (Detection*)(data);

    int row = elemIdx / grid_w;
    int col = elemIdx % grid_w;

    det->conf = max_cls_prob;
    det->class_id = class_id;
    det->bbox[0] = (col + 0.5f - curInput[elemIdx + 0 * total_grid]) * stride;
    det->bbox[1] = (row + 0.5f - curInput[elemIdx + 1 * total_grid]) * stride;
    det->bbox[2] = (col + 0.5f + curInput[elemIdx + 2 * total_grid]) * stride;
    det->bbox[3] = (row + 0.5f + curInput[elemIdx + 3 * total_grid]) * stride;

    if (is_segmentation) {
        for (int k = 0; k < 32; ++k) {
            det->mask[k] =
                    curInput[elemIdx + (4 + classes + (is_pose ? N_kpts * 3 : 0) + (is_obb ? 1 : 0) + k) * total_grid];
        }
    }

    if (is_pose) {
        for (int kpt = 0; kpt < N_kpts; kpt++) {
            int kpt_x_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3) * total_grid;
            int kpt_y_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3 + 1) * total_grid;
            int kpt_conf_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3 + 2) * total_grid;

            float kpt_confidence = sigmoid(curInput[elemIdx + kpt_conf_idx]);

            float kpt_x = (curInput[elemIdx + kpt_x_idx] * 2.0 + col) * stride;
            float kpt_y = (curInput[elemIdx + kpt_y_idx] * 2.0 + row) * stride;

            bool is_within_bbox =
                    kpt_x >= det->bbox[0] && kpt_x <= det->bbox[2] && kpt_y >= det->bbox[1] && kpt_y <= det->bbox[3];

            if (kpt_confidence < confkeypoints || !is_within_bbox) {
                det->keypoints[kpt * 3] = -1;
                det->keypoints[kpt * 3 + 1] = -1;
                det->keypoints[kpt * 3 + 2] = -1;
            } else {
                det->keypoints[kpt * 3] = kpt_x;
                det->keypoints[kpt * 3 + 1] = kpt_y;
                det->keypoints[kpt * 3 + 2] = kpt_confidence;
            }
        }
    }

    if (is_obb) {
        double pi = CV_PI;
        auto angle_inx = curInput[elemIdx + (4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0) +
                                             0) * total_grid];
        auto angle = (sigmoid(angle_inx) - 0.25f) * pi;

        auto cos1 = cos(angle);
        auto sin1 = sin(angle);
        auto xf = (curInput[elemIdx + 2 * total_grid] - curInput[elemIdx + 0 * total_grid]) / 2;
        auto yf = (curInput[elemIdx + 3 * total_grid] - curInput[elemIdx + 1 * total_grid]) / 2;

        auto x = xf * cos1 - yf * sin1;
        auto y = xf * sin1 + yf * cos1;

        float cx = (col + 0.5f + x) * stride;
        float cy = (row + 0.5f + y) * stride;

        float w1 = (curInput[elemIdx + 0 * total_grid] + curInput[elemIdx + 2 * total_grid]) * stride;
        float h1 = (curInput[elemIdx + 1 * total_grid] + curInput[elemIdx + 3 * total_grid]) * stride;
        det->bbox[0] = cx;
        det->bbox[1] = cy;
        det->bbox[2] = w1;
        det->bbox[3] = h1;
        det->angle = angle;
    }
}

void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,
                                 int mYoloV8NetWidth, int batchSize) {
    int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
    cudaMemsetAsync(output, 0, sizeof(float), stream);
    for (int idx = 0; idx < batchSize; ++idx) {
        CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream));
    }
    int numElem = 0;

    //    const int maxGrids = mStridesLength;
    //    int grids[maxGrids][2];
    //    for (int i = 0; i < maxGrids; ++i) {
    //        grids[i][0] = mYoloV8netHeight / mStrides[i];
    //        grids[i][1] = mYoloV8NetWidth / mStrides[i];
    //    }

    int maxGrids = mStridesLength;
    int flatGridsLen = 2 * maxGrids;
    int* flatGrids = new int[flatGridsLen];

    for (int i = 0; i < maxGrids; ++i) {
        flatGrids[2 * i] = mYoloV8netHeight / mStrides[i];
        flatGrids[2 * i + 1] = mYoloV8NetWidth / mStrides[i];
    }

    for (unsigned int i = 0; i < maxGrids; i++) {
        // Access the elements of the original 2D array from the flattened 1D array
        int grid_h = flatGrids[2 * i];      // Corresponds to the access of grids[i][0]
        int grid_w = flatGrids[2 * i + 1];  // Corresponds to the access of grids[i][1]
        int stride = mStrides[i];
        numElem = grid_h * grid_w * batchSize;  // Calculate the total number of elements
        if (numElem < mThreadCount)             // Adjust the thread count if needed
            mThreadCount = numElem;

        // The CUDA kernel call remains unchanged
        CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>(
                inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, mNumberofpoints,
                mConfthreshkeypoints, outputElem, is_segmentation_, is_pose_, is_obb_);
    }

    delete[] flatGrids;
}

PluginFieldCollection YoloPluginCreator::mFC{};
std::vector<PluginField> YoloPluginCreator::mPluginAttributes;

YoloPluginCreator::YoloPluginCreator() {
    mPluginAttributes.clear();
    mFC.nbFields = mPluginAttributes.size();
    mFC.fields = mPluginAttributes.data();
}

const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT {
    return "YoloLayer_TRT";
}

const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT {
    return "1";
}

const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT {
    return &mFC;
}

IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT {
    assert(fc->nbFields == 1);
    assert(strcmp(fc->fields[0].name, "combinedInfo") == 0);
    const int* combinedInfo = static_cast<const int*>(fc->fields[0].data);
    int netinfo_count = 9;
    int class_count = combinedInfo[0];
    int numberofpoints = combinedInfo[1];
    float confthreshkeypoints = combinedInfo[2];
    int input_w = combinedInfo[3];
    int input_h = combinedInfo[4];
    int max_output_object_count = combinedInfo[5];
    bool is_segmentation = combinedInfo[6];
    bool is_pose = combinedInfo[7];
    bool is_obb = combinedInfo[8];
    const int* px_arry = combinedInfo + netinfo_count;
    int px_arry_length = fc->fields[0].length - netinfo_count;
    YoloLayerPlugin* obj =
            new YoloLayerPlugin(class_count, numberofpoints, confthreshkeypoints, input_w, input_h,
                                max_output_object_count, is_segmentation, is_pose, is_obb, px_arry, px_arry_length);
    obj->setPluginNamespace(mNamespace.c_str());
    return obj;
}

IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData,
                                                     size_t serialLength) TRT_NOEXCEPT {
    // This object will be deleted when the network is destroyed, which will
    // call YoloLayerPlugin::destroy()
    YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
    obj->setPluginNamespace(mNamespace.c_str());
    return obj;
}

}  // namespace nvinfer1


================================================
FILE: yolov12-tubro/plugin/yololayer.h
================================================
#pragma once

#include <opencv2/opencv.hpp>
#include <string>
#include <vector>
#include "NvInfer.h"
#include "macros.h"

namespace nvinfer1 {
class API YoloLayerPlugin : public IPluginV2IOExt {
   public:
    YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth, int netHeight,
                    int maxOut, bool is_segmentation, bool is_pose, bool is_obb, const int* strides, int stridesLength);

    YoloLayerPlugin(const void* data, size_t length);

    ~YoloLayerPlugin();

    int getNbOutputs() const TRT_NOEXCEPT override { return 1; }

    nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;

    int initialize() TRT_NOEXCEPT override;

    virtual void terminate() TRT_NOEXCEPT override {}

    virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; }

    virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace,
                        cudaStream_t stream) TRT_NOEXCEPT override;

    virtual size_t getSerializationSize() const TRT_NOEXCEPT override;

    virtual void serialize(void* buffer) const TRT_NOEXCEPT override;

    bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs,
                                   int nbOutputs) const TRT_NOEXCEPT override {
        return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
    }

    const char* getPluginType() const TRT_NOEXCEPT override;

    const char* getPluginVersion() const TRT_NOEXCEPT override;

    void destroy() TRT_NOEXCEPT override;

    IPluginV2IOExt* clone() const TRT_NOEXCEPT override;

    void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;

    const char* getPluginNamespace() const TRT_NOEXCEPT override;

    nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
                                         int32_t nbInputs) const TRT_NOEXCEPT;

    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
                                      int nbInputs) const TRT_NOEXCEPT override;

    bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;

    void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
                         IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;

    void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out,
                         int32_t nbOutput) TRT_NOEXCEPT override;

    void detachFromContext() TRT_NOEXCEPT override;

   private:
    void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,
                    int mYoloV8NetWidth, int batchSize);

    int mThreadCount = 256;
    const char* mPluginNamespace;
    int mClassCount;
    int mNumberofpoints;
    float mConfthreshkeypoints;
    int mYoloV8NetWidth;
    int mYoloV8netHeight;
    int mMaxOutObject;
    bool is_segmentation_;
    bool is_pose_;
    bool is_obb_;
    int* mStrides;
    int mStridesLength;
};

class API YoloPluginCreator : public IPluginCreator {
   public:
    YoloPluginCreator();

    ~YoloPluginCreator() override = default;

    const char* getPluginName() const TRT_NOEXCEPT override;

    const char* getPluginVersion() const TRT_NOEXCEPT override;

    const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;

    nvinfer1::IPluginV2IOExt* createPlugin(const char* name,
                                           const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;

    nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData,
                                                size_t serialLength) TRT_NOEXCEPT override;

    void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; }

    const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); }

   private:
    std::string mNamespace;
    static PluginFieldCollection mFC;
    static std::vector<PluginField> mPluginAttributes;
};

REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
}  // namespace nvinfer1


================================================
FILE: yolov12-tubro/readme.md
================================================
## Introduction

Yolov12 model supports TensorRT-8.

Detection training code [link](https://github.com/sunsmarterjie/yolov12/releases/tag/turbo)
Segment training code[link](https://github.com/sunsmarterjie/yolov12/releases/tag/seg)
Classify training code[link](https://github.com/sunsmarterjie/yolov12/releases/tag/cls)

## Environment

* cuda 11.6
* cudnn 8.9.1.23
* tensorrt 8.6.1.6
* opencv 4.8.0
* ultralytics 8.3.63

## Support

* [x] YOLO12-det support FP32/FP16 and C++ API
* [x] YOLO12-seg support FP32/FP16 and C++ API
* [x] YOLO12-cls support FP32/FP16 and C++ API


## Config

* Choose the YOLO12 sub-model n/s/m/l/x from command line arguments.
* Other configs please check [src/config.h](src/config.h)

## Build and Run (Detection)

1. generate .wts from pytorch with .pt, or download .wts from model zoo

```shell
# You are supposed to train your own models instead of using the pre-trained models
# to download other models, replace 'yolov12n.pt' with 'yolov12s.pt', 'yolov12m.pt', 'yolov12l.pt' or 'yolov12x.pt'
# Generate .wts
cp [PATH-TO-TENSORRTX]/yolov12/gen_wts.py .
python gen_wts.py -w yolov12n.pt -o yolov12n.wts -t detect
# A file 'yolov12n.wts' will be generated.
```

2. build tensorrtx/yolov12 and run
```shell
cd [PATH-TO-TENSORRTX]/yolov12
mkdir build
cd build
cmake ..
make
```


## Build and Run (Segment)

1. generate .wts from pytorch with .pt, or download .wts from model zoo

```shell
# You are supposed to train your own models instead of using the pre-trained models
to download other models, replace 'yolov12n-seg.pt' with 'yolov12s-seg.pt', 'yolov12m-seg.pt', 'yolov12l-seg.pt' or 'yolov12x-seg.pt'
# Generate .wts
cp [PATH-TO-TENSORRTX]/yolov12/gen_wts.py .
python gen_wts.py -w yolov12n.pt -o yolov12n.wts -t seg
# A file 'yolov12n.wts' will be generated.
```

2. build tensorrtx/yolov12 and run
```shell
cd [PATH-TO-TENSORRTX]/yolov12
mkdir build
cd build
cmake ..
make
```

## Build and Run (Classify)

1. generate .wts from pytorch with .pt, or download .wts from model zoo

```shell
# Download ultralytics

# You are supposed to train your own models instead of using the pre-trained models
to download other models, replace 'yolov12n-cls.pt' with 'yolov12s-cls.pt', 'yolov12m-cls.pt', 'yolov12l-cls.pt' or 'yolov12x-cls.pt'
# Generate .wts
cp [PATH-TO-TENSORRTX]/yolov12/gen_wts.py .
python gen_wts.py -w yolov12n-cls.pt -t cls -o yolov12n-cls.wts
# A file 'yolov12n-cls.wts' will be generated.
```

2. build tensorrtx/yolov12 and run
```shell
cd [PATH-TO-TENSORRTX]/yolov12
mkdir build
cd build
cmake ..
make
```

### Detection
```shell
cp [PATH-TO-ultralytics]/yolov12n.wts .
# Build and serialize TensorRT engine
./yolov12_det -s yolov12n.wts yolov12n.engine [n/s/m/l/x]
# Run inference
./yolov12_det -d yolov12n.engine ../images [c/g]
# results saved in build directory
```

### Segment
```shell
cp [PATH-TO-ultralytics]/yolov2n-seg.wts .
# Build and serialize TensorRT engine
./yolov12-seg -s yolov12n-seg.wts yolov12n-seg.engine [n/s/m/l/x]
# Run inference
./yolov12-seg -d yolov12n-seg.engine ../images
# results saved in build directory
```


### Classify
```shell
cp [PATH-TO-ultralytics]/yolov2n-cls.wts .
# Build and serialize TensorRT engine
./yolov12-cls -s yolov12n-cls.wts yolov12n-cls.engine [n/s/m/l/x]
# Run inference
./yolov12-cls -d yolov12n-cls.engine ../images
# results saved in build directory
## More Information
See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)


================================================
FILE: yolov12-tubro/src/block.cpp
================================================
#include "block.h"
#include <assert.h>
#include <math.h>
#include <fstream>
#include <iostream>
#include "config.h"
#include "model.h"
#include "yololayer.h"

std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, nvinfer1::Weights> WeightMap;

    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");

    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--) {
        nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        std::string name;
        input >> name >> std::dec >> size;
        wt.type = nvinfer1::DataType::kFLOAT;

        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; x++) {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        wt.count = size;
        WeightMap[name] = wt;
        // std::cout << "===========name:              " << name << std::endl;
    }
    return WeightMap;
}

nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network,
                                      std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                      std::string lname, float eps) {

    float* gamma = (float*)weightMap[lname + ".weight"].values;
    float* beta = (float*)weightMap[lname + ".bias"].values;
    float* mean = (float*)weightMap[lname + ".running_mean"].values;
    float* var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;

    float* scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, scval, len};

    float* shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, shval, len};

    float* pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, pval, len};
    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    nvinfer1::IScaleLayer* output = network->addScale(input, nvinfer1::ScaleMode::kCHANNEL, shift, scale, power);
    assert(output);
    return output;
}

nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network,
                                        std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                        int ch, std::vector<int> k, int s, std::string lname, int p, int g, int d) {

    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k[0], k[1]},
                                                                  weightMap[lname + ".conv.weight"], bias_empty);

    conv->setNbGroups(g);
    assert(conv);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    // auto pad
    int p0 = k[0] / 2;
    int p1 = k[1] / 2;
    conv->setPaddingNd(nvinfer1::DimsHW{p0, p1});
    nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);

    nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
    nvinfer1::IElementWiseLayer* ew =
            network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
    assert(ew);
    return ew;
}

static nvinfer1::ILayer* bottleneck(nvinfer1::INetworkDefinition* network,
                                    std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                    int c1, int c2, bool shortcut, std::vector<int> k1, std::vector<int> k2, float e,
                                    int g, std::string lname) {
    int c_ = (int)((float)c2 * e);
    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c_, k1, 1, lname + ".cv1");
    nvinfer1::IElementWiseLayer* conv2 =
            convBnSiLU(network, weightMap, *conv1->getOutput(0), c2, k2, 1, lname + ".cv2", 0, g);

    if (shortcut && c1 == c2) {
        nvinfer1::IElementWiseLayer* ew =
                network->addElementWise(input, *conv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
        return ew;
    }
    return conv2;
}

nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                             nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname) {

    nvinfer1::IShuffleLayer* shuffle1 = network->addShuffle(input);
    shuffle1->setReshapeDimensions(nvinfer1::Dims4{kBatchSize, 4, 16, grid});
    shuffle1->setSecondTranspose(nvinfer1::Permutation{0, 2, 1, 3});
    nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*shuffle1->getOutput(0));
    softmax->setAxes(1 << 1);

    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv =
            network->addConvolutionNd(*softmax->getOutput(0), 1, nvinfer1::DimsHW{1, 1}, weightMap[lname], bias_empty);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    conv->setPaddingNd(nvinfer1::DimsHW{p, p});

    nvinfer1::IShuffleLayer* shuffle2 = network->addShuffle(*conv->getOutput(0));
    shuffle2->setReshapeDimensions(nvinfer1::Dims3{kBatchSize, 4, grid});

    return shuffle2;
}

nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
                                       std::vector<nvinfer1::IConcatenationLayer*> dets, const int* px_arry,
                                       int px_arry_num, bool is_segmentation, bool is_pose, bool is_obb) {
    auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
    const int netinfo_count = 9;  // Assuming the first 5 elements are for netinfo as per existing code.
    const int total_count = netinfo_count + px_arry_num;  // Total number of elements for netinfo and px_arry combined.

    std::vector<int> combinedInfo(total_count);
    int class_num = kNumClass;
    if (is_pose)
        class_num = kPoseNumClass;
    else if (is_obb)
        class_num = kObbNumClass;
    int input_w = kInputW;
    if (is_obb)
        input_w = kObbInputW;
    int input_h = kInputH;
    if (is_obb)
        input_h = kObbInputH;
    // Fill in the first 5 elements as per existing netinfo.
    combinedInfo[0] = class_num;
    combinedInfo[1] = kNumberOfPoints;
    combinedInfo[2] = kConfThreshKeypoints;
    combinedInfo[3] = input_w;
    combinedInfo[4] = input_h;
    combinedInfo[5] = kMaxNumOutputBbox;
    combinedInfo[6] = is_segmentation;
    combinedInfo[7] = is_pose;
    combinedInfo[8] = is_obb;

    // Copy the contents of px_arry into the combinedInfo vector after the initial 5 elements.
    std::copy(px_arry, px_arry + px_arry_num, combinedInfo.begin() + netinfo_count);

    // Now let's create the PluginField object to hold this combined information.
    nvinfer1::PluginField pluginField;
    pluginField.name = "combinedInfo";  // This can be any name that the plugin will recognize
    pluginField.data = combinedInfo.data();
    pluginField.type = nvinfer1::PluginFieldType::kINT32;
    pluginField.length = combinedInfo.size();

    // Create the PluginFieldCollection to hold the PluginField object.
    nvinfer1::PluginFieldCollection pluginFieldCollection;
    pluginFieldCollection.nbFields = 1;  // We have just one field, but it's a combined array
    pluginFieldCollection.fields = &pluginField;

    // Create the plugin object using the PluginFieldCollection.
    nvinfer1::IPluginV2* pluginObject = creator->createPlugin("yololayer", &pluginFieldCollection);

    // We assume that the plugin is to be added onto the network.
    // Prepare input tensors for the YOLO Layer.
    std::vector<nvinfer1::ITensor*> inputTensors;
    for (auto det : dets) {
        inputTensors.push_back(det->getOutput(0));  // Assuming each IConcatenationLayer has one output tensor.
    }

    // Add the plugin to the network using the prepared input tensors.
    nvinfer1::IPluginV2Layer* yoloLayer = network->addPluginV2(inputTensors.data(), inputTensors.size(), *pluginObject);

    return yoloLayer;  // Return the added YOLO layer.
}

nvinfer1::ILayer* Conv(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                       nvinfer1::ITensor& input, int c_out, std::string lname, int k, int s, int padding, int g,
                       bool act) {
    nvinfer1::Weights emptywts{nvinfer1::DataType::kFLOAT, nullptr, 0};

    nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, c_out, nvinfer1::DimsHW{k, k},
                                                                  weightMap[lname + ".conv.weight"], emptywts);
    assert(conv);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    // auto pad
    int p0 = k / 2;
    int p1 = k / 2;
    conv->setPaddingNd(nvinfer1::DimsHW{p0, p1});
    conv->setNbGroups(g);

    nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);
    if (act) {
        nvinfer1::IActivationLayer* sigmoid =
                network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
        nvinfer1::IElementWiseLayer* ew = network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0),
                                                                  nvinfer1::ElementWiseOperation::kPROD);
        assert(ew);
        return ew;
    } else
        return bn;
}

nvinfer1::ILayer* DWConv(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                         nvinfer1::ITensor& input, int ch, std::vector<int> k, int s, std::string lname) {
    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k[0], k[1]},
                                                                  weightMap[lname + ".conv.weight"], bias_empty);

    assert(conv);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    conv->setNbGroups(ch);
    // auto pad
    int p0 = k[0] / 2;
    int p1 = k[1] / 2;
    conv->setPaddingNd(nvinfer1::DimsHW{p0, p1});

    nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);

    nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
    nvinfer1::IElementWiseLayer* ew =
            network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
    assert(ew);
    return ew;
}

nvinfer1::IElementWiseLayer* C3k(nvinfer1::INetworkDefinition* network,
                                 std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c2,
                                 std::string lname, int n, bool shortcut, int g, float e, int k) {
    int c_ = c2 * float(e);

    nvinfer1::IElementWiseLayer* cv1 = convBnSiLU(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv1");
    nvinfer1::IElementWiseLayer* cv2 = convBnSiLU(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv2");
    nvinfer1::ITensor* y = cv1->getOutput(0);
    for (int i = 0; i < n; i++) {
        nvinfer1::ILayer* b = bottleneck(network, weightMap, *y, c_, c_, shortcut, {k, k}, {k, k}, 1.0, g,
                                         lname + ".m." + std::to_string(i));
        y = b->getOutput(0);
    }
    nvinfer1::ITensor* inputTensor[] = {y, cv2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor, 2);
    nvinfer1::IElementWiseLayer* cv3 =
            convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv3");

    return cv3;
}

nvinfer1::IElementWiseLayer* C3K2(nvinfer1::INetworkDefinition* network,
                                  std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c2,
                                  int n, std::string lname, bool c3k, float e, int g, bool shortcut) {
    int c = int(c2 * float(e));
    nvinfer1::ILayer* cv1 = Conv(network, weightMap, input, 2 * c, lname + ".cv1", 1, 1);
    nvinfer1::ISliceLayer* sl0 = network->addSlice(
            *cv1->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0},
            nvinfer1::Dims4{cv1->getOutput(0)->getDimensions().d[0], cv1->getOutput(0)->getDimensions().d[1] / 2,
                            cv1->getOutput(0)->getDimensions().d[2], cv1->getOutput(0)->getDimensions().d[3]},
            nvinfer1::Dims4{1, 1, 1, 1});
    nvinfer1::ISliceLayer* sl1 = network->addSlice(
            *cv1->getOutput(0), nvinfer1::Dims4{0, cv1->getOutput(0)->getDimensions().d[1] / 2, 0, 0},
            nvinfer1::Dims4{cv1->getOutput(0)->getDimensions().d[0], cv1->getOutput(0)->getDimensions().d[1] / 2,
                            cv1->getOutput(0)->getDimensions().d[2], cv1->getOutput(0)->getDimensions().d[3]},
            nvinfer1::Dims4{1, 1, 1, 1});
    nvinfer1::ITensor* inputTensor0[] = {sl0->getOutput(0), sl1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor0, 2);
    nvinfer1::ITensor* current = sl1->getOutput(0);

    for (int i = 0; i < n; i++) {
        nvinfer1::ILayer* b;
        if (c3k) {
            b = C3k(network, weightMap, *current, c, lname + ".m." + std::to_string(i), 2, shortcut, g);
        } else {
            b = bottleneck(network, weightMap, *current, c, c, shortcut, {3, 3}, {3, 3}, 0.5, g,
                           lname + ".m." + std::to_string(i));
        }
        current = b->getOutput(0);
        nvinfer1::ITensor* inputTensors[] = {cat->getOutput(0), b->getOutput(0)};
        cat = network->addConcatenation(inputTensors, 2);
    }
    nvinfer1::IElementWiseLayer* cv2 =
            convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv2");
    return cv2;
}

nvinfer1::ILayer* AAttn(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                        nvinfer1::ITensor& input, int dim, int num_heads, std::string lname, int area) {

    nvinfer1::Dims d_input = input.getDimensions();
    int B = d_input.d[0];
    int C = d_input.d[1];
    int H = d_input.d[2];
    int W = d_input.d[3];
    int N = W * H;
    int head_dim = dim / num_heads;
    int all_head_dim = head_dim * num_heads;

    nvinfer1::ILayer* qk = Conv(network, weightMap, input, all_head_dim * 2, lname + ".qk", 1, 1, 0, 1, false);
    nvinfer1::IShuffleLayer* qk_flatten_t = network->addShuffle(*qk->getOutput(0));
    qk_flatten_t->setReshapeDimensions(nvinfer1::Dims3{B, -1, N});
    qk_flatten_t->setSecondTranspose(nvinfer1::Permutation{0, 2, 1});

    nvinfer1::ILayer* v = Conv(network, weightMap, input, all_head_dim, lname + ".v", 1, 1, 0, 1, false);
    nvinfer1::IShuffleLayer* v_flatten_t = network->addShuffle(*v->getOutput(0));
    v_flatten_t->setReshapeDimensions(nvinfer1::Dims3{B, -1, N});
    v_flatten_t->setSecondTranspose(nvinfer1::Permutation{0, 2, 1});  // (1, 6400, 64)

    nvinfer1::ILayer* pe = Conv(network, weightMap, *v->getOutput(0), dim, lname + ".pe", 5, 1, 2, dim, false);

    nvinfer1::ITensor* q_k = qk_flatten_t->getOutput(0);
    nvinfer1::ITensor* v_ = v_flatten_t->getOutput(0);
    if (area > 1) {
        B = B * area;
        N = N / area;

        nvinfer1::IShuffleLayer* qk_reshape = network->addShuffle(*qk_flatten_t->getOutput(0));
        qk_reshape->setReshapeDimensions(nvinfer1::Dims3{B, N, C * 2});
        nvinfer1::IShuffleLayer* v_reshape = network->addShuffle(*v_flatten_t->getOutput(0));
        v_reshape->setReshapeDimensions(nvinfer1::Dims3{B, N, C});

        q_k = qk_reshape->getOutput(0);
        v_ = v_reshape->getOutput(0);
    }
    nvinfer1::Dims q_k_dim = q_k->getDimensions();
    nvinfer1::ISliceLayer* q =
            network->addSlice(*q_k, nvinfer1::Dims3{0, 0, 0},
                              nvinfer1::Dims3{q_k_dim.d[0], q_k_dim.d[1], q_k_dim.d[2] / 2}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* k =
            network->addSlice(*q_k, nvinfer1::Dims3{0, 0, q_k_dim.d[2] / 2},
                              nvinfer1::Dims3{q_k_dim.d[0], q_k_dim.d[1], q_k_dim.d[2] / 2}, nvinfer1::Dims3{1, 1, 1});

    nvinfer1::IShuffleLayer* q_reshape = network->addShuffle(*q->getOutput(0));
    q_reshape->setReshapeDimensions(nvinfer1::Dims4{B, N, num_heads, head_dim});
    nvinfer1::IShuffleLayer* k_reshape = network->addShuffle(*k->getOutput(0));
    k_reshape->setReshapeDimensions(nvinfer1::Dims4{B, N, num_heads, head_dim});
    nvinfer1::IShuffleLayer* v_reshape = network->addShuffle(*v_);
    v_reshape->setReshapeDimensions(nvinfer1::Dims4{B, N, num_heads, head_dim});

    // (B, N, num_head, head_dim)--->(B, num_head, head_dim, N)
    nvinfer1::IShuffleLayer* q_t_view = network->addShuffle(*q_reshape->getOutput(0));
    q_t_view->setFirstTranspose(nvinfer1::Permutation{0, 2, 3, 1});

    nvinfer1::IShuffleLayer* k_t_view = network->addShuffle(*k_reshape->getOutput(0));
    k_t_view->setFirstTranspose(nvinfer1::Permutation{0, 2, 3, 1});
    nvinfer1::IShuffleLayer* v_t_view = network->addShuffle(*v_reshape->getOutput(0));
    v_t_view->setFirstTranspose(nvinfer1::Permutation{0, 2, 3, 1});

    nvinfer1::IShuffleLayer* q_T = network->addShuffle(*q_t_view->getOutput(0));
    q_T->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2});  // (B, num_head, N, head_dim, N)
    nvinfer1::IMatrixMultiplyLayer* q_mul_k =
            network->addMatrixMultiply(*q_T->getOutput(0), nvinfer1::MatrixOperation::kNONE, *k_t_view->getOutput(0),
                                       nvinfer1::MatrixOperation::kNONE);

    float scale = 1.0 / sqrt(head_dim);
    float* scale_val = reinterpret_cast<float*>(malloc(sizeof(float) * 1));
    scale_val[0] = scale;
    nvinfer1::Weights s_w{nvinfer1::DataType::kFLOAT, scale_val, 1};  // scale
    float* shift_val = reinterpret_cast<float*>(malloc(sizeof(float) * 1));
    shift_val[0] = 0;
    nvinfer1::Weights sh_w{nvinfer1::DataType::kFLOAT, shift_val, 1};  // shift
    float* power_val = reinterpret_cast<float*>(malloc(sizeof(float) * 1));
    power_val[0] = 1;
    nvinfer1::Weights p_w{nvinfer1::DataType::kFLOAT, power_val, 1};  // power
    nvinfer1::IScaleLayer* q_mul_k_scale =
            network->addScale(*q_mul_k->getOutput(0), nvinfer1::ScaleMode::kUNIFORM, sh_w, s_w, p_w);

    nvinfer1::IReduceLayer* attn_max =
            network->addReduce(*q_mul_k_scale->getOutput(0), nvinfer1::ReduceOperation::kMAX, 1 << 3, true);

    nvinfer1::IElementWiseLayer* attn_sub = network->addElementWise(
            *q_mul_k_scale->getOutput(0), *attn_max->getOutput(0), nvinfer1::ElementWiseOperation::kSUB);
    nvinfer1::IUnaryLayer* attn_exp = network->addUnary(*attn_sub->getOutput(0), nvinfer1::UnaryOperation::kEXP);
    nvinfer1::IReduceLayer* attn_sum =
            network->addReduce(*attn_exp->getOutput(0), nvinfer1::ReduceOperation::kSUM, 1 << 3, true);

    nvinfer1::IElementWiseLayer* attn_div = network->addElementWise(*attn_exp->getOutput(0), *attn_sum->getOutput(0),
                                                                    nvinfer1::ElementWiseOperation::kDIV);

    nvinfer1::IShuffleLayer* attn_t = network->addShuffle(*attn_div->getOutput(0));
    attn_t->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2});

    nvinfer1::IMatrixMultiplyLayer* attn_v =
            network->addMatrixMultiply(*v_t_view->getOutput(0), nvinfer1::MatrixOperation::kNONE, *attn_t->getOutput(0),
                                       nvinfer1::MatrixOperation::kNONE);

    nvinfer1::IShuffleLayer* attn_v_t = network->addShuffle(*attn_v->getOutput(0));
    attn_v_t->setFirstTranspose(nvinfer1::Permutation{0, 3, 1, 2});
    nvinfer1::ITensor* attn_temp = attn_v_t->getOutput(0);
    if (area > 1) {
        B = B / area;
        N = N * area;

        nvinfer1::IShuffleLayer* attn_v_t_r = network->addShuffle(*attn_v_t->getOutput(0));
        attn_v_t_r->setReshapeDimensions(nvinfer1::Dims3{B, N, C});
        attn_temp = attn_v_t_r->getOutput(0);
    }
    nvinfer1::IShuffleLayer* attn_x = network->addShuffle(*attn_temp);
    attn_x->setReshapeDimensions(nvinfer1::Dims4{B, H, W, C});
    attn_x->setSecondTranspose(nvinfer1::Permutation{0, 3, 1, 2});
    nvinfer1::IElementWiseLayer* x_add_pp =
            network->addElementWise(*attn_x->getOutput(0), *pe->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
    nvinfer1::ILayer* proj = Conv(network, weightMap, *x_add_pp->getOutput(0), dim, lname + ".proj", 1, 1, 0, 1, false);

    return proj;
}

nvinfer1::IElementWiseLayer* ABlock(nvinfer1::INetworkDefinition* network,
                                    std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                    int dim, int num_heads, std::string lname, float mlp_ratio, int area) {

    nvinfer1::ILayer* attn = AAttn(network, weightMap, input, dim, num_heads, lname + ".attn", area);
    nvinfer1::IElementWiseLayer* add1 =  // x = x + self.attn(x)
            network->addElementWise(input, *attn->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
    int mlp_hidden_dim = int(dim * mlp_ratio);

    nvinfer1::ILayer* mlp_0 =
            Conv(network, weightMap, *add1->getOutput(0), mlp_hidden_dim, lname + ".mlp.0", 1, 1, 0, 1, true);
    nvinfer1::ILayer* mlp_1 = Conv(network, weightMap, *mlp_0->getOutput(0), dim, lname + ".mlp.1", 1, 1, 0, 1, false);

    nvinfer1::IElementWiseLayer* result =
            network->addElementWise(*add1->getOutput(0), *mlp_1->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
    return result;
}

nvinfer1::ILayer* A2C2f(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                        nvinfer1::ITensor& input, int c2, int n, std::string lname, bool a2, int area, bool residual,
                        float mlp_ratio, float e, int g, bool shortcut) {

    int c_ = static_cast<int>(c2 * e);
    assert(c_ % 32 == 0 && "Dimension of ABlock must be a multiple of 32");
    int num_heads = c_ / 32;

    nvinfer1::ILayer* cv1 = Conv(network, weightMap, input, c_, lname + ".cv1", 1, 1);
    std::vector<nvinfer1::ITensor*> y{cv1->getOutput(0)};
    nvinfer1::ITensor* current = cv1->getOutput(0);
    for (int i = 0; i < n; i++) {
        if (a2) {
            nvinfer1::ILayer* m_0 = ABlock(network, weightMap, *current, c_, num_heads,
                                           lname + ".m." + std::to_string(i) + ".0", mlp_ratio, area);
            nvinfer1::ILayer* m_1 = ABlock(network, weightMap, *m_0->getOutput(0), c_, num_heads,
                                           lname + ".m." + std::to_string(i) + ".1", mlp_ratio, area);
            current = m_1->getOutput(0);
        } else {
            // C3k
            nvinfer1::ILayer* m =
                    C3k(network, weightMap, *current, c_, lname + ".m." + std::to_string(i), 2, shortcut, g);
            current = m->getOutput(0);
        }
        y.push_back(current);
    }
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(y.data(), static_cast<int>(y.size()));
    cat->setAxis(1);
    nvinfer1::ILayer* cv2 = Conv(network, weightMap, *cat->getOutput(0), c2, lname + ".cv2", 1, 1);

    if (a2 && residual) {
        // std::cout << lname << " applying residual connection with gamma" << std::endl;

        nvinfer1::Weights gamma = weightMap[lname + ".gamma"];

        nvinfer1::IConstantLayer* gamma_layer = network->addConstant(nvinfer1::Dims4{1, c2, 1, 1}, gamma);
        nvinfer1::IElementWiseLayer* scaled_output = network->addElementWise(
                *gamma_layer->getOutput(0), *cv2->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
        nvinfer1::IElementWiseLayer* result =
                network->addElementWise(input, *scaled_output->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);

        return result;
    } else {

        return cv2;
    }
}


================================================
FILE: yolov12-tubro/src/calibrator.cpp
================================================
#include "calibrator.h"
#include <fstream>
#include <iostream>
#include <iterator>
#include <opencv2/dnn/dnn.hpp>
#include "cuda_utils.h"
#include "utils.h"

Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir,
                                               const char* calib_table_name, const char* input_blob_name,
                                               bool read_cache)
    : batchsize_(batchsize),
      input_w_(input_w),
      input_h_(input_h),
      img_idx_(0),
      img_dir_(img_dir),
      calib_table_name_(calib_table_name),
      input_blob_name_(input_blob_name),
      read_cache_(read_cache) {
    input_count_ = 3 * input_w * input_h * batchsize;
    CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float)));
    read_files_in_dir(img_dir, img_files_);
}

Int8EntropyCalibrator2::~Int8EntropyCalibrator2() {
    CUDA_CHECK(cudaFree(device_input_));
}

int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT {
    return batchsize_;
}

bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT {
    if (img_idx_ + batchsize_ > (int)img_files_.size()) {
        return false;
    }

    std::vector<cv::Mat> input_imgs_;
    for (int i = img_idx_; i < img_idx_ + batchsize_; i++) {
        std::cout << img_files_[i] << "  " << i << std::endl;
        cv::Mat temp = cv::imread(img_dir_ + "/" + img_files_[i]);
        if (temp.empty()) {
            std::cerr << "Fatal error: image cannot open!" << std::endl;
            return false;
        }
        cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_);
        input_imgs_.push_back(pr_img);
    }
    img_idx_ += batchsize_;
    cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0),
                                           true, false);
    CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr<float>(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice));
    assert(!strcmp(names[0], input_blob_name_));
    bindings[0] = device_input_;
    return true;
}

const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT {
    std::cout << "reading calib cache: " << calib_table_name_ << std::endl;
    calib_cache_.clear();
    std::ifstream input(calib_table_name_, std::ios::binary);
    input >> std::noskipws;
    if (read_cache_ && input.good()) {
        std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(calib_cache_));
    }
    length = calib_cache_.size();
    return length ? calib_cache_.data() : nullptr;
}

void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT {
    std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl;
    std::ofstream output(calib_table_name_, std::ios::binary);
    output.write(reinterpret_cast<const char*>(cache), length);
}


================================================
FILE: yolov12-tubro/src/model.cpp
================================================
#include <math.h>
#include <iostream>

#include "block.h"
//#include "calibrator.h"
#include "config.h"
#include "model.h"

static int get_width(int x, float gw, int max_channels, int divisor = 8) {
    auto channel = std::min(x, max_channels);
    channel = int(ceil((channel * gw) / divisor)) * divisor;
    return channel;
}

static int get_depth(int x, float gd) {
    if (x == 1)
        return 1;
    int r = round(x * gd);
    if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0)
        --r;
    return std::max<int>(r, 1);
}
static nvinfer1::IElementWiseLayer* convBnSiLUProto(nvinfer1::INetworkDefinition* network,
                                                    std::map<std::string, nvinfer1::Weights> weightMap,
                                                    nvinfer1::ITensor& input, int ch, int k, int s, int p,
                                                    std::string lname) {
    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv =
            network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty);

    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    conv->setPaddingNd(nvinfer1::DimsHW{p, p});
    conv->setName((lname + ".conv").c_str());

    nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);
    bn->setName((lname + ".bn").c_str());
    // This concat operator is not used for calculation, in order to prevent the operator fusion unrealized error when int8 is quantized.
    // Error Code 10: Internal Error (Could not find any implementation for node
    // model.22.proto.cv3.conv + model.22.proto.cv3.sigmoid + PWN(PWN((Unnamed Layer* 353) [Activation]), PWN(model.22.proto.cv3.silu)).)

#if defined(USE_INT8)
    nvinfer1::ITensor* inputTensors[] = {bn->getOutput(0)};
    auto concat = network->addConcatenation(inputTensors, 1);
    nvinfer1::IActivationLayer* sigmoid =
            network->addActivation(*concat->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
    assert(sigmoid);
    bn->setName((lname + ".sigmoid").c_str());
    nvinfer1::IElementWiseLayer* ew = network->addElementWise(*concat->getOutput(0), *sigmoid->getOutput(0),
                                                              nvinfer1::ElementWiseOperation::kPROD);
    assert(ew);
    ew->setName((lname + ".silu").c_str());
#else
    nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
    assert(sigmoid);
    bn->setName((lname + ".sigmoid").c_str());
    nvinfer1::IElementWiseLayer* ew =
            network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
    assert(ew);
    ew->setName((lname + ".silu").c_str());

#endif
    return ew;
}

static nvinfer1::IElementWiseLayer* Proto(nvinfer1::INetworkDefinition* network,
                                          std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input,
                                          std::string lname, float gw, int max_channels) {
    int mid_channel = get_width(256, gw, max_channels);
    auto cv1 = convBnSiLU(network, weightMap, input, mid_channel, {3, 3}, 1, lname + ".cv1");
    //    float *convTranpsose_bais = (float *) weightMap["model.23.proto.upsample.bias"].values;
    //    int convTranpsose_bais_len = weightMap["model.23.proto.upsample.bias"].count;
    //    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, convTranpsose_bais, convTranpsose_bais_len};
    auto convTranpsose =
            network->addDeconvolutionNd(*cv1->getOutput(0), mid_channel, nvinfer1::DimsHW{2, 2},
                                        weightMap[lname + ".upsample.weight"], weightMap[lname + ".upsample.bias"]);

    assert(convTranpsose);
    convTranpsose->setStrideNd(nvinfer1::DimsHW{2, 2});
    convTranpsose->setPadding(nvinfer1::DimsHW{0, 0});
    auto cv2 = convBnSiLU(network, weightMap, *convTranpsose->getOutput(0), mid_channel, {3, 3}, 1, lname + ".cv2");
    auto cv3 = convBnSiLUProto(network, weightMap, *cv2->getOutput(0), 32, 1, 1, 0, lname + ".cv3");
    assert(cv3);
    return cv3;
}

static nvinfer1::IShuffleLayer* cv4_conv_combined(nvinfer1::INetworkDefinition* network,
                                                  std::map<std::string, nvinfer1::Weights>& weightMap,
                                                  nvinfer1::ITensor& input, std::string lname, int grid_shape, float gw,
                                                  const std::string& algo_type, int max_channels) {
    int nm_nk = 0;
    int c4 = 0;

    if (algo_type == "seg") {
        nm_nk = 32;
        c4 = std::max(get_width(256, gw, max_channels) / 4, nm_nk);
    } else if (algo_type == "pose") {
        nm_nk = kNumberOfPoints * 3;
        c4 = std::max(get_width(256, gw, max_channels) / 4, kNumberOfPoints * 3);
    }

    auto cv0 = convBnSiLU(network, weightMap, input, c4, {3, 3}, 1, lname + ".0");
    auto cv1 = convBnSiLU(network, weightMap, *cv0->getOutput(0), c4, {3, 3}, 1, lname + ".1");
    float* cv2_bais_value = (float*)weightMap[lname + ".2" + ".bias"].values;
    int cv2_bais_len = weightMap[lname + ".2" + ".bias"].count;
    nvinfer1::Weights cv2_bais{nvinfer1::DataType::kFLOAT, cv2_bais_value, cv2_bais_len};
    auto cv2 = network->addConvolutionNd(*cv1->getOutput(0), nm_nk, nvinfer1::DimsHW{1, 1},
                                         weightMap[lname + ".2" + ".weight"], cv2_bais);
    cv2->setStrideNd(nvinfer1::DimsHW{1, 1});
    nvinfer1::IShuffleLayer* cv2_shuffle = network->addShuffle(*cv2->getOutput(0));
    cv2_shuffle->setReshapeDimensions(nvinfer1::Dims3{kBatchSize, nm_nk, grid_shape});

    return cv2_shuffle;
}

void calculateStrides(nvinfer1::IElementWiseLayer* conv_layers[], int size, int reference_size, int strides[]) {
    for (int i = 0; i < size; ++i) {
        nvinfer1::ILayer* layer = conv_layers[i];
        nvinfer1::Dims dims = layer->getOutput(0)->getDimensions();
        int feature_map_size = dims.d[2];
        strides[i] = reference_size / feature_map_size;
    }
}

void calculateStrides(nvinfer1::ILayer* conv_layers[], int size, int reference_size, int strides[]) {
    for (int i = 0; i < size; ++i) {
        nvinfer1::ILayer* layer = conv_layers[i];
        nvinfer1::Dims dims = layer->getOutput(0)->getDimensions();
        int feature_map_size = dims.d[2];
        strides[i] = reference_size / feature_map_size;
    }
}

nvinfer1::IHostMemory* buildEngineYolov12Cls(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                             nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                             std::string& type, int max_channels) {

    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(
            1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));

    nvinfer1::ITensor* data =
            network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kClsInputH, kClsInputW});
    assert(data);

    nvinfer1::ILayer* conv0 = Conv(network, weightMap, *data, get_width(64, gw, max_channels), "model.0", 3, 2);
    nvinfer1::ILayer* conv1 =
            Conv(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), "model.1", 3, 2, 1, 2);

    bool c3k2 = false;
    if (type == "m" || type == "l" || type == "x") {
        c3k2 = true;
    }
    float mlp_ratio = 2.0;
    bool residual = true;
    if (type == "l" || type == "x") {
        //mlp_ratio = 1.5;  // if use the official's pretrained model,you are supposed to use 1.5
        mlp_ratio = 1;  // your ownself 's model
        // residual = true;
    }

    nvinfer1::ILayer* conv2 = C3K2(network, weightMap, *conv1->getOutput(0), get_width(256, gw, max_channels),
                                   get_depth(2, gd), "model.2", c3k2, 0.25);
    nvinfer1::ILayer* conv3 =
            Conv(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), "model.3", 3, 2, 1, 4);
    nvinfer1::ILayer* conv4 = C3K2(network, weightMap, *conv3->getOutput(0), get_width(512, gw, max_channels),
                                   get_depth(2, gd), "model.4", c3k2, 0.25);
    nvinfer1::ILayer* conv5 =
            Conv(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), "model.5", 3, 2);
    nvinfer1::ILayer* conv6 = A2C2f(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                                    get_depth(4, gd), "model.6", true, 1, residual, mlp_ratio);
    nvinfer1::ILayer* conv7 =
            Conv(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), "model.7", 3, 2);
    nvinfer1::ILayer* conv8 = A2C2f(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
                                    get_depth(4, gd), "model.8", true, 1, residual, mlp_ratio);

    nvinfer1::ILayer* conv_class = Conv(network, weightMap, *conv8->getOutput(0), 1280, "model.9.conv");
    nvinfer1::Dims dim = conv_class->getOutput(0)->getDimensions();
    assert(dim.nbDims == 4);
    nvinfer1::IPoolingLayer* pool2 = network->addPoolingNd(*conv_class->getOutput(0), nvinfer1::PoolingType::kAVERAGE,
                                                           nvinfer1::DimsHW{dim.d[2], dim.d[3]});

    nvinfer1::IShuffleLayer* shuffle_0 = network->addShuffle(*pool2->getOutput(0));
    shuffle_0->setReshapeDimensions(nvinfer1::Dims2{kBatchSize, 1280});
    auto linear_weight = weightMap["model.9.linear.weight"];
    auto constant_weight = network->addConstant(nvinfer1::Dims2{kClsNumClass, 1280}, linear_weight);
    auto constant_bias =
            network->addConstant(nvinfer1::Dims2{kBatchSize, kClsNumClass}, weightMap["model.9.linear.bias"]);
    auto linear_matrix_multipy =
            network->addMatrixMultiply(*shuffle_0->getOutput(0), nvinfer1::MatrixOperation::kNONE,
                                       *constant_weight->getOutput(0), nvinfer1::MatrixOperation::kTRANSPOSE);
    auto yolo = network->addElementWise(*linear_matrix_multipy->getOutput(0), *constant_bias->getOutput(0),
                                        nvinfer1::ElementWiseOperation::kSUM);
    assert(yolo);

    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    // Set the maximum batch size and workspace size
    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20));

    // Configuration according to the precision mode being used
#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform supports int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator = new Int8EntropyCalibrator2(kBatchSize, kClsInputW, kClsInputH, kInputQuantizationFolder,
                                                  "int8calib.table", kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    // Begin building the engine; this may take a while
    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // Cleanup the network definition and allocated weights
    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}

nvinfer1::IHostMemory* buildEngineYolov12Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                             nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                             int& max_channels, std::string& type) {

    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(
            1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));

    // =====================   input   ===================================================
    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW});
    assert(data);

    // =====================   backbone   ===================================================
    nvinfer1::IElementWiseLayer* conv0 =
            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), {3, 3}, 2, "model.0");
    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0),
                                                    get_width(128, gw, max_channels), {3, 3}, 2, "model.1", 1, 2);

    bool c3k2 = false;
    if (type == "m" || type == "l" || type == "x") {
        c3k2 = true;
    }
    float mlp_ratio = 2.0;
    bool residual = false;
    if (type == "l" || type == "x") {
        mlp_ratio = 1.5;  // see the yolov12-seg/ultralytics/nn/tasks.py/parse_model()
        residual = true;
    }
    /*   nvinfer1::IElementWiseLayer* C3K2(nvinfer1::INetworkDefinition * network,
                                      std::map<std::string, nvinfer1::Weights> & weightMap, nvinfer1::ITensor & input,
                                      int c2, int n, std::string lname, bool c3k, float e, int g, bool shortcut)*/
    nvinfer1::IElementWiseLayer* conv2 =
            C3K2(network, weightMap, *conv1->getOutput(0), get_width(256, gw, max_channels), get_depth(2, gd),
                 "model.2", c3k2, 0.25);

    nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0),
                                                    get_width(256, gw, max_channels), {3, 3}, 2, "model.3", 1, 4);
    nvinfer1::IElementWiseLayer* conv4 =
            C3K2(network, weightMap, *conv3->getOutput(0), get_width(512, gw, max_channels), get_depth(2, gd),
                 "model.4", c3k2, 0.25);
    nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0),
                                                    get_width(512, gw, max_channels), {3, 3}, 2, "model.5");

    /*nvinfer1::ILayer* A2C2f(nvinfer1::INetworkDefinition * network, std::map<std::string, nvinfer1::Weights> weightMap,
                            nvinfer1::ITensor & input, int c2, int n, std::string lname, bool a2, int area,
                            bool residual, float mlp_ratio, float e, int g, bool shortcut)*/
    nvinfer1::ILayer* conv6 = A2C2f(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                                    get_depth(4, gd), "model.6", true, 4, residual, mlp_ratio);

    nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0),
                                                    get_width(1024, gw, max_channels), {3, 3}, 2, "model.7");
    nvinfer1::ILayer* conv8 = A2C2f(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
                                    get_depth(4, gd), "model.8", true, 1, residual, mlp_ratio);

    // =========================  neck ====================================================================
    float scale[] = {1.0, 1.0, 2.0, 2.0};

    nvinfer1::IResizeLayer* upsample9 = network->addResize(*conv8->getOutput(0));
    upsample9->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample9->setScales(scale, 4);
    nvinfer1::ITensor* inputTensors10[] = {upsample9->getOutput(0), conv6->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat10 = network->addConcatenation(inputTensors10, 2);
    /*nvinfer1::ILayer* A2C2f(nvinfer1::INetworkDefinition * network, std::map<std::string, nvinfer1::Weights> weightMap,
                            nvinfer1::ITensor & input, int c2, std::string lname, int n, bool a2, int area,
                            bool residual, float mlp_ratio, float e, int g, bool shortcut)*/
    nvinfer1::ILayer* conv11 = A2C2f(network, weightMap, *cat10->getOutput(0), get_width(512, gw, max_channels),
                                     get_depth(2, gd), "model.11", false, -1, residual, mlp_ratio);

    nvinfer1::IResizeLayer* upsample12 = network->addResize(*conv11->getOutput(0));
    upsample12->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample12->setScales(scale, 4);
    nvinfer1::ITensor* inputTensors13[] = {upsample12->getOutput(0), conv4->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat13 = network->addConcatenation(inputTensors13, 2);
    nvinfer1::ILayer* conv14 = A2C2f(network, weightMap, *cat13->getOutput(0), get_width(256, gw, max_channels),
                                     get_depth(2, gd), "model.14", false, -1, residual, mlp_ratio);

    nvinfer1::IElementWiseLayer* conv15 = convBnSiLU(network, weightMap, *conv14->getOutput(0),
                                                     get_width(256, gw, max_channels), {3, 3}, 2, "model.15");
    nvinfer1::ITensor* inputTensors16[] = {conv15->getOutput(0), conv11->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat16 = network->addConcatenation(inputTensors16, 2);
    nvinfer1::ILayer* conv17 = A2C2f(network, weightMap, *cat16->getOutput(0), get_width(512, gw, max_channels),
                                     get_depth(2, gd), "model.17", false, -1, residual, mlp_ratio);

    nvinfer1::IElementWiseLayer* conv18 = convBnSiLU(network, weightMap, *conv17->getOutput(0),
                                                     get_width(512, gw, max_channels), {3, 3}, 2, "model.18");
    nvinfer1::ITensor* inputTensors19[] = {conv18->getOutput(0), conv8->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat19 = network->addConcatenation(inputTensors19, 2);
    nvinfer1::IElementWiseLayer* conv20 = C3K2(network, weightMap, *cat19->getOutput(0),
                                               get_width(1024, gw, max_channels), get_depth(2, gd), "model.20", true);

    // =============================== output ===================================================================
    int c2 = std::max(std::max(16, get_width(256, gw, max_channels) / 4), 16 * 4);
    int c3 = std::max(get_width(256, gw, max_channels), std::min(kNumClass, 100));

    // output0   location
    nvinfer1::IElementWiseLayer* conv21_cv2_0_0 =
            convBnSiLU(network, weightMap, *conv14->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.0.0");
    nvinfer1::IElementWiseLayer* conv21_cv2_0_1 =
            convBnSiLU(network, weightMap, *conv21_cv2_0_0->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.0.1");
    nvinfer1::IConvolutionLayer* conv21_cv2_0_2 =
            network->addConvolutionNd(*conv21_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.21.cv2.0.2.weight"], weightMap["model.21.cv2.0.2.bias"]);
    conv21_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv21_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});

    // output0 classes
    auto* conv21_cv3_0_0_0 = DWConv(network, weightMap, *conv14->getOutput(0), get_width(256, gw, max_channels), {3, 3},
                                    1, "model.21.cv3.0.0.0");
    nvinfer1::IElementWiseLayer* conv21_cv3_0_0_1 =
            convBnSiLU(network, weightMap, *conv21_cv3_0_0_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.0.0.1");

    auto* conv21_cv3_0_1_0 =
            DWConv(network, weightMap, *conv21_cv3_0_0_1->getOutput(0), c3, {3, 3}, 1, "model.21.cv3.0.1.0");
    nvinfer1::IElementWiseLayer* conv21_cv3_0_1_1 =
            convBnSiLU(network, weightMap, *conv21_cv3_0_1_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.0.1.1");
    nvinfer1::IConvolutionLayer* conv21_cv3_0_1_2 =
            network->addConvolutionNd(*conv21_cv3_0_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.21.cv3.0.2.weight"], weightMap["model.21.cv3.0.2.bias"]);
    conv21_cv3_0_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    conv21_cv3_0_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});

    nvinfer1::ITensor* inputTensors21_0[] = {conv21_cv2_0_2->getOutput(0), conv21_cv3_0_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat21_0 = network->addConcatenation(inputTensors21_0, 2);

    // out1 location
    nvinfer1::IElementWiseLayer* conv21_cv2_1_0 =
            convBnSiLU(network, weightMap, *conv17->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.1.0");
    nvinfer1::IElementWiseLayer* conv21_cv2_1_1 =
            convBnSiLU(network, weightMap, *conv21_cv2_1_0->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.1.1");
    nvinfer1::IConvolutionLayer* conv21_cv2_1_2 =
            network->addConvolutionNd(*conv21_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.21.cv2.1.2.weight"], weightMap["model.21.cv2.1.2.bias"]);
    conv21_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv21_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    // out1 classes
    auto* conv21_cv3_1_0_0 = DWConv(network, weightMap, *conv17->getOutput(0), get_width(512, gw, max_channels), {3, 3},
                                    1, "model.21.cv3.1.0.0");
    nvinfer1::IElementWiseLayer* conv21_cv3_1_0_1 =
            convBnSiLU(network, weightMap, *conv21_cv3_1_0_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.1.0.1");
    auto* conv21_cv3_1_1_0 =
            DWConv(network, weightMap, *conv21_cv3_1_0_1->getOutput(0), c3, {3, 3}, 1, "model.21.cv3.1.1.0");
    nvinfer1::IElementWiseLayer* conv21_cv3_1_1_1 =
            convBnSiLU(network, weightMap, *conv21_cv3_1_1_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.1.1.1");
    nvinfer1::IConvolutionLayer* conv21_cv3_1_1_2 =
            network->addConvolutionNd(*conv21_cv3_1_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.21.cv3.1.2.weight"], weightMap["model.21.cv3.1.2.bias"]);
    conv21_cv3_1_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    conv21_cv3_1_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});

    nvinfer1::ITensor* inputTensors21_1[] = {conv21_cv2_1_2->getOutput(0), conv21_cv3_1_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat21_1 = network->addConcatenation(inputTensors21_1, 2);

    // out2 location
    nvinfer1::IElementWiseLayer* conv21_cv2_2_0 =
            convBnSiLU(network, weightMap, *conv20->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.2.0");
    nvinfer1::IElementWiseLayer* conv21_cv2_2_1 =
            convBnSiLU(network, weightMap, *conv21_cv2_2_0->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.2.1");
    nvinfer1::IConvolutionLayer* conv21_cv2_2_2 =
            network->addConvolutionNd(*conv21_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.21.cv2.2.2.weight"], weightMap["model.21.cv2.2.2.bias"]);
    conv21_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv21_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});

    // out2 classes
    auto* conv21_cv3_2_0_0 = DWConv(network, weightMap, *conv20->getOutput(0), get_width(1024, gw, max_channels),
                                    {3, 3}, 1, "model.21.cv3.2.0.0");
    nvinfer1::IElementWiseLayer* conv21_cv3_2_0_1 =
            convBnSiLU(network, weightMap, *conv20->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.2.0.1");
    auto* conv21_cv3_2_1_0 =
            DWConv(network, weightMap, *conv21_cv3_2_0_1->getOutput(0), c3, {3, 3}, 1, "model.21.cv3.2.1.0");
    nvinfer1::IElementWiseLayer* conv21_cv3_2_1_1 =
            convBnSiLU(network, weightMap, *conv21_cv3_2_1_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.2.1.1");
    nvinfer1::IConvolutionLayer* conv21_cv3_2_1_2 =
            network->addConvolutionNd(*conv21_cv3_2_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.21.cv3.2.2.weight"], weightMap["model.21.cv3.2.2.bias"]);
    conv21_cv3_2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv21_cv3_2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});

    nvinfer1::ITensor* inputTensor21_2[] = {conv21_cv2_2_2->getOutput(0), conv21_cv3_2_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat21_2 = network->addConcatenation(inputTensor21_2, 2);

    // ============================================ yolov12  detect =========================================
    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7};
    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
    int stridesLength = sizeof(strides) / sizeof(int);

    nvinfer1::IShuffleLayer* shuffle21_0 = network->addShuffle(*cat21_0->getOutput(0));
    shuffle21_0->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
    nvinfer1::ISliceLayer* split21_0_0 = network->addSlice(
            *shuffle21_0->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split21_0_1 =
            network->addSlice(*shuffle21_0->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])},
                              nvinfer1::Dims3{1, 1, 1});

    nvinfer1::IShuffleLayer* dfl21_0 =
            DFL(network, weightMap, *split21_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
                1, 0, "model.21.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor22_dfl_0[] = {dfl21_0->getOutput(0), split21_0_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 2);
    cat22_dfl_0->setAxis(1);

    nvinfer1::IShuffleLayer* shuffle21_1 = network->addShuffle(*cat21_1->getOutput(0));
    shuffle21_1->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
    nvinfer1::ISliceLayer* split21_1_0 = network->addSlice(
            *shuffle21_1->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split21_1_1 =
            network->addSlice(*shuffle21_1->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl21_1 =
            DFL(network, weightMap, *split21_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
                1, 0, "model.21.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor22_dfl_1[] = {dfl21_1->getOutput(0), split21_1_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 2);
    cat22_dfl_1->setAxis(1);

    nvinfer1::IShuffleLayer* shuffle21_2 = network->addShuffle(*cat21_2->getOutput(0));
    shuffle21_2->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
    nvinfer1::ISliceLayer* split21_2_0 = network->addSlice(
            *shuffle21_2->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split21_2_1 =
            network->addSlice(*shuffle21_2->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl21_2 =
            DFL(network, weightMap, *split21_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
                1, 0, "model.21.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor22_dfl_2[] = {dfl21_2->getOutput(0), split21_2_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 2);
    cat22_dfl_2->setAxis(1);

    nvinfer1::IPluginV2Layer* yolo =
            addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer*>{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2},
                         strides, stridesLength, true, false, false);
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 64 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator = new Int8EntropyCalibrator2(kBatchSize, kInputW, kInputH, kInputQuantizationFolder,
                                                  "int8calib.table", kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}

nvinfer1::IHostMemory* buildEngineYolov12Seg(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                             nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                             int& max_channels, std::string& type) {

    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(
            1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));

    // =====================   input   ===================================================
    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW});
    assert(data);

    // =====================   backbone   ===================================================
    nvinfer1::IElementWiseLayer* conv0 =
            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), {3, 3}, 2, "model.0");
    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0),
                                                    get_width(128, gw, max_channels), {3, 3}, 2, "model.1", 1, 2);

    bool c3k2 = false;
    if (type == "m" || type == "l" || type == "x") {
        c3k2 = true;
    }
    float mlp_ratio = 2.0;
    bool residual = true;
    if (type == "l" || type == "x") {
        mlp_ratio = 1;  // see the yolov12-seg/ultralytics/nn/tasks.py/parse_model()
        // residual = true;
    }
    nvinfer1::IElementWiseLayer* conv2 =
            C3K2(network, weightMap, *conv1->getOutput(0), get_width(256, gw, max_channels), get_depth(2, gd),
                 "model.2", c3k2, 0.25);

    nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0),
                                                    get_width(256, gw, max_channels), {3, 3}, 2, "model.3", 1, 4);
    nvinfer1::IElementWiseLayer* conv4 =
            C3K2(network, weightMap, *conv3->getOutput(0), get_width(512, gw, max_channels), get_depth(2, gd),
                 "model.4", c3k2, 0.25);
    nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0),
                                                    get_width(512, gw, max_channels), {3, 3}, 2, "model.5");
    nvinfer1::ILayer* conv6 = A2C2f(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                                    get_depth(4, gd), "model.6", true, 4, residual, mlp_ratio);
    nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0),
                                                    get_width(1024, gw, max_channels), {3, 3}, 2, "model.7");
    nvinfer1::ILayer* conv8 = A2C2f(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
                                    get_depth(4, gd), "model.8", true, 1, residual, mlp_ratio);

    // =========================  neck ====================================================================
    float scale[] = {1.0, 1.0, 2.0, 2.0};

    nvinfer1::IResizeLayer* upsample9 = network->addResize(*conv8->getOutput(0));
    upsample9->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample9->setScales(scale, 4);
    nvinfer1::ITensor* inputTensors10[] = {upsample9->getOutput(0), conv6->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat10 = network->addConcatenation(inputTensors10, 2);
    nvinfer1::ILayer* conv11 = A2C2f(network, weightMap, *cat10->getOutput(0), get_width(512, gw, max_channels),
                                     get_depth(2, gd), "model.11", false, -1, residual, mlp_ratio);

    nvinfer1::IResizeLayer* upsample12 = network->addResize(*conv11->getOutput(0));
    upsample12->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample12->setScales(scale, 4);
    nvinfer1::ITensor* inputTensors13[] = {upsample12->getOutput(0), conv4->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat13 = network->addConcatenation(inputTensors13, 2);
    nvinfer1::ILayer* conv14 = A2C2f(network, weightMap, *cat13->getOutput(0), get_width(256, gw, max_channels),
                                     get_depth(2, gd), "model.14", false, -1, residual, mlp_ratio);

    nvinfer1::IElementWiseLayer* conv15 = convBnSiLU(network, weightMap, *conv14->getOutput(0),
                                                     get_width(256, gw, max_channels), {3, 3}, 2, "model.15");
    nvinfer1::ITensor* inputTensors16[] = {conv15->getOutput(0), conv11->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat16 = network->addConcatenation(inputTensors16, 2);
    nvinfer1::ILayer* conv17 = A2C2f(network, weightMap, *cat16->getOutput(0), get_width(512, gw, max_channels),
                                     get_depth(2, gd), "model.17", false, -1, residual, mlp_ratio);

    nvinfer1::IElementWiseLayer* conv18 = convBnSiLU(network, weightMap, *conv17->getOutput(0),
                                                     get_width(512, gw, max_channels), {3, 3}, 2, "model.18");
    nvinfer1::ITensor* inputTensors19[] = {conv18->getOutput(0), conv8->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat19 = network->addConcatenation(inputTensors19, 2);
    nvinfer1::IElementWiseLayer* conv20 = C3K2(network, weightMap, *cat19->getOutput(0),
                                               get_width(1024, gw, max_channels), get_depth(2, gd), "model.20", true);

    // =============================== output ===================================================================
    int c2 = std::max(std::max(16, get_width(256, gw, max_channels) / 4), 16 * 4);
    int c3 = std::max(get_width(256, gw, max_channels), std::min(kNumClass, 100));

    // output0   location
    nvinfer1::IElementWiseLayer* conv21_cv2_0_0 =
            convBnSiLU(network, weightMap, *conv14->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.0.0");
    nvinfer1::IElementWiseLayer* conv21_cv2_0_1 =
            convBnSiLU(network, weightMap, *conv21_cv2_0_0->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.0.1");
    nvinfer1::IConvolutionLayer* conv21_cv2_0_2 =
            network->addConvolutionNd(*conv21_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.21.cv2.0.2.weight"], weightMap["model.21.cv2.0.2.bias"]);
    conv21_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv21_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});

    // output0 classes
    auto* conv21_cv3_0_0_0 = DWConv(network, weightMap, *conv14->getOutput(0), get_width(256, gw, max_channels), {3, 3},
                                    1, "model.21.cv3.0.0.0");
    nvinfer1::IElementWiseLayer* conv21_cv3_0_0_1 =
            convBnSiLU(network, weightMap, *conv21_cv3_0_0_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.0.0.1");

    auto* conv21_cv3_0_1_0 =
            DWConv(network, weightMap, *conv21_cv3_0_0_1->getOutput(0), c3, {3, 3}, 1, "model.21.cv3.0.1.0");
    nvinfer1::IElementWiseLayer* conv21_cv3_0_1_1 =
            convBnSiLU(network, weightMap, *conv21_cv3_0_1_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.0.1.1");
    nvinfer1::IConvolutionLayer* conv21_cv3_0_1_2 =
            network->addConvolutionNd(*conv21_cv3_0_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.21.cv3.0.2.weight"], weightMap["model.21.cv3.0.2.bias"]);
    conv21_cv3_0_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    conv21_cv3_0_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});

    nvinfer1::ITensor* inputTensors21_0[] = {conv21_cv2_0_2->getOutput(0), conv21_cv3_0_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat21_0 = network->addConcatenation(inputTensors21_0, 2);

    // out1 location
    nvinfer1::IElementWiseLayer* conv21_cv2_1_0 =
            convBnSiLU(network, weightMap, *conv17->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.1.0");
    nvinfer1::IElementWiseLayer* conv21_cv2_1_1 =
            convBnSiLU(network, weightMap, *conv21_cv2_1_0->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.1.1");
    nvinfer1::IConvolutionLayer* conv21_cv2_1_2 =
            network->addConvolutionNd(*conv21_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.21.cv2.1.2.weight"], weightMap["model.21.cv2.1.2.bias"]);
    conv21_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv21_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    // out1 classes
    auto* conv21_cv3_1_0_0 = DWConv(network, weightMap, *conv17->getOutput(0), get_width(512, gw, max_channels), {3, 3},
                                    1, "model.21.cv3.1.0.0");
    nvinfer1::IElementWiseLayer* conv21_cv3_1_0_1 =
            convBnSiLU(network, weightMap, *conv21_cv3_1_0_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.1.0.1");
    auto* conv21_cv3_1_1_0 =
            DWConv(network, weightMap, *conv21_cv3_1_0_1->getOutput(0), c3, {3, 3}, 1, "model.21.cv3.1.1.0");
    nvinfer1::IElementWiseLayer* conv21_cv3_1_1_1 =
            convBnSiLU(network, weightMap, *conv21_cv3_1_1_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.1.1.1");
    nvinfer1::IConvolutionLayer* conv21_cv3_1_1_2 =
            network->addConvolutionNd(*conv21_cv3_1_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.21.cv3.1.2.weight"], weightMap["model.21.cv3.1.2.bias"]);
    conv21_cv3_1_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    conv21_cv3_1_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});

    nvinfer1::ITensor* inputTensors21_1[] = {conv21_cv2_1_2->getOutput(0), conv21_cv3_1_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat21_1 = network->addConcatenation(inputTensors21_1, 2);

    // out2 location
    nvinfer1::IElementWiseLayer* conv21_cv2_2_0 =
            convBnSiLU(network, weightMap, *conv20->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.2.0");
    nvinfer1::IElementWiseLayer* conv21_cv2_2_1 =
            convBnSiLU(network, weightMap, *conv21_cv2_2_0->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.2.1");
    nvinfer1::IConvolutionLayer* conv21_cv2_2_2 =
            network->addConvolutionNd(*conv21_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.21.cv2.2.2.weight"], weightMap["model.21.cv2.2.2.bias"]);
    conv21_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv21_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});

    // out2 classes
    auto* conv21_cv3_2_0_0 = DWConv(network, weightMap, *conv20->getOutput(0), get_width(1024, gw, max_channels),
                                    {3, 3}, 1, "model.21.cv3.2.0.0");
    nvinfer1::IElementWiseLayer* conv21_cv3_2_0_1 =
            convBnSiLU(network, weightMap, *conv21_cv3_2_0_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.2.0.1");
    auto* conv21_cv3_2_1_0 =
            DWConv(network, weightMap, *conv21_cv3_2_0_1->getOutput(0), c3, {3, 3}, 1, "model.21.cv3.2.1.0");
    nvinfer1::IElementWiseLayer* conv21_cv3_2_1_1 =
            convBnSiLU(network, weightMap, *conv21_cv3_2_1_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.2.1.1");
    nvinfer1::IConvolutionLayer* conv21_cv3_2_1_2 =
            network->addConvolutionNd(*conv21_cv3_2_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.21.cv3.2.2.weight"], weightMap["model.21.cv3.2.2.bias"]);
    conv21_cv3_2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv21_cv3_2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});

    nvinfer1::ITensor* inputTensor21_2[] = {conv21_cv2_2_2->getOutput(0), conv21_cv3_2_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat21_2 = network->addConcatenation(inputTensor21_2, 2);

    // ============================================ yolov12  detect =========================================
    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7};
    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
    int stridesLength = sizeof(strides) / sizeof(int);

    nvinfer1::IShuffleLayer* shuffle21_0 = network->addShuffle(*cat21_0->getOutput(0));
    shuffle21_0->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
    nvinfer1::ISliceLayer* split21_0_0 = network->addSlice(
            *shuffle21_0->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split21_0_1 =
            network->addSlice(*shuffle21_0->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])},
                              nvinfer1::Dims3{1, 1, 1});

    nvinfer1::IShuffleLayer* dfl21_0 =
            DFL(network, weightMap, *split21_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
                1, 0, "model.21.dfl.conv.weight");
    auto proto_coef_0 = cv4_conv_combined(network, weightMap, *conv14->getOutput(0), "model.21.cv4.0",
                                          (kInputH / strides[0]) * (kInputW / strides[0]), gw, "seg", max_channels);
    nvinfer1::ITensor* inputTensor22_dfl_0[] = {dfl21_0->getOutput(0), split21_0_1->getOutput(0),
                                                proto_coef_0->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 3);
    cat22_dfl_0->setAxis(1);

    nvinfer1::IShuffleLayer* shuffle21_1 = network->addShuffle(*cat21_1->getOutput(0));
    shuffle21_1->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
    nvinfer1::ISliceLayer* split21_1_0 = network->addSlice(
            *shuffle21_1->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split21_1_1 =
            network->addSlice(*shuffle21_1->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl21_1 =
            DFL(network, weightMap, *split21_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
                1, 0, "model.21.dfl.conv.weight");
    auto proto_coef_1 = cv4_conv_combined(network, weightMap, *conv17->getOutput(0), "model.21.cv4.1",
                                          (kInputH / strides[1]) * (kInputW / strides[1]), gw, "seg", max_channels);
    nvinfer1::ITensor* inputTensor22_dfl_1[] = {dfl21_1->getOutput(0), split21_1_1->getOutput(0),
                                                proto_coef_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 3);
    cat22_dfl_1->setAxis(1);

    nvinfer1::IShuffleLayer* shuffle21_2 = network->addShuffle(*cat21_2->getOutput(0));
    shuffle21_2->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
    nvinfer1::ISliceLayer* split21_2_0 = network->addSlice(
            *shuffle21_2->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split21_2_1 =
            network->addSlice(*shuffle21_2->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl21_2 =
            DFL(network, weightMap, *split21_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
                1, 0, "model.21.dfl.conv.weight");
    auto proto_coef_2 = cv4_conv_combined(network, weightMap, *conv20->getOutput(0), "model.21.cv4.2",
                                          (kInputH / strides[2]) * (kInputW / strides[2]), gw, "seg", max_channels);
    nvinfer1::ITensor* inputTensor22_dfl_2[] = {dfl21_2->getOutput(0), split21_2_1->getOutput(0),
                                                proto_coef_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 3);
    cat22_dfl_2->setAxis(1);

    nvinfer1::IPluginV2Layer* yolo =
            addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer*>{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2},
                         strides, stridesLength, true, false, false);
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    auto proto = Proto(network, weightMap, *conv14->getOutput(0), "model.21.proto", gw, max_channels);
    proto->getOutput(0)->setName(kProtoTensorName);
    network->markOutput(*proto->getOutput(0));

    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator = new Int8EntropyCalibrator2(kBatchSize, kInputW, kInputH, kInputQuantizationFolder,
                                                  "int8calib.table", kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}


================================================
FILE: yolov12-tubro/src/postprocess.cpp
================================================
#include "postprocess.h"
#include "utils.h"

cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
    float l, r, t, b;
    float r_w = kInputW / (img.cols * 1.0);
    float r_h = kInputH / (img.rows * 1.0);

    if (r_h > r_w) {
        l = bbox[0];
        r = bbox[2];
        t = bbox[1] - (kInputH - r_w * img.rows) / 2;
        b = bbox[3] - (kInputH - r_w * img.rows) / 2;
        l = l / r_w;
        r = r / r_w;
        t = t / r_w;
        b = b / r_w;
    } else {
        l = bbox[0] - (kInputW - r_h * img.cols) / 2;
        r = bbox[2] - (kInputW - r_h * img.cols) / 2;
        t = bbox[1];
        b = bbox[3];
        l = l / r_h;
        r = r / r_h;
        t = t / r_h;
        b = b / r_h;
    }
    l = std::max(0.0f, l);
    t = std::max(0.0f, t);
    int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
    int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));

    return cv::Rect(int(round(l)), int(round(t)), width, height);
}

cv::Rect get_rect_obb(cv::Mat& img, float bbox[4]) {
    float l, r, t, b;
    float r_w = kObbInputW / (img.cols * 1.0);
    float r_h = kObbInputH / (img.rows * 1.0);

    if (r_h > r_w) {
        l = bbox[0];
        r = bbox[2];
        t = bbox[1] - (kObbInputH - r_w * img.rows) / 2;
        b = bbox[3] - (kObbInputH - r_w * img.rows) / 2;
        l = l / r_w;
        r = r / r_w;
        t = t / r_w;
        b = b / r_w;
    } else {
        l = bbox[0] - (kObbInputW - r_h * img.cols) / 2;
        r = bbox[2] - (kObbInputW - r_h * img.cols) / 2;
        t = bbox[1];
        b = bbox[3];
        l = l / r_h;
        r = r / r_h;
        t = t / r_h;
        b = b / r_h;
    }
    l = std::max(0.0f, l);
    t = std::max(0.0f, t);
    int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
    int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));

    return cv::Rect(int(round(l)), int(round(t)), width, height);
}

cv::Rect get_rect_adapt_landmark(cv::Mat& img, float bbox[4], float lmk[kNumberOfPoints * 3]) {
    float l, r, t, b;
    float r_w = kInputW / (img.cols * 1.0);
    float r_h = kInputH / (img.rows * 1.0);
    if (r_h > r_w) {
        l = bbox[0] / r_w;
        r = bbox[2] / r_w;
        t = (bbox[1] - (kInputH - r_w * img.rows) / 2) / r_w;
        b = (bbox[3] - (kInputH - r_w * img.rows) / 2) / r_w;
        for (int i = 0; i < kNumberOfPoints * 3; i += 3) {
            lmk[i] /= r_w;
            lmk[i + 1] = (lmk[i + 1] - (kInputH - r_w * img.rows) / 2) / r_w;
            // lmk[i + 2]
        }
    } else {
        l = (bbox[0] - (kInputW - r_h * img.cols) / 2) / r_h;
        r = (bbox[2] - (kInputW - r_h * img.cols) / 2) / r_h;
        t = bbox[1] / r_h;
        b = bbox[3] / r_h;
        for (int i = 0; i < kNumberOfPoints * 3; i += 3) {
            lmk[i] = (lmk[i] - (kInputW - r_h * img.cols) / 2) / r_h;
            lmk[i + 1] /= r_h;
            // lmk[i + 2]
        }
    }
    l = std::max(0.0f, l);
    t = std::max(0.0f, t);
    int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
    int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));

    return cv::Rect(int(round(l)), int(round(t)), width, height);
}

static float iou(float lbox[4], float rbox[4]) {
    float interBox[] = {
            (std::max)(lbox[0], rbox[0]),
            (std::min)(lbox[2], rbox[2]),
            (std::max)(lbox[1], rbox[1]),
            (std::min)(lbox[3], rbox[3]),
    };

    if (interBox[2] > interBox[3] || interBox[0] > interBox[1])
        return 0.0f;

    float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]);
    float unionBoxS = (lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) - interBoxS;
    return interBoxS / unionBoxS;
}

static bool cmp(const Detection& a, const Detection& b) {
    if (a.conf == b.conf) {
        return a.bbox[0] < b.bbox[0];
    }
    return a.conf > b.conf;
}

void nms(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh) {
    int det_size = sizeof(Detection) / sizeof(float);
    std::map<float, std::vector<Detection>> m;

    for (int i = 0; i < output[0]; i++) {
        if (output[1 + det_size * i + 4] <= conf_thresh || isnan(output[1 + det_size * i + 4]))
            continue;
        Detection det;
        memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
        if (m.count(det.class_id) == 0)
            m.emplace(det.class_id, std::vector<Detection>());
        m[det.class_id].push_back(det);
    }
    for (auto it = m.begin(); it != m.end(); it++) {
        auto& dets = it->second;
        std::sort(dets.begin(), dets.end(), cmp);
        for (size_t m = 0; m < dets.size(); ++m) {
            auto& item = dets[m];
            res.push_back(item);
            for (size_t n = m + 1; n < dets.size(); ++n) {
                if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
                    dets.erase(dets.begin() + n);
                    --n;
                }
            }
        }
    }
}

void batch_nms(std::vector<std::vector<Detection>>& res_batch, float* output, int batch_size, int output_size,
               float conf_thresh, float nms_thresh) {
    res_batch.resize(batch_size);
    for (int i = 0; i < batch_size; i++) {
        nms(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh);
    }
}

void process_decode_ptr_host(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img,
                             int count) {
    Detection det;
    for (int i = 0; i < count; i++) {
        int basic_pos = 1 + i * bbox_element;
        int keep_flag = decode_ptr_host[basic_pos + 6];
        if (keep_flag == 1) {
            det.bbox[0] = decode_ptr_host[basic_pos + 0];
            det.bbox[1] = decode_ptr_host[basic_pos + 1];
            det.bbox[2] = decode_ptr_host[basic_pos + 2];
            det.bbox[3] = decode_ptr_host[basic_pos + 3];
            det.conf = decode_ptr_host[basic_pos + 4];
            det.class_id = decode_ptr_host[basic_pos + 5];
            res.push_back(det);
        }
    }
}

void batch_process(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
                   int bbox_element, const std::vector<cv::Mat>& img_batch) {
    res_batch.resize(batch_size);
    int count = static_cast<int>(*decode_ptr_host);
    count = std::min(count, kMaxNumOutputBbox);
    for (int i = 0; i < batch_size; i++) {
        auto& img = const_cast<cv::Mat&>(img_batch[i]);
        process_decode_ptr_host(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count);
    }
}

void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
    for (size_t i = 0; i < img_batch.size(); i++) {
        auto& res = res_batch[i];
        cv::Mat img = img_batch[i];
        for (size_t j = 0; j < res.size(); j++) {
            cv::Rect r = get_rect(img, res[j].bbox);
            cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
            cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
                        cv::Scalar(0xFF, 0xFF, 0xFF), 2);
        }
    }
}

void draw_bbox_keypoints_line(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
    const std::vector<std::pair<int, int>> skeleton_pairs = {
            {0, 1}, {0, 2},  {0, 5}, {0, 6},  {1, 2},   {1, 3},   {2, 4},   {5, 6},   {5, 7},  {5, 11},
            {6, 8}, {6, 12}, {7, 9}, {8, 10}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}};

    for (size_t i = 0; i < img_batch.size(); i++) {
        auto& res = res_batch[i];
        cv::Mat img = img_batch[i];
        for (size_t j = 0; j < res.size(); j++) {
            cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].keypoints);
            cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
            cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
                        cv::Scalar(0xFF, 0xFF, 0xFF), 2);

            for (int k = 0; k < kNumberOfPoints * 3; k += 3) {
                if (res[j].keypoints[k + 2] > 0.5) {
                    cv::circle(img, cv::Point((int)res[j].keypoints[k], (int)res[j].keypoints[k + 1]), 3,
                               cv::Scalar(0, 0x27, 0xC1), -1);
                }
            }

            for (const auto& bone : skeleton_pairs) {
                int kp1_idx = bone.first * 3;
                int kp2_idx = bone.second * 3;
                if (res[j].keypoints[kp1_idx + 2] > 0.5 && res[j].keypoints[kp2_idx + 2] > 0.5) {
                    cv::Point p1((int)res[j].keypoints[kp1_idx], (int)res[j].keypoints[kp1_idx + 1]);
                    cv::Point p2((int)res[j].keypoints[kp2_idx], (int)res[j].keypoints[kp2_idx + 1]);
                    cv::line(img, p1, p2, cv::Scalar(0, 0x27, 0xC1), 2);
                }
            }
        }
    }
}

cv::Mat scale_mask(cv::Mat mask, cv::Mat img) {
    int x, y, w, h;
    float r_w = kInputW / (img.cols * 1.0);
    float r_h = kInputH / (img.rows * 1.0);
    if (r_h > r_w) {
        w = kInputW;
        h = r_w * img.rows;
        x = 0;
        y = (kInputH - h) / 2;
    } else {
        w = r_h * img.cols;
        h = kInputH;
        x = (kInputW - w) / 2;
        y = 0;
    }
    cv::Rect r(x, y, w, h);
    cv::Mat res;
    cv::resize(mask(r), res, img.size());
    return res;
}

void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks,
                    std::unordered_map<int, std::string>& labels_map) {
    static std::vector<uint32_t> colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17,
                                           0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF,
                                           0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7};
    for (size_t i = 0; i < dets.size(); i++) {
        cv::Mat img_mask = scale_mask(masks[i], img);
        auto color = colors[(int)dets[i].class_id % colors.size()];
        auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF);

        cv::Rect r = get_rect(img, dets[i].bbox);
        for (int x = r.x; x < r.x + r.width; x++) {
            for (int y = r.y; y < r.y + r.height; y++) {
                float val = img_mask.at<float>(y, x);
                if (val <= 0.5)
                    continue;
                img.at<cv::Vec3b>(y, x)[0] = img.at<cv::Vec3b>(y, x)[0] / 2 + bgr[0] / 2;
                img.at<cv::Vec3b>(y, x)[1] = img.at<cv::Vec3b>(y, x)[1] / 2 + bgr[1] / 2;
                img.at<cv::Vec3b>(y, x)[2] = img.at<cv::Vec3b>(y, x)[2] / 2 + bgr[2] / 2;
            }
        }

        cv::rectangle(img, r, bgr, 2);

        // Get the size of the text
        cv::Size textSize =
                cv::getTextSize(labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf),
                                cv::FONT_HERSHEY_PLAIN, 1.2, 2, NULL);
        // Set the top left corner of the rectangle
        cv::Point topLeft(r.x, r.y - textSize.height);

        // Set the bottom right corner of the rectangle
        cv::Point bottomRight(r.x + textSize.width, r.y + textSize.height);

        // Set the thickness of the rectangle lines
        int lineThickness = 2;

        // Draw the rectangle on the image
        cv::rectangle(img, topLeft, bottomRight, bgr, -1);

        cv::putText(img, labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf),
                    cv::Point(r.x, r.y + 4), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar::all(0xFF), 2);
    }
}

void process_decode_ptr_host_obb(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element,
                                 cv::Mat& img, int count) {
    Detection det;
    for (int i = 0; i < count; i++) {
        int basic_pos = 1 + i * bbox_element;
        int keep_flag = decode_ptr_host[basic_pos + 6];
        if (keep_flag == 1) {
            det.bbox[0] = decode_ptr_host[basic_pos + 0];
            det.bbox[1] = decode_ptr_host[basic_pos + 1];
            det.bbox[2] = decode_ptr_host[basic_pos + 2];
            det.bbox[3] = decode_ptr_host[basic_pos + 3];
            det.conf = decode_ptr_host[basic_pos + 4];
            det.class_id = decode_ptr_host[basic_pos + 5];
            det.angle = decode_ptr_host[basic_pos + 7];
            res.push_back(det);
        }
    }
}

void batch_process_obb(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
                       int bbox_element, const std::vector<cv::Mat>& img_batch) {
    res_batch.resize(batch_size);
    int count = static_cast<int>(*decode_ptr_host);
    count = std::min(count, kMaxNumOutputBbox);
    for (int i = 0; i < batch_size; i++) {
        auto& img = const_cast<cv::Mat&>(img_batch[i]);
        process_decode_ptr_host_obb(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count);
    }
}

std::tuple<float, float, float> convariance_matrix(Detection res) {
    float w = res.bbox[2];
    float h = res.bbox[3];

    float a = w * w / 12.0;
    float b = h * h / 12.0;
    float c = res.angle;

    float cos_r = std::cos(c);
    float sin_r = std::sin(c);

    float cos_r2 = cos_r * cos_r;
    float sin_r2 = sin_r * sin_r;

    float a_val = a * cos_r2 + b * sin_r2;
    float b_val = a * sin_r2 + b * cos_r2;
    float c_val = (a - b) * cos_r * sin_r;

    return std::make_tuple(a_val, b_val, c_val);
}

static float probiou(const Detection& res1, const Detection& res2, float eps = 1e-7) {
    // Calculate the prob iou between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf.
    float a1, b1, c1, a2, b2, c2;
    std::tuple<float, float, float> matrix1 = {a1, b1, c1};
    std::tuple<float, float, float> matrix2 = {a2, b2, c2};
    matrix1 = convariance_matrix(res1);
    matrix2 = convariance_matrix(res2);
    a1 = std::get<0>(matrix1);
    b1 = std::get<1>(matrix1);
    c1 = std::get<2>(matrix1);
    a2 = std::get<0>(matrix2);
    b2 = std::get<1>(matrix2);
    c2 = std::get<2>(matrix2);

    float x1 = res1.bbox[0], y1 = res1.bbox[1];
    float x2 = res2.bbox[0], y2 = res2.bbox[1];

    float t1 = ((a1 + a2) * std::pow(y1 - y2, 2) + (b1 + b2) * std::pow(x1 - x2, 2)) /
               ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps);
    float t2 = ((c1 + c2) * (x2 - x1) * (y1 - y2)) / ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps);
    float t3 = std::log(
            ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2)) /
                    (4 * std::sqrt(std::max(a1 * b1 - c1 * c1, 0.0f)) * std::sqrt(std::max(a2 * b2 - c2 * c2, 0.0f)) +
                     eps) +
            eps);

    float bd = 0.25f * t1 + 0.5f * t2 + 0.5f * t3;
    bd = std::max(std::min(bd, 100.0f), eps);
    float hd = std::sqrt(1.0 - std::exp(-bd) + eps);

    return 1 - hd;
}

void nms_obb(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh) {
    int det_size = sizeof(Detection) / sizeof(float);
    std::map<float, std::vector<Detection>> m;

    for (int i = 0; i < output[0]; i++) {

        if (output[1 + det_size * i + 4] <= conf_thresh)
            continue;
        Detection det;
        memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
        if (m.count(det.class_id) == 0)
            m.emplace(det.class_id, std::vector<Detection>());
        m[det.class_id].push_back(det);
    }
    for (auto it = m.begin(); it != m.end(); it++) {
        auto& dets = it->second;
        std::sort(dets.begin(), dets.end(), cmp);
        for (size_t m = 0; m < dets.size(); ++m) {
            auto& item = dets[m];
            res.push_back(item);
            for (size_t n = m + 1; n < dets.size(); ++n) {
                if (probiou(item, dets[n]) >= nms_thresh) {
                    dets.erase(dets.begin() + n);
                    --n;
                }
            }
        }
    }
}

void batch_nms_obb(std::vector<std::vector<Detection>>& res_batch, float* output, int batch_size, int output_size,
                   float conf_thresh, float nms_thresh) {
    res_batch.resize(batch_size);
    for (int i = 0; i < batch_size; i++) {
        nms_obb(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh);
    }
}

static std::vector<cv::Point> get_corner(cv::Mat& img, const Detection& box) {
    float cos_value, sin_value;

    // Calculate center point and width/height
    float x1 = box.bbox[0];
    float y1 = box.bbox[1];
    float w = box.bbox[2];
    float h = box.bbox[3];
    float angle = box.angle * 180.0f / CV_PI;  // Convert radians to degrees

    // Print original angle
    std::cout << "Original angle: " << angle << std::endl;

    // Swap width and height if height is greater than or equal to width
    if (h >= w) {
        std::swap(w, h);
        angle = fmod(angle + 90.0f, 180.0f);  // Adjust angle to be within [0, 180)
    }

    // Ensure the angle is between 0 and 180 degrees
    if (angle < 0) {
        angle += 360.0f;  // Convert to positive value
    }
    if (angle > 180.0f) {
        angle -= 180.0f;  // Subtract 180 from angles greater than 180
    }

    // Print adjusted angle
    std::cout << "Adjusted angle: " << angle << std::endl;

    // Convert to normal angle value
    float normal_angle = fmod(angle, 180.0f);
    if (normal_angle < 0) {
        normal_angle += 180.0f;  // Ensure it's a positive value
    }

    // Print normal angle value
    std::cout << "Normal angle: " << normal_angle << std::endl;

    cos_value = std::cos(angle * CV_PI / 180.0f);  // Convert to radians
    sin_value = std::sin(angle * CV_PI / 180.0f);

    // Calculate each corner point
    float l = x1 - w / 2;  // Left boundary
    float r = x1 + w / 2;  // Right boundary
    float t = y1 - h / 2;  // Top boundary
    float b = y1 + h / 2;  // Bottom boundary

    // Use get_rect function to scale the coordinates
    float bbox[4] = {l, t, r, b};
    cv::Rect rect = get_rect_obb(img, bbox);

    float x_ = (rect.x + rect.x + rect.width) / 2;   // Center x
    float y_ = (rect.y + rect.y + rect.height) / 2;  // Center y
    float width = rect.width;                        // Width
    float height = rect.height;                      // Height

    // Calculate each corner point
    std::vector<cv::Point> corner_points(4);
    float vec1x = width / 2 * cos_value;
    float vec1y = width / 2 * sin_value;
    float vec2x = -height / 2 * sin_value;
    float vec2y = height / 2 * cos_value;

    corner_points[0] = cv::Point(int(round(x_ + vec1x + vec2x)), int(round(y_ + vec1y + vec2y)));  // Top-left corner
    corner_points[1] = cv::Point(int(round(x_ + vec1x - vec2x)), int(round(y_ + vec1y - vec2y)));  // Top-right corner
    corner_points[2] =
            cv::Point(int(round(x_ - vec1x - vec2x)), int(round(y_ - vec1y - vec2y)));  // Bottom-right corner
    corner_points[3] = cv::Point(int(round(x_ - vec1x + vec2x)), int(round(y_ - vec1y + vec2y)));  // Bottom-left corner

    // Check and adjust corner points to ensure the rectangle is parallel to image boundaries
    for (auto& point : corner_points) {
        point.x = std::max(0, std::min(point.x, img.cols - 1));
        point.y = std::max(0, std::min(point.y, img.rows - 1));
    }

    return corner_points;
}

void draw_bbox_obb(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
    static std::vector<uint32_t> colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17,
                                           0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF,
                                           0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7};
    for (size_t i = 0; i < img_batch.size(); i++) {
        auto& res = res_batch[i];
        auto& img = img_batch[i];
        for (auto& obj : res) {
            auto color = colors[(int)obj.class_id % colors.size()];
            auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF);
            auto corner_points = get_corner(img, obj);
            cv::polylines(img, std::vector<std::vector<cv::Point>>{corner_points}, true, bgr, 1);

            auto text = (std::to_string((int)(obj.class_id)) + ":" + to_string_with_precision(obj.conf));
            cv::Size textsize = cv::getTextSize(text, 0, 0.3, 1, nullptr);

            int width = textsize.width;
            int height = textsize.height;
            bool outside = (corner_points[0].y - height >= 3) ? true : false;
            cv::Point p1(corner_points[0].x, corner_points[0].y), p2;
            p2.x = corner_points[0].x + width;
            if (outside) {
                p2.y = corner_points[0].y - height - 3;
            } else {
                p2.y = corner_points[0].y + height + 3;
            }
            cv::rectangle(img, p1, p2, bgr, -1, cv::LINE_AA);
            cv::putText(
                    img, text,
                    cv::Point(corner_points[0].x, (outside ? corner_points[0].y - 2 : corner_points[0].y + height + 2)),
                    0, 0.3, cv::Scalar::all(255), 1, cv::LINE_AA);
        }
    }
}


================================================
FILE: yolov12-tubro/src/postprocess.cu
================================================
//
// Created by lindsay on 23-7-17.
//
#include "postprocess.h"
#include "types.h"

static __global__ void decode_kernel_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray,
                                         int max_objects) {
    float count = predict[0];
    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    if (position >= count)
        return;

    float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float));
    int index = atomicAdd(parray, 1);
    if (index >= max_objects)
        return;

    float confidence = pitem[4];

    if (confidence < confidence_threshold)
        return;
    //[center_x center_y w h conf class_id  mask[32] keypoints[51] angle]
    float cx = pitem[0];
    float cy = pitem[1];
    float width = pitem[2];
    float height = pitem[3];
    float label = pitem[5];
    float angle = pitem[89];

    float* pout_item = parray + 1 + index * bbox_element;
    *pout_item++ = cx;
    *pout_item++ = cy;
    *pout_item++ = width;
    *pout_item++ = height;
    *pout_item++ = confidence;
    *pout_item++ = label;
    *pout_item++ = 1;  // 1 = keep, 0 = ignore
    *pout_item++ = angle;
}

static __global__ void decode_kernel(float* predict, int num_bboxes, float confidence_threshold, float* parray,
                                     int max_objects) {
    float count = predict[0];
    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    if (position >= count)
        return;

    float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float));
    int index = atomicAdd(parray, 1);
    if (index >= max_objects)
        return;

    float confidence = pitem[4];
    if (confidence < confidence_threshold)
        return;

    float left = pitem[0];
    float top = pitem[1];
    float right = pitem[2];
    float bottom = pitem[3];
    float label = pitem[5];

    float* pout_item = parray + 1 + index * bbox_element;
    *pout_item++ = left;
    *pout_item++ = top;
    *pout_item++ = right;
    *pout_item++ = bottom;
    *pout_item++ = confidence;
    *pout_item++ = label;
    *pout_item++ = 1;  // 1 = keep, 0 = ignore
}

static __device__ float box_iou(float aleft, float atop, float aright, float abottom, float bleft, float btop,
                                float bright, float bbottom) {
    float cleft = max(aleft, bleft);
    float ctop = max(atop, btop);
    float cright = min(aright, bright);
    float cbottom = min(abottom, bbottom);
    float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
    if (c_area == 0.0f)
        return 0.0f;

    float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
    float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
    return c_area / (a_area + b_area - c_area);
}

static __global__ void nms_kernel(float* bboxes, int max_objects, float threshold) {
    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    int count = min(static_cast<int>(bboxes[0]), max_objects);
    if (position >= count)
        return;

    float* pcurrent = bboxes + 1 + position * bbox_element;
    for (int i = 0; i < count; ++i) {
        float* pitem = bboxes + 1 + i * bbox_element;
        if (i == position || pcurrent[5] != pitem[5])
            continue;
        if (pitem[4] >= pcurrent[4]) {
            if (pitem[4] == pcurrent[4] && i < position)
                continue;
            float iou =
                    box_iou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pitem[0], pitem[1], pitem[2], pitem[3]);
            if (iou > threshold) {
                pcurrent[6] = 0;
                return;
            }
        }
    }
}

static __device__ void convariance_matrix(float w, float h, float r, float& a, float& b, float& c) {
    float a_val = w * w / 12.0f;
    float b_val = h * h / 12.0f;
    float cos_r = cosf(r);
    float sin_r = sinf(r);

    a = a_val * cos_r * cos_r + b_val * sin_r * sin_r;
    b = a_val * sin_r * sin_r + b_val * cos_r * cos_r;
    c = (a_val - b_val) * sin_r * cos_r;
}

static __device__ float box_probiou(float cx1, float cy1, float w1, float h1, float r1, float cx2, float cy2, float w2,
                                    float h2, float r2, float eps = 1e-7) {

    // Calculate the prob iou between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf.
    float a1, b1, c1, a2, b2, c2;
    convariance_matrix(w1, h1, r1, a1, b1, c1);
    convariance_matrix(w2, h2, r2, a2, b2, c2);

    float t1 = ((a1 + a2) * powf(cy1 - cy2, 2) + (b1 + b2) * powf(cx1 - cx2, 2)) /
               ((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2) + eps);
    float t2 = ((c1 + c2) * (cx2 - cx1) * (cy1 - cy2)) / ((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2) + eps);
    float t3 = logf(((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2)) /
                            (4 * sqrtf(fmaxf(a1 * b1 - c1 * c1, 0.0f)) * sqrtf(fmaxf(a2 * b2 - c2 * c2, 0.0f)) + eps) +
                    eps);
    float bd = 0.25f * t1 + 0.5f * t2 + 0.5f * t3;
    bd = fmaxf(fminf(bd, 100.0f), eps);
    float hd = sqrtf(1.0f - expf(-bd) + eps);
    return 1 - hd;
}

static __global__ void nms_kernel_obb(float* bboxes, int max_objects, float threshold) {
    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    int count = min(static_cast<int>(bboxes[0]), max_objects);
    if (position >= count)
        return;

    float* pcurrent = bboxes + 1 + position * bbox_element;
    for (int i = 0; i < count; ++i) {
        float* pitem = bboxes + 1 + i * bbox_element;
        if (i == position || pcurrent[5] != pitem[5])
            continue;
        if (pitem[4] >= pcurrent[4]) {
            if (pitem[4] == pcurrent[4] && i < position)
                continue;
            float iou = box_probiou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pcurrent[7], pitem[0], pitem[1],
                                    pitem[2], pitem[3], pitem[7]);
            if (iou > threshold) {
                pcurrent[6] = 0;
                return;
            }
        }
    }
}

void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
                 cudaStream_t stream) {
    int block = 256;
    int grid = ceil(num_bboxes / (float)block);
    decode_kernel<<<grid, block, 0, stream>>>((float*)predict, num_bboxes, confidence_threshold, parray, max_objects);
}

void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) {
    int block = max_objects < 256 ? max_objects : 256;
    int grid = ceil(max_objects / (float)block);
    nms_kernel<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold);
}

void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
                     cudaStream_t stream) {
    int block = 256;
    int grid = ceil(num_bboxes / (float)block);
    decode_kernel_obb<<<grid, block, 0, stream>>>((float*)predict, num_bboxes, confidence_threshold, parray,
                                                  max_objects);
}

void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) {
    int block = max_objects < 256 ? max_objects : 256;
    int grid = ceil(max_objects / (float)block);
    nms_kernel_obb<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold);
}


================================================
FILE: yolov12-tubro/src/preprocess.cu
================================================
#include "cuda_utils.h"
#include "preprocess.h"

static uint8_t* img_buffer_host = nullptr;
static uint8_t* img_buffer_device = nullptr;

__global__ void warpaffine_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst,
                                  int dst_width, int dst_height, uint8_t const_value_st, AffineMatrix d2s, int edge) {
    int position = blockDim.x * blockIdx.x + threadIdx.x;
    if (position >= edge)
        return;

    float m_x1 = d2s.value[0];
    float m_y1 = d2s.value[1];
    float m_z1 = d2s.value[2];
    float m_x2 = d2s.value[3];
    float m_y2 = d2s.value[4];
    float m_z2 = d2s.value[5];

    int dx = position % dst_width;
    int dy = position / dst_width;
    float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
    float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
    float c0, c1, c2;

    if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
        // out of range
        c0 = const_value_st;
        c1 = const_value_st;
        c2 = const_value_st;
    } else {
        int y_low = floorf(src_y);
        int x_low = floorf(src_x);
        int y_high = y_low + 1;
        int x_high = x_low + 1;

        uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
        float ly = src_y - y_low;
        float lx = src_x - x_low;
        float hy = 1 - ly;
        float hx = 1 - lx;
        float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
        uint8_t* v1 = const_value;
        uint8_t* v2 = const_value;
        uint8_t* v3 = const_value;
        uint8_t* v4 = const_value;

        if (y_low >= 0) {
            if (x_low >= 0)
                v1 = src + y_low * src_line_size + x_low * 3;

            if (x_high < src_width)
                v2 = src + y_low * src_line_size + x_high * 3;
        }

        if (y_high < src_height) {
            if (x_low >= 0)
                v3 = src + y_high * src_line_size + x_low * 3;

            if (x_high < src_width)
                v4 = src + y_high * src_line_size + x_high * 3;
        }

        c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
        c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
        c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
    }

    // bgr to rgb
    float t = c2;
    c2 = c0;
    c0 = t;

    // normalization
    c0 = c0 / 255.0f;
    c1 = c1 / 255.0f;
    c2 = c2 / 255.0f;

    // rgbrgbrgb to rrrgggbbb
    int area = dst_width * dst_height;
    float* pdst_c0 = dst + dy * dst_width + dx;
    float* pdst_c1 = pdst_c0 + area;
    float* pdst_c2 = pdst_c1 + area;
    *pdst_c0 = c0;
    *pdst_c1 = c1;
    *pdst_c2 = c2;
}

void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height,
                     cudaStream_t stream) {
    int img_size = src_width * src_height * 3;
    // copy data to pinned memory
    memcpy(img_buffer_host, src, img_size);
    // copy data to device memory
    CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream));

    AffineMatrix s2d, d2s;
    float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width);

    s2d.value[0] = scale;
    s2d.value[1] = 0;
    s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5;
    s2d.value[3] = 0;
    s2d.value[4] = scale;
    s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5;
    cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
    cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
    cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);

    memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));

    int jobs = dst_height * dst_width;
    int threads = 256;
    int blocks = ceil(jobs / (float)threads);
    warpaffine_kernel<<<blocks, threads, 0, stream>>>(img_buffer_device, src_width * 3, src_width, src_height, dst,
                                                      dst_width, dst_height, 128, d2s, jobs);
}

void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch, float* dst, int dst_width, int dst_height,
                           cudaStream_t stream) {
    int dst_size = dst_width * dst_height * 3;
    for (size_t i = 0; i < img_batch.size(); i++) {
        cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width,
                        dst_height, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
    }
}

void cuda_preprocess_init(int max_image_size) {
    // prepare input data in pinned memory
    CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3));
    // prepare input data in device memory
    CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3));
}

void cuda_preprocess_destroy() {
    CUDA_CHECK(cudaFree(img_buffer_device));
    CUDA_CHECK(cudaFreeHost(img_buffer_host));
}


================================================
FILE: yolov12-tubro/yolov12_cls.cpp
================================================
#include "calibrator.h"
#include "config.h"
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "utils.h"

#include <chrono>
#include <cmath>
#include <iostream>
#include <numeric>
#include <opencv2/opencv.hpp>

using namespace nvinfer1;

static Logger gLogger;
const static int kOutputSize = kClsNumClass;

void batch_preprocess(std::vector<cv::Mat>& imgs, float* output, int dst_width = 224, int dst_height = 224) {
    for (size_t b = 0; b < imgs.size(); b++) {
        int h = imgs[b].rows;
        int w = imgs[b].cols;
        int m = std::min(h, w);
        int top = (h - m) / 2;
        int left = (w - m) / 2;
        cv::Mat img = imgs[b](cv::Rect(left, top, m, m));
        cv::resize(img, img, cv::Size(dst_width, dst_height), 0, 0, cv::INTER_LINEAR);
        cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
        img.convertTo(img, CV_32F, 1 / 255.0);

        std::vector<cv::Mat> channels(3);
        cv::split(img, channels);

        // CHW format
        for (int c = 0; c < 3; ++c) {
            int i = 0;
            for (int row = 0; row < dst_height; ++row) {
                for (int col = 0; col < dst_width; ++col) {
                    output[b * 3 * dst_height * dst_width + c * dst_height * dst_width + i] =
                            channels[c].at<float>(row, col);
                    ++i;
                }
            }
        }
    }
}

std::vector<float> softmax(float* prob, int n) {
    std::vector<float> res;
    float sum = 0.0f;
    float t;
    for (int i = 0; i < n; i++) {
        t = expf(prob[i]);
        res.push_back(t);
        sum += t;
    }
    for (int i = 0; i < n; i++) {
        res[i] /= sum;
    }
    return res;
}

std::vector<int> topk(const std::vector<float>& vec, int k) {
    std::vector<int> topk_index;
    std::vector<size_t> vec_index(vec.size());
    std::iota(vec_index.begin(), vec_index.end(), 0);

    std::sort(vec_index.begin(), vec_index.end(),
              [&vec](size_t index_1, size_t index_2) { return vec[index_1] > vec[index_2]; });

    int k_num = std::min<int>(vec.size(), k);

    for (int i = 0; i < k_num; ++i) {
        topk_index.push_back(vec_index[i]);
    }

    return topk_index;
}

std::vector<std::string> read_classes(std::string file_name) {
    std::vector<std::string> classes;
    std::ifstream ifs(file_name, std::ios::in);
    if (!ifs.is_open()) {
        std::cerr << file_name << " is not found, pls refer to README and download it." << std::endl;
        assert(0);
    }
    std::string s;
    while (std::getline(ifs, s)) {
        classes.push_back(s);
    }
    ifs.close();
    return classes;
}

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, float& gd, float& gw,
                std::string& img_dir, std::string& type, int& max_channels) {
    if (argc < 4)
        return false;
    if (std::string(argv[1]) == "-s" && (argc == 5)) {
        wts = std::string(argv[2]);
        engine = std::string(argv[3]);
        auto net = std::string(argv[4]);
        if (net[0] == 'n') {
            gd = 0.50;
            gw = 0.25;
            max_channels = 1024;
            type = "n";
        } else if (net[0] == 's') {
            gd = 0.50;
            gw = 0.50;
            max_channels = 1024;
            type = "s";
        } else if (net[0] == 'm') {
            gd = 0.50;
            gw = 1.00;
            max_channels = 512;
            type = "m";
        } else if (net[0] == 'l') {
            gd = 1.0;
            gw = 1.0;
            max_channels = 512;
            type = "l";
        } else if (net[0] == 'x') {
            gd = 1.0;
            gw = 1.50;
            max_channels = 512;
            type = "x";
        } else {
            return false;
        }
    } else if (std::string(argv[1]) == "-d" && argc == 4) {
        engine = std::string(argv[2]);
        img_dir = std::string(argv[3]);
    } else {
        return false;
    }
    return true;
}

void prepare_buffers(ICudaEngine* engine, float** gpu_input_buffer, float** gpu_output_buffer, float** cpu_input_buffer,
                     float** output_buffer_host) {
    assert(engine->getNbBindings() == 2);
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(kInputTensorName);
    const int outputIndex = engine->getBindingIndex(kOutputTensorName);
    assert(inputIndex == 0);
    assert(outputIndex == 1);
    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc((void**)gpu_input_buffer, kBatchSize * 3 * kClsInputH * kClsInputW * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)gpu_output_buffer, kBatchSize * kOutputSize * sizeof(float)));

    *cpu_input_buffer = new float[kBatchSize * 3 * kClsInputH * kClsInputW];
    *output_buffer_host = new float[kBatchSize * kOutputSize];
}

void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* input, float* output,
           int batchSize) {
    CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * kClsInputH * kClsInputW * sizeof(float),
                               cudaMemcpyHostToDevice, stream));
    context.enqueueV2(buffers, stream, nullptr);
    CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
                               stream));
    cudaStreamSynchronize(stream);
}

void serialize_engine(float& gd, float& gw, std::string& wts_name, std::string& engine_name, std::string& type,
                      int max_channels) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();
    // Create model to populate the network, then set the outputs and create an engine
    IHostMemory* serialized_engine = nullptr;
    serialized_engine = buildEngineYolov12Cls(builder, config, DataType::kFLOAT, wts_name, gd, gw, type, max_channels);
    assert(serialized_engine);
    // Save engine to file
    std::ofstream p(engine_name, std::ios::binary);
    if (!p) {
        std::cerr << "Could not open plan output file" << std::endl;
        assert(false);
    }
    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());

    // Close everything down
    delete serialized_engine;
    delete config;
    delete builder;
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
                        IExecutionContext** context) {
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        assert(false);
    }
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    char* serialized_engine = new char[size];
    assert(serialized_engine);
    file.read(serialized_engine, size);
    file.close();

    *runtime = createInferRuntime(gLogger);
    assert(*runtime);
    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
    assert(*engine);
    *context = (*engine)->createExecutionContext();
    assert(*context);
    delete[] serialized_engine;
}

int main(int argc, char** argv) {
    // yolov12-cls -s ../models/yolov12n-cls.wts ../models/yolov12-cls.fp32.trt n
    // yolov12-cls -d ../models/yolov12n-cls.fp32.trt ../images
    cudaSetDevice(kGpuId);
    std::string wts_name;
    std::string engine_name;
    float gd = 0.0f, gw = 0.0f;
    std::string img_dir;
    std::string type;
    int max_channels = 0;

    if (!parse_args(argc, argv, wts_name, engine_name, gd, gw, img_dir, type, max_channels)) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./yolov12-cls -s [.wts] [.engine] [n/s/m/l/x]  // serialize model to plan file" << std::endl;
        std::cerr << "./yolov12-cls -d [.engine] ../images  // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // Create a model using the API directly and serialize it to a file
    if (!wts_name.empty()) {
        serialize_engine(gd, gw, wts_name, engine_name, type, max_channels);
        return 0;
    }

    // Deserialize the engine from file
    IRuntime* runtime = nullptr;
    ICudaEngine* engine = nullptr;
    IExecutionContext* context = nullptr;
    deserialize_engine(engine_name, &runtime, &engine, &context);
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    // Prepare cpu and gpu buffers
    float* device_buffers[2];
    float* cpu_input_buffer = nullptr;
    float* output_buffer_host = nullptr;
    prepare_buffers(engine, &device_buffers[0], &device_buffers[1], &cpu_input_buffer, &output_buffer_host);

    // Read images from directory
    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    // Read imagenet labels
    auto classes = read_classes("imagenet_classes.txt");

    // batch predict
    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
        // Get a batch of images
        std::vector<cv::Mat> img_batch;
        std::vector<std::string> img_name_batch;
        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
            img_batch.push_back(img);
            img_name_batch.push_back(file_names[j]);
        }

        // Preprocess
        batch_preprocess(img_batch, cpu_input_buffer);

        // Run inference
        auto start = std::chrono::system_clock::now();
        infer(*context, stream, (void**)device_buffers, cpu_input_buffer, output_buffer_host, kBatchSize);
        auto end = std::chrono::system_clock::now();
        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
                  << "ms" << std::endl;

        // Postprocess and get top-k result
        for (size_t b = 0; b < img_name_batch.size(); b++) {
            float* p = &output_buffer_host[b * kOutputSize];
            auto res = softmax(p, kOutputSize);
            auto topk_idx = topk(res, 3);
            std::cout << img_name_batch[b] << std::endl;
            for (auto idx : topk_idx) {
                std::cout << "  " << classes[idx] << " " << res[idx] << std::endl;
            }
        }
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(device_buffers[0]));
    CUDA_CHECK(cudaFree(device_buffers[1]));
    delete[] cpu_input_buffer;
    delete[] output_buffer_host;
    // Destroy the engine
    delete context;
    delete engine;
    delete runtime;
    return 0;
}


================================================
FILE: yolov12-tubro/yolov12_cls_trt.py
================================================
"""
An example that uses TensorRT's Python api to make inferences.
"""
import os
import shutil
import sys
import threading
import time
import cv2
import numpy as np
import torch
import pycuda.autoinit  # noqa: F401
import pycuda.driver as cuda
import tensorrt as trt


def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret


class YoLov12TRT(object):
    """
    description: A YOLOv12 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []
        self.mean = (0.485, 0.456, 0.406)
        self.std = (0.229, 0.224, 0.225)

        for binding in engine:
            print('binding:', binding, engine.get_binding_shape(binding))
            self.batch_size = engine.get_binding_shape(binding)[0]
            size = trt.volume(engine.get_binding_shape(
                binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings

    def infer(self, raw_image_generator):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_input_image = np.empty(
            shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            batch_image_raw.append(image_raw)
            input_image = self.preprocess_cls_image(image_raw)
            np.copyto(batch_input_image[i], input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        # Do postprocess
        for i in range(self.batch_size):
            classes_ls, predicted_conf_ls, category_id_ls = self.postprocess_cls(
                output)
            cv2.putText(batch_image_raw[i], str(
                classes_ls), (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1, cv2.LINE_AA)
            print(classes_ls, predicted_conf_ls)
        return batch_image_raw, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()

    def get_raw_image(self, image_path_batch):
        """
        description: Read an image from image path
        """
        for img_path in image_path_batch:
            yield cv2.imread(img_path)

    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_cls_image(self, raw_bgr_image, dst_width=224, dst_height=224):

        """
            description: Convert BGR image to RGB,
                         crop the center square frame,
                         resize it to target size, normalize to [0,1],
                         transform to NCHW format.
            param:
                raw_bgr_image: numpy array, raw BGR image
                dst_width: int, target image width
                dst_height: int, target image height
            return:
                image:  the processed image
                image_raw: the original image
                h: original height
                w: original width
        """
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        # Crop the center square frame
        m = min(h, w)
        top = (h - m) // 2
        left = (w - m) // 2
        image = raw_bgr_image[top:top + m, left:left + m]

        # Resize the image with target size while maintaining ratio
        image = cv2.resize(image, (dst_width, dst_height), interpolation=cv2.INTER_LINEAR)

        # Convert BGR to RGB
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # Normalize to [0,1]
        image = image.astype(np.float32) / 255.0

        # HWC to CHW format
        image = image.transpose(2, 0, 1)

        # CHW to NCHW format (add batch dimension)
        image = np.expand_dims(image, axis=0)

        # Convert the image to row-major order, also known as "C order"
        image = np.ascontiguousarray(image)

        batch_data = np.expand_dims(image, axis=0)

        return batch_data

    def postprocess_cls(self, output_data):
        classes_ls = []
        predicted_conf_ls = []
        category_id_ls = []
        output_data = output_data.reshape(self.batch_size, -1)
        output_data = torch.Tensor(output_data)
        p = torch.nn.functional.softmax(output_data, dim=1)
        score, index = torch.topk(p, 3)
        for ind in range(index.shape[0]):
            input_category_id = index[ind][0].item()  # 716
            category_id_ls.append(input_category_id)
            predicted_confidence = score[ind][0].item()
            predicted_conf_ls.append(predicted_confidence)
            classes_ls.append(classes[input_category_id])
        return classes_ls, predicted_conf_ls, category_id_ls


class inferThread(threading.Thread):
    def __init__(self, yolov12_wrapper, image_path_batch):
        threading.Thread.__init__(self)
        self.yolov12_wrapper = yolov12_wrapper
        self.image_path_batch = image_path_batch

    def run(self):
        batch_image_raw, use_time = self.yolov12_wrapper.infer(
            self.yolov12_wrapper.get_raw_image(self.image_path_batch))
        for i, img_path in enumerate(self.image_path_batch):
            parent, filename = os.path.split(img_path)
            save_name = os.path.join('output', filename)
            # Save image
            cv2.imwrite(save_name, batch_image_raw[i])
        print('input->{}, time->{:.2f}ms, saving into output/'.format(
            self.image_path_batch, use_time * 1000))


class warmUpThread(threading.Thread):
    def __init__(self, yolov12_wrapper):
        threading.Thread.__init__(self)
        self.yolov12_wrapper = yolov12_wrapper

    def run(self):
        batch_image_raw, use_time = self.yolov12_wrapper.infer(
            self.yolov12_wrapper.get_raw_image_zeros())
        print(
            'warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))


# with open("imagenet_classes.txt") as f:
#     classes = [line.strip() for line in f.readlines()]

classes = ["daisy", "dandelion", "rose", "sunflower", "tulip"]


if __name__ == "__main__":
    # load custom plugin and engine
    engine_file_path = "build/yolov12n-cls-5.engine"

    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')
    # a YoLov12TRT instance
    yolov12_wrapper = YoLov12TRT(engine_file_path)
    try:
        print('batch size is', yolov12_wrapper.batch_size)

        image_dir = "images"
        image_path_batches = get_img_path_batches(
            yolov12_wrapper.batch_size, image_dir)

        for i in range(10):
            # create a new thread to do warm_up
            thread1 = warmUpThread(yolov12_wrapper)
            thread1.start()
            thread1.join()
        for batch in image_path_batches:
            # create a new thread to do inference
            thread1 = inferThread(yolov12_wrapper, batch)
            thread1.start()
            thread1.join()
    finally:
        # destroy the instance
        yolov12_wrapper.destroy()


================================================
FILE: yolov12-tubro/yolov12_det.cpp
================================================

#include <fstream>
#include <iostream>
#include <opencv2/opencv.hpp>
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "postprocess.h"
#include "preprocess.h"
#include "utils.h"

Logger gLogger;
using namespace nvinfer1;
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;

void serialize_engine(std::string& wts_name, std::string& engine_name, float& gd, float& gw, int& max_channels,
                      std::string& type) {
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();
    IHostMemory* serialized_engine = nullptr;

    serialized_engine = buildEngineYolov12Det(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels, type);

    assert(serialized_engine);
    std::ofstream p(engine_name, std::ios::binary);
    if (!p) {
        std::cout << "could not open plan output file" << std::endl;
        assert(false);
    }
    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());

    delete serialized_engine;
    delete config;
    delete builder;
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
                        IExecutionContext** context) {
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        assert(false);
    }
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    char* serialized_engine = new char[size];
    assert(serialized_engine);
    file.read(serialized_engine, size);
    file.close();

    *runtime = createInferRuntime(gLogger);
    assert(*runtime);
    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
    assert(*engine);
    *context = (*engine)->createExecutionContext();
    assert(*context);
    delete[] serialized_engine;
}

void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
                    float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
                    std::string cuda_post_process) {
    assert(engine->getNbBindings() == 2);
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(kInputTensorName);
    const int outputIndex = engine->getBindingIndex(kOutputTensorName);
    assert(inputIndex == 0);
    assert(outputIndex == 1);
    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
    if (cuda_post_process == "c") {
        *output_buffer_host = new float[kBatchSize * kOutputSize];
    } else if (cuda_post_process == "g") {
        if (kBatchSize > 1) {
            std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
            exit(0);
        }
        // Allocate memory for decode_ptr_host and copy to device
        *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
        CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
    }
}

void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
           float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
    // infer on the batch asynchronously, and DMA output back to host
    auto start = std::chrono::system_clock::now();
    context.enqueueV2(buffers, stream, nullptr);
    if (cuda_post_process == "c") {
        CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
                                   stream));
        auto end = std::chrono::system_clock::now();
        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
                  << "ms" << std::endl;
    } else if (cuda_post_process == "g") {
        CUDA_CHECK(
                cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
        cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
        cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream);  //cuda nms
        CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
                                   sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
                                   stream));
        auto end = std::chrono::system_clock::now();
        std::cout << "inference and gpu postprocess time: "
                  << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));
}

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir, std::string& type,
                std::string& cuda_post_process, float& gd, float& gw, int& max_channels) {
    if (argc < 4)
        return false;
    if (std::string(argv[1]) == "-s" && (argc == 5)) {
        wts = std::string(argv[2]);
        engine = std::string(argv[3]);
        auto sub_type = std::string(argv[4]);

        if (sub_type[0] == 'n') {
            gd = 0.50;
            gw = 0.25;
            max_channels = 1024;
            type = "n";
        } else if (sub_type[0] == 's') {
            gd = 0.50;
            gw = 0.50;
            max_channels = 1024;
            type = "s";
        } else if (sub_type[0] == 'm') {
            gd = 0.50;
            gw = 1.00;
            max_channels = 512;
            type = "m";
        } else if (sub_type[0] == 'l') {
            gd = 1.0;
            gw = 1.0;
            max_channels = 512;
            type = "l";
        } else if (sub_type[0] == 'x') {
            gd = 1.0;
            gw = 1.50;
            max_channels = 512;
            type = "x";
        } else {
            return false;
        }
    } else if (std::string(argv[1]) == "-d" && argc == 5) {
        engine = std::string(argv[2]);
        img_dir = std::string(argv[3]);
        cuda_post_process = std::string(argv[4]);
    } else {
        return false;
    }
    return true;
}

int main(int argc, char** argv) {
    // yolov12_det -s ../models/yolov12n.wts ../models/yolov12n.fp32.trt n
    // yolov12_det -d ../models/yolov12n.fp32.trt ../images c
    cudaSetDevice(kGpuId);
    std::string wts_name;
    std::string engine_name;
    std::string img_dir;
    std::string cuda_post_process;
    std::string type;
    int model_bboxes;
    float gd = 0, gw = 0;
    int max_channels = 0;

    if (!parse_args(argc, argv, wts_name, engine_name, img_dir, type, cuda_post_process, gd, gw, max_channels)) {
        std::cerr << "Arguments not right!" << std::endl;
        std::cerr << "./yolov12_det -s [.wts] [.engine] [n/s/m/l/x]  // serialize model to "
                     "plan file"
                  << std::endl;
        std::cerr << "./yolov12_det -d [.engine] ../images  [c/g]// deserialize plan file and run inference"
                  << std::endl;
        return -1;
    }

    // Create a model using the API directly and serialize it to a file
    if (!wts_name.empty()) {
        serialize_engine(wts_name, engine_name, gd, gw, max_channels, type);
        return 0;
    }

    // Deserialize the engine from file
    IRuntime* runtime = nullptr;
    ICudaEngine* engine = nullptr;
    IExecutionContext* context = nullptr;
    deserialize_engine(engine_name, &runtime, &engine, &context);
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));
    cuda_preprocess_init(kMaxInputImageSize);
    auto out_dims = engine->getBindingDimensions(1);
    model_bboxes = out_dims.d[0];
    // Prepare cpu and gpu buffers
    float* device_buffers[2];
    float* output_buffer_host = nullptr;
    float* decode_ptr_host = nullptr;
    float* decode_ptr_device = nullptr;

    // Read images from directory
    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
                   &decode_ptr_device, cuda_post_process);

    // batch predict
    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
        // Get a batch of images
        std::vector<cv::Mat> img_batch;
        std::vector<std::string> img_name_batch;
        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
            img_batch.push_back(img);
            img_name_batch.push_back(file_names[j]);
        }
        // Preprocess
        cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
        // Run inference
        infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
              decode_ptr_device, model_bboxes, cuda_post_process);
        // Save the first 100 values of output_buffer_host, one per line
        //        std::ofstream out("../models/output.txt");
        //        for (int j = 0; j < 100; j++) {
        //            out << output_buffer_host[j] << std::endl;
        //        }
        //        out.close();
        std::vector<std::vector<Detection>> res_batch;
        if (cuda_post_process == "c") {
            // NMS
            batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
        } else if (cuda_post_process == "g") {
            //Process gpu decode and nms results
            batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
        }
        // Draw bounding boxes
        draw_bbox(img_batch, res_batch);
        // Save images
        for (size_t j = 0; j < img_batch.size(); j++) {
            cv::imwrite("_" + img_name_batch[j], img_batch[j]);
        }
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(device_buffers[0]));
    CUDA_CHECK(cudaFree(device_buffers[1]));
    CUDA_CHECK(cudaFree(decode_ptr_device));
    delete[] decode_ptr_host;
    delete[] output_buffer_host;
    cuda_preprocess_destroy();
    // Destroy the engine
    delete context;
    delete engine;
    delete runtime;

    // Print histogram of the output distribution
    //std::cout << "\nOutput:\n\n";
    //for (unsigned int i = 0; i < kOutputSize; i++)
    //{
    //    std::cout << prob[i] << ", ";
    //    if (i % 10 == 0) std::cout << std::endl;
    //}
    //std::cout << std::endl;

    return 0;
}


================================================
FILE: yolov12-tubro/yolov12_det_trt.py
================================================
"""
An example that uses TensorRT's Python api to make inferences.
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit  # noqa: F401
import pycuda.driver as cuda
import tensorrt as trt

CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4
POSE_NUM = 17 * 3
DET_NUM = 6
SEG_NUM = 32
OBB_NUM = 1


def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret


def plot_one_box(x, img, color=None, label=None, line_thickness=None):
    """
    description: Plots one bounding box on image img,
                 this function comes from YoLo11 project.
    param:
        x:      a box likes [x1,y1,x2,y2]
        img:    a opencv image object
        color:  color to draw rectangle, such as (0,255,0)
        label:  str
        line_thickness: int
    return:
        no return

    """
    tl = (
            line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
    )  # line/font thickness
    color = color or [random.randint(0, 255) for _ in range(3)]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
    if label:
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(
            img,
            label,
            (c1[0], c1[1] - 2),
            0,
            tl / 3,
            [225, 255, 255],
            thickness=tf,
            lineType=cv2.LINE_AA,
        )


class YoLo12TRT(object):
    """
    description: A YOLO11 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            self.batch_size = engine.get_binding_shape(binding)[0]
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.det_output_length = host_outputs[0].shape[0]

    def infer(self, raw_image_generator):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_origin_h = []
        batch_origin_w = []
        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
            batch_image_raw.append(image_raw)
            batch_origin_h.append(origin_h)
            batch_origin_w.append(origin_w)
            np.copyto(batch_input_image[i], input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        # Do postprocess
        for i in range(self.batch_size):
            result_boxes, result_scores, result_classid = self.post_process(
                output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i],
                batch_origin_w[i]
            )
            # Draw rectangles and labels on the original image
            for j in range(len(result_boxes)):
                box = result_boxes[j]
                plot_one_box(
                    box,
                    batch_image_raw[i],
                    label="{}:{:.2f}".format(
                        categories[int(result_classid[j])], result_scores[j]
                    ),
                )
        return batch_image_raw, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()

    def get_raw_image(self, image_path_batch):
        """
        description: Read an image from image path
        """
        for img_path in image_path_batch:
            yield cv2.imread(img_path)

    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_image(self, raw_bgr_image):
        """
        description: Convert BGR image to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = self.input_w / w
        r_h = self.input_h / h
        if r_h > r_w:
            tw = self.input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.input_h - th) / 2)
            ty2 = self.input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = self.input_h
            tx1 = int((self.input_w - tw) / 2)
            tx2 = self.input_w - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy(self, origin_h, origin_w, x):
        """
        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
        param:
            origin_h:   height of original image
            origin_w:   width of original image
            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
        return:
            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
        """
        y = np.zeros_like(x)
        r_w = self.input_w / origin_w
        r_h = self.input_h / origin_h
        if r_h > r_w:
            y[:, 0] = x[:, 0]
            y[:, 2] = x[:, 2]
            y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2
            y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2
            y /= r_w
        else:
            y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2
            y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2
            y[:, 1] = x[:, 1]
            y[:, 3] = x[:, 3]
            y /= r_h

        return y

    def post_process(self, output, origin_h, origin_w):
        """
        description: postprocess the prediction
        param:
            output:     A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
            result_scores: finally scores, a numpy, each element is the score correspoing to box
            result_classid: finally classid, a numpy, each element is the classid correspoing to box
        """
        num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + OBB_NUM
        # Get the num of boxes detected
        num = int(output[0])
        print("There are {} detections in the picture!!!".format(num))
        # Reshape to a two dimentional ndarray
        # pred = np.reshape(output[1:], (-1, 38))[:num, :]
        pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :]
        # Do nms
        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
        result_scores = boxes[:, 4] if len(boxes) else np.array([])
        result_classid = boxes[:, 5] if len(boxes) else np.array([])
        return result_boxes, result_scores, result_classid

    def bbox_iou(self, box1, box2, x1y1x2y2=True):
        """
        description: compute the IoU of two bounding boxes
        param:
            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            x1y1x2y2: select the coordinate format
        return:
            iou: computed iou
        """
        if not x1y1x2y2:
            # Transform from center and width to exact coordinates
            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
        else:
            # Get the coordinates of bounding boxes
            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

        # Get the coordinates of the intersection rectangle
        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
        # Intersection area
        inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None)
                      * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None))
        # Union Area
        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)

        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

        return iou

    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
        """
        description: Removes detections with lower object confidence score than 'conf_thres' and performs
        Non-Maximum Suppression to further filter detections.
        param:
            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
            origin_h: original image height
            origin_w: original image width
            conf_thres: a confidence threshold to filter detections
            nms_thres: a iou threshold to filter detections
        return:
            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
        """
        # Get the boxes that score > CONF_THRESH
        boxes = prediction[prediction[:, 4] >= conf_thres]
        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
        boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
        # clip the coordinates
        boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1)
        boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1)
        boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1)
        boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1)
        # Object confidence
        confs = boxes[:, 4]
        # Sort by the confs
        boxes = boxes[np.argsort(-confs)]
        # Perform non-maximum suppression
        keep_boxes = []
        while boxes.shape[0]:
            large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
            label_match = boxes[0, -1] == boxes[:, -1]
            # Indices of boxes with lower confidence scores, large IOUs and matching labels
            invalid = large_overlap & label_match
            keep_boxes += [boxes[0]]
            boxes = boxes[~invalid]
        boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
        return boxes


class inferThread(threading.Thread):
    def __init__(self, yolo11_wrapper, image_path_batch):
        threading.Thread.__init__(self)
        self.yolo11_wrapper = yolo11_wrapper
        self.image_path_batch = image_path_batch

    def run(self):
        batch_image_raw, use_time = self.yolo11_wrapper.infer(self.yolo11_wrapper.get_raw_image(self.image_path_batch))
        for i, img_path in enumerate(self.image_path_batch):
            parent, filename = os.path.split(img_path)
            save_name = os.path.join('output', filename)
            # Save image
            cv2.imwrite(save_name, batch_image_raw[i])
        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))


class warmUpThread(threading.Thread):
    def __init__(self, yolo11_wrapper):
        threading.Thread.__init__(self)
        self.yolo11_wrapper = yolo11_wrapper

    def run(self):
        batch_image_raw, use_time = self.yolo11_wrapper.infer(self.yolo11_wrapper.get_raw_image_zeros())
        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))


if __name__ == "__main__":
    # load custom plugin and engine
    PLUGIN_LIBRARY = "build/libmyplugins.so"
    engine_file_path = "build/yolov12n-det.engine"

    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]
    if len(sys.argv) > 2:
        PLUGIN_LIBRARY = sys.argv[2]

    ctypes.CDLL(PLUGIN_LIBRARY)

    # load coco labels
    categories = ["object"]

    # categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
    #               "traffic light",
    #               "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep",
    #               "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase",
    #               "frisbee",
    #               "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
    #               "surfboard",
    #               "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
    #               "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
    #               "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
    #               "cell phone",
    #               "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
    #               "teddy bear",
    #               "hair drier", "toothbrush"]

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')
    # a YoLo11TRT instance
    yolov12_wrapper = YoLo12TRT(engine_file_path)
    try:
        print('batch size is', yolov12_wrapper.batch_size)

        image_dir = "images"
        image_path_batches = get_img_path_batches(yolov12_wrapper.batch_size, image_dir)

        for i in range(10):
            # create a new thread to do warm_up
            thread1 = warmUpThread(yolov12_wrapper)
            thread1.start()
            thread1.join()
        for batch in image_path_batches:
            # create a new thread to do inference
            thread1 = inferThread(yolov12_wrapper, batch)
            thread1.start()
            thread1.join()
    finally:
        # destroy the instance
        yolov12_wrapper.destroy()


================================================
FILE: yolov12-tubro/yolov12_seg.cpp
================================================

#include <fstream>
#include <iostream>
#include <opencv2/opencv.hpp>
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "postprocess.h"
#include "preprocess.h"
#include "utils.h"

Logger gLogger;
using namespace nvinfer1;
const int kOutputSize = kMaxNumOutputBbox * (sizeof(Detection) - sizeof(float) * 51) / sizeof(float) + 1;
const static int kOutputSegSize = 32 * (kInputH / 4) * (kInputW / 4);

static cv::Rect get_downscale_rect(float bbox[4], float scale) {

    float left = bbox[0];
    float top = bbox[1];
    float right = bbox[0] + bbox[2];
    float bottom = bbox[1] + bbox[3];

    left = left < 0 ? 0 : left;
    top = top < 0 ? 0 : top;
    right = right > kInputW ? kInputW : right;
    bottom = bottom > kInputH ? kInputH : bottom;

    left /= scale;
    top /= scale;
    right /= scale;
    bottom /= scale;
    return cv::Rect(int(left), int(top), int(right - left), int(bottom - top));
}

std::vector<cv::Mat> process_mask(const float* proto, int proto_size, std::vector<Detection>& dets) {

    std::vector<cv::Mat> masks;
    for (size_t i = 0; i < dets.size(); i++) {

        cv::Mat mask_mat = cv::Mat::zeros(kInputH / 4, kInputW / 4, CV_32FC1);
        auto r = get_downscale_rect(dets[i].bbox, 4);

        for (int x = r.x; x < r.x + r.width; x++) {
            for (int y = r.y; y < r.y + r.height; y++) {
                float e = 0.0f;
                for (int j = 0; j < 32; j++) {
                    e += dets[i].mask[j] * proto[j * proto_size / 32 + y * mask_mat.cols + x];
                }
                e = 1.0f / (1.0f + expf(-e));
                mask_mat.at<float>(y, x) = e;
            }
        }
        cv::resize(mask_mat, mask_mat, cv::Size(kInputW, kInputH));
        masks.push_back(mask_mat);
    }
    return masks;
}

void serialize_engine(std::string& wts_name, std::string& engine_name, std::string& type, float& gd, float& gw,
                      int& max_channels) {
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();
    IHostMemory* serialized_engine = nullptr;

    serialized_engine = buildEngineYolov12Seg(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels, type);

    assert(serialized_engine);
    std::ofstream p(engine_name, std::ios::binary);
    if (!p) {
        std::cout << "could not open plan output file" << std::endl;
        assert(false);
    }
    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());

    delete serialized_engine;
    delete config;
    delete builder;
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
                        IExecutionContext** context) {
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        assert(false);
    }
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    char* serialized_engine = new char[size];
    assert(serialized_engine);
    file.read(serialized_engine, size);
    file.close();

    *runtime = createInferRuntime(gLogger);
    assert(*runtime);
    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
    assert(*engine);
    *context = (*engine)->createExecutionContext();
    assert(*context);
    delete[] serialized_engine;
}

void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
                    float** output_seg_buffer_device, float** output_buffer_host, float** output_seg_buffer_host,
                    float** decode_ptr_host, float** decode_ptr_device, std::string cuda_post_process) {
    assert(engine->getNbBindings() == 3);
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(kInputTensorName);
    const int outputIndex = engine->getBindingIndex(kOutputTensorName);
    const int outputIndex_seg = engine->getBindingIndex("proto");

    assert(inputIndex == 0);
    assert(outputIndex == 1);
    assert(outputIndex_seg == 2);
    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)output_seg_buffer_device, kBatchSize * kOutputSegSize * sizeof(float)));

    if (cuda_post_process == "c") {
        *output_buffer_host = new float[kBatchSize * kOutputSize];
        *output_seg_buffer_host = new float[kBatchSize * kOutputSegSize];
    } else if (cuda_post_process == "g") {
        if (kBatchSize > 1) {
            std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
            exit(0);
        }
        // Allocate memory for decode_ptr_host and copy to device
        *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
        CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
    }
}

void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, float* output_seg,
           int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes,
           std::string cuda_post_process) {
    // infer on the batch asynchronously, and DMA output back to host
    auto start = std::chrono::system_clock::now();
    context.enqueueV2(buffers, stream, nullptr);
    if (cuda_post_process == "c") {

        std::cout << "kOutputSize:" << kOutputSize << std::endl;
        CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
                                   stream));
        std::cout << "kOutputSegSize:" << kOutputSegSize << std::endl;
        CUDA_CHECK(cudaMemcpyAsync(output_seg, buffers[2], batchsize * kOutputSegSize * sizeof(float),
                                   cudaMemcpyDeviceToHost, stream));

        auto end = std::chrono::system_clock::now();
        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
                  << "ms" << std::endl;
    } else if (cuda_post_process == "g") {
        CUDA_CHECK(
                cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
        cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
        cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream);  //cuda nms
        CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
                                   sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
                                   stream));
        auto end = std::chrono::system_clock::now();
        std::cout << "inference and gpu postprocess time: "
                  << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));
}

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir, std::string& type,
                std::string& cuda_post_process, std::string& labels_filename, float& gd, float& gw, int& max_channels) {
    if (argc < 4)
        return false;
    if (std::string(argv[1]) == "-s" && argc == 5) {
        wts = std::string(argv[2]);
        engine = std::string(argv[3]);
        std::string sub_type = std::string(argv[4]);
        if (sub_type[0] == 'n') {
            gd = 0.50;
            gw = 0.25;
            max_channels = 1024;
            type = "n";
        } else if (sub_type[0] == 's') {
            gd = 0.50;
            gw = 0.50;
            max_channels = 1024;
            type = "s";
        } else if (sub_type[0] == 'm') {
            gd = 0.50;
            gw = 1.00;
            max_channels = 512;
            type = "m";
        } else if (sub_type[0] == 'l') {
            gd = 1.0;
            gw = 1.0;
            max_channels = 512;
            type = "l";
        } else if (sub_type[0] == 'x') {
            gd = 1.0;
            gw = 1.50;
            max_channels = 512;
            type = "x";
        } else {
            return false;
        }
    } else if (std::string(argv[1]) == "-d" && argc == 6) {
        engine = std::string(argv[2]);
        img_dir = std::string(argv[3]);
        cuda_post_process = std::string(argv[4]);
        labels_filename = std::string(argv[5]);
    } else {
        return false;
    }
    return true;
}

int main(int argc, char** argv) {
    // yolo11_seg -s ../models/yolo11n-seg.wts ../models/yolo11n-seg.fp32.trt n
    // yolo11_seg -d ../models/yolo11n-seg.fp32.trt ../images c coco.txt
    cudaSetDevice(kGpuId);
    std::string wts_name;
    std::string engine_name;
    std::string img_dir;
    std::string type;
    std::string cuda_post_process;
    std::string labels_filename = "coco.txt";
    int model_bboxes;
    float gd = 0.0f, gw = 0.0f;
    int max_channels = 0;

    if (!parse_args(argc, argv, wts_name, engine_name, img_dir, type, cuda_post_process, labels_filename, gd, gw,
                    max_channels)) {
        std::cerr << "Arguments not right!" << std::endl;
        std::cerr << "./yolo11_seg -s [.wts] [.engine] [n/s/m/l/x]  // serialize model to plan file" << std::endl;
        std::cerr << "./yolo11_seg -d [.engine] ../images  [c/g] coco_file// deserialize plan file and run inference"
                  << std::endl;
        return -1;
    }

    // Create a model using the API directly and serialize it to a file
    if (!wts_name.empty()) {
        serialize_engine(wts_name, engine_name, type, gd, gw, max_channels);
        return 0;
    }

    // Deserialize the engine from file
    IRuntime* runtime = nullptr;
    ICudaEngine* engine = nullptr;
    IExecutionContext* context = nullptr;
    deserialize_engine(engine_name, &runtime, &engine, &context);
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));
    cuda_preprocess_init(kMaxInputImageSize);
    auto out_dims = engine->getBindingDimensions(1);
    model_bboxes = out_dims.d[0];
    // Prepare cpu and gpu buffers
    float* device_buffers[3];
    float* output_buffer_host = nullptr;
    float* output_seg_buffer_host = nullptr;
    float* decode_ptr_host = nullptr;
    float* decode_ptr_device = nullptr;

    // Read images from directory
    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    std::unordered_map<int, std::string> labels_map;
    read_labels(labels_filename, labels_map);
    assert(kNumClass == labels_map.size());

    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &device_buffers[2], &output_buffer_host,
                   &output_seg_buffer_host, &decode_ptr_host, &decode_ptr_device, cuda_post_process);

    // // batch predict
    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
        // Get a batch of images
        std::vector<cv::Mat> img_batch;
        std::vector<std::string> img_name_batch;
        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
            img_batch.push_back(img);
            img_name_batch.push_back(file_names[j]);
        }
        // Preprocess
        cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
        // Run inference
        infer(*context, stream, (void**)device_buffers, output_buffer_host, output_seg_buffer_host, kBatchSize,
              decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process);
        std::vector<std::vector<Detection>> res_batch;
        if (cuda_post_process == "c") {
            // NMS
            batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
            for (size_t b = 0; b < img_batch.size(); b++) {
                auto& res = res_batch[b];
                cv::Mat img = img_batch[b];
                auto masks = process_mask(&output_seg_buffer_host[b * kOutputSegSize], kOutputSegSize, res);
                draw_mask_bbox(img, res, masks, labels_map);
                cv::imwrite("_" + img_name_batch[b], img);
            }
        } else if (cuda_post_process == "g") {
            // Process gpu decode and nms results
            // batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
            // todo seg in gpu
            std::cerr << "seg_postprocess is not support in gpu right now" << std::endl;
        }
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(device_buffers[0]));
    CUDA_CHECK(cudaFree(device_buffers[1]));
    CUDA_CHECK(cudaFree(device_buffers[2]));
    CUDA_CHECK(cudaFree(decode_ptr_device));
    delete[] decode_ptr_host;
    delete[] output_buffer_host;
    delete[] output_seg_buffer_host;
    cuda_preprocess_destroy();
    // Destroy the engine
    delete context;
    delete engine;
    delete runtime;

    // Print histogram of the output distribution
    // std::cout << "\nOutput:\n\n";
    // for (unsigned int i = 0; i < kOutputSize; i++)
    //{
    //    std::cout << prob[i] << ", ";
    //    if (i % 10 == 0) std::cout << std::endl;
    //}
    // std::cout << std::endl;

    return 0;
}


================================================
FILE: yolov12-tubro/yolov12_seg_trt.py
================================================
"""
An example that uses TensorRT's Python api to make inferences.
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit  # noqa: F401
import pycuda.driver as cuda
import tensorrt as trt

CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4
POSE_NUM = 17 * 3
DET_NUM = 6
SEG_NUM = 32
OBB_NUM = 1


def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret


def plot_one_box(x, img, color=None, label=None, line_thickness=None):
    """
    description: Plots one bounding box on image img,
                 this function comes from YoLo11 project.
    param:
        x:      a box likes [x1,y1,x2,y2]
        img:    a opencv image object
        color:  color to draw rectangle, such as (0,255,0)
        label:  str
        line_thickness: int
    return:
        no return

    """
    tl = (
            line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
    )  # line/font thickness
    color = color or [random.randint(0, 255) for _ in range(3)]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
    if label:
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(
            img,
            label,
            (c1[0], c1[1] - 2),
            0,
            tl / 3,
            [225, 255, 255],
            thickness=tf,
            lineType=cv2.LINE_AA,
        )


class YoLo12TRT(object):
    """
    description: A YOLO11 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            self.batch_size = engine.get_binding_shape(binding)[0]
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings

        # Data length
        self.det_output_length = host_outputs[0].shape[0]
        self.seg_output_length = host_outputs[1].shape[0]
        self.seg_w = int(self.input_w / 4)
        self.seg_h = int(self.input_h / 4)
        self.seg_c = int(self.seg_output_length / (self.seg_w * self.seg_w))
        self.det_row_output_length = self.seg_c + DET_NUM + POSE_NUM + OBB_NUM

        # Draw mask
        self.colors_obj = Colors()

    def infer(self, raw_image_generator):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_origin_h = []
        batch_origin_w = []
        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
            batch_image_raw.append(image_raw)
            batch_origin_h.append(origin_h)
            batch_origin_w.append(origin_w)
            np.copyto(batch_input_image[i], input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        cuda.memcpy_dtoh_async(host_outputs[1], cuda_outputs[1], stream)

        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        output_proto_mask = host_outputs[1]
        # Do postprocess
        for i in range(self.batch_size):
            result_boxes, result_scores, result_classid, result_proto_coef = self.post_process(
                output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i],
                batch_origin_w[i]
            )

            if result_proto_coef.shape[0] == 0:
                continue
            result_masks = self.process_mask(output_proto_mask, result_proto_coef, result_boxes, batch_origin_h[i],
                                             batch_origin_w[i])

            self.draw_mask(result_masks, colors_=[self.colors_obj(x, True) for x in result_classid],
                           im_src=batch_image_raw[i])

            # Draw rectangles and labels on the original image
            for j in range(len(result_boxes)):
                box = result_boxes[j]
                plot_one_box(
                    box,
                    batch_image_raw[i],
                    label="{}:{:.2f}".format(
                        categories[int(result_classid[j])], result_scores[j]
                    ),
                )
        return batch_image_raw, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()

    def get_raw_image(self, image_path_batch):
        """
        description: Read an image from image path
        """
        for img_path in image_path_batch:
            yield cv2.imread(img_path)

    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_image(self, raw_bgr_image):
        """
        description: Convert BGR image to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = self.input_w / w
        r_h = self.input_h / h
        if r_h > r_w:
            tw = self.input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.input_h - th) / 2)
            ty2 = self.input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = self.input_h
            tx1 = int((self.input_w - tw) / 2)
            tx2 = self.input_w - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy(self, origin_h, origin_w, x):
        """
        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
        param:
            origin_h:   height of original image
            origin_w:   width of original image
            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
        return:
            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
        """
        y = np.zeros_like(x)
        r_w = self.input_w / origin_w
        r_h = self.input_h / origin_h
        if r_h > r_w:
            y[:, 0] = x[:, 0]
            y[:, 2] = x[:, 2]
            y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2
            y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2
            y /= r_w
        else:
            y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2
            y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2
            y[:, 1] = x[:, 1]
            y[:, 3] = x[:, 3]
            y /= r_h

        return y

    def post_process(self, output, origin_h, origin_w):
        """
        description: postprocess the prediction
        param:
            output:     A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
            result_scores: finally scores, a numpy, each element is the score correspoing to box
            result_classid: finally classid, a numpy, each element is the classid correspoing to box
        """
        # Get the num of boxes detected
        num = int(output[0])
        print("There are {} detections ".format(num))
        # Reshape to a two dimentional ndarray
        pred = np.reshape(output[1:], (-1, self.det_row_output_length))[:num, :]

        # Do nms
        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
        result_scores = boxes[:, 4] if len(boxes) else np.array([])
        result_classid = boxes[:, 5] if len(boxes) else np.array([])
        result_proto_coef = boxes[:, DET_NUM:int(DET_NUM + SEG_NUM)] if len(boxes) else np.array([])
        return result_boxes, result_scores, result_classid, result_proto_coef

    def bbox_iou(self, box1, box2, x1y1x2y2=True):
        """
        description: compute the IoU of two bounding boxes
        param:
            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            x1y1x2y2: select the coordinate format
        return:
            iou: computed iou
        """
        if not x1y1x2y2:
            # Transform from center and width to exact coordinates
            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
        else:
            # Get the coordinates of bounding boxes
            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

        # Get the coordinates of the intersection rectangle
        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
        # Intersection area
        inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None)
                      * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None))
        # Union Area
        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)

        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

        return iou

    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
        """
        description: Removes detections with lower object confidence score than 'conf_thres' and performs
        Non-Maximum Suppression to further filter detections.
        param:
            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
            origin_h: original image height
            origin_w: original image width
            conf_thres: a confidence threshold to filter detections
            nms_thres: a iou threshold to filter detections
        return:
            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
        """
        # Get the boxes that score > CONF_THRESH
        boxes = prediction[prediction[:, 4] >= conf_thres]
        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
        boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
        # clip the coordinates
        boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1)
        boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1)
        boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1)
        boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1)
        # Object confidence
        confs = boxes[:, 4]
        # Sort by the confs
        boxes = boxes[np.argsort(-confs)]
        # Perform non-maximum suppression
        keep_boxes = []
        while boxes.shape[0]:
            large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
            label_match = boxes[0, 5] == boxes[:, 5]
            # Indices of boxes with lower confidence scores, large IOUs and matching labels
            invalid = large_overlap & label_match
            keep_boxes += [boxes[0]]
            boxes = boxes[~invalid]
        boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
        return boxes

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def scale_mask(self, mask, ih, iw):
        mask = cv2.resize(mask, (self.input_w, self.input_h))
        r_w = self.input_w / (iw * 1.0)
        r_h = self.input_h / (ih * 1.0)
        if r_h > r_w:
            w = self.input_w
            h = int(r_w * ih)
            x = 0
            y = int((self.input_h - h) / 2)
        else:
            w = int(r_h * iw)
            h = self.input_h
            x = int((self.input_w - w) / 2)
            y = 0
        crop = mask[y:y + h, x:x + w]
        crop = cv2.resize(crop, (iw, ih))
        return crop

    def process_mask(self, output_proto_mask, result_proto_coef, result_boxes, ih, iw):
        """
        description: Mask pred by yolo11 instance segmentation ,
        param:
            output_proto_mask: prototype mask e.g. (32, 160, 160) for 640x640 input
            result_proto_coef: prototype mask coefficients (n, 32), n represents n results
            result_boxes     :
            ih: rows of original image
            iw: cols of original image
        return:
            mask_result: (n, ih, iw)
        """
        result_proto_masks = output_proto_mask.reshape(self.seg_c, self.seg_h, self.seg_w)
        c, mh, mw = result_proto_masks.shape
        print(result_proto_masks.shape)
        print(result_proto_coef.shape)
        masks = self.sigmoid((result_proto_coef @ result_proto_masks.astype(np.float32).reshape(c, -1))).reshape(-1, mh,
                                                                                                                 mw)

        mask_result = []
        for mask, box in zip(masks, result_boxes):
            mask_s = np.zeros((ih, iw))
            crop_mask = self.scale_mask(mask, ih, iw)
            x1 = int(box[0])
            y1 = int(box[1])
            x2 = int(box[2])
            y2 = int(box[3])
            crop = crop_mask[y1:y2, x1:x2]
            crop = np.where(crop >= 0.5, 1, 0)
            crop = crop.astype(np.uint8)
            mask_s[y1:y2, x1:x2] = crop

            mask_result.append(mask_s)
        mask_result = np.array(mask_result)
        return mask_result

    def draw_mask(self, masks, colors_, im_src, alpha=0.5):
        """
        description: Draw mask on image ,
        param:
            masks  : result_mask
            colors_: color to draw mask
            im_src : original image
            alpha  : scale between original  image and mask
        return:
            no return
        """
        if len(masks) == 0:
            return
        masks = np.asarray(masks, dtype=np.uint8)
        masks = np.ascontiguousarray(masks.transpose(1, 2, 0))
        masks = np.asarray(masks, dtype=np.float32)
        colors_ = np.asarray(colors_, dtype=np.float32)
        s = masks.sum(2, keepdims=True).clip(0, 1)
        masks = (masks @ colors_).clip(0, 255)
        im_src[:] = masks * alpha + im_src * (1 - s * alpha)


class inferThread(threading.Thread):
    def __init__(self, yolo11_wrapper, image_path_batch):
        threading.Thread.__init__(self)
        self.yolo11_wrapper = yolo11_wrapper
        self.image_path_batch = image_path_batch

    def run(self):
        batch_image_raw, use_time = self.yolo11_wrapper.infer(self.yolo11_wrapper.get_raw_image(self.image_path_batch))
        for i, img_path in enumerate(self.image_path_batch):
            parent, filename = os.path.split(img_path)
            save_name = os.path.join('output', filename)
            # Save image
            cv2.imwrite(save_name, batch_image_raw[i])
        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))


class warmUpThread(threading.Thread):
    def __init__(self, yolo11_wrapper):
        threading.Thread.__init__(self)
        self.yolo11_wrapper = yolo11_wrapper

    def run(self):
        batch_image_raw, use_time = self.yolo11_wrapper.infer(self.yolo11_wrapper.get_raw_image_zeros())
        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))


class Colors:
    def __init__(self):
        hexs = ('FF95C8', 'FF3838', 'FF9D97', 'FF701F', 'FFB21D', 'CFD231', '48F90A',
                '92CC17', '3DDB86', '1A9334', '00D4BB', '2C99A8', '00C2FF',
                '344593', '6473FF', '0018EC', '8438FF', '520085', 'CB38FF',
                'FF37C7')
        self.palette = [self.hex2rgb(f'#{c}') for c in hexs]
        self.n = len(self.palette)

    def __call__(self, i, bgr=False):
        c = self.palette[int(i) % self.n]
        return (c[2], c[1], c[0]) if bgr else c

    @staticmethod
    def hex2rgb(h):  # rgb order (PIL)
        return tuple(int(h[1 + i:1 + i + 2], 16) for i in (0, 2, 4))


if __name__ == "__main__":
    # load custom plugin and engine
    PLUGIN_LIBRARY = 'build/libmyplugins.so'
    engine_file_path = "build/yolov12n-seg-4.engine"

    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]
    if len(sys.argv) > 2:
        PLUGIN_LIBRARY = sys.argv[2]

    ctypes.CDLL(PLUGIN_LIBRARY)

    # load coco labels
    categories = ["QT", "CT", "VT", "XT"]

    # categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
    #               "traffic light",
    #               "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep",
    #               "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase",
    #               "frisbee",
    #               "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
    #               "surfboard",
    #               "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
    #               "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
    #               "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
    #               "cell phone",
    #               "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
    #               "teddy bear",
    #               "hair drier", "toothbrush"]

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')
    # a YoLo11TRT instance
    yolov12_wrapper = YoLo12TRT(engine_file_path)
    try:
        print('batch size is', yolov12_wrapper.batch_size)

        image_dir = "images"
        image_path_batches = get_img_path_batches(yolov12_wrapper.batch_size, image_dir)

        for i in range(10):
            # create a new thread to do warm_up
            thread1 = warmUpThread(yolov12_wrapper)
            thread1.start()
            thread1.join()
        for batch in image_path_batches:
            # create a new thread to do inference
            thread1 = inferThread(yolov12_wrapper, batch)
            thread1.start()
            thread1.join()
    finally:
        # destroy the instance
        yolov12_wrapper.destroy()


================================================
FILE: yolov13/CMakeLists.txt
================================================


cmake_minimum_required(VERSION 3.10)

project(yolov13)

# Set up environment-based paths for CUDA and TensorRT
if(DEFINED ENV{CUDA_HOME})
  set(CUDA_TOOLKIT_ROOT_DIR $ENV{CUDA_HOME})
else()
  set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda")
endif()

if(DEFINED ENV{TENSORRT_DIR})
  set(TENSORRT_ROOT $ENV{TENSORRT_DIR})
else()
  set(TENSORRT_ROOT "/opt/TensorRT-8.6.1.6")
endif()

message(STATUS "Using CUDA from: ${CUDA_TOOLKIT_ROOT_DIR}")
message(STATUS "Using TensorRT from: ${TENSORRT_ROOT}")

add_definitions(-std=c++11)
add_definitions(-DAPI_EXPORTS)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc)
enable_language(CUDA)

include_directories(${PROJECT_SOURCE_DIR}/include)
include_directories(${PROJECT_SOURCE_DIR}/plugin)

# CUDA and TensorRT configuration
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
  message("embed_platform on")
  include_directories(${CUDA_TOOLKIT_ROOT_DIR}/targets/aarch64-linux/include)
  link_directories(${CUDA_TOOLKIT_ROOT_DIR}/targets/aarch64-linux/lib)
  include_directories(${TENSORRT_ROOT}/include)
  link_directories(${TENSORRT_ROOT}/lib)
else()
  message("embed_platform off")
  include_directories(${CUDA_TOOLKIT_ROOT_DIR}/include)
  link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
  include_directories(${TENSORRT_ROOT}/include)
  link_directories(${TENSORRT_ROOT}/lib)
endif()

add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu)
target_link_libraries(myplugins nvinfer cudart)

find_package(OpenCV REQUIRED)
include_directories(${OpenCV_INCLUDE_DIRS})

file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu)

add_executable(yolov13-det ${PROJECT_SOURCE_DIR}/yolov13_det.cpp ${SRCS})
target_link_libraries(yolov13-det nvinfer)
target_link_libraries(yolov13-det cudart)
target_link_libraries(yolov13-det myplugins)
target_link_libraries(yolov13-det ${OpenCV_LIBS})


================================================
FILE: yolov13/gen_wts.py
================================================
import sys  # noqa: F401
import argparse
import os
import struct
import torch


def parse_args():
    parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
    parser.add_argument('-w', '--weights', required=True,
                        help='Input weights (.pt) file path (required)')
    parser.add_argument(
        '-o', '--output', help='Output (.wts) file path (optional)')

    args = parser.parse_args()
    if not os.path.isfile(args.weights):
        raise SystemExit('Invalid input file')
    if not args.output:
        args.output = os.path.splitext(args.weights)[0] + '.wts'
    elif os.path.isdir(args.output):
        args.output = os.path.join(
            args.output,
            os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
    return args.weights, args.output


pt_file, wts_file = parse_args()

print('Generating .wts for detection model')

# Load model
print(f'Loading {pt_file}')

# Initialize
device = 'cpu'

# Load model
model = torch.load(pt_file, map_location=device, weights_only=False)['model'].float()  # load to FP32

# Anchor handling for detection model
anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None]
delattr(model.model[-1], 'anchors')

model.to(device).eval()

with open(wts_file, 'w') as f:
    f.write('{}\n'.format(len(model.state_dict().keys())))
    for k, v in model.state_dict().items():
        vr = v.reshape(-1).cpu().numpy()
        f.write('{} {} '.format(k, len(vr)))
        for vv in vr:
            f.write(' ')
            f.write(struct.pack('>f', float(vv)).hex())
        f.write('\n')

# python3 gen_wts.py -w your_model.pt -o output_name.wts


================================================
FILE: yolov13/include/block.h
================================================
#pragma once

#include <map>
#include <string>
#include <vector>
#include "NvInfer.h"

using namespace std;
std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file);

nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network,
                                      std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                      std::string lname, float eps);

nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network,
                                        std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                        int ch, std::vector<int> k, int s, std::string lname, int p = 0, int g = 1,
                                        int d = 1);

nvinfer1::ILayer* Conv(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                       nvinfer1::ITensor& input, int c_out, std::string lname, int k = 1, int s = 1, int padding = 0,
                       int g = 1, bool act = true);

nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                             nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname);

nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
                                       std::vector<nvinfer1::IConcatenationLayer*> dets, const int* px_arry,
                                       int px_arry_num);

nvinfer1::IElementWiseLayer* C3k(nvinfer1::INetworkDefinition* network,
                                 std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c2,
                                 std::string lname, int n = 1, bool shortcut = true, int g = 1, float e = 0.5,
                                 int k = 3);

nvinfer1::IElementWiseLayer* C3K2(nvinfer1::INetworkDefinition* network,
                                  std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c2,
                                  int n, std::string lname, bool c3k = false, float e = 0.5, int g = 1,
                                  bool shortcut = true);

nvinfer1::ILayer* AAttn(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                        nvinfer1::ITensor& input, int dim, int num_heads, std::string lname, int area = 1);

nvinfer1::ILayer* DWConv(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                         nvinfer1::ITensor& input, int ch, std::vector<int> k, int s, std::string lname);

nvinfer1::IElementWiseLayer* ABlock(nvinfer1::INetworkDefinition* network,
                                    std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                    int dim, int num_heads, std::string lname, float mlp_ratio = 1.2, int area = 1);

nvinfer1::ILayer* A2C2f(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights>,
                        nvinfer1::ITensor& input, int c2, int n, std::string lname, bool a2 = true, int area = 1,
                        bool residual = false, float mlp_ratio = 2.0, float e = 0.5, int g = 1, bool shortcut = true);

nvinfer1::IElementWiseLayer* DSConv(nvinfer1::INetworkDefinition* network,
                                    std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                    int c_in, int c_out, std::string lname, int k = 3, int s = 1, int p = 0, int d = 1,
                                    bool bias = false);

nvinfer1::ILayer* DSBottleneck(nvinfer1::INetworkDefinition* network,
                               std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
                               int c2, std::string lname, bool shortcut = true, float e = 0.5, int k1 = 3, int k2 = 5,
                               int d2 = 1);

nvinfer1::ILayer* DSC3k(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                        nvinfer1::ITensor& input, int c2, int n, std::string lname, bool shortcut = true, int g = 1,
                        float e = 0.5, int k1 = 3, int k2 = 5, int d2 = 1);

nvinfer1::ILayer* DSC3K2(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                         nvinfer1::ITensor& input, int c2, std::string lname, int n = 1, bool dsc3k = false,
                         float e = 0.5, int g = 1, bool shortcut = true, int k1 = 3, int k2 = 7, int d2 = 1);

nvinfer1::ILayer* FuseModule(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                             std::vector<nvinfer1::ITensor*>& input, int c_in, bool channel_adjust, std::string lname);

// nvinfer1::ILayer* FuseModule(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
//                              std::vector<nvinfer1::ITensor*>input, int c_in, bool channel_adjust, std::string lname);

nvinfer1::ISoftMaxLayer* AdaHyperedgeGen(nvinfer1::INetworkDefinition* network,
                                         std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                         int node_dim, int num_hyperedges, std::string lname, int num_heads = 4,
                                         std::string context = "both");

nvinfer1::IElementWiseLayer* GELU(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor& input);

nvinfer1::IElementWiseLayer* AdaHGConv(nvinfer1::INetworkDefinition* network,
                                       std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                       int embed_dim, std::string lname, int num_hyperedges = 16, int num_heads = 4,
                                       std::string context = "both");

nvinfer1::IShuffleLayer* AdaHGComputation(nvinfer1::INetworkDefinition* network,
                                          std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                          int embed_dim, std::string lname, int num_hyperedges = 16, int num_heads = 8,
                                          std::string context = "both");

nvinfer1::ILayer* C3AH(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                       nvinfer1::ITensor& input, int c2, std::string lname, float e = 1.0, int num_hyperedges = 8,
                       std::string context = "both");

nvinfer1::ILayer* HyperACE(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                           std::vector<nvinfer1::ITensor*> input, int c1, int c2, std::string lname, int n = 1,
                           int num_hyperedges = 8, bool dsc3k = false, bool shortcut = false, float e1 = 0.5,
                           float e2 = 1, std::string context = "both", bool channel_adjust = true);

nvinfer1::IElementWiseLayer* FullPad_Tunnel(nvinfer1::INetworkDefinition* network,
                                            std::map<std::string, nvinfer1::Weights> weightMap,
                                            std::vector<nvinfer1::ITensor*> input, std::string lname);

nvinfer1::ILayer* DownsampleConv(nvinfer1::INetworkDefinition* network,
                                 std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                 int in_channels, std::string lname, bool channel_adjust = true);

void cout_dim(nvinfer1::ITensor& input);


================================================
FILE: yolov13/include/calibrator.h
================================================
#ifndef ENTROPY_CALIBRATOR_H
#define ENTROPY_CALIBRATOR_H

#include <NvInfer.h>
#include <string>
#include <vector>
#include "macros.h"

//! \class Int8EntropyCalibrator2
//!
//! \brief Implements Entropy calibrator 2.
//!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
//!
class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
   public:
    Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name,
                           const char* input_blob_name, bool read_cache = true);
    virtual ~Int8EntropyCalibrator2();
    int getBatchSize() const TRT_NOEXCEPT override;
    bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
    const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
    void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;

   private:
    int batchsize_;
    int input_w_;
    int input_h_;
    int img_idx_;
    std::string img_dir_;
    std::vector<std::string> img_files_;
    size_t input_count_;
    std::string calib_table_name_;
    const char* input_blob_name_;
    bool read_cache_;
    void* device_input_;
    std::vector<char> calib_cache_;
};

#endif  // ENTROPY_CALIBRATOR_H


================================================
FILE: yolov13/include/config.h
================================================
#define USE_FP16
// #define USE_FP32
// #define USE_INT8

const static char* kInputTensorName = "images";
const static char* kOutputTensorName = "output";
const static int kNumClass = 80;
const static int kBatchSize = 1;
const static int kGpuId = 0;
const static int kInputH = 640;
const static int kInputW = 640;
const static float kNmsThresh = 0.45f;
const static float kConfThresh = 0.5f;
const static int kMaxInputImageSize = 3000 * 3000;
const static int kMaxNumOutputBbox = 1000;
//Quantization input image folder path
const static char* kInputQuantizationFolder = "./tensorrtx-int8calib-data/coco_calib";


================================================
FILE: yolov13/include/cuda_utils.h
================================================
#ifndef TRTX_CUDA_UTILS_H_
#define TRTX_CUDA_UTILS_H_

#include <cuda_runtime_api.h>

#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)                                                                    \
    {                                                                                          \
        cudaError_t error_code = callstr;                                                      \
        if (error_code != cudaSuccess) {                                                       \
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
            assert(0);                                                                         \
        }                                                                                      \
    }
#endif  // CUDA_CHECK

#endif  // TRTX_CUDA_UTILS_H_


================================================
FILE: yolov13/include/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "NvInferRuntimeCommon.h"
#include "macros.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf {
   public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {}

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {}

    ~LogStreamConsumerBuffer() {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr()) {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync() {
        putOutput();
        return 0;
    }

    void putOutput() {
        if (mShouldLog) {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }

   private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase {
   public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog) {}

   protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
   public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(severity <= reportableSeverity),
          mSeverity(severity) {}

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(other.mShouldLog),
          mSeverity(other.mSeverity) {}

    void setReportableSeverity(Severity reportableSeverity) {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

   private:
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger {
   public:
    Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {}

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult {
        kRUNNING,  //!< The test is running
        kPASSED,   //!< The test passed
        kFAILED,   //!< The test failed
        kWAIVED    //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger() { return *this; }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity) { mReportableSeverity = severity; }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom {
       public:
        TestAtom(TestAtom&&) = default;

       private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started), mName(name), mCmdline(cmdline) {}

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline) {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom) {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result) {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass) {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const { return mReportableSeverity; }

   private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result) {
        switch (result) {
            case TestResult::kRUNNING:
                return "RUNNING";
            case TestResult::kPASSED:
                return "PASSED";
            case TestResult::kFAILED:
                return "FAILED";
            case TestResult::kWAIVED:
                return "WAIVED";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result) {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv) {
        std::stringstream ss;
        for (int i = 0; i < argc; i++) {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace {

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

}  // anonymous namespace

#endif  // TENSORRT_LOGGING_H


================================================
FILE: yolov13/include/macros.h
================================================
#ifndef __MACROS_H
#define __MACROS_H

#include "NvInfer.h"

#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else

#if defined(_MSC_VER)
#define API __declspec(dllimport)
#else
#define API
#endif
#endif  // API_EXPORTS

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif

#endif  // __MACROS_H


================================================
FILE: yolov13/include/model.h
================================================
#pragma once

#include <assert.h>
#include <string>
#include "NvInfer.h"

nvinfer1::IHostMemory* buildEngineYolov13Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                             nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                             int& max_channels, std::string& type);

nvinfer1::IHostMemory* buildEngineYolov13Det_debug(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                                   nvinfer1::DataType dt, const std::string& wts_path, float& gd,
                                                   float& gw, int& max_channels, std::string& type);


================================================
FILE: yolov13/include/postprocess.h
================================================
#pragma once

#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "types.h"

// Preprocessing functions
cv::Rect get_rect(cv::Mat& img, float bbox[4]);

// Processing functions
void batch_process(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
                   int bbox_element, const std::vector<cv::Mat>& img_batch);

void process_decode_ptr_host(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img,
                             int count);

// NMS functions
void nms(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh = 0.5);

void batch_nms(std::vector<std::vector<Detection>>& batch_res, float* output, int batch_size, int output_size,
               float conf_thresh, float nms_thresh = 0.5);

// CUDA-related functions
void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
                 cudaStream_t stream);

void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);

// Drawing functions
void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);


================================================
FILE: yolov13/include/preprocess.h
================================================
#pragma once

#include <map>
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "types.h"

void cuda_preprocess_init(int max_image_size);

void cuda_preprocess_destroy();

void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height,
                     cudaStream_t stream);

void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch, float* dst, int dst_width, int dst_height,
                           cudaStream_t stream);


================================================
FILE: yolov13/include/types.h
================================================
#pragma once
#include "config.h"

struct alignas(float) Detection {
    //center_x center_y w h
    float bbox[4];
    float conf;  // bbox_conf * cls_conf
    float class_id;
};

struct AffineMatrix {
    float value[3];
};

const int bbox_element =
        sizeof(AffineMatrix) / sizeof(float) + 1;  // left, top, right, bottom, confidence, class, keepflag


================================================
FILE: yolov13/include/utils.h
================================================
#pragma once
#include <dirent.h>
#include <fstream>
#include <opencv2/opencv.hpp>

static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
    int w, h, x, y;
    float r_w = input_w / (img.cols * 1.0);
    float r_h = input_h / (img.rows * 1.0);
    if (r_h > r_w) {
        w = input_w;
        h = r_w * img.rows;
        x = 0;
        y = (input_h - h) / 2;
    } else {
        w = r_h * img.cols;
        h = input_h;
        x = (input_w - w) / 2;
        y = 0;
    }
    cv::Mat re(h, w, CV_8UC3);
    cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
    cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
    re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
    return out;
}

static inline int read_files_in_dir(const char* p_dir_name, std::vector<std::string>& file_names) {
    DIR* p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            //            std::cout << "Found file: " << cur_file_name << std::endl;
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}

// Function to trim leading and trailing whitespace from a string
static inline std::string trim_leading_whitespace(const std::string& str) {
    size_t first = str.find_first_not_of(' ');
    if (std::string::npos == first) {
        return str;
    }
    size_t last = str.find_last_not_of(' ');
    return str.substr(first, (last - first + 1));
}

// Src: https://stackoverflow.com/questions/16605967
static inline std::string to_string_with_precision(const float a_value, const int n = 2) {
    std::ostringstream out;
    out.precision(n);
    out << std::fixed << a_value;
    return out.str();
}

static inline int read_labels(const std::string labels_filename, std::unordered_map<int, std::string>& labels_map) {
    std::ifstream file(labels_filename);
    // Read each line of the file
    std::string line;
    int index = 0;
    while (std::getline(file, line)) {
        // Strip the line of any leading or trailing whitespace
        line = trim_leading_whitespace(line);

        // Add the stripped line to the labels_map, using the loop index as the key
        labels_map[index] = line;
        index++;
    }
    // Close the file
    file.close();

    return 0;
}


================================================
FILE: yolov13/plugin/geluKernel.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <cuda.h>
#if CUDA_VERSION >= 10010

#include <cstring>
#include <vector>

#include "NvInfer.h"
#include "common/bertCommon.h"
#include "common/common.cuh"
#include "common/serialize.hpp"
#include "geluPlugin.h"

using namespace nvinfer1;

namespace nvinfer1 {
namespace plugin {
namespace bert {

// constants for approximating the normal cdf
constexpr float A = 0.5f;
constexpr float B = 0.7978845608028654f;    // sqrt(2.0/M_PI)
constexpr float C = 0.035677408136300125f;  // 0.044715 * sqrt(2.0/M_PI)

template <typename T, unsigned TPB>
__global__ void geluKernel(const T a, const T b, const T c, int n, const T* input, T* output) {
    const int idx = blockIdx.x * TPB + threadIdx.x;

    if (idx < n) {
        const T in = input[idx];
        const T cdf = a + a * tanh(in * (c * in * in + b));
        output[idx] = in * cdf;
    }
}

int computeGelu(cudaStream_t stream, int n, const float* input, float* output) {
    constexpr int blockSize = 256;
    const int gridSize = (n + blockSize - 1) / blockSize;
    geluKernel<float, blockSize><<<gridSize, blockSize, 0, stream>>>(A, B, C, n, input, output);

    PLUGIN_CHECK(cudaPeekAtLastError());
    return 0;
}

int computeGelu(cudaStream_t stream, int n, const half* input, half* output) {
    constexpr int blockSize = 256;

    if (0 == (n & 1)) {
        const int n2 = n / 2;

        const int gridSize = (n2 + blockSize - 1) / blockSize;
        const half2 A2 = __floats2half2_rn(A, A);
        const half2 B2 = __floats2half2_rn(B, B);
        const half2 C2 = __floats2half2_rn(C, C);
        const half2* input2 = reinterpret_cast<const half2*>(input);
        half2* output2 = reinterpret_cast<half2*>(output);
        geluKernel<half2, blockSize><<<gridSize, blockSize, 0, stream>>>(A2, B2, C2, n2, input2, output2);
    } else {
        const int gridSize = (n + blockSize - 1) / blockSize;
        geluKernel<half, blockSize><<<gridSize, blockSize, 0, stream>>>(A, B, C, n, input, output);
    }

    PLUGIN_CHECK(cudaPeekAtLastError());
    return 0;
}

template <typename T, int TPB>
__global__ void geluBiasKernel(const T a, const T b, const T c, T* output, const T* input, const T* bias,
                               const int ld) {

    const int offset = blockIdx.x * ld;

    for (int it = threadIdx.x; it < ld; it += TPB) {
        const int idx = it + offset;
        const T in = input[idx] + bias[it];
        const T cdf = a + a * tanh(in * (c * in * in + b));
        output[idx] = in * cdf;
    }
}

int computeGeluBias(float* output, const float* input, const float* bias, const int ld, const int cols,
                    cudaStream_t stream) {
    geluBiasKernel<float, 256><<<cols, 256, 0, stream>>>(A, B, C, output, input, bias, ld);
    return cudaPeekAtLastError();
}

int computeGeluBias(half* output, const half* input, const half* bias, const int ld, const int cols,
                    cudaStream_t stream) {
    if (ld & 1) {
        geluBiasKernel<half, 256><<<cols, 256, 0, stream>>>(A, B, C, output, input, bias, ld);
    } else {

        const half2 A2 = __floats2half2_rn(A, A);
        const half2 B2 = __floats2half2_rn(B, B);
        const half2 C2 = __floats2half2_rn(C, C);
        const int ld2 = ld / 2;
        const half2* input2 = reinterpret_cast<const half2*>(input);
        const half2* bias2 = reinterpret_cast<const half2*>(bias);
        half2* output2 = reinterpret_cast<half2*>(output);
        geluBiasKernel<half2, 256><<<cols, 256, 0, stream>>>(A2, B2, C2, output2, input2, bias2, ld2);
    }

    return cudaPeekAtLastError();
}

}  // namespace bert
}  // namespace plugin
}  // namespace nvinfer1
#endif  // CUDA_VERSION >= 10010


================================================
FILE: yolov13/plugin/yololayer.cu
================================================
#include <assert.h>
#include <math.h>
#include <iostream>
#include <vector>
#include "cuda_utils.h"
#include "types.h"
#include "yololayer.h"

namespace Tn {
template <typename T>
void write(char*& buffer, const T& val) {
    *reinterpret_cast<T*>(buffer) = val;
    buffer += sizeof(T);
}

template <typename T>
void read(const char*& buffer, T& val) {
    val = *reinterpret_cast<const T*>(buffer);
    buffer += sizeof(T);
}
}  // namespace Tn

__device__ float sigmoid(float x) {
    return 1.0f / (1.0f + exp(-x));
}

namespace nvinfer1 {
YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const int* strides,
                                 int stridesLength) {
    mClassCount = classCount;
    mYoloV13NetWidth = netWidth;
    mYoloV13netHeight = netHeight;
    mMaxOutObject = maxOut;
    mStridesLength = stridesLength;
    mStrides = new int[stridesLength];
    memcpy(mStrides, strides, stridesLength * sizeof(int));
}

YoloLayerPlugin::~YoloLayerPlugin() {
    if (mStrides != nullptr) {
        delete[] mStrides;
        mStrides = nullptr;
    }
}

YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) {
    using namespace Tn;
    const char *d = reinterpret_cast<const char*>(data), *a = d;
    read(d, mClassCount);
    read(d, mThreadCount);
    read(d, mYoloV13NetWidth);
    read(d, mYoloV13netHeight);
    read(d, mMaxOutObject);
    read(d, mStridesLength);
    mStrides = new int[mStridesLength];
    for (int i = 0; i < mStridesLength; ++i) {
        read(d, mStrides[i]);
    }

    assert(d == a + length);
}

void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT {
    using namespace Tn;
    char *d = static_cast<char*>(buffer), *a = d;
    write(d, mClassCount);
    write(d, mThreadCount);
    write(d, mYoloV13NetWidth);
    write(d, mYoloV13netHeight);
    write(d, mMaxOutObject);
    write(d, mStridesLength);
    for (int i = 0; i < mStridesLength; ++i) {
        write(d, mStrides[i]);
    }

    assert(d == a + getSerializationSize());
}

size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT {
    return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mYoloV13netHeight) + sizeof(mYoloV13NetWidth) +
           sizeof(mMaxOutObject) + sizeof(mStridesLength) + sizeof(int) * mStridesLength;
}

int YoloLayerPlugin::initialize() TRT_NOEXCEPT {
    return 0;
}

nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs,
                                                    int nbInputDims) TRT_NOEXCEPT {
    int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float);
    return nvinfer1::Dims3(total_size + 1, 1, 1);
}

void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT {
    mPluginNamespace = pluginNamespace;
}

const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT {
    return mPluginNamespace;
}

nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
                                                      int nbInputs) const TRT_NOEXCEPT {
    return nvinfer1::DataType::kFLOAT;
}

bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
                                                   int nbInputs) const TRT_NOEXCEPT {
    return false;
}

bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT {
    return false;
}

void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput,
                                      nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT{};

void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
                                      IGpuAllocator* gpuAllocator) TRT_NOEXCEPT{};

void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {}

const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT {
    return "YoloLayer_TRT";
}

const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT {
    return "1";
}

void YoloLayerPlugin::destroy() TRT_NOEXCEPT {
    delete this;
}

nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT {
    YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV13NetWidth, mYoloV13netHeight, mMaxOutObject, mStrides,
                                             mStridesLength);
    p->setPluginNamespace(mPluginNamespace);
    return p;
}

int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs,
                             void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
    forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV13netHeight, mYoloV13NetWidth, batchSize);
    return 0;
}

__device__ float Logist(float data) {
    return 1.0f / (1.0f + expf(-data));
};

__global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h,
                             int grid_w, const int stride, int classes, int outputElem) {
    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    if (idx >= numElements)
        return;

    int total_grid = grid_h * grid_w;
    int info_len = 4 + classes;
    int batchIdx = idx / total_grid;
    int elemIdx = idx % total_grid;
    const float* curInput = input + batchIdx * total_grid * info_len;
    int outputIdx = batchIdx * outputElem;

    int class_id = 0;
    float max_cls_prob = 0.0;
    for (int i = 4; i < 4 + classes; i++) {
        float p = Logist(curInput[elemIdx + i * total_grid]);
        if (p > max_cls_prob) {
            max_cls_prob = p;
            class_id = i - 4;
        }
    }

    if (max_cls_prob < 0.1)
        return;

    int count = (int)atomicAdd(output + outputIdx, 1);
    char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection);
    Detection* det = (Detection*)(data);

    if (count >= maxoutobject)
        return;

    int row = elemIdx / grid_w;
    int col = elemIdx % grid_w;

    det->conf = max_cls_prob;
    det->class_id = class_id;
    det->bbox[0] = (col + 0.5f - curInput[elemIdx + 0 * total_grid]) * stride;
    det->bbox[1] = (row + 0.5f - curInput[elemIdx + 1 * total_grid]) * stride;
    det->bbox[2] = (col + 0.5f + curInput[elemIdx + 2 * total_grid]) * stride;
    det->bbox[3] = (row + 0.5f + curInput[elemIdx + 3 * total_grid]) * stride;
}

void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV13netHeight,
                                 int mYoloV13NetWidth, int batchSize) {
    int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
    cudaMemsetAsync(output, 0, sizeof(float), stream);
    for (int idx = 0; idx < batchSize; ++idx) {
        CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream));
    }
    int numElem = 0;

    int maxGrids = mStridesLength;
    int flatGridsLen = 2 * maxGrids;
    int* flatGrids = new int[flatGridsLen];

    for (int i = 0; i < maxGrids; ++i) {
        flatGrids[2 * i] = mYoloV13netHeight / mStrides[i];
        flatGrids[2 * i + 1] = mYoloV13NetWidth / mStrides[i];
    }

    for (unsigned int i = 0; i < maxGrids; i++) {
        // Access the elements of the original 2D array from the flattened 1D array
        int grid_h = flatGrids[2 * i];      // Corresponds to the access of grids[i][0]
        int grid_w = flatGrids[2 * i + 1];  // Corresponds to the access of grids[i][1]
        int stride = mStrides[i];
        numElem = grid_h * grid_w * batchSize;  // Calculate the total number of elements
        if (numElem < mThreadCount)             // Adjust the thread count if needed
            mThreadCount = numElem;

        CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>(
                inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, outputElem);
    }

    delete[] flatGrids;
}

PluginFieldCollection YoloPluginCreator::mFC{};
std::vector<PluginField> YoloPluginCreator::mPluginAttributes;

YoloPluginCreator::YoloPluginCreator() {
    mPluginAttributes.clear();
    mFC.nbFields = mPluginAttributes.size();
    mFC.fields = mPluginAttributes.data();
}

const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT {
    return "YoloLayer_TRT";
}

const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT {
    return "1";
}

const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT {
    return &mFC;
}

IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT {
    assert(fc->nbFields == 1);
    assert(strcmp(fc->fields[0].name, "combinedInfo") == 0);
    const int* combinedInfo = static_cast<const int*>(fc->fields[0].data);

    // Clean packed layout: class_num, input_w, input_h, max_out
    int class_count = combinedInfo[0];
    int input_w = combinedInfo[1];
    int input_h = combinedInfo[2];
    int max_output_object_count = combinedInfo[3];
    int stride_offset = 4;

    const int* px_arry = combinedInfo + stride_offset;
    int px_arry_length = fc->fields[0].length - stride_offset;

    YoloLayerPlugin* obj =
            new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, px_arry, px_arry_length);
    obj->setPluginNamespace(mNamespace.c_str());
    return obj;
}

IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData,
                                                     size_t serialLength) TRT_NOEXCEPT {
    YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
    obj->setPluginNamespace(mNamespace.c_str());
    return obj;
}

}  // namespace nvinfer1


================================================
FILE: yolov13/plugin/yololayer.h
================================================
#pragma once

#include <opencv2/opencv.hpp>
#include <string>
#include <vector>
#include "NvInfer.h"
#include "macros.h"

namespace nvinfer1 {
class API YoloLayerPlugin : public IPluginV2IOExt {
   public:
    YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const int* strides, int stridesLength);

    YoloLayerPlugin(const void* data, size_t length);

    ~YoloLayerPlugin();

    int getNbOutputs() const TRT_NOEXCEPT override { return 1; }

    nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;

    int initialize() TRT_NOEXCEPT override;

    virtual void terminate() TRT_NOEXCEPT override {}

    virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; }

    virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace,
                        cudaStream_t stream) TRT_NOEXCEPT override;

    virtual size_t getSerializationSize() const TRT_NOEXCEPT override;

    virtual void serialize(void* buffer) const TRT_NOEXCEPT override;

    bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs,
                                   int nbOutputs) const TRT_NOEXCEPT override {
        return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
    }

    const char* getPluginType() const TRT_NOEXCEPT override;

    const char* getPluginVersion() const TRT_NOEXCEPT override;

    void destroy() TRT_NOEXCEPT override;

    IPluginV2IOExt* clone() const TRT_NOEXCEPT override;

    void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;

    const char* getPluginNamespace() const TRT_NOEXCEPT override;

    nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
                                         int32_t nbInputs) const TRT_NOEXCEPT;

    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
                                      int nbInputs) const TRT_NOEXCEPT override;

    bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;

    void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
                         IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;

    void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out,
                         int32_t nbOutput) TRT_NOEXCEPT override;

    void detachFromContext() TRT_NOEXCEPT override;

   private:
    void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV13netHeight,
                    int mYoloV13NetWidth, int batchSize);

    int mThreadCount = 256;
    const char* mPluginNamespace;
    int mClassCount;
    // Removed non-detection members
    int mYoloV13netHeight;
    int mYoloV13NetWidth;
    int mMaxOutObject;
    int* mStrides;
    int mStridesLength;
};

class API YoloPluginCreator : public IPluginCreator {
   public:
    YoloPluginCreator();

    ~YoloPluginCreator() override = default;

    const char* getPluginName() const TRT_NOEXCEPT override;

    const char* getPluginVersion() const TRT_NOEXCEPT override;

    const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;

    nvinfer1::IPluginV2IOExt* createPlugin(const char* name,
                                           const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;

    nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData,
                                                size_t serialLength) TRT_NOEXCEPT override;

    void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; }

    const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); }

   private:
    std::string mNamespace;
    static PluginFieldCollection mFC;
    static std::vector<PluginField> mPluginAttributes;
};

REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
}  // namespace nvinfer1


================================================
FILE: yolov13/readme.md
================================================
## Introduction

Yolov13 model supports TensorRT-8.

Detection training code [link](https://github.com/iMoonLab/yolov13/releases/tag/yolov13)


## Environment

* cuda 11.6
* cudnn 8.9.1.23
* tensorrt 8.6.1.6
* opencv 4.8.0
* ultralytics 8.3.63

## Support

* [x] YOLOV13-det support FP32/FP16/INT8 and C++ API


## Config

* Choose the YOLOV13 sub-model n/s/l/x from command line arguments.
* Other configs please check [include/config.h](include/config.h)

## Build and Run (Detection)

1. generate .wts from pytorch with .pt, or download .wts from model zoo

```shell
# Download ultralytics
wget https://github.com/iMoonLab/yolov13/releases/tag/yolov13 -O ultralytics-8.3.63.zip
# Unzip ultralytics
unzip ultralytics-8.3.63.zip
cd ultralytics-8.3.63
# Training your ownself models
to download other models, replace 'yolov13n.pt' with 'yolov13s.pt', 'yolov13l.pt', or 'yolov13x.pt'
# Generate .wts
cp [PATH-TO-TENSORRTX]/yolov13/gen_wts.py .
python3 gen_wts.py -w yolov13n.pt -o yolov13n.wts
# A file 'yolov13n.wts' will be generated.
```

2. build tensorrtx/yolov13 and run
```shell
cd [PATH-TO-TENSORRTX]/yolov13
mkdir build
cd build
cmake ..
make
```


### Detection
```shell
cp [PATH-TO-ultralytics]/yolov13n.wts .
# Build and serialize TensorRT engine
./yolov13-det -s yolov13n.wts yolov13n-det.engine [n/s/l/x]
# Run inference
./yolov13-det -d yolov13n-det.engine ../images [c/g]
# results saved in build directory
```

## INT8 Quantization
1. Prepare calibration images, you can randomly select 1000s images from your train set.
     For coco, you can also download the calibration images `coco_calib` from
     [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing)
     or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh
2. unzip it in [PATH-TO-TENSORRTX]/yolov13/build
3. set the macro `USE_INT8` in include/config.h and make again
4. serialize the model and test
... build successfully in my 4060 ...

## More Information
See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)


================================================
FILE: yolov13/src/block.cpp
================================================
#include "block.h"
#include <assert.h>
#include <math.h>
#include <fstream>
#include <iostream>
#include "config.h"
#include "model.h"
#include "yololayer.h"

std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, nvinfer1::Weights> WeightMap;

    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");

    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--) {
        nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        std::string name;
        input >> name >> std::dec >> size;
        wt.type = nvinfer1::DataType::kFLOAT;

        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; x++) {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        wt.count = size;
        WeightMap[name] = wt;
        // std::cout << "===========name:              " << name << std::endl;
    }
    return WeightMap;
}

nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network,
                                      std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                      std::string lname, float eps) {
    cout << "BatchNorm's name :             " << lname << endl;
    float* gamma = (float*)weightMap[lname + ".weight"].values;
    float* beta = (float*)weightMap[lname + ".bias"].values;
    float* mean = (float*)weightMap[lname + ".running_mean"].values;
    float* var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;

    float* scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, scval, len};

    float* shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, shval, len};

    float* pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, pval, len};
    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    nvinfer1::IScaleLayer* output = network->addScale(input, nvinfer1::ScaleMode::kCHANNEL, shift, scale, power);
    assert(output);
    return output;
}

nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network,
                                        std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                        int ch, std::vector<int> k, int s, std::string lname, int p, int g, int d) {

    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k[0], k[1]},
                                                                  weightMap[lname + ".conv.weight"], bias_empty);

    conv->setNbGroups(g);
    assert(conv);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    // auto pad
    int p0 = k[0] / 2;
    int p1 = k[1] / 2;
    conv->setPaddingNd(nvinfer1::DimsHW{p0, p1});
    nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);

    nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
    nvinfer1::IElementWiseLayer* ew =
            network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
    assert(ew);
    return ew;
}

static nvinfer1::ILayer* bottleneck(nvinfer1::INetworkDefinition* network,
                                    std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                    int c1, int c2, bool shortcut, std::vector<int> k1, std::vector<int> k2, float e,
                                    int g, std::string lname) {
    int c_ = (int)((float)c2 * e);
    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c_, k1, 1, lname + ".cv1");
    nvinfer1::IElementWiseLayer* conv2 =
            convBnSiLU(network, weightMap, *conv1->getOutput(0), c2, k2, 1, lname + ".cv2", 0, g);

    if (shortcut && c1 == c2) {
        nvinfer1::IElementWiseLayer* ew =
                network->addElementWise(input, *conv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
        return ew;
    }
    return conv2;
}

nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                             nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname) {

    nvinfer1::IShuffleLayer* shuffle1 = network->addShuffle(input);
    shuffle1->setReshapeDimensions(nvinfer1::Dims4{kBatchSize, 4, 16, grid});
    shuffle1->setSecondTranspose(nvinfer1::Permutation{0, 2, 1, 3});
    nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*shuffle1->getOutput(0));
    softmax->setAxes(1 << 1);

    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv =
            network->addConvolutionNd(*softmax->getOutput(0), 1, nvinfer1::DimsHW{1, 1}, weightMap[lname], bias_empty);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    conv->setPaddingNd(nvinfer1::DimsHW{p, p});

    nvinfer1::IShuffleLayer* shuffle2 = network->addShuffle(*conv->getOutput(0));
    shuffle2->setReshapeDimensions(nvinfer1::Dims3{kBatchSize, 4, grid});

    return shuffle2;
}

nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
                                       std::vector<nvinfer1::IConcatenationLayer*> dets, const int* px_arry,
                                       int px_arry_num) {
    auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
    // Packing: class_num, input_w, input_h, max_out
    const int netinfo_count = 4;
    const int total_count = netinfo_count + px_arry_num;

    std::vector<int> combinedInfo(total_count);

    // Fill in the first 4 elements
    combinedInfo[0] = kNumClass;
    combinedInfo[1] = kInputW;
    combinedInfo[2] = kInputH;
    combinedInfo[3] = kMaxNumOutputBbox;

    // Copy the contents of px_arry into the combinedInfo vector
    std::copy(px_arry, px_arry + px_arry_num, combinedInfo.begin() + netinfo_count);

    // Now let's create the PluginField object to hold this combined information.
    nvinfer1::PluginField pluginField;
    pluginField.name = "combinedInfo";
    pluginField.data = combinedInfo.data();
    pluginField.type = nvinfer1::PluginFieldType::kINT32;
    pluginField.length = combinedInfo.size();

    // Create the PluginFieldCollection
    nvinfer1::PluginFieldCollection pluginFieldCollection;
    pluginFieldCollection.nbFields = 1;
    pluginFieldCollection.fields = &pluginField;

    // Create the plugin object
    nvinfer1::IPluginV2* pluginObject = creator->createPlugin("yololayer", &pluginFieldCollection);

    // Prepare input tensors for the YOLO Layer.
    std::vector<nvinfer1::ITensor*> inputTensors;
    for (auto det : dets) {
        inputTensors.push_back(det->getOutput(0));
    }

    // Add the plugin to the network
    nvinfer1::IPluginV2Layer* yoloLayer = network->addPluginV2(inputTensors.data(), inputTensors.size(), *pluginObject);

    return yoloLayer;
}

nvinfer1::ILayer* Conv(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                       nvinfer1::ITensor& input, int c_out, std::string lname, int k, int s, int padding, int g,
                       bool act) {
    nvinfer1::Weights emptywts{nvinfer1::DataType::kFLOAT, nullptr, 0};
    cout << "Conv name: " << lname << endl;
    nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, c_out, nvinfer1::DimsHW{k, k},
                                                                  weightMap[lname + ".conv.weight"], emptywts);
    assert(conv);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    // auto pad
    int p0 = k / 2;
    int p1 = k / 2;
    conv->setPaddingNd(nvinfer1::DimsHW{p0, p1});
    conv->setNbGroups(g);

    nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);
    if (act) {
        nvinfer1::IActivationLayer* sigmoid =
                network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
        nvinfer1::IElementWiseLayer* ew = network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0),
                                                                  nvinfer1::ElementWiseOperation::kPROD);
        assert(ew);
        return ew;
    } else
        return bn;
}

nvinfer1::ILayer* DWConv(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                         nvinfer1::ITensor& input, int ch, std::vector<int> k, int s, std::string lname) {
    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k[0], k[1]},
                                                                  weightMap[lname + ".conv.weight"], bias_empty);

    assert(conv);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    conv->setNbGroups(ch);
    // auto pad
    int p0 = k[0] / 2;
    int p1 = k[1] / 2;
    conv->setPaddingNd(nvinfer1::DimsHW{p0, p1});

    nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);

    nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
    nvinfer1::IElementWiseLayer* ew =
            network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
    assert(ew);
    return ew;
}

nvinfer1::IElementWiseLayer* C3k(nvinfer1::INetworkDefinition* network,
                                 std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c2,
                                 std::string lname, int n, bool shortcut, int g, float e, int k) {
    int c_ = c2 * float(e);

    nvinfer1::IElementWiseLayer* cv1 = convBnSiLU(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv1");
    nvinfer1::IElementWiseLayer* cv2 = convBnSiLU(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv2");
    nvinfer1::ITensor* y = cv1->getOutput(0);
    for (int i = 0; i < n; i++) {
        nvinfer1::ILayer* b = bottleneck(network, weightMap, *y, c_, c_, shortcut, {k, k}, {k, k}, 1.0, g,
                                         lname + ".m." + std::to_string(i));
        y = b->getOutput(0);
    }
    nvinfer1::ITensor* inputTensor[] = {y, cv2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor, 2);
    nvinfer1::IElementWiseLayer* cv3 =
            convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv3");

    return cv3;
}

nvinfer1::IElementWiseLayer* C3K2(nvinfer1::INetworkDefinition* network,
                                  std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c2,
                                  int n, std::string lname, bool c3k, float e, int g, bool shortcut) {
    int c = int(c2 * float(e));
    nvinfer1::ILayer* cv1 = Conv(network, weightMap, input, 2 * c, lname + ".cv1", 1, 1);
    nvinfer1::ISliceLayer* sl0 = network->addSlice(
            *cv1->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0},
            nvinfer1::Dims4{cv1->getOutput(0)->getDimensions().d[0], cv1->getOutput(0)->getDimensions().d[1] / 2,
                            cv1->getOutput(0)->getDimensions().d[2], cv1->getOutput(0)->getDimensions().d[3]},
            nvinfer1::Dims4{1, 1, 1, 1});
    nvinfer1::ISliceLayer* sl1 = network->addSlice(
            *cv1->getOutput(0), nvinfer1::Dims4{0, cv1->getOutput(0)->getDimensions().d[1] / 2, 0, 0},
            nvinfer1::Dims4{cv1->getOutput(0)->getDimensions().d[0], cv1->getOutput(0)->getDimensions().d[1] / 2,
                            cv1->getOutput(0)->getDimensions().d[2], cv1->getOutput(0)->getDimensions().d[3]},
            nvinfer1::Dims4{1, 1, 1, 1});
    nvinfer1::ITensor* inputTensor0[] = {sl0->getOutput(0), sl1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor0, 2);
    nvinfer1::ITensor* current = sl1->getOutput(0);

    for (int i = 0; i < n; i++) {
        nvinfer1::ILayer* b;
        if (c3k) {
            b = C3k(network, weightMap, *current, c, lname + ".m." + std::to_string(i), 2, shortcut, g);
        } else {
            b = bottleneck(network, weightMap, *current, c, c, shortcut, {3, 3}, {3, 3}, 0.5, g,
                           lname + ".m." + std::to_string(i));
        }
        current = b->getOutput(0);
        nvinfer1::ITensor* inputTensors[] = {cat->getOutput(0), b->getOutput(0)};
        cat = network->addConcatenation(inputTensors, 2);
    }
    nvinfer1::IElementWiseLayer* cv2 =
            convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv2");
    return cv2;
}

void cout_dim(nvinfer1::ITensor& input) {

    nvinfer1::Dims d = input.getDimensions();

    std::cout << "======================= Dimensions =================================" << std::endl;
    std::cout << "          " << d.d[0] << std::endl;
    std::cout << "          " << d.d[1] << std::endl;
    std::cout << "          " << d.d[2] << std::endl;
    std::cout << "          " << d.d[3] << std::endl;
    std::cout << "======================================================================" << std::endl;
}

nvinfer1::ILayer* AAttn(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                        nvinfer1::ITensor& input, int dim, int num_heads, std::string lname, int area) {

    nvinfer1::Dims d_input = input.getDimensions();
    int B = d_input.d[0];
    int C = d_input.d[1];
    int H = d_input.d[2];
    int W = d_input.d[3];
    int N = W * H;
    int head_dim = dim / num_heads;
    int all_head_dim = head_dim * num_heads;

    nvinfer1::ILayer* qk = Conv(network, weightMap, input, all_head_dim * 2, lname + ".qk", 1, 1, 0, 1, false);
    nvinfer1::IShuffleLayer* qk_flatten_t = network->addShuffle(*qk->getOutput(0));
    qk_flatten_t->setReshapeDimensions(nvinfer1::Dims3{B, -1, N});
    qk_flatten_t->setSecondTranspose(nvinfer1::Permutation{0, 2, 1});

    nvinfer1::ILayer* v = Conv(network, weightMap, input, all_head_dim, lname + ".v", 1, 1, 0, 1, false);
    nvinfer1::IShuffleLayer* v_flatten_t = network->addShuffle(*v->getOutput(0));
    v_flatten_t->setReshapeDimensions(nvinfer1::Dims3{B, -1, N});
    v_flatten_t->setSecondTranspose(nvinfer1::Permutation{0, 2, 1});  // (1, 6400, 64)

    nvinfer1::ILayer* pe = Conv(network, weightMap, *v->getOutput(0), dim, lname + ".pe", 5, 1, 2, dim, false);

    nvinfer1::ITensor* q_k = qk_flatten_t->getOutput(0);
    nvinfer1::ITensor* v_ = v_flatten_t->getOutput(0);
    if (area > 1) {
        B = B * area;
        N = N / area;

        nvinfer1::IShuffleLayer* qk_reshape = network->addShuffle(*qk_flatten_t->getOutput(0));
        qk_reshape->setReshapeDimensions(nvinfer1::Dims3{B, N, C * 2});
        nvinfer1::IShuffleLayer* v_reshape = network->addShuffle(*v_flatten_t->getOutput(0));
        v_reshape->setReshapeDimensions(nvinfer1::Dims3{B, N, C});

        q_k = qk_reshape->getOutput(0);
        v_ = v_reshape->getOutput(0);
    }
    nvinfer1::Dims q_k_dim = q_k->getDimensions();
    nvinfer1::ISliceLayer* q =
            network->addSlice(*q_k, nvinfer1::Dims3{0, 0, 0},
                              nvinfer1::Dims3{q_k_dim.d[0], q_k_dim.d[1], q_k_dim.d[2] / 2}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* k =
            network->addSlice(*q_k, nvinfer1::Dims3{0, 0, q_k_dim.d[2] / 2},
                              nvinfer1::Dims3{q_k_dim.d[0], q_k_dim.d[1], q_k_dim.d[2] / 2}, nvinfer1::Dims3{1, 1, 1});

    nvinfer1::IShuffleLayer* q_reshape = network->addShuffle(*q->getOutput(0));
    q_reshape->setReshapeDimensions(nvinfer1::Dims4{B, N, num_heads, head_dim});
    nvinfer1::IShuffleLayer* k_reshape = network->addShuffle(*k->getOutput(0));
    k_reshape->setReshapeDimensions(nvinfer1::Dims4{B, N, num_heads, head_dim});
    nvinfer1::IShuffleLayer* v_reshape = network->addShuffle(*v_);
    v_reshape->setReshapeDimensions(nvinfer1::Dims4{B, N, num_heads, head_dim});

    // (B, N, num_head, head_dim)--->(B, num_head, head_dim, N)
    nvinfer1::IShuffleLayer* q_t_view = network->addShuffle(*q_reshape->getOutput(0));
    q_t_view->setFirstTranspose(nvinfer1::Permutation{0, 2, 3, 1});

    nvinfer1::IShuffleLayer* k_t_view = network->addShuffle(*k_reshape->getOutput(0));
    k_t_view->setFirstTranspose(nvinfer1::Permutation{0, 2, 3, 1});
    nvinfer1::IShuffleLayer* v_t_view = network->addShuffle(*v_reshape->getOutput(0));
    v_t_view->setFirstTranspose(nvinfer1::Permutation{0, 2, 3, 1});

    nvinfer1::IShuffleLayer* q_T = network->addShuffle(*q_t_view->getOutput(0));
    q_T->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2});  // (B, num_head, N, head_dim, N)
    nvinfer1::IMatrixMultiplyLayer* q_mul_k =
            network->addMatrixMultiply(*q_T->getOutput(0), nvinfer1::MatrixOperation::kNONE, *k_t_view->getOutput(0),
                                       nvinfer1::MatrixOperation::kNONE);

    float scale = 1.0 / sqrt(head_dim);
    float* scale_val = reinterpret_cast<float*>(malloc(sizeof(float) * 1));
    scale_val[0] = scale;
    nvinfer1::Weights s_w{nvinfer1::DataType::kFLOAT, scale_val, 1};  // scale
    float* shift_val = reinterpret_cast<float*>(malloc(sizeof(float) * 1));
    shift_val[0] = 0;
    nvinfer1::Weights sh_w{nvinfer1::DataType::kFLOAT, shift_val, 1};  // shift
    float* power_val = reinterpret_cast<float*>(malloc(sizeof(float) * 1));
    power_val[0] = 1;
    nvinfer1::Weights p_w{nvinfer1::DataType::kFLOAT, power_val, 1};  // power
    nvinfer1::IScaleLayer* q_mul_k_scale =
            network->addScale(*q_mul_k->getOutput(0), nvinfer1::ScaleMode::kUNIFORM, sh_w, s_w, p_w);

    nvinfer1::IReduceLayer* attn_max =
            network->addReduce(*q_mul_k_scale->getOutput(0), nvinfer1::ReduceOperation::kMAX, 1 << 3, true);

    nvinfer1::IElementWiseLayer* attn_sub = network->addElementWise(
            *q_mul_k_scale->getOutput(0), *attn_max->getOutput(0), nvinfer1::ElementWiseOperation::kSUB);
    nvinfer1::IUnaryLayer* attn_exp = network->addUnary(*attn_sub->getOutput(0), nvinfer1::UnaryOperation::kEXP);
    nvinfer1::IReduceLayer* attn_sum =
            network->addReduce(*attn_exp->getOutput(0), nvinfer1::ReduceOperation::kSUM, 1 << 3, true);

    nvinfer1::IElementWiseLayer* attn_div = network->addElementWise(*attn_exp->getOutput(0), *attn_sum->getOutput(0),
                                                                    nvinfer1::ElementWiseOperation::kDIV);
    cout_dim(*attn_div->getOutput(0));

    nvinfer1::IShuffleLayer* attn_t = network->addShuffle(*attn_div->getOutput(0));
    attn_t->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2});

    nvinfer1::IMatrixMultiplyLayer* attn_v =
            network->addMatrixMultiply(*v_t_view->getOutput(0), nvinfer1::MatrixOperation::kNONE, *attn_t->getOutput(0),
                                       nvinfer1::MatrixOperation::kNONE);

    nvinfer1::IShuffleLayer* attn_v_t = network->addShuffle(*attn_v->getOutput(0));
    attn_v_t->setFirstTranspose(nvinfer1::Permutation{0, 3, 1, 2});
    nvinfer1::ITensor* attn_temp = attn_v_t->getOutput(0);
    if (area > 1) {
        B = B / area;
        N = N * area;

        nvinfer1::IShuffleLayer* attn_v_t_r = network->addShuffle(*attn_v_t->getOutput(0));
        attn_v_t_r->setReshapeDimensions(nvinfer1::Dims3{B, N, C});
        attn_temp = attn_v_t_r->getOutput(0);
    }
    nvinfer1::IShuffleLayer* attn_x = network->addShuffle(*attn_temp);
    attn_x->setReshapeDimensions(nvinfer1::Dims4{B, H, W, C});
    attn_x->setSecondTranspose(nvinfer1::Permutation{0, 3, 1, 2});
    nvinfer1::IElementWiseLayer* x_add_pp =
            network->addElementWise(*attn_x->getOutput(0), *pe->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
    nvinfer1::ILayer* proj = Conv(network, weightMap, *x_add_pp->getOutput(0), dim, lname + ".proj", 1, 1, 0, 1, false);

    return proj;
}

nvinfer1::IElementWiseLayer* ABlock(nvinfer1::INetworkDefinition* network,
                                    std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                    int dim, int num_heads, std::string lname, float mlp_ratio, int area) {

    nvinfer1::ILayer* attn = AAttn(network, weightMap, input, dim, num_heads, lname + ".attn", area);
    nvinfer1::IElementWiseLayer* add1 =  // x = x + self.attn(x)
            network->addElementWise(input, *attn->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
    int mlp_hidden_dim = int(dim * mlp_ratio);

    nvinfer1::ILayer* mlp_0 =
            Conv(network, weightMap, *add1->getOutput(0), mlp_hidden_dim, lname + ".mlp.0", 1, 1, 0, 1, true);
    nvinfer1::ILayer* mlp_1 = Conv(network, weightMap, *mlp_0->getOutput(0), dim, lname + ".mlp.1", 1, 1, 0, 1, false);

    nvinfer1::IElementWiseLayer* result =
            network->addElementWise(*add1->getOutput(0), *mlp_1->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
    return result;
}

nvinfer1::ILayer* A2C2f(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                        nvinfer1::ITensor& input, int c2, int n, std::string lname, bool a2, int area, bool residual,
                        float mlp_ratio, float e, int g, bool shortcut) {

    int c_ = static_cast<int>(c2 * e);
    assert(c_ % 32 == 0 && "Dimension of ABlock must be a multiple of 32");
    int num_heads = c_ / 32;

    nvinfer1::ILayer* cv1 = Conv(network, weightMap, input, c_, lname + ".cv1", 1, 1);
    std::vector<nvinfer1::ITensor*> y{cv1->getOutput(0)};
    nvinfer1::ITensor* current = cv1->getOutput(0);
    for (int i = 0; i < n; i++) {
        if (a2) {
            nvinfer1::ILayer* m_0 = ABlock(network, weightMap, *current, c_, num_heads,
                                           lname + ".m." + std::to_string(i) + ".0", mlp_ratio, area);
            nvinfer1::ILayer* m_1 = ABlock(network, weightMap, *m_0->getOutput(0), c_, num_heads,
                                           lname + ".m." + std::to_string(i) + ".1", mlp_ratio, area);
            current = m_1->getOutput(0);
        } else {
            // C3k
            nvinfer1::ILayer* m =
                    C3k(network, weightMap, *current, c_, lname + ".m." + std::to_string(i), 2, shortcut, g);
            current = m->getOutput(0);
        }
        y.push_back(current);
    }
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(y.data(), static_cast<int>(y.size()));
    cat->setAxis(1);
    nvinfer1::ILayer* cv2 = Conv(network, weightMap, *cat->getOutput(0), c2, lname + ".cv2", 1, 1);

    if (a2 && residual) {
        std::cout << lname << " applying residual connection with gamma" << std::endl;

        nvinfer1::Weights gamma = weightMap[lname + ".gamma"];

        nvinfer1::IConstantLayer* gamma_layer = network->addConstant(nvinfer1::Dims4{1, c2, 1, 1}, gamma);
        nvinfer1::IElementWiseLayer* scaled_output = network->addElementWise(
                *gamma_layer->getOutput(0), *cv2->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
        nvinfer1::IElementWiseLayer* result =
                network->addElementWise(input, *scaled_output->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);

        return result;
    } else {

        return cv2;
    }
}

nvinfer1::IElementWiseLayer* DSConv(nvinfer1::INetworkDefinition* network,
                                    std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                    int c_in, int c_out, std::string lname, int k, int s, int p, int d, bool bias) {
    if (p == 0) {
        p = (d * (k - 1)) / 2;
    }
    nvinfer1::Weights emptywts{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* dw =
            network->addConvolutionNd(input, c_in, nvinfer1::DimsHW{k, k}, weightMap[lname + ".dw.weight"], emptywts);
    dw->setStrideNd(nvinfer1::DimsHW{s, s});
    dw->setPaddingNd(nvinfer1::DimsHW{p, p});
    dw->setNbGroups(c_in);
    dw->setDilationNd(nvinfer1::DimsHW{d, d});

    nvinfer1::IConvolutionLayer* pw = network->addConvolutionNd(*dw->getOutput(0), c_out, nvinfer1::DimsHW{1, 1},
                                                                weightMap[lname + ".pw.weight"], emptywts);
    pw->setStrideNd(nvinfer1::DimsHW{1, 1});
    pw->setPaddingNd(nvinfer1::DimsHW{0, 0});
    pw->setNbGroups(1);
    pw->setDilationNd(nvinfer1::DimsHW{1, 1});

    nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *pw->getOutput(0), lname + ".bn", 1e-3);

    nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
    nvinfer1::IElementWiseLayer* ew =
            network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
    assert(ew);
    return ew;
}

nvinfer1::ILayer* DSBottleneck(nvinfer1::INetworkDefinition* network,
                               std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
                               int c2, std::string lname, bool shortcut, float e, int k1, int k2, int d2) {
    int c_ = float(e) * c2;
    nvinfer1::IElementWiseLayer* cv1 = DSConv(network, weightMap, input, c1, c_, lname + ".cv1", k1, 1, 0, 1, false);
    nvinfer1::IElementWiseLayer* y =
            DSConv(network, weightMap, *cv1->getOutput(0), c_, c2, lname + ".cv2", k2, 1, 0, d2, false);
    if (c1 == c2 && shortcut) {
        nvinfer1::IElementWiseLayer* add =
                network->addElementWise(input, *y->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
        return add;
    } else
        return y;
}

nvinfer1::ILayer* DSC3k(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                        nvinfer1::ITensor& input, int c2, int n, std::string lname, bool shortcut, int g, float e,
                        int k1, int k2, int d2) {
    int c_ = float(e) * c2;
    nvinfer1::ILayer* cv1 = Conv(network, weightMap, input, c_, lname + ".cv1", 1, 1);
    nvinfer1::ILayer* cv2 = Conv(network, weightMap, input, c_, lname + ".cv2", 1, 1);
    nvinfer1::ITensor* current = cv1->getOutput(0);
    for (int i = 0; i < n; i++) {
        nvinfer1::ILayer* m_ = DSBottleneck(network, weightMap, *current, c_, c_, lname + ".m." + std::to_string(i),
                                            shortcut, 1.0, k1, k2, d2);
        current = m_->getOutput(0);
    }
    nvinfer1::ITensor* inputTensors[] = {current, cv2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 2);
    nvinfer1::ILayer* cv3 = Conv(network, weightMap, *cat->getOutput(0), c2, lname + ".cv3", 1, 1);

    return cv3;
}

nvinfer1::ILayer* DSC3K2(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                         nvinfer1::ITensor& input, int c2, std::string lname, int n, bool dsc3k, float e, int g,
                         bool shortcut, int k1, int k2, int d2) {
    int c = float(e) * c2;
    nvinfer1::ILayer* cv1 = Conv(network, weightMap, input, 2 * c, lname + ".cv1");
    nvinfer1::Dims dim_cv1 = cv1->getOutput(0)->getDimensions();
    nvinfer1::ISliceLayer* sl0 = network->addSlice(
            *cv1->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0},
            nvinfer1::Dims4{dim_cv1.d[0], dim_cv1.d[1] / 2, dim_cv1.d[2], dim_cv1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    nvinfer1::ISliceLayer* sl1 = network->addSlice(
            *cv1->getOutput(0), nvinfer1::Dims4{0, dim_cv1.d[1] / 2, 0, 0},
            nvinfer1::Dims4{dim_cv1.d[0], dim_cv1.d[1] / 2, dim_cv1.d[2], dim_cv1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
    std::vector<nvinfer1::ITensor*> y = {sl0->getOutput(0), sl1->getOutput(0)};
    nvinfer1::ITensor* current = sl1->getOutput(0);
    for (int i = 0; i < n; i++) {
        if (dsc3k) {
            nvinfer1::ILayer* m_ = DSC3k(network, weightMap, *current, c, 2, lname + ".m." + std::to_string(i),
                                         shortcut, g, 1.0, k1, k2, d2);
            current = m_->getOutput(0);
            y.push_back(current);
        } else {
            nvinfer1::ILayer* m_ = DSBottleneck(network, weightMap, *current, c, c, lname + ".m." + std::to_string(i),
                                                shortcut, 1.0, k1, k2, d2);
            current = m_->getOutput(0);
            y.push_back(current);
        }
    }
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(y.data(), y.size());
    nvinfer1::ILayer* cv2 = Conv(network, weightMap, *cat->getOutput(0), c2, lname + ".cv2");

    return cv2;
}

nvinfer1::ILayer* FuseModule(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                             std::vector<nvinfer1::ITensor*>& input, int c_in, bool channel_adjust, std::string lname) {
    nvinfer1::IPoolingLayer* x1_ds =
            network->addPoolingNd(*input[0], nvinfer1::PoolingType::kAVERAGE, nvinfer1::DimsHW{2, 2});
    x1_ds->setStrideNd(nvinfer1::DimsHW{2, 2});
    x1_ds->setPaddingNd(nvinfer1::DimsHW{0, 0});

    nvinfer1::IResizeLayer* x3_up = network->addResize(*input[2]);
    float scale[] = {1, 1, 2, 2};
    x3_up->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    x3_up->setScales(scale, 4);

    nvinfer1::ITensor* inputTensor[] = {x1_ds->getOutput(0), input[1], x3_up->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor, 3);
    cat->setAxis(1);
    nvinfer1::ILayer* conv_out = Conv(network, weightMap, *cat->getOutput(0), c_in, lname + ".conv_out");
    return conv_out;
}

nvinfer1::ISoftMaxLayer* AdaHyperedgeGen(nvinfer1::INetworkDefinition* network,
                                         std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                         int node_dim, int num_hyperedges, std::string lname, int num_heads,
                                         std::string context) {

    nvinfer1::Dims dim_input = input.getDimensions();
    int B = dim_input.d[0];
    int N = dim_input.d[1];
    int D = dim_input.d[2];
    int head_dim = node_dim / num_heads;
    nvinfer1::ITensor* context_cat = nullptr;
    if (context == "mean") {
        nvinfer1::IReduceLayer* context_mean =
                network->addReduce(input, nvinfer1::ReduceOperation::kAVG, 1 << 1, false);
        context_cat = context_mean->getOutput(0);
    } else if (context == "max") {
        nvinfer1::IReduceLayer* context_max = network->addReduce(input, nvinfer1::ReduceOperation::kMAX, 1 << 1, false);
        context_cat = context_max->getOutput(0);
    } else {
        nvinfer1::IReduceLayer* context_mean =
                network->addReduce(input, nvinfer1::ReduceOperation::kAVG, 1 << 1, false);
        nvinfer1::IReduceLayer* context_max = network->addReduce(input, nvinfer1::ReduceOperation::kMAX, 1 << 1, false);
        nvinfer1::ITensor* inputTensor[] = {context_mean->getOutput(0), context_max->getOutput(0)};
        nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor, 2);
        cat->setAxis(1 << 0);
        context_cat = cat->getOutput(0);
    }

    nvinfer1::IShuffleLayer* context_cat_dim4 = network->addShuffle(*context_cat);
    context_cat_dim4->setReshapeDimensions(
            nvinfer1::Dims4{context_cat->getDimensions().d[0], context_cat->getDimensions().d[1], 1, 1});
    nvinfer1::IFullyConnectedLayer* prototypes_offsets_ = network->addFullyConnected(
            *context_cat_dim4->getOutput(0), num_hyperedges * node_dim, weightMap[lname + ".context_net.weight"],
            weightMap[lname + ".context_net.bias"]);
    nvinfer1::IShuffleLayer* prototypes_offsets = network->addShuffle(*prototypes_offsets_->getOutput(0));
    prototypes_offsets->setReshapeDimensions(nvinfer1::Dims3{B, num_hyperedges, D});
    // prototype_offsets = self.context_net(context_cat).view(B, self.num_hyperedges, D)

    nvinfer1::Weights prototype_base_wts = weightMap[lname + ".prototype_base"];
    nvinfer1::IConstantLayer* prototype_base =
            network->addConstant(nvinfer1::Dims3{1, num_hyperedges, node_dim}, prototype_base_wts);
    nvinfer1::IElementWiseLayer* prototypes = network->addElementWise(
            *prototype_base->getOutput(0), *prototypes_offsets->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
    // prototypes = self.prototype_base.unsqueeze(0) + prototype_offsets

    nvinfer1::IShuffleLayer* input_dim4 = network->addShuffle(input);
    input_dim4->setReshapeDimensions(nvinfer1::Dims4{B * N, D, 1, 1});
    nvinfer1::IFullyConnectedLayer* X_proj =
            network->addFullyConnected(*input_dim4->getOutput(0), node_dim, weightMap[lname + ".pre_head_proj.weight"],
                                       weightMap[lname + ".pre_head_proj.bias"]);
    // X_proj = self.pre_head_proj(X)

    nvinfer1::IShuffleLayer* X_heads = network->addShuffle(*X_proj->getOutput(0));
    X_heads->setReshapeDimensions(nvinfer1::Dims4{B, N, num_heads, head_dim});
    X_heads->setSecondTranspose(nvinfer1::Permutation{0, 2, 1, 3});
    // X_heads = X_proj.view(B, N, self.num_heads, self.head_dim).transpose(1, 2)

    nvinfer1::IShuffleLayer* proto_heads = network->addShuffle(*prototypes->getOutput(0));
    proto_heads->setReshapeDimensions(nvinfer1::Dims4{B, num_hyperedges, num_heads, head_dim});
    proto_heads->setSecondTranspose(nvinfer1::Permutation{0, 2, 1, 3});
    // proto_heads = prototypes.view(B, self.num_hyperedges, self.num_heads, self.head_dim).permute(0, 2, 1, 3)

    nvinfer1::IShuffleLayer* X_heads_flat = network->addShuffle(*X_heads->getOutput(0));
    X_heads_flat->setReshapeDimensions(nvinfer1::Dims3{B * num_heads, N, head_dim});
    // X_heads_flat = X_heads.reshape(B * self.num_heads, N, self.head_dim)

    nvinfer1::IShuffleLayer* proto_heads_flat = network->addShuffle(*proto_heads->getOutput(0));
    proto_heads_flat->setReshapeDimensions(nvinfer1::Dims3{B * num_heads, num_hyperedges, head_dim});
    proto_heads_flat->setSecondTranspose(nvinfer1::Permutation{0, 2, 1});
    //proto_heads_flat = proto_heads.reshape(B * self.num_heads, self.num_hyperedges, self.head_dim).transpose(1, 2)

    nvinfer1::IMatrixMultiplyLayer* logits =
            network->addMatrixMultiply(*X_heads_flat->getOutput(0), nvinfer1::MatrixOperation::kNONE,
                                       *proto_heads_flat->getOutput(0), nvinfer1::MatrixOperation::kNONE);
    float* scales_ptr = reinterpret_cast<float*>(malloc(sizeof(float)));
    *scales_ptr = sqrt(static_cast<float>(head_dim));
    nvinfer1::Weights scale_wts{nvinfer1::DataType::kFLOAT, scales_ptr, 1};
    nvinfer1::IConstantLayer* scale_layer = network->addConstant(nvinfer1::Dims3{1, 1, 1}, scale_wts);
    // keep weight alive during build
    weightMap[lname + ".scaling"] = scale_wts;
    nvinfer1::IElementWiseLayer* logits_scale = network->addElementWise(
            *logits->getOutput(0), *scale_layer->getOutput(0), nvinfer1::ElementWiseOperation::kDIV);
    // logits = torch.bmm(X_heads_flat, proto_heads_flat) / self.scaling

    nvinfer1::IShuffleLayer* logits_scale_view = network->addShuffle(*logits_scale->getOutput(0));
    logits_scale_view->setReshapeDimensions(nvinfer1::Dims4{B, num_heads, N, num_hyperedges});
    nvinfer1::IReduceLayer* logits_scale_view_mean =
            network->addReduce(*logits_scale_view->getOutput(0), nvinfer1::ReduceOperation::kAVG, 1 << 1, false);

    nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*logits_scale_view_mean->getOutput(0));
    softmax->setAxes(1 << 1);

    return softmax;
}

nvinfer1::IElementWiseLayer* GELU(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor& input) {
    static float sqrt_2_over_pi = 0.797885f;  // 0.7978845608
    static float kappa = 0.044715f;
    static float one = 1.0f;
    static float half = 0.5f;

    nvinfer1::IElementWiseLayer* x3_layer =
            network->addElementWise(input, input, nvinfer1::ElementWiseOperation::kPROD);
    nvinfer1::ITensor* x2 = x3_layer->getOutput(0);
    x3_layer = network->addElementWise(*x2, input, nvinfer1::ElementWiseOperation::kPROD);
    nvinfer1::ITensor* x3 = x3_layer->getOutput(0);

    nvinfer1::Weights kappa_weight{nvinfer1::DataType::kFLOAT, &kappa, 1};
    nvinfer1::IConstantLayer* kappa_const = network->addConstant(nvinfer1::Dims4{1, 1, 1, 1}, kappa_weight);
    nvinfer1::IElementWiseLayer* scaled_x3 =
            network->addElementWise(*x3, *kappa_const->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);

    nvinfer1::IElementWiseLayer* inner_sum =
            network->addElementWise(input, *scaled_x3->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
    nvinfer1::ITensor* inner = inner_sum->getOutput(0);

    nvinfer1::Weights sqrt_weight{nvinfer1::DataType::kFLOAT, &sqrt_2_over_pi, 1};
    nvinfer1::IConstantLayer* sqrt_const = network->addConstant(nvinfer1::Dims4{1, 1, 1, 1}, sqrt_weight);
    nvinfer1::IElementWiseLayer* scaled_inner =
            network->addElementWise(*inner, *sqrt_const->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);

    nvinfer1::IActivationLayer* tanh_layer =
            network->addActivation(*scaled_inner->getOutput(0), nvinfer1::ActivationType::kTANH);

    nvinfer1::Weights one_weight{nvinfer1::DataType::kFLOAT, &one, 1};
    nvinfer1::IConstantLayer* one_const = network->addConstant(nvinfer1::Dims4{1, 1, 1, 1}, one_weight);
    nvinfer1::IElementWiseLayer* add_one = network->addElementWise(*tanh_layer->getOutput(0), *one_const->getOutput(0),
                                                                   nvinfer1::ElementWiseOperation::kSUM);

    nvinfer1::IElementWiseLayer* half_x =
            network->addElementWise(input, *add_one->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);

    nvinfer1::Weights half_weight{nvinfer1::DataType::kFLOAT, &half, 1};
    nvinfer1::IConstantLayer* half_const = network->addConstant(nvinfer1::Dims4{1, 1, 1, 1}, half_weight);
    nvinfer1::IElementWiseLayer* gelu = network->addElementWise(*half_x->getOutput(0), *half_const->getOutput(0),
                                                                nvinfer1::ElementWiseOperation::kPROD);
    return gelu;
}

nvinfer1::IElementWiseLayer* AdaHGConv(nvinfer1::INetworkDefinition* network,
                                       std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                       int embed_dim, std::string lname, int num_hyperedges, int num_heads,
                                       std::string context) {

    // {B, N, num_hyperedges}
    nvinfer1::ISoftMaxLayer* A = AdaHyperedgeGen(network, weightMap, input, embed_dim, num_hyperedges,
                                                 lname + ".edge_generator", num_heads, context);
    nvinfer1::IMatrixMultiplyLayer* He = network->addMatrixMultiply(  // 486 layer
            *A->getOutput(0), nvinfer1::MatrixOperation::kTRANSPOSE, input, nvinfer1::MatrixOperation::kNONE);
    nvinfer1::IShuffleLayer* He_dim4 = network->addShuffle(*He->getOutput(0));
    He_dim4->setReshapeDimensions(nvinfer1::Dims4{He->getOutput(0)->getDimensions().d[1],
                                                  He->getOutput(0)->getDimensions().d[0],
                                                  He->getOutput(0)->getDimensions().d[2], 1});

    nvinfer1::IFullyConnectedLayer* He_edge_proj_ =
            network->addFullyConnected(*He_dim4->getOutput(0), embed_dim, weightMap[lname + ".edge_proj.0.weight"],
                                       weightMap[lname + ".edge_proj.0.bias"]);
    nvinfer1::IElementWiseLayer* He_edge_proj = GELU(network, *He_edge_proj_->getOutput(0));
    nvinfer1::IShuffleLayer* He_edge_proj_dim2 = network->addShuffle(*He_edge_proj->getOutput(0));
    He_edge_proj_dim2->setReshapeDimensions(nvinfer1::Dims2{He_edge_proj->getOutput(0)->getDimensions().d[0],
                                                            He_edge_proj->getOutput(0)->getDimensions().d[1]});
    nvinfer1::IShuffleLayer* A_dim2 = network->addShuffle(*A->getOutput(0));
    A_dim2->setReshapeDimensions(
            nvinfer1::Dims2{A->getOutput(0)->getDimensions().d[1] *
                                    A->getOutput(0)->getDimensions().d[0],  // keep the batch information
                            A->getOutput(0)->getDimensions().d[2]});
    nvinfer1::IMatrixMultiplyLayer* x_new_ =
            network->addMatrixMultiply(*A_dim2->getOutput(0), nvinfer1::MatrixOperation::kNONE,
                                       *He_edge_proj_dim2->getOutput(0), nvinfer1::MatrixOperation::kNONE);
    nvinfer1::IShuffleLayer* x_new_dim4 = network->addShuffle(*x_new_->getOutput(0));
    x_new_dim4->setReshapeDimensions(nvinfer1::Dims4{x_new_->getOutput(0)->getDimensions().d[0],
                                                     x_new_->getOutput(0)->getDimensions().d[1], 1, 1});
    nvinfer1::IFullyConnectedLayer* x_new_node_proj_ =
            network->addFullyConnected(*x_new_dim4->getOutput(0), embed_dim, weightMap[lname + ".node_proj.0.weight"],
                                       weightMap[lname + ".node_proj.0.bias"]);
    nvinfer1::IElementWiseLayer* x_new_node_proj = GELU(network, *x_new_node_proj_->getOutput(0));
    nvinfer1::IShuffleLayer* x_new_finall = network->addShuffle(*x_new_node_proj->getOutput(0));
    x_new_finall->setReshapeDimensions(nvinfer1::Dims3{1, x_new_node_proj->getOutput(0)->getDimensions().d[0],
                                                       x_new_node_proj->getOutput(0)->getDimensions().d[1]});
    nvinfer1::IElementWiseLayer* add =
            network->addElementWise(*x_new_finall->getOutput(0), input, nvinfer1::ElementWiseOperation::kSUM);

    return add;
}

nvinfer1::IShuffleLayer* AdaHGComputation(nvinfer1::INetworkDefinition* network,
                                          std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                          int embed_dim, std::string lname, int num_hyperedges, int num_heads,
                                          std::string context) {
    nvinfer1::Dims dim = input.getDimensions();
    int B = dim.d[0];
    int C = dim.d[1];
    int H = dim.d[2];
    int W = dim.d[3];
    nvinfer1::IShuffleLayer* tokens = network->addShuffle(input);
    tokens->setReshapeDimensions(nvinfer1::Dims3{B, C, H * W});
    tokens->setSecondTranspose(nvinfer1::Permutation{0, 2, 1});
    nvinfer1::IElementWiseLayer* hgnn = AdaHGConv(network, weightMap, *tokens->getOutput(0), embed_dim, lname + ".hgnn",
                                                  num_hyperedges, num_heads, context);

    nvinfer1::IShuffleLayer* x_out = network->addShuffle(*hgnn->getOutput(0));
    x_out->setFirstTranspose(nvinfer1::Permutation{0, 2, 1});
    x_out->setReshapeDimensions(nvinfer1::Dims4{B, C, H, W});

    return x_out;
}

nvinfer1::ILayer* C3AH(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                       nvinfer1::ITensor& input, int c2, std::string lname, float e, int num_hyperedges,
                       std::string context) {
    int c_ = float(e) * c2;
    assert(c_ % 16 == 0 && "Dimension of AdaHGComputation should be a multiplt of 16");
    int num_heads = c_ / 16;
    nvinfer1::ILayer* cv1 = Conv(network, weightMap, input, c_, lname + ".cv1");
    nvinfer1::ILayer* cv2 = Conv(network, weightMap, input, c_, lname + ".cv2");

    nvinfer1::IShuffleLayer* m = AdaHGComputation(network, weightMap, *cv1->getOutput(0), c_, lname + ".m",
                                                  num_hyperedges, num_heads, context);
    nvinfer1::ITensor* inputTensor[] = {m->getOutput(0), cv2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor, 2);
    nvinfer1::ILayer* cv3 = Conv(network, weightMap, *cat->getOutput(0), c2, lname + ".cv3");
    return cv3;
}

nvinfer1::ILayer* HyperACE(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                           std::vector<nvinfer1::ITensor*> input, int c1, int c2, std::string lname, int n,
                           int num_hyperedges, bool dsc3k, bool shortcut, float e1, float e2, std::string context,
                           bool channel_adjust) {
    int c = int(c2 * e1);
    nvinfer1::ILayer* fuse = FuseModule(network, weightMap, input, c1, channel_adjust, lname + ".fuse");
    nvinfer1::ILayer* cv1 = Conv(network, weightMap, *fuse->getOutput(0), 3 * c, lname + ".cv1");
    nvinfer1::Dims d_cv1 = cv1->getOutput(0)->getDimensions();
    nvinfer1::ISliceLayer* sl0 = network->addSlice(*cv1->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0},
                                                   nvinfer1::Dims4{d_cv1.d[0], d_cv1.d[1] / 3, d_cv1.d[2], d_cv1.d[3]},
                                                   nvinfer1::Dims4{1, 1, 1, 1});
    nvinfer1::ISliceLayer* sl1 = network->addSlice(*cv1->getOutput(0), nvinfer1::Dims4{0, d_cv1.d[1] / 3, 0, 0},
                                                   nvinfer1::Dims4{d_cv1.d[0], d_cv1.d[1] / 3, d_cv1.d[2], d_cv1.d[3]},
                                                   nvinfer1::Dims4{1, 1, 1, 1});
    nvinfer1::ISliceLayer* sl2 = network->addSlice(*cv1->getOutput(0), nvinfer1::Dims4{0, d_cv1.d[1] / 3 * 2, 0, 0},
                                                   nvinfer1::Dims4{d_cv1.d[0], d_cv1.d[1] / 3, d_cv1.d[2], d_cv1.d[3]},
                                                   nvinfer1::Dims4{1, 1, 1, 1});
    std::vector<nvinfer1::ITensor*> y = {sl0->getOutput(0), sl1->getOutput(0), sl2->getOutput(0)};
    nvinfer1::ILayer* out1 = C3AH(network, weightMap, *y[1], c, lname + ".branch1", e2, num_hyperedges, context);
    nvinfer1::ILayer* out2 = C3AH(network, weightMap, *y[1], c, lname + ".branch2", e2, num_hyperedges, context);
    nvinfer1::ITensor* current = y[2];
    for (int i = 0; i < n; i++) {
        if (dsc3k) {
            nvinfer1::ILayer* m_ = DSC3k(network, weightMap, *current, c, 2, lname + ".m." + std::to_string(i),
                                         shortcut, 1, 0.5, 3, 7, 1);
            current = m_->getOutput(0);
        } else {
            nvinfer1::ILayer* m_ =
                    DSBottleneck(network, weightMap, *current, c, c, lname + ".m." + std::to_string(i), shortcut);
            current = m_->getOutput(0);
        }
        y.push_back(current);
    }

    y[1] = out1->getOutput(0);
    y.push_back(out2->getOutput(0));

    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(y.data(), y.size());
    nvinfer1::ILayer* cv2 = Conv(network, weightMap, *cat->getOutput(0), c2, lname + ".cv2");

    return cv2;
}

nvinfer1::ILayer* DownsampleConv(nvinfer1::INetworkDefinition* network,
                                 std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                 int in_channels, std::string lname, bool channel_adjust) {
    nvinfer1::IPoolingLayer* downsample =
            network->addPoolingNd(input, nvinfer1::PoolingType::kAVERAGE, nvinfer1::DimsHW{2, 2});
    downsample->setStrideNd(nvinfer1::DimsHW{2, 2});
    downsample->setPaddingNd(nvinfer1::DimsHW{0, 0});
    if (channel_adjust) {
        nvinfer1::ILayer* channel_adjust_ =
                Conv(network, weightMap, *downsample->getOutput(0), in_channels * 2, lname + ".channel_adjust");
        return channel_adjust_;
    } else
        return downsample;
}

nvinfer1::IElementWiseLayer* FullPad_Tunnel(nvinfer1::INetworkDefinition* network,
                                            std::map<std::string, nvinfer1::Weights> weightMap,
                                            std::vector<nvinfer1::ITensor*> input, std::string lname) {
    nvinfer1::Weights gate = weightMap[lname + ".gate"];
    nvinfer1::IConstantLayer* gate_constant = network->addConstant(nvinfer1::Dims4{1, 1, 1, 1}, gate);
    nvinfer1::IElementWiseLayer* scaled_input_1 =
            network->addElementWise(*input[1], *gate_constant->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
    nvinfer1::IElementWiseLayer* add =
            network->addElementWise(*input[0], *scaled_input_1->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);

    return add;
}


================================================
FILE: yolov13/src/calibrator.cpp
================================================
#include "calibrator.h"
#include <fstream>
#include <iostream>
#include <iterator>
#include <opencv2/dnn/dnn.hpp>
#include "cuda_utils.h"
#include "utils.h"

Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir,
                                               const char* calib_table_name, const char* input_blob_name,
                                               bool read_cache)
    : batchsize_(batchsize),
      input_w_(input_w),
      input_h_(input_h),
      img_idx_(0),
      img_dir_(img_dir),
      calib_table_name_(calib_table_name),
      input_blob_name_(input_blob_name),
      read_cache_(read_cache) {
    input_count_ = 3 * input_w * input_h * batchsize;
    CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float)));
    read_files_in_dir(img_dir, img_files_);
}

Int8EntropyCalibrator2::~Int8EntropyCalibrator2() {
    CUDA_CHECK(cudaFree(device_input_));
}

int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT {
    return batchsize_;
}

bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT {
    if (img_idx_ + batchsize_ > (int)img_files_.size()) {
        return false;
    }

    std::vector<cv::Mat> input_imgs_;
    for (int i = img_idx_; i < img_idx_ + batchsize_; i++) {
        std::cout << img_files_[i] << "  " << i << std::endl;
        cv::Mat temp = cv::imread(img_dir_ + "/" + img_files_[i]);
        if (temp.empty()) {
            std::cerr << "Fatal error: image cannot open!" << std::endl;
            return false;
        }
        cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_);
        input_imgs_.push_back(pr_img);
    }
    img_idx_ += batchsize_;
    cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0),
                                           true, false);
    CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr<float>(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice));
    assert(!strcmp(names[0], input_blob_name_));
    bindings[0] = device_input_;
    return true;
}

const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT {
    std::cout << "reading calib cache: " << calib_table_name_ << std::endl;
    calib_cache_.clear();
    std::ifstream input(calib_table_name_, std::ios::binary);
    input >> std::noskipws;
    if (read_cache_ && input.good()) {
        std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(calib_cache_));
    }
    length = calib_cache_.size();
    return length ? calib_cache_.data() : nullptr;
}

void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT {
    std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl;
    std::ofstream output(calib_table_name_, std::ios::binary);
    output.write(reinterpret_cast<const char*>(cache), length);
}


================================================
FILE: yolov13/src/model.cpp
================================================
#include <math.h>
#include <iostream>

#include "block.h"
#include "calibrator.h"
#include "config.h"
#include "model.h"

static int get_width(int x, float gw, int max_channels, int divisor = 8) {
    auto channel = std::min(x, max_channels);
    channel = int(ceil((channel * gw) / divisor)) * divisor;
    return channel;
}

static int get_depth(int x, float gd) {
    if (x == 1)
        return 1;
    int r = round(x * gd);
    if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0)
        --r;
    return std::max<int>(r, 1);
}
// Unused functions removed: convBnSiLUProto, Proto, cv4_conv_combined

void calculateStrides(nvinfer1::IElementWiseLayer* conv_layers[], int size, int reference_size, int strides[]) {
    for (int i = 0; i < size; ++i) {
        nvinfer1::ILayer* layer = conv_layers[i];
        nvinfer1::Dims dims = layer->getOutput(0)->getDimensions();
        int feature_map_size = dims.d[2];
        strides[i] = reference_size / feature_map_size;
    }
}

void calculateStrides(nvinfer1::ILayer* conv_layers[], int size, int reference_size, int strides[]) {
    for (int i = 0; i < size; ++i) {
        nvinfer1::ILayer* layer = conv_layers[i];
        nvinfer1::Dims dims = layer->getOutput(0)->getDimensions();
        int feature_map_size = dims.d[2];
        strides[i] = reference_size / feature_map_size;
    }
}

nvinfer1::IHostMemory* buildEngineYolov13Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                             nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                             int& max_channels, std::string& type) {

    std::cout << "The number of the KNumClass is " << kNumClass << std::endl;
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(
            1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));

    // =====================   input   ===================================================
    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW});
    assert(data);

    // =====================   backbone   ===================================================
    nvinfer1::ILayer* conv0 = Conv(network, weightMap, *data, get_width(64, gw, max_channels), "model.0", 3, 2);
    nvinfer1::ILayer* conv1 =
            Conv(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), "model.1", 3, 2, 1, 2);

    bool dsc3k = false;
    float mlp_ratio = 2.0;
    bool residual = false;
    bool channel_adjust = true;
    if (type == "l" || type == "x") {
        mlp_ratio = 1.5;
        residual = true;
        dsc3k = true;
        channel_adjust = false;
    }
    nvinfer1::ILayer* conv2 = DSC3K2(network, weightMap, *conv1->getOutput(0), get_width(256, gw, max_channels),
                                     "model.2", get_depth(2, gd), dsc3k, 0.25);
    nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0),
                                                    get_width(256, gw, max_channels), {3, 3}, 2, "model.3", 1, 4);
    nvinfer1::ILayer* conv4 = DSC3K2(network, weightMap, *conv3->getOutput(0), get_width(512, gw, max_channels),
                                     "model.4", get_depth(2, gd), dsc3k, 0.25);
    nvinfer1::IElementWiseLayer* conv5 =
            DSConv(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels),
                   get_width(512, gw, max_channels), "model.5", 3, 2);
    nvinfer1::ILayer* conv6 = A2C2f(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                                    get_depth(4, gd), "model.6", true, 4, residual, mlp_ratio);
    nvinfer1::IElementWiseLayer* conv7 =
            DSConv(network, weightMap, *conv6->getOutput(0), get_width(512, gw, max_channels),
                   get_width(1024, gw, max_channels), "model.7", 3, 2);

    nvinfer1::ILayer* conv8 = A2C2f(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
                                    get_depth(4, gd), "model.8", true, 1, residual, mlp_ratio);

    //=========================  neck ====================================================================
    float scale[] = {1.0, 1.0, 2.0, 2.0};
    int num_hyperedges = 8;
    if (type == "n") {
        num_hyperedges *= 0.5;
    } else if (type == "x") {
        num_hyperedges *= 1.5;
    }

    nvinfer1::ILayer* conv9 =
            HyperACE(network, weightMap, {conv4->getOutput(0), conv6->getOutput(0), conv8->getOutput(0)},
                     get_width(512, gw, max_channels), get_width(512, gw, max_channels), "model.9", get_depth(2, gd),
                     num_hyperedges, true, true, 0.5, 1, "both", channel_adjust);

    auto input_dims = conv9->getOutput(0)->getDimensions();
    nvinfer1::IResizeLayer* upsample10 = network->addResize(*conv9->getOutput(0));
    assert(upsample10);
    upsample10->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample10->setOutputDimensions(
            nvinfer1::Dims4{input_dims.d[0], input_dims.d[1], input_dims.d[2] * 2, input_dims.d[3] * 2});

    nvinfer1::ILayer* downsample11 = DownsampleConv(network, weightMap, *conv9->getOutput(0),
                                                    get_width(512, gw, max_channels), "model.11", channel_adjust);

    nvinfer1::IElementWiseLayer* conv12 =  // conv6:(1, 128, 40, 40) conv9: (1, 128, 40, 40)
            FullPad_Tunnel(network, weightMap, {conv6->getOutput(0), conv9->getOutput(0)}, "model.12");
    nvinfer1::IElementWiseLayer* conv13 =
            FullPad_Tunnel(network, weightMap, {conv4->getOutput(0), upsample10->getOutput(0)}, "model.13");

    nvinfer1::IElementWiseLayer* conv14 =
            FullPad_Tunnel(network, weightMap, {conv8->getOutput(0), downsample11->getOutput(0)}, "model.14");

    nvinfer1::IResizeLayer* upsample15 = network->addResize(*conv14->getOutput(0));
    assert(upsample15);
    upsample15->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample15->setScales(scale, 4);
    nvinfer1::ITensor* inputTensors16[] = {upsample15->getOutput(0), conv12->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat16 = network->addConcatenation(inputTensors16, 2);
    nvinfer1::ILayer* conv17 = DSC3K2(network, weightMap, *cat16->getOutput(0), get_width(512, gw, max_channels),
                                      "model.17", get_depth(2, gd), true);

    nvinfer1::IElementWiseLayer* conv18 =
            FullPad_Tunnel(network, weightMap, {conv17->getOutput(0), conv9->getOutput(0)}, "model.18");

    nvinfer1::IResizeLayer* upsample19 = network->addResize(*conv17->getOutput(0));
    assert(upsample19);
    upsample19->setScales(scale, 4);
    upsample19->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    nvinfer1::ITensor* inputTensors20[] = {upsample19->getOutput(0), conv13->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat20 = network->addConcatenation(inputTensors20, 2);
    nvinfer1::ILayer* conv21 = DSC3K2(network, weightMap, *cat20->getOutput(0), get_width(256, gw, max_channels),
                                      "model.21", get_depth(2, gd), true);

    nvinfer1::ILayer* conv22 =
            Conv(network, weightMap, *upsample10->getOutput(0), get_width(256, gw, max_channels), "model.22");
    nvinfer1::IElementWiseLayer* conv23 =
            FullPad_Tunnel(network, weightMap, {conv21->getOutput(0), conv22->getOutput(0)}, "model.23");

    nvinfer1::ILayer* conv24 =
            Conv(network, weightMap, *conv23->getOutput(0), get_width(256, gw, max_channels), "model.24", 3, 2);
    nvinfer1::ITensor* inputTensors25[] = {conv24->getOutput(0), conv18->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat25 = network->addConcatenation(inputTensors25, 2);
    nvinfer1::ILayer* conv26 = DSC3K2(network, weightMap, *cat25->getOutput(0), get_width(512, gw, max_channels),
                                      "model.26", get_depth(2, gd), true);
    nvinfer1::IElementWiseLayer* conv27 =
            FullPad_Tunnel(network, weightMap, {conv26->getOutput(0), conv9->getOutput(0)}, "model.27");

    nvinfer1::ILayer* conv28 =
            Conv(network, weightMap, *conv26->getOutput(0), get_width(512, gw, max_channels), "model.28", 3, 2);
    nvinfer1::ITensor* inputTensors29[] = {conv28->getOutput(0), conv14->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat29 = network->addConcatenation(inputTensors29, 2);
    nvinfer1::ILayer* conv30 = DSC3K2(network, weightMap, *cat29->getOutput(0), get_width(1024, gw, max_channels),
                                      "model.30", get_depth(2, gd), true);
    nvinfer1::IElementWiseLayer* conv31 =
            FullPad_Tunnel(network, weightMap, {conv30->getOutput(0), downsample11->getOutput(0)}, "model.31");

    // =============================== output ===================================================================
    int c2 = std::max(std::max(16, get_width(256, gw, max_channels) / 4), 16 * 4);
    int c3 = std::max(get_width(256, gw, max_channels), std::min(kNumClass, 100));

    // output0   location
    nvinfer1::IElementWiseLayer* conv32_cv2_0_0 =
            convBnSiLU(network, weightMap, *conv23->getOutput(0), c2, {3, 3}, 1, "model.32.cv2.0.0");
    nvinfer1::IElementWiseLayer* conv32_cv2_0_1 =
            convBnSiLU(network, weightMap, *conv32_cv2_0_0->getOutput(0), c2, {3, 3}, 1, "model.32.cv2.0.1");
    nvinfer1::IConvolutionLayer* conv32_cv2_0_2 =
            network->addConvolutionNd(*conv32_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.32.cv2.0.2.weight"], weightMap["model.32.cv2.0.2.bias"]);
    conv32_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv32_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});

    // output0 classes
    auto* conv32_cv3_0_0_0 = DWConv(network, weightMap, *conv23->getOutput(0), get_width(256, gw, max_channels), {3, 3},
                                    1, "model.32.cv3.0.0.0");
    nvinfer1::IElementWiseLayer* conv32_cv3_0_0_1 =
            convBnSiLU(network, weightMap, *conv32_cv3_0_0_0->getOutput(0), c3, {1, 1}, 1, "model.32.cv3.0.0.1");

    auto* conv32_cv3_0_1_0 =
            DWConv(network, weightMap, *conv32_cv3_0_0_1->getOutput(0), c3, {3, 3}, 1, "model.32.cv3.0.1.0");
    nvinfer1::IElementWiseLayer* conv32_cv3_0_1_1 =
            convBnSiLU(network, weightMap, *conv32_cv3_0_1_0->getOutput(0), c3, {1, 1}, 1, "model.32.cv3.0.1.1");
    nvinfer1::IConvolutionLayer* conv32_cv3_0_1_2 =
            network->addConvolutionNd(*conv32_cv3_0_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.32.cv3.0.2.weight"], weightMap["model.32.cv3.0.2.bias"]);
    conv32_cv3_0_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    conv32_cv3_0_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});

    nvinfer1::ITensor* inputTensors32_0[] = {conv32_cv2_0_2->getOutput(0), conv32_cv3_0_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat32_0 = network->addConcatenation(inputTensors32_0, 2);

    // out1 location
    nvinfer1::IElementWiseLayer* conv32_cv2_1_0 =
            convBnSiLU(network, weightMap, *conv27->getOutput(0), c2, {3, 3}, 1, "model.32.cv2.1.0");
    nvinfer1::IElementWiseLayer* conv32_cv2_1_1 =
            convBnSiLU(network, weightMap, *conv32_cv2_1_0->getOutput(0), c2, {3, 3}, 1, "model.32.cv2.1.1");
    nvinfer1::IConvolutionLayer* conv32_cv2_1_2 =
            network->addConvolutionNd(*conv32_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.32.cv2.1.2.weight"], weightMap["model.32.cv2.1.2.bias"]);
    conv32_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv32_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    // out1 classes
    auto* conv32_cv3_1_0_0 = DWConv(network, weightMap, *conv27->getOutput(0), get_width(512, gw, max_channels), {3, 3},
                                    1, "model.32.cv3.1.0.0");
    nvinfer1::IElementWiseLayer* conv32_cv3_1_0_1 =
            convBnSiLU(network, weightMap, *conv32_cv3_1_0_0->getOutput(0), c3, {1, 1}, 1, "model.32.cv3.1.0.1");
    auto* conv32_cv3_1_1_0 =
            DWConv(network, weightMap, *conv32_cv3_1_0_1->getOutput(0), c3, {3, 3}, 1, "model.32.cv3.1.1.0");
    nvinfer1::IElementWiseLayer* conv32_cv3_1_1_1 =
            convBnSiLU(network, weightMap, *conv32_cv3_1_1_0->getOutput(0), c3, {1, 1}, 1, "model.32.cv3.1.1.1");
    nvinfer1::IConvolutionLayer* conv32_cv3_1_1_2 =
            network->addConvolutionNd(*conv32_cv3_1_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.32.cv3.1.2.weight"], weightMap["model.32.cv3.1.2.bias"]);
    conv32_cv3_1_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    conv32_cv3_1_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});

    nvinfer1::ITensor* inputTensors32_1[] = {conv32_cv2_1_2->getOutput(0), conv32_cv3_1_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat32_1 = network->addConcatenation(inputTensors32_1, 2);

    // out2 location
    nvinfer1::IElementWiseLayer* conv32_cv2_2_0 =
            convBnSiLU(network, weightMap, *conv31->getOutput(0), c2, {3, 3}, 1, "model.32.cv2.2.0");
    nvinfer1::IElementWiseLayer* conv32_cv2_2_1 =
            convBnSiLU(network, weightMap, *conv32_cv2_2_0->getOutput(0), c2, {3, 3}, 1, "model.32.cv2.2.1");
    nvinfer1::IConvolutionLayer* conv32_cv2_2_2 =
            network->addConvolutionNd(*conv32_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.32.cv2.2.2.weight"], weightMap["model.32.cv2.2.2.bias"]);
    conv32_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv32_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});

    // out2 classes
    auto* conv32_cv3_2_0_0 = DWConv(network, weightMap, *conv31->getOutput(0), get_width(1024, gw, max_channels),
                                    {3, 3}, 1, "model.32.cv3.2.0.0");
    nvinfer1::IElementWiseLayer* conv32_cv3_2_0_1 =
            convBnSiLU(network, weightMap, *conv32_cv3_2_0_0->getOutput(0), c3, {1, 1}, 1, "model.32.cv3.2.0.1");
    auto* conv32_cv3_2_1_0 =
            DWConv(network, weightMap, *conv32_cv3_2_0_1->getOutput(0), c3, {3, 3}, 1, "model.32.cv3.2.1.0");
    nvinfer1::IElementWiseLayer* conv32_cv3_2_1_1 =
            convBnSiLU(network, weightMap, *conv32_cv3_2_1_0->getOutput(0), c3, {1, 1}, 1, "model.32.cv3.2.1.1");
    nvinfer1::IConvolutionLayer* conv32_cv3_2_1_2 =
            network->addConvolutionNd(*conv32_cv3_2_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.32.cv3.2.2.weight"], weightMap["model.32.cv3.2.2.bias"]);
    conv32_cv3_2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv32_cv3_2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});

    nvinfer1::ITensor* inputTensor32_2[] = {conv32_cv2_2_2->getOutput(0), conv32_cv3_2_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat32_2 = network->addConcatenation(inputTensor32_2, 2);

    // ============================================ yolov13  detect =========================================
    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7};
    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
    int stridesLength = sizeof(strides) / sizeof(int);

    nvinfer1::IShuffleLayer* shuffle32_0 = network->addShuffle(*cat32_0->getOutput(0));
    shuffle32_0->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
    nvinfer1::ISliceLayer* split32_0_0 = network->addSlice(
            *shuffle32_0->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split32_0_1 =
            network->addSlice(*shuffle32_0->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])},
                              nvinfer1::Dims3{1, 1, 1});

    nvinfer1::IShuffleLayer* dfl32_0 =
            DFL(network, weightMap, *split32_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
                1, 0, "model.32.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor32_dfl_0[] = {dfl32_0->getOutput(0), split32_0_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat32_dfl_0 = network->addConcatenation(inputTensor32_dfl_0, 2);
    cat32_dfl_0->setAxis(1);

    nvinfer1::IShuffleLayer* shuffle32_1 = network->addShuffle(*cat32_1->getOutput(0));
    shuffle32_1->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
    nvinfer1::ISliceLayer* split32_1_0 = network->addSlice(
            *shuffle32_1->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split32_1_1 =
            network->addSlice(*shuffle32_1->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl32_1 =
            DFL(network, weightMap, *split32_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
                1, 0, "model.32.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor32_dfl_1[] = {dfl32_1->getOutput(0), split32_1_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat32_dfl_1 = network->addConcatenation(inputTensor32_dfl_1, 2);
    cat32_dfl_1->setAxis(1);

    nvinfer1::IShuffleLayer* shuffle32_2 = network->addShuffle(*cat32_2->getOutput(0));
    shuffle32_2->setReshapeDimensions(
            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
    nvinfer1::ISliceLayer* split32_2_0 = network->addSlice(
            *shuffle32_2->getOutput(0), nvinfer1::Dims3{0, 0, 0},
            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split32_2_1 =
            network->addSlice(*shuffle32_2->getOutput(0), nvinfer1::Dims3{0, 64, 0},
                              nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::IShuffleLayer* dfl32_2 =
            DFL(network, weightMap, *split32_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
                1, 0, "model.32.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor32_dfl_2[] = {dfl32_2->getOutput(0), split32_2_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat32_dfl_2 = network->addConcatenation(inputTensor32_dfl_2, 2);
    cat32_dfl_2->setAxis(1);
    std::cout << " There are  " << weightMap.size() << "  layers parameters in the network!!!" << endl;

    nvinfer1::IPluginV2Layer* yolo =
            addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer*>{cat32_dfl_0, cat32_dfl_1, cat32_dfl_2},
                         strides, stridesLength);
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator = new Int8EntropyCalibrator2(kBatchSize, kInputW, kInputH, kInputQuantizationFolder,
                                                  "int8calib.table", kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}


================================================
FILE: yolov13/src/postprocess.cpp
================================================
#include "postprocess.h"
#include "utils.h"

cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
    float l, r, t, b;
    float r_w = kInputW / (img.cols * 1.0);
    float r_h = kInputH / (img.rows * 1.0);

    if (r_h > r_w) {
        l = bbox[0];
        r = bbox[2];
        t = bbox[1] - (kInputH - r_w * img.rows) / 2;
        b = bbox[3] - (kInputH - r_w * img.rows) / 2;
        l = l / r_w;
        r = r / r_w;
        t = t / r_w;
        b = b / r_w;
    } else {
        l = bbox[0] - (kInputW - r_h * img.cols) / 2;
        r = bbox[2] - (kInputW - r_h * img.cols) / 2;
        t = bbox[1];
        b = bbox[3];
        l = l / r_h;
        r = r / r_h;
        t = t / r_h;
        b = b / r_h;
    }
    l = std::max(0.0f, l);
    t = std::max(0.0f, t);
    int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
    int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));

    return cv::Rect(int(round(l)), int(round(t)), width, height);
}

static float iou(float lbox[4], float rbox[4]) {
    float interBox[] = {
            (std::max)(lbox[0], rbox[0]),
            (std::min)(lbox[2], rbox[2]),
            (std::max)(lbox[1], rbox[1]),
            (std::min)(lbox[3], rbox[3]),
    };

    if (interBox[2] > interBox[3] || interBox[0] > interBox[1])
        return 0.0f;

    float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]);
    float unionBoxS = (lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) - interBoxS;
    return interBoxS / unionBoxS;
}

static bool cmp(const Detection& a, const Detection& b) {
    if (a.conf == b.conf) {
        return a.bbox[0] < b.bbox[0];
    }
    return a.conf > b.conf;
}

void nms(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh) {
    int det_size = sizeof(Detection) / sizeof(float);
    std::map<float, std::vector<Detection>> m;

    for (int i = 0; i < output[0]; i++) {
        if (output[1 + det_size * i + 4] <= conf_thresh || isnan(output[1 + det_size * i + 4]))
            continue;
        Detection det;
        memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
        if (m.count(det.class_id) == 0)
            m.emplace(det.class_id, std::vector<Detection>());
        m[det.class_id].push_back(det);
    }
    for (auto it = m.begin(); it != m.end(); it++) {
        auto& dets = it->second;
        std::sort(dets.begin(), dets.end(), cmp);
        for (size_t m = 0; m < dets.size(); ++m) {
            auto& item = dets[m];
            res.push_back(item);
            for (size_t n = m + 1; n < dets.size(); ++n) {
                if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
                    dets.erase(dets.begin() + n);
                    --n;
                }
            }
        }
    }
}

void batch_nms(std::vector<std::vector<Detection>>& res_batch, float* output, int batch_size, int output_size,
               float conf_thresh, float nms_thresh) {
    res_batch.resize(batch_size);
    for (int i = 0; i < batch_size; i++) {
        nms(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh);
    }
}

void process_decode_ptr_host(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img,
                             int count) {
    Detection det;
    for (int i = 0; i < count; i++) {
        int basic_pos = 1 + i * bbox_element;
        int keep_flag = decode_ptr_host[basic_pos + 6];
        if (keep_flag == 1) {
            det.bbox[0] = decode_ptr_host[basic_pos + 0];
            det.bbox[1] = decode_ptr_host[basic_pos + 1];
            det.bbox[2] = decode_ptr_host[basic_pos + 2];
            det.bbox[3] = decode_ptr_host[basic_pos + 3];
            det.conf = decode_ptr_host[basic_pos + 4];
            det.class_id = decode_ptr_host[basic_pos + 5];
            res.push_back(det);
        }
    }
}

void batch_process(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
                   int bbox_element, const std::vector<cv::Mat>& img_batch) {
    res_batch.resize(batch_size);
    int count = static_cast<int>(*decode_ptr_host);
    count = std::min(count, kMaxNumOutputBbox);
    for (int i = 0; i < batch_size; i++) {
        auto& img = const_cast<cv::Mat&>(img_batch[i]);
        process_decode_ptr_host(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count);
    }
}

void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
    for (size_t i = 0; i < img_batch.size(); i++) {
        auto& res = res_batch[i];
        cv::Mat img = img_batch[i];
        for (size_t j = 0; j < res.size(); j++) {
            cv::Rect r = get_rect(img, res[j].bbox);
            cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
            cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
                        cv::Scalar(0xFF, 0xFF, 0xFF), 2);
        }
    }
}


================================================
FILE: yolov13/src/postprocess.cu
================================================
//
// Created by lindsay on 23-7-17.
//
#include "postprocess.h"
#include "types.h"

static __global__ void decode_kernel(float* predict, int num_bboxes, float confidence_threshold, float* parray,
                                     int max_objects) {
    float count = predict[0];
    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    if (position >= count)
        return;

    float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float));
    int index = atomicAdd(parray, 1);
    char* pout_item_char = (char*)parray + sizeof(float) + index * bbox_element * sizeof(float);
    float* pout_item = (float*)pout_item_char;
    // Wait, let's look at how parray is used.
    // In original code:
    // float* pout_item = parray + 1 + index * bbox_element;
    // But parray[0] is count. So parray + 1 is start of data.
    // Ensure this matches usage in nms_kernel.

    if (index >= max_objects)
        return;

    float confidence = pitem[4];
    if (confidence < confidence_threshold)
        return;

    float left = pitem[0];
    float top = pitem[1];
    float right = pitem[2];
    float bottom = pitem[3];
    float label = pitem[5];

    // Re-verify pointer arithmetic.
    // parray is float*. 1 is float size.
    // index * bbox_element is float offset.
    float* out_ptr = parray + 1 + index * bbox_element;

    *out_ptr++ = left;
    *out_ptr++ = top;
    *out_ptr++ = right;
    *out_ptr++ = bottom;
    *out_ptr++ = confidence;
    *out_ptr++ = label;
    *out_ptr++ = 1;  // 1 = keep, 0 = ignore
}

static __device__ float box_iou(float aleft, float atop, float aright, float abottom, float bleft, float btop,
                                float bright, float bbottom) {
    float cleft = max(aleft, bleft);
    float ctop = max(atop, btop);
    float cright = min(aright, bright);
    float cbottom = min(abottom, bbottom);
    float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
    if (c_area == 0.0f)
        return 0.0f;

    float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
    float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
    return c_area / (a_area + b_area - c_area);
}

static __global__ void nms_kernel(float* bboxes, int max_objects, float threshold) {
    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    int count = min((int)*bboxes, max_objects);
    if (position >= count)
        return;

    float* pcurrent = bboxes + 1 + position * bbox_element;
    for (int i = 0; i < count; ++i) {
        float* pitem = bboxes + 1 + i * bbox_element;
        if (i == position || pcurrent[5] != pitem[5])
            continue;
        if (pitem[4] >= pcurrent[4]) {
            if (pitem[4] == pcurrent[4] && i < position)
                continue;
            float iou =
                    box_iou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pitem[0], pitem[1], pitem[2], pitem[3]);
            if (iou > threshold) {
                pcurrent[6] = 0;
                return;
            }
        }
    }
}

void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
                 cudaStream_t stream) {
    int block = 256;
    int grid = ceil(num_bboxes / (float)block);
    decode_kernel<<<grid, block, 0, stream>>>((float*)predict, num_bboxes, confidence_threshold, parray, max_objects);
}

void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) {
    int block = max_objects < 256 ? max_objects : 256;
    int grid = ceil(max_objects / (float)block);
    nms_kernel<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold);
}


================================================
FILE: yolov13/src/preprocess.cu
================================================
#include "cuda_utils.h"
#include "preprocess.h"

static uint8_t* img_buffer_host = nullptr;
static uint8_t* img_buffer_device = nullptr;

__global__ void warpaffine_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst,
                                  int dst_width, int dst_height, uint8_t const_value_st, AffineMatrix d2s, int edge) {
    int position = blockDim.x * blockIdx.x + threadIdx.x;
    if (position >= edge)
        return;

    float m_x1 = d2s.value[0];
    float m_y1 = d2s.value[1];
    float m_z1 = d2s.value[2];
    float m_x2 = d2s.value[3];
    float m_y2 = d2s.value[4];
    float m_z2 = d2s.value[5];

    int dx = position % dst_width;
    int dy = position / dst_width;
    float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
    float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
    float c0, c1, c2;

    if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
        // out of range
        c0 = const_value_st;
        c1 = const_value_st;
        c2 = const_value_st;
    } else {
        int y_low = floorf(src_y);
        int x_low = floorf(src_x);
        int y_high = y_low + 1;
        int x_high = x_low + 1;

        uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
        float ly = src_y - y_low;
        float lx = src_x - x_low;
        float hy = 1 - ly;
        float hx = 1 - lx;
        float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
        uint8_t* v1 = const_value;
        uint8_t* v2 = const_value;
        uint8_t* v3 = const_value;
        uint8_t* v4 = const_value;

        if (y_low >= 0) {
            if (x_low >= 0)
                v1 = src + y_low * src_line_size + x_low * 3;

            if (x_high < src_width)
                v2 = src + y_low * src_line_size + x_high * 3;
        }

        if (y_high < src_height) {
            if (x_low >= 0)
                v3 = src + y_high * src_line_size + x_low * 3;

            if (x_high < src_width)
                v4 = src + y_high * src_line_size + x_high * 3;
        }

        c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
        c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
        c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
    }

    // bgr to rgb
    float t = c2;
    c2 = c0;
    c0 = t;

    // normalization
    c0 = c0 / 255.0f;
    c1 = c1 / 255.0f;
    c2 = c2 / 255.0f;

    // rgbrgbrgb to rrrgggbbb
    int area = dst_width * dst_height;
    float* pdst_c0 = dst + dy * dst_width + dx;
    float* pdst_c1 = pdst_c0 + area;
    float* pdst_c2 = pdst_c1 + area;
    *pdst_c0 = c0;
    *pdst_c1 = c1;
    *pdst_c2 = c2;
}

void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height,
                     cudaStream_t stream) {
    int img_size = src_width * src_height * 3;
    // copy data to pinned memory
    memcpy(img_buffer_host, src, img_size);
    // copy data to device memory
    CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream));

    AffineMatrix s2d, d2s;
    float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width);

    s2d.value[0] = scale;
    s2d.value[1] = 0;
    s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5;
    s2d.value[3] = 0;
    s2d.value[4] = scale;
    s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5;
    cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
    cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
    cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);

    memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));

    int jobs = dst_height * dst_width;
    int threads = 256;
    int blocks = ceil(jobs / (float)threads);
    warpaffine_kernel<<<blocks, threads, 0, stream>>>(img_buffer_device, src_width * 3, src_width, src_height, dst,
                                                      dst_width, dst_height, 128, d2s, jobs);
}

void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch, float* dst, int dst_width, int dst_height,
                           cudaStream_t stream) {
    int dst_size = dst_width * dst_height * 3;
    for (size_t i = 0; i < img_batch.size(); i++) {
        cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width,
                        dst_height, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
    }
}

void cuda_preprocess_init(int max_image_size) {
    // prepare input data in pinned memory
    CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3));
    // prepare input data in device memory
    CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3));
}

void cuda_preprocess_destroy() {
    CUDA_CHECK(cudaFree(img_buffer_device));
    CUDA_CHECK(cudaFreeHost(img_buffer_host));
}


================================================
FILE: yolov13/yolov13_det.cpp
================================================
#include <fstream>
#include <iostream>
#include <opencv2/opencv.hpp>
#if defined(_WIN32)
#include <direct.h>
#include <io.h>
#include <windows.h>
#else
#include <sys/stat.h>
#include <unistd.h>
#endif
#include <climits>
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "postprocess.h"
#include "preprocess.h"
#include "utils.h"

Logger gLogger;
using namespace nvinfer1;
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;

static std::string get_executable_dir() {
#if defined(_WIN32)
    char buf[MAX_PATH];
    DWORD len = GetModuleFileNameA(NULL, buf, MAX_PATH);
    if (len == 0 || len == MAX_PATH)
        return std::string(".");
    std::string path(buf, buf + len);
    size_t pos = path.find_last_of("\\/");
    if (pos != std::string::npos)
        return path.substr(0, pos);
    return std::string(".");
#else
    char buf[PATH_MAX];
    ssize_t len = readlink("/proc/self/exe", buf, sizeof(buf) - 1);
    if (len == -1)
        return std::string(".");
    buf[len] = '\0';
    std::string path(buf);
    size_t pos = path.find_last_of('/');
    if (pos != std::string::npos)
        return path.substr(0, pos);
    return std::string(".");
#endif
}

void serialize_engine(std::string& wts_name, std::string& engine_name, float& gd, float& gw, int& max_channels,
                      std::string& type) {
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();
    IHostMemory* serialized_engine = nullptr;

    serialized_engine = buildEngineYolov13Det(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels, type);

    assert(serialized_engine);
    std::ofstream p(engine_name, std::ios::binary);
    if (!p) {
        std::cout << "could not open plan output file" << std::endl;
        assert(false);
    }
    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());

    delete serialized_engine;
    delete config;
    delete builder;
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
                        IExecutionContext** context) {
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        assert(false);
    }
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    char* serialized_engine = new char[size];
    assert(serialized_engine);
    file.read(serialized_engine, size);
    file.close();

    *runtime = createInferRuntime(gLogger);
    assert(*runtime);
    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
    assert(*engine);
    *context = (*engine)->createExecutionContext();
    assert(*context);
    delete[] serialized_engine;
}

void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
                    float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
                    std::string cuda_post_process) {
    assert(engine->getNbBindings() == 2);
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(kInputTensorName);
    const int outputIndex = engine->getBindingIndex(kOutputTensorName);
    assert(inputIndex == 0);
    assert(outputIndex == 1);
    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
    if (cuda_post_process == "c") {
        *output_buffer_host = new float[kBatchSize * kOutputSize];
    } else if (cuda_post_process == "g") {
        if (kBatchSize > 1) {
            std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
            exit(0);
        }
        // Allocate memory for decode_ptr_host and copy to device
        *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
        CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
    }
}

void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
           float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
    // infer on the batch asynchronously, and DMA output back to host
    auto start = std::chrono::system_clock::now();
    context.enqueueV2(buffers, stream, nullptr);
    if (cuda_post_process == "c") {
        CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
                                   stream));
        auto end = std::chrono::system_clock::now();
        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
                  << "ms" << std::endl;
    } else if (cuda_post_process == "g") {
        CUDA_CHECK(
                cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
        cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
        cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream);  //cuda nms
        CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
                                   sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
                                   stream));
        auto end = std::chrono::system_clock::now();
        std::cout << "inference and gpu postprocess time: "
                  << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));
}

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir, std::string& type,
                std::string& cuda_post_process, float& gd, float& gw, int& max_channels) {
    if (argc < 4)
        return false;
    if (std::string(argv[1]) == "-s" && (argc == 5)) {
        wts = std::string(argv[2]);
        engine = std::string(argv[3]);
        auto sub_type = std::string(argv[4]);

        if (sub_type[0] == 'n') {
            gd = 0.50;
            gw = 0.25;
            max_channels = 1024;
            type = "n";
        } else if (sub_type[0] == 's') {
            gd = 0.50;
            gw = 0.50;
            max_channels = 1024;
            type = "s";
        } else if (sub_type[0] == 'l') {
            gd = 1.0;
            gw = 1.0;
            max_channels = 512;
            type = "l";
        } else if (sub_type[0] == 'x') {
            gd = 1.0;
            gw = 1.50;
            max_channels = 512;
            type = "x";
        } else {
            return false;
        }
    } else if (std::string(argv[1]) == "-d" && argc == 5) {
        engine = std::string(argv[2]);
        img_dir = std::string(argv[3]);
        cuda_post_process = std::string(argv[4]);
    } else {
        return false;
    }
    return true;
}

int main(int argc, char** argv) {
    cudaSetDevice(kGpuId);
    std::string wts_name;
    std::string engine_name;
    std::string img_dir;
    std::string cuda_post_process;
    std::string type;
    int model_bboxes;
    float gd = 0, gw = 0;
    int max_channels = 0;

    if (!parse_args(argc, argv, wts_name, engine_name, img_dir, type, cuda_post_process, gd, gw, max_channels)) {
        std::cerr << "Arguments not right!" << std::endl;
        std::cerr << "./yolov13-det -s [.wts] [.engine] [n/s/l/x]  // serialize model to "
                     "plan file"
                  << std::endl;
        std::cerr << "./yolov13-det -d [.engine] ../images  [c/g]// deserialize plan file and run inference"
                  << std::endl;
        return -1;
    }

    // Create a model using the API directly and serialize it to a file
    if (!wts_name.empty()) {
        serialize_engine(wts_name, engine_name, gd, gw, max_channels, type);
        return 0;
    }

    // Deserialize the engine from file
    IRuntime* runtime = nullptr;
    ICudaEngine* engine = nullptr;
    IExecutionContext* context = nullptr;
    deserialize_engine(engine_name, &runtime, &engine, &context);
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));
    cuda_preprocess_init(kMaxInputImageSize);
    auto out_dims = engine->getBindingDimensions(1);
    model_bboxes = out_dims.d[0];
    // Prepare cpu and gpu buffers
    float* device_buffers[2];
    float* output_buffer_host = nullptr;
    float* decode_ptr_host = nullptr;
    float* decode_ptr_device = nullptr;

    // Read images from directory
    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
                   &decode_ptr_device, cuda_post_process);

    // batch predict
    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
        // Get a batch of images
        std::vector<cv::Mat> img_batch;
        std::vector<std::string> img_name_batch;
        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
            img_batch.push_back(img);
            img_name_batch.push_back(file_names[j]);
        }
        // Preprocess
        cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
        // Run inference
        infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
              decode_ptr_device, model_bboxes, cuda_post_process);
        // Save the first 100 values of output_buffer_host, one per line
        //        std::ofstream out("../models/output.txt");
        //        for (int j = 0; j < 100; j++) {
        //            out << output_buffer_host[j] << std::endl;
        //        }
        //        out.close();
        std::vector<std::vector<Detection>> res_batch;
        if (cuda_post_process == "c") {
            // NMS
            batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
        } else if (cuda_post_process == "g") {
            //Process gpu decode and nms results
            batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
        }
        // Draw bounding boxes
        draw_bbox(img_batch, res_batch);
#if 0
        // legacy: save under a "build" subfolder of the working directory
        const std::string out_dir = "build";
#else
        // Save results to the directory where the executable resides
        const std::string exe_dir = get_executable_dir();
        const std::string out_dir = exe_dir;
#endif
#if defined(_WIN32)
        if (_access(out_dir.c_str(), 0) != 0) {
            if (_mkdir(out_dir.c_str()) != 0) {
                std::cerr << "Warning: create directory failed: " << out_dir << std::endl;
            }
        }
#else
        if (access(out_dir.c_str(), F_OK) != 0) {
            if (mkdir(out_dir.c_str(), 0755) != 0) {
                std::cerr << "Warning: create directory failed: " << out_dir << std::endl;
            }
        }
#endif
        for (size_t j = 0; j < img_batch.size(); j++) {
            std::string out_path = out_dir + "/_" + img_name_batch[j];
            if (cv::imwrite(out_path, img_batch[j])) {
                std::cout << "Saved: " << out_path << std::endl;
            } else {
                std::cerr << "Failed to save: " << out_path << std::endl;
            }
        }
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(device_buffers[0]));
    CUDA_CHECK(cudaFree(device_buffers[1]));
    CUDA_CHECK(cudaFree(decode_ptr_device));
    delete[] decode_ptr_host;
    delete[] output_buffer_host;
    cuda_preprocess_destroy();
    // Destroy the engine
    delete context;
    delete engine;
    delete runtime;

    // Print histogram of the output distribution
    //std::cout << "\nOutput:\n\n";
    //for (unsigned int i = 0; i < kOutputSize; i++)
    //{
    //    std::cout << prob[i] << ", ";
    //    if (i % 10 == 0) std::cout << std::endl;
    //}
    //std::cout << std::endl;

    return 0;
}


================================================
FILE: yolov13/yolov13_det_trt.py
================================================
"""
An example that uses TensorRT's Python api to make inferences.
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit  # noqa: F401
import pycuda.driver as cuda
import tensorrt as trt

CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4
DET_NUM = 6


def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret


def plot_one_box(x, img, color=None, label=None, line_thickness=None):
    """
    description: Plots one bounding box on image img,
                 this function comes from YoLov13 project.
    param:
        x:      a box likes [x1,y1,x2,y2]
        img:    a opencv image object
        color:  color to draw rectangle, such as (0,255,0)
        label:  str
        line_thickness: int
    return:
        no return

    """
    tl = (
            line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
    )  # line/font thickness
    color = color or [random.randint(0, 255) for _ in range(3)]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
    if label:
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(
            img,
            label,
            (c1[0], c1[1] - 2),
            0,
            tl / 3,
            [225, 255, 255],
            thickness=tf,
            lineType=cv2.LINE_AA,
        )


class YoLov13TRT(object):
    """
    description: A YOLOv13 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            self.batch_size = engine.get_binding_shape(binding)[0]
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.det_output_length = host_outputs[0].shape[0]

    def infer(self, raw_image_generator):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_origin_h = []
        batch_origin_w = []
        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
            batch_image_raw.append(image_raw)
            batch_origin_h.append(origin_h)
            batch_origin_w.append(origin_w)
            np.copyto(batch_input_image[i], input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        # print("output: ", output[400:500])
        # Do postprocess
        for i in range(self.batch_size):
            result_boxes, result_scores, result_classid = self.post_process(
                output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i],
                batch_origin_w[i]
            )
            # Draw rectangles and labels on the original image
            for j in range(len(result_boxes)):
                box = result_boxes[j]
                plot_one_box(
                    box,
                    batch_image_raw[i],
                    label="{}:{:.2f}".format(
                        categories[int(result_classid[j])], result_scores[j]
                    ),
                )
        return batch_image_raw, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()

    def get_raw_image(self, image_path_batch):
        """
        description: Read an image from image path
        """
        for img_path in image_path_batch:
            yield cv2.imread(img_path)

    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_image(self, raw_bgr_image):
        """
        description: Convert BGR image to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = self.input_w / w
        r_h = self.input_h / h
        if r_h > r_w:
            tw = self.input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.input_h - th) / 2)
            ty2 = self.input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = self.input_h
            tx1 = int((self.input_w - tw) / 2)
            tx2 = self.input_w - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy(self, origin_h, origin_w, x):
        """
        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
        param:
            origin_h:   height of original image
            origin_w:   width of original image
            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
        return:
            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
        """
        y = np.zeros_like(x)
        r_w = self.input_w / origin_w
        r_h = self.input_h / origin_h
        if r_h > r_w:
            y[:, 0] = x[:, 0]
            y[:, 2] = x[:, 2]
            y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2
            y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2
            y /= r_w
        else:
            y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2
            y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2
            y[:, 1] = x[:, 1]
            y[:, 3] = x[:, 3]
            y /= r_h

        return y

    def post_process(self, output, origin_h, origin_w):
        """
        description: postprocess the prediction
        param:
            output:     A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
            result_scores: finally scores, a numpy, each element is the score correspoing to box
            result_classid: finally classid, a numpy, each element is the classid correspoing to box
        """
        num_values_per_detection = DET_NUM
        # Get the num of boxes detected
        num = int(output[0])
        print("There are {} detections in the picture!!!".format(num))
        # Reshape to a two dimentional ndarray
        # pred = np.reshape(output[1:], (-1, 38))[:num, :]
        pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :]
        # Do nms
        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
        result_scores = boxes[:, 4] if len(boxes) else np.array([])
        result_classid = boxes[:, 5] if len(boxes) else np.array([])
        return result_boxes, result_scores, result_classid

    def bbox_iou(self, box1, box2, x1y1x2y2=True):
        """
        description: compute the IoU of two bounding boxes
        param:
            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            x1y1x2y2: select the coordinate format
        return:
            iou: computed iou
        """
        if not x1y1x2y2:
            # Transform from center and width to exact coordinates
            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
        else:
            # Get the coordinates of bounding boxes
            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

        # Get the coordinates of the intersection rectangle
        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
        # Intersection area
        inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None)
                      * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None))
        # Union Area
        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)

        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

        return iou

    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
        """
        description: Removes detections with lower object confidence score than 'conf_thres' and performs
        Non-Maximum Suppression to further filter detections.
        param:
            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
            origin_h: original image height
            origin_w: original image width
            conf_thres: a confidence threshold to filter detections
            nms_thres: a iou threshold to filter detections
        return:
            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
        """
        # Get the boxes that score > CONF_THRESH
        boxes = prediction[prediction[:, 4] >= conf_thres]
        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
        boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
        # clip the coordinates
        boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1)
        boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1)
        boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1)
        boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1)
        # Object confidence
        confs = boxes[:, 4]
        # Sort by the confs
        boxes = boxes[np.argsort(-confs)]
        # Perform non-maximum suppression
        keep_boxes = []
        while boxes.shape[0]:
            large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
            label_match = boxes[0, -1] == boxes[:, -1]
            # Indices of boxes with lower confidence scores, large IOUs and matching labels
            invalid = large_overlap & label_match
            keep_boxes += [boxes[0]]
            boxes = boxes[~invalid]
        boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
        return boxes


class inferThread(threading.Thread):
    def __init__(self, yolov13_wrapper, image_path_batch):
        threading.Thread.__init__(self)
        self.yolov13_wrapper = yolov13_wrapper
        self.image_path_batch = image_path_batch

    def run(self):
        batch_image_raw, use_time = self.yolov13_wrapper.infer(
            self.yolov13_wrapper.get_raw_image(self.image_path_batch))
        for i, img_path in enumerate(self.image_path_batch):
            parent, filename = os.path.split(img_path)
            save_name = os.path.join('output', filename)
            # Save image
            cv2.imwrite(save_name, batch_image_raw[i])
        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))


class warmUpThread(threading.Thread):
    def __init__(self, yolov13_wrapper):
        threading.Thread.__init__(self)
        self.yolov13_wrapper = yolov13_wrapper

    def run(self):
        batch_image_raw, use_time = self.yolov13_wrapper.infer(self.yolov13_wrapper.get_raw_image_zeros())
        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))


if __name__ == "__main__":
    # load custom plugin and engine
    PLUGIN_LIBRARY = "build/libmyplugins.so"
    engine_file_path = "build/yolov13n-det.engine"
    # engine_file_path = "build/yolov13n-det-int8.engine"

    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]
    if len(sys.argv) > 2:
        PLUGIN_LIBRARY = sys.argv[2]

    ctypes.CDLL(PLUGIN_LIBRARY)

    # load coco labels
    # categories = ["object"]

    categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
                  "traffic light",
                  "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
                  "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase",
                  "frisbee",
                  "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
                  "surfboard",
                  "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
                  "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
                  "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
                  "cell phone",
                  "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
                  "teddy bear",
                  "hair drier", "toothbrush"]

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')
    # a YoLov13TRT instance
    yolov13_wrapper = YoLov13TRT(engine_file_path)
    try:
        print('batch size is', yolov13_wrapper.batch_size)

        image_dir = "images"
        image_path_batches = get_img_path_batches(yolov13_wrapper.batch_size, image_dir)

        for i in range(10):
            # create a new thread to do warm_up
            thread1 = warmUpThread(yolov13_wrapper)
            thread1.start()
            thread1.join()
        for batch in image_path_batches:
            # create a new thread to do inference
            thread1 = inferThread(yolov13_wrapper, batch)
            thread1.start()
            thread1.join()
    finally:
        # destroy the instance
        yolov13_wrapper.destroy()


================================================
FILE: yolov3/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(yolov3)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

find_package(CUDA REQUIRED)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(/usr/include/x86_64-linux-gnu/)
link_directories(/usr/lib/x86_64-linux-gnu/)

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")

#cuda_add_library(leaky ${PROJECT_SOURCE_DIR}/leaky.cu)
cuda_add_library(yololayer SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu)
target_link_libraries(yololayer nvinfer cudart ${OpenCV_LIBS})

add_executable(yolov3 ${PROJECT_SOURCE_DIR}/calibrator.cpp ${PROJECT_SOURCE_DIR}/yolov3.cpp)
target_link_libraries(yolov3 nvinfer)
target_link_libraries(yolov3 cudart)
target_link_libraries(yolov3 yololayer)
target_link_libraries(yolov3 ${OpenCV_LIBS})

add_definitions(-O2 -pthread)


================================================
FILE: yolov3/README.md
================================================
# yolov3

The Pytorch implementation is [ultralytics/yolov3 archive branch](https://github.com/ultralytics/yolov3/tree/archive). It provides two trained weights of yolov3, `yolov3.weights` and `yolov3.pt`

This branch is using tensorrt7 API, there is also a yolov3 implementation using tensorrt4 API, go to [branch trt4/yolov3](https://github.com/wang-xinyu/tensorrtx/tree/trt4/yolov3), which is using [ayooshkathuria/pytorch-yolo-v3](https://github.com/ayooshkathuria/pytorch-yolo-v3).

## Config

- Input shape defined in yololayer.h
- Number of classes defined in yololayer.h
- INT8/FP16/FP32 can be selected by the macro in yolov3.cpp
- GPU id can be selected by the macro in yolov3.cpp
- NMS thresh in yolov3.cpp
- BBox confidence thresh in yolov3.cpp

## How to run

1. generate yolov3.wts from pytorch implementation with yolov3.cfg and yolov3.weights, or download .wts from model zoo

```
git clone https://github.com/wang-xinyu/tensorrtx.git
git clone -b archive https://github.com/ultralytics/yolov3.git
// download its weights 'yolov3.pt' or 'yolov3.weights'
cp {tensorrtx}/yolov3/gen_wts.py {ultralytics/yolov3/}
cd {ultralytics/yolov3/}
python gen_wts.py yolov3.weights
// a file 'yolov3.wts' will be generated.
// the master branch of yolov3 should work, if not, you can checkout cf7a4d31d37788023a9186a1a143a2dab0275ead
```

2. put yolov3.wts into tensorrtx/yolov3, build and run

```
mv yolov3.wts {tensorrtx}/yolov3/
cd {tensorrtx}/yolov3
mkdir build
cd build
cmake ..
make
sudo ./yolov3 -s                          // serialize model to plan file i.e. 'yolov3.engine'
sudo ./yolov3 -d ../../yolov3-spp/samples // deserialize plan file and run inference, the images in samples will be processed.
```

3. check the images generated, as follows. _zidane.jpg and _bus.jpg

# INT8 Quantization

1. Prepare calibration images, you can randomly select 1000s images from your train set. For coco, you can also download my calibration images `coco_calib` from [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing) or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh

2. unzip it in yolov3/build

3. set the macro `USE_INT8` in yolov3.cpp and make

4. serialize the model and test

<p align="center">
<img src="https://user-images.githubusercontent.com/15235574/78247927-4d9fac00-751e-11ea-8b1b-704a0aeb3fcf.jpg">
</p>

<p align="center">
<img src="https://user-images.githubusercontent.com/15235574/78247970-60b27c00-751e-11ea-88df-41473fed4823.jpg">
</p>

## More Information

See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)


================================================
FILE: yolov3/calibrator.cpp
================================================
#include <iostream>
#include <iterator>
#include <fstream>
#include <opencv2/dnn/dnn.hpp>
#include "calibrator.h"
#include "cuda_runtime_api.h"
#include "utils.h"

Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache)
    : batchsize_(batchsize)
    , input_w_(input_w)
    , input_h_(input_h)
    , img_idx_(0)
    , img_dir_(img_dir)
    , calib_table_name_(calib_table_name)
    , input_blob_name_(input_blob_name)
    , read_cache_(read_cache)
{
    input_count_ = 3 * input_w * input_h * batchsize;
    CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float)));
    read_files_in_dir(img_dir, img_files_);
}

Int8EntropyCalibrator2::~Int8EntropyCalibrator2()
{
    CUDA_CHECK(cudaFree(device_input_));
}

int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT
{
    return batchsize_;
}

bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT
{
    if (img_idx_ + batchsize_ > (int)img_files_.size()) {
        return false;
    }

    std::vector<cv::Mat> input_imgs_;
    for (int i = img_idx_; i < img_idx_ + batchsize_; i++) {
        std::cout << img_files_[i] << "  " << i << std::endl;
        cv::Mat temp = cv::imread(img_dir_ + img_files_[i]);
        if (temp.empty()){
            std::cerr << "Fatal error: image cannot open!" << std::endl;
            return false;
        }
        cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_);
        input_imgs_.push_back(pr_img);
    }
    img_idx_ += batchsize_;
    cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false);

    CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr<float>(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice));
    assert(!strcmp(names[0], input_blob_name_));
    bindings[0] = device_input_;
    return true;
}

const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT
{
    std::cout << "reading calib cache: " << calib_table_name_ << std::endl;
    calib_cache_.clear();
    std::ifstream input(calib_table_name_, std::ios::binary);
    input >> std::noskipws;
    if (read_cache_ && input.good())
    {
        std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(calib_cache_));
    }
    length = calib_cache_.size();
    return length ? calib_cache_.data() : nullptr;
}

void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT
{
    std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl;
    std::ofstream output(calib_table_name_, std::ios::binary);
    output.write(reinterpret_cast<const char*>(cache), length);
}


================================================
FILE: yolov3/calibrator.h
================================================
#ifndef ENTROPY_CALIBRATOR_H
#define ENTROPY_CALIBRATOR_H

#include "NvInfer.h"
#include <string>
#include <vector>
#include "macros.h"

//! \class Int8EntropyCalibrator2
//!
//! \brief Implements Entropy calibrator 2.
//!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
//!
class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2
{
public:
    Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true);

    virtual ~Int8EntropyCalibrator2();
    int getBatchSize() const TRT_NOEXCEPT override;
    bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
    const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
    void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;
    
private:
    int batchsize_;
    int input_w_;
    int input_h_;
    int img_idx_;
    std::string img_dir_;
    std::vector<std::string> img_files_;
    size_t input_count_;
    std::string calib_table_name_;
    const char* input_blob_name_;
    bool read_cache_;
    void* device_input_;
    std::vector<char> calib_cache_;
};

#endif // ENTROPY_CALIBRATOR_H


================================================
FILE: yolov3/gen_wts.py
================================================
import struct
import sys
import torch
from models import *  # noqa: F403
from utils.utils import *  # noqa: F403

model = Darknet('cfg/yolov3.cfg', (608, 608))  # noqa: F405
weights = sys.argv[1]
device = torch_utils.select_device('0')  # noqa: F405
if weights.endswith('.pt'):  # pytorch format
    model.load_state_dict(torch.load(weights, map_location=device, weights_only=False)['model'])
else:  # darknet format
    load_darknet_weights(model, weights)  # noqa: F405
model = model.eval()

with open('yolov3.wts', 'w') as f:
    f.write('{}\n'.format(len(model.state_dict().keys())))
    for k, v in model.state_dict().items():
        vr = v.reshape(-1).cpu().numpy()
        f.write('{} {} '.format(k, len(vr)))
        for vv in vr:
            f.write(' ')
            f.write(struct.pack('>f', float(vv)).hex())
        f.write('\n')


================================================
FILE: yolov3/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "macros.h"

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#else
#define TRT_NOEXCEPT
#endif

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: yolov3/macros.h
================================================
#ifndef __MACROS_H
#define __MACROS_H

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif

#endif  // __MACROS_H


================================================
FILE: yolov3/utils.h
================================================
#ifndef __TRT_UTILS_H_
#define __TRT_UTILS_H_

#include <iostream>
#include <vector>
#include <algorithm>
#include <cudnn.h>
#include <dirent.h>
#include <opencv2/opencv.hpp>

#ifndef CUDA_CHECK

#define CUDA_CHECK(callstr)                                                                    \
    {                                                                                          \
        cudaError_t error_code = callstr;                                                      \
        if (error_code != cudaSuccess) {                                                       \
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
            assert(0);                                                                         \
        }                                                                                      \
    }

#endif

namespace Tn
{
    template<typename T> 
    void write(char*& buffer, const T& val)
    {
        *reinterpret_cast<T*>(buffer) = val;
        buffer += sizeof(T);
    }

    template<typename T> 
    void read(const char*& buffer, T& val)
    {
        val = *reinterpret_cast<const T*>(buffer);
        buffer += sizeof(T);
    }
}

static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
    int w, h, x, y;
    float r_w = input_w / (img.cols*1.0);
    float r_h = input_h / (img.rows*1.0);
    if (r_h > r_w) {
        w = input_w;
        h = r_w * img.rows;
        x = 0;
        y = (input_h - h) / 2;
    } else {
        w = r_h * img.cols;
        h = input_h;
        x = (input_w - w) / 2;
        y = 0;
    }
    cv::Mat re(h, w, CV_8UC3);
    cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
    cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
    re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
    return out;
}

static inline int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
    DIR *p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 &&
            strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}

#endif


================================================
FILE: yolov3/yololayer.cu
================================================
#include "yololayer.h"
#include "utils.h"
#include <assert.h>

using namespace Yolo;

namespace nvinfer1
{
    YoloLayerPlugin::YoloLayerPlugin()
    {
        mClassCount = CLASS_NUM;
        mYoloKernel.clear();
        mYoloKernel.push_back(yolo1);
        mYoloKernel.push_back(yolo2);
        mYoloKernel.push_back(yolo3);

        mKernelCount = mYoloKernel.size();
    }
    
    YoloLayerPlugin::~YoloLayerPlugin()
    {
    }

    // create the plugin at runtime from a byte stream
    YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length)
    {
        using namespace Tn;
        const char *d = reinterpret_cast<const char *>(data), *a = d;
        read(d, mClassCount);
        read(d, mThreadCount);
        read(d, mKernelCount);
        mYoloKernel.resize(mKernelCount);
        auto kernelSize = mKernelCount*sizeof(YoloKernel);
        memcpy(mYoloKernel.data(),d,kernelSize);
        d += kernelSize;

        assert(d == a + length);
    }

    void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT
    {
        using namespace Tn;
        char* d = static_cast<char*>(buffer), *a = d;
        write(d, mClassCount);
        write(d, mThreadCount);
        write(d, mKernelCount);
        auto kernelSize = mKernelCount*sizeof(YoloKernel);
        memcpy(d,mYoloKernel.data(),kernelSize);
        d += kernelSize;

        assert(d == a + getSerializationSize());
    }
    
    size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT
    {  
        return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount)  + sizeof(Yolo::YoloKernel) * mYoloKernel.size();
    }

    int YoloLayerPlugin::initialize() TRT_NOEXCEPT
    { 
        return 0;
    }
    
    Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT
    {
        //output the result to channel
        int totalsize = MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float);

        return Dims3(totalsize + 1, 1, 1);
    }

    // Set plugin namespace
    void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT
    {
        mPluginNamespace = pluginNamespace;
    }

    const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT
    {
        return mPluginNamespace;
    }

    // Return the DataType of the plugin output at the requested index
    DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT
    {
        return DataType::kFLOAT;
    }

    // Return true if output tensor is broadcast across a batch.
    bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT
    {
        return false;
    }

    // Return true if plugin can use input that is broadcast across batch without replication.
    bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT
    {
        return false;
    }

    void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT
    {
    }

    // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
    void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT
    {
    }

    // Detach the plugin object from its execution context.
    void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {}

    const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT
    {
        return "YoloLayer_TRT";
    }

    const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT
    {
        return "1";
    }

    void YoloLayerPlugin::destroy() TRT_NOEXCEPT
    {
        delete this;
    }

    // Clone the plugin
    IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT
    {
        YoloLayerPlugin *p = new YoloLayerPlugin();
        p->setPluginNamespace(mPluginNamespace);
        return p;
    }

    __device__ float Logist(float data){ return 1.0f / (1.0f + expf(-data)); };

    __global__ void CalDetection(const float *input, float *output,int noElements, 
            int yoloWidth,int yoloHeight,const float anchors[CHECK_COUNT*2],int classes,int outputElem) {
 
        int idx = threadIdx.x + blockDim.x * blockIdx.x;
        if (idx >= noElements) return;

        int total_grid = yoloWidth * yoloHeight;
        int bnIdx = idx / total_grid;
        idx = idx - total_grid*bnIdx;
        int info_len_i = 5 + classes;
        const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT);

        for (int k = 0; k < 3; ++k) {
            int class_id = 0;
            float max_cls_prob = 0.0;
            for (int i = 5; i < info_len_i; ++i) {
                float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]);
                if (p > max_cls_prob) {
                    max_cls_prob = p;
                    class_id = i - 5;
                }
            }
            float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]);
            if (max_cls_prob < IGNORE_THRESH || box_prob < IGNORE_THRESH) continue;

            float *res_count = output + bnIdx*outputElem;
            int count = (int)atomicAdd(res_count, 1);
            if (count >= MAX_OUTPUT_BBOX_COUNT) return;
            char* data = (char * )res_count + sizeof(float) + count*sizeof(Detection);
            Detection* det =  (Detection*)(data);

            int row = idx / yoloWidth;
            int col = idx % yoloWidth;

            //Location
            det->bbox[0] = (col + Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * INPUT_W / yoloWidth;
            det->bbox[1] = (row + Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * INPUT_H / yoloHeight;
            det->bbox[2] = expf(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]) * anchors[2*k];
            det->bbox[3] = expf(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]) * anchors[2*k + 1];
            det->det_confidence = box_prob;
            det->class_id = class_id;
            det->class_confidence = max_cls_prob;
        }
    }

    void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) {
        void* devAnchor;
        size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;
        CUDA_CHECK(cudaMalloc(&devAnchor,AnchorLen));

        int outputElem = 1 + MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float);

        for(int idx = 0 ; idx < batchSize; ++idx) {
            CUDA_CHECK(cudaMemset(output + idx*outputElem, 0, sizeof(float)));
        }
        int numElem = 0;
        for (unsigned int i = 0;i< mYoloKernel.size();++i)
        {
            const auto& yolo = mYoloKernel[i];
            numElem = yolo.width*yolo.height*batchSize;
            if (numElem < mThreadCount)
                mThreadCount = numElem;
            CUDA_CHECK(cudaMemcpy(devAnchor, yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
            CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>>
                (inputs[i],output, numElem, yolo.width, yolo.height, (float *)devAnchor, mClassCount ,outputElem);
        }

        CUDA_CHECK(cudaFree(devAnchor));
    }


    int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT
    {
        //assert(batchSize == 1);
        //GPU
        //CUDA_CHECK(cudaStreamSynchronize(stream));
        forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);

        return 0;
    }

    PluginFieldCollection YoloPluginCreator::mFC{};
    std::vector<PluginField> YoloPluginCreator::mPluginAttributes;

    YoloPluginCreator::YoloPluginCreator()
    {
        mPluginAttributes.clear();

        mFC.nbFields = mPluginAttributes.size();
        mFC.fields = mPluginAttributes.data();
    }

    const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT
    {
            return "YoloLayer_TRT";
    }

    const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT
    {
            return "1";
    }

    const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT
    {
            return &mFC;
    }

    IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT
    {
        YoloLayerPlugin* obj = new YoloLayerPlugin();
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

    IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT
    {
        // This object will be deleted when the network is destroyed, which will
        // call MishPlugin::destroy()
        YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

}


================================================
FILE: yolov3/yololayer.h
================================================
#ifndef _YOLO_LAYER_H
#define _YOLO_LAYER_H

#include <iostream>
#include <vector>
#include "NvInfer.h"
#include "macros.h"


namespace Yolo
{
    static constexpr int CHECK_COUNT = 3;
    static constexpr float IGNORE_THRESH = 0.1f;
    static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000;
    static constexpr int CLASS_NUM = 80;
    static constexpr int INPUT_H = 608;
    static constexpr int INPUT_W = 608;

    struct YoloKernel
    {
        int width;
        int height;
        float anchors[CHECK_COUNT*2];
    };

    static constexpr YoloKernel yolo1 = {
        INPUT_W / 32,
        INPUT_H / 32,
        {116,90,  156,198,  373,326}
    };
    static constexpr YoloKernel yolo2 = {
        INPUT_W / 16,
        INPUT_H / 16,
        {30,61,  62,45,  59,119}
    };
    static constexpr YoloKernel yolo3 = {
        INPUT_W / 8,
        INPUT_H / 8,
        {10,13,  16,30,  33,23}
    };

    static constexpr int LOCATIONS = 4;
    struct alignas(float) Detection{
        //x y w h
        float bbox[LOCATIONS];
        float det_confidence;
        float class_id;
        float class_confidence;
    };
}

namespace nvinfer1
{
    class YoloLayerPlugin: public IPluginV2IOExt
    {
        public:
            explicit YoloLayerPlugin();
            YoloLayerPlugin(const void* data, size_t length);

            ~YoloLayerPlugin();

            int getNbOutputs() const TRT_NOEXCEPT override
            {
                return 1;
            }

            Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;

            int initialize() TRT_NOEXCEPT override;

            virtual void terminate() TRT_NOEXCEPT override {};

            virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0;}

            virtual int enqueue(int batchSize, const void*const * inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;

            virtual size_t getSerializationSize() const TRT_NOEXCEPT override;

            virtual void serialize(void* buffer) const TRT_NOEXCEPT override;

            bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override {
                return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
            }

            const char* getPluginType() const TRT_NOEXCEPT override;

            const char* getPluginVersion() const TRT_NOEXCEPT override;

            void destroy() TRT_NOEXCEPT override;

            IPluginV2IOExt* clone() const TRT_NOEXCEPT override;

            void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;

            const char* getPluginNamespace() const TRT_NOEXCEPT override;

            DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;

            bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override;

            bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;

            void attachToContext(
                    cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;

            void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT override;

            void detachFromContext() TRT_NOEXCEPT override;

        private:
            void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1);
            int mClassCount;
            int mKernelCount;
            std::vector<Yolo::YoloKernel> mYoloKernel;
            int mThreadCount = 256;
            const char* mPluginNamespace;
    };

    class YoloPluginCreator : public IPluginCreator
    {
        public:
            YoloPluginCreator();

            ~YoloPluginCreator() override = default;

            const char* getPluginName() const TRT_NOEXCEPT override;

            const char* getPluginVersion() const TRT_NOEXCEPT override;

            const PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;

            IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT override;

            IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;

            void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override
            {
                mNamespace = libNamespace;
            }

            const char* getPluginNamespace() const TRT_NOEXCEPT override
            {
                return mNamespace.c_str();
            }

        private:
            std::string mNamespace;
            static PluginFieldCollection mFC;
            static std::vector<PluginField> mPluginAttributes;
    };
    REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
};

#endif 


================================================
FILE: yolov3/yolov3.cpp
================================================
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "utils.h"
#include "logging.h"
#include "yololayer.h"
#include "calibrator.h"

#define USE_FP16  // set USE_INT8 or USE_FP16 or USE_FP32
#define DEVICE 0  // GPU id
#define NMS_THRESH 0.4
#define BBOX_CONF_THRESH 0.5

using namespace nvinfer1;

// stuff we know about the network and the input/output blobs
static const int INPUT_H = Yolo::INPUT_H;
static const int INPUT_W = Yolo::INPUT_W;
static const int DETECTION_SIZE = sizeof(Yolo::Detection) / sizeof(float);
static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * DETECTION_SIZE + 1;  // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
static Logger gLogger;

cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
    int l, r, t, b;
    float r_w = INPUT_W / (img.cols * 1.0);
    float r_h = INPUT_H / (img.rows * 1.0);
    if (r_h > r_w) {
        l = bbox[0] - bbox[2]/2.f;
        r = bbox[0] + bbox[2]/2.f;
        t = bbox[1] - bbox[3]/2.f - (INPUT_H - r_w * img.rows) / 2;
        b = bbox[1] + bbox[3]/2.f - (INPUT_H - r_w * img.rows) / 2;
        l = l / r_w;
        r = r / r_w;
        t = t / r_w;
        b = b / r_w;
    } else {
        l = bbox[0] - bbox[2]/2.f - (INPUT_W - r_h * img.cols) / 2;
        r = bbox[0] + bbox[2]/2.f - (INPUT_W - r_h * img.cols) / 2;
        t = bbox[1] - bbox[3]/2.f;
        b = bbox[1] + bbox[3]/2.f;
        l = l / r_h;
        r = r / r_h;
        t = t / r_h;
        b = b / r_h;
    }
    return cv::Rect(l, t, r-l, b-t);
}

float iou(float lbox[4], float rbox[4]) {
    float interBox[] = {
        std::max(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left
        std::min(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right
        std::max(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top
        std::min(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom
    };

    if(interBox[2] > interBox[3] || interBox[0] > interBox[1])
        return 0.0f;

    float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]);
    return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS);
}

bool cmp(const Yolo::Detection& a, const Yolo::Detection& b) {
    return a.det_confidence > b.det_confidence;
}

void nms(std::vector<Yolo::Detection>& res, float *output, float nms_thresh = NMS_THRESH) {
    std::map<float, std::vector<Yolo::Detection>> m;
    for (int i = 0; i < output[0] && i < 1000; i++) {
        if (output[1 + 7 * i + 4] <= BBOX_CONF_THRESH) continue;
        Yolo::Detection det;
        memcpy(&det, &output[1 + 7 * i], 7 * sizeof(float));
        if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Yolo::Detection>());
        m[det.class_id].push_back(det);
    }
    for (auto it = m.begin(); it != m.end(); it++) {
        //std::cout << it->second[0].class_id << " --- " << std::endl;
        auto& dets = it->second;
        std::sort(dets.begin(), dets.end(), cmp);
        for (size_t m = 0; m < dets.size(); ++m) {
            auto& item = dets[m];
            res.push_back(item);
            for (size_t n = m + 1; n < dets.size(); ++n) {
                if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
                    dets.erase(dets.begin()+n);
                    --n;
                }
            }
        }
    }
}

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        
        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + ".weight"].values;
    float *beta = (float*)weightMap[lname + ".bias"].values;
    float *mean = (float*)weightMap[lname + ".running_mean"].values;
    float *var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};
    
    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

ILayer* convBnLeaky(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input,  int outch, int ksize, int s, int p, int linx) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap["module_list." + std::to_string(linx) + ".Conv2d.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{p, p});

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "module_list." + std::to_string(linx) + ".BatchNorm2d", 1e-5);

    auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU);
    lr->setAlpha(0.1);

    return lr;
}

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../yolov3.wts");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    // Yeah I am stupid, I just want to expand the complete arch of darknet..
    auto lr0 = convBnLeaky(network, weightMap, *data, 32, 3, 1, 1, 0);
    auto lr1 = convBnLeaky(network, weightMap, *lr0->getOutput(0), 64, 3, 2, 1, 1);
    auto lr2 = convBnLeaky(network, weightMap, *lr1->getOutput(0), 32, 1, 1, 0, 2);
    auto lr3 = convBnLeaky(network, weightMap, *lr2->getOutput(0), 64, 3, 1, 1, 3);
    auto ew4 = network->addElementWise(*lr3->getOutput(0), *lr1->getOutput(0), ElementWiseOperation::kSUM);
    auto lr5 = convBnLeaky(network, weightMap, *ew4->getOutput(0), 128, 3, 2, 1, 5);
    auto lr6 = convBnLeaky(network, weightMap, *lr5->getOutput(0), 64, 1, 1, 0, 6);
    auto lr7 = convBnLeaky(network, weightMap, *lr6->getOutput(0), 128, 3, 1, 1, 7);
    auto ew8 = network->addElementWise(*lr7->getOutput(0), *lr5->getOutput(0), ElementWiseOperation::kSUM);
    auto lr9 = convBnLeaky(network, weightMap, *ew8->getOutput(0), 64, 1, 1, 0, 9);
    auto lr10 = convBnLeaky(network, weightMap, *lr9->getOutput(0), 128, 3, 1, 1, 10);
    auto ew11 = network->addElementWise(*lr10->getOutput(0), *ew8->getOutput(0), ElementWiseOperation::kSUM);
    auto lr12 = convBnLeaky(network, weightMap, *ew11->getOutput(0), 256, 3, 2, 1, 12);
    auto lr13 = convBnLeaky(network, weightMap, *lr12->getOutput(0), 128, 1, 1, 0, 13);
    auto lr14 = convBnLeaky(network, weightMap, *lr13->getOutput(0), 256, 3, 1, 1, 14);
    auto ew15 = network->addElementWise(*lr14->getOutput(0), *lr12->getOutput(0), ElementWiseOperation::kSUM);
    auto lr16 = convBnLeaky(network, weightMap, *ew15->getOutput(0), 128, 1, 1, 0, 16);
    auto lr17 = convBnLeaky(network, weightMap, *lr16->getOutput(0), 256, 3, 1, 1, 17);
    auto ew18 = network->addElementWise(*lr17->getOutput(0), *ew15->getOutput(0), ElementWiseOperation::kSUM);
    auto lr19 = convBnLeaky(network, weightMap, *ew18->getOutput(0), 128, 1, 1, 0, 19);
    auto lr20 = convBnLeaky(network, weightMap, *lr19->getOutput(0), 256, 3, 1, 1, 20);
    auto ew21 = network->addElementWise(*lr20->getOutput(0), *ew18->getOutput(0), ElementWiseOperation::kSUM);
    auto lr22 = convBnLeaky(network, weightMap, *ew21->getOutput(0), 128, 1, 1, 0, 22);
    auto lr23 = convBnLeaky(network, weightMap, *lr22->getOutput(0), 256, 3, 1, 1, 23);
    auto ew24 = network->addElementWise(*lr23->getOutput(0), *ew21->getOutput(0), ElementWiseOperation::kSUM);
    auto lr25 = convBnLeaky(network, weightMap, *ew24->getOutput(0), 128, 1, 1, 0, 25);
    auto lr26 = convBnLeaky(network, weightMap, *lr25->getOutput(0), 256, 3, 1, 1, 26);
    auto ew27 = network->addElementWise(*lr26->getOutput(0), *ew24->getOutput(0), ElementWiseOperation::kSUM);
    auto lr28 = convBnLeaky(network, weightMap, *ew27->getOutput(0), 128, 1, 1, 0, 28);
    auto lr29 = convBnLeaky(network, weightMap, *lr28->getOutput(0), 256, 3, 1, 1, 29);
    auto ew30 = network->addElementWise(*lr29->getOutput(0), *ew27->getOutput(0), ElementWiseOperation::kSUM);
    auto lr31 = convBnLeaky(network, weightMap, *ew30->getOutput(0), 128, 1, 1, 0, 31);
    auto lr32 = convBnLeaky(network, weightMap, *lr31->getOutput(0), 256, 3, 1, 1, 32);
    auto ew33 = network->addElementWise(*lr32->getOutput(0), *ew30->getOutput(0), ElementWiseOperation::kSUM);
    auto lr34 = convBnLeaky(network, weightMap, *ew33->getOutput(0), 128, 1, 1, 0, 34);
    auto lr35 = convBnLeaky(network, weightMap, *lr34->getOutput(0), 256, 3, 1, 1, 35);
    auto ew36 = network->addElementWise(*lr35->getOutput(0), *ew33->getOutput(0), ElementWiseOperation::kSUM);
    auto lr37 = convBnLeaky(network, weightMap, *ew36->getOutput(0), 512, 3, 2, 1, 37);
    auto lr38 = convBnLeaky(network, weightMap, *lr37->getOutput(0), 256, 1, 1, 0, 38);
    auto lr39 = convBnLeaky(network, weightMap, *lr38->getOutput(0), 512, 3, 1, 1, 39);
    auto ew40 = network->addElementWise(*lr39->getOutput(0), *lr37->getOutput(0), ElementWiseOperation::kSUM);
    auto lr41 = convBnLeaky(network, weightMap, *ew40->getOutput(0), 256, 1, 1, 0, 41);
    auto lr42 = convBnLeaky(network, weightMap, *lr41->getOutput(0), 512, 3, 1, 1, 42);
    auto ew43 = network->addElementWise(*lr42->getOutput(0), *ew40->getOutput(0), ElementWiseOperation::kSUM);
    auto lr44 = convBnLeaky(network, weightMap, *ew43->getOutput(0), 256, 1, 1, 0, 44);
    auto lr45 = convBnLeaky(network, weightMap, *lr44->getOutput(0), 512, 3, 1, 1, 45);
    auto ew46 = network->addElementWise(*lr45->getOutput(0), *ew43->getOutput(0), ElementWiseOperation::kSUM);
    auto lr47 = convBnLeaky(network, weightMap, *ew46->getOutput(0), 256, 1, 1, 0, 47);
    auto lr48 = convBnLeaky(network, weightMap, *lr47->getOutput(0), 512, 3, 1, 1, 48);
    auto ew49 = network->addElementWise(*lr48->getOutput(0), *ew46->getOutput(0), ElementWiseOperation::kSUM);
    auto lr50 = convBnLeaky(network, weightMap, *ew49->getOutput(0), 256, 1, 1, 0, 50);
    auto lr51 = convBnLeaky(network, weightMap, *lr50->getOutput(0), 512, 3, 1, 1, 51);
    auto ew52 = network->addElementWise(*lr51->getOutput(0), *ew49->getOutput(0), ElementWiseOperation::kSUM);
    auto lr53 = convBnLeaky(network, weightMap, *ew52->getOutput(0), 256, 1, 1, 0, 53);
    auto lr54 = convBnLeaky(network, weightMap, *lr53->getOutput(0), 512, 3, 1, 1, 54);
    auto ew55 = network->addElementWise(*lr54->getOutput(0), *ew52->getOutput(0), ElementWiseOperation::kSUM);
    auto lr56 = convBnLeaky(network, weightMap, *ew55->getOutput(0), 256, 1, 1, 0, 56);
    auto lr57 = convBnLeaky(network, weightMap, *lr56->getOutput(0), 512, 3, 1, 1, 57);
    auto ew58 = network->addElementWise(*lr57->getOutput(0), *ew55->getOutput(0), ElementWiseOperation::kSUM);
    auto lr59 = convBnLeaky(network, weightMap, *ew58->getOutput(0), 256, 1, 1, 0, 59);
    auto lr60 = convBnLeaky(network, weightMap, *lr59->getOutput(0), 512, 3, 1, 1, 60);
    auto ew61 = network->addElementWise(*lr60->getOutput(0), *ew58->getOutput(0), ElementWiseOperation::kSUM);
    auto lr62 = convBnLeaky(network, weightMap, *ew61->getOutput(0), 1024, 3, 2, 1, 62);
    auto lr63 = convBnLeaky(network, weightMap, *lr62->getOutput(0), 512, 1, 1, 0, 63);
    auto lr64 = convBnLeaky(network, weightMap, *lr63->getOutput(0), 1024, 3, 1, 1, 64);
    auto ew65 = network->addElementWise(*lr64->getOutput(0), *lr62->getOutput(0), ElementWiseOperation::kSUM);
    auto lr66 = convBnLeaky(network, weightMap, *ew65->getOutput(0), 512, 1, 1, 0, 66);
    auto lr67 = convBnLeaky(network, weightMap, *lr66->getOutput(0), 1024, 3, 1, 1, 67);
    auto ew68 = network->addElementWise(*lr67->getOutput(0), *ew65->getOutput(0), ElementWiseOperation::kSUM);
    auto lr69 = convBnLeaky(network, weightMap, *ew68->getOutput(0), 512, 1, 1, 0, 69);
    auto lr70 = convBnLeaky(network, weightMap, *lr69->getOutput(0), 1024, 3, 1, 1, 70);
    auto ew71 = network->addElementWise(*lr70->getOutput(0), *ew68->getOutput(0), ElementWiseOperation::kSUM);
    auto lr72 = convBnLeaky(network, weightMap, *ew71->getOutput(0), 512, 1, 1, 0, 72);
    auto lr73 = convBnLeaky(network, weightMap, *lr72->getOutput(0), 1024, 3, 1, 1, 73);
    auto ew74 = network->addElementWise(*lr73->getOutput(0), *ew71->getOutput(0), ElementWiseOperation::kSUM);
    auto lr75 = convBnLeaky(network, weightMap, *ew74->getOutput(0), 512, 1, 1, 0, 75);
    auto lr76 = convBnLeaky(network, weightMap, *lr75->getOutput(0), 1024, 3, 1, 1, 76);
    auto lr77 = convBnLeaky(network, weightMap, *lr76->getOutput(0), 512, 1, 1, 0, 77);
    auto lr78 = convBnLeaky(network, weightMap, *lr77->getOutput(0), 1024, 3, 1, 1, 78);
    auto lr79 = convBnLeaky(network, weightMap, *lr78->getOutput(0), 512, 1, 1, 0, 79);
    auto lr80 = convBnLeaky(network, weightMap, *lr79->getOutput(0), 1024, 3, 1, 1, 80);
    IConvolutionLayer* conv81 = network->addConvolutionNd(*lr80->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.81.Conv2d.weight"], weightMap["module_list.81.Conv2d.bias"]);
    assert(conv81);
    // 82 is yolo
    auto l83 = lr79;
    auto lr84 = convBnLeaky(network, weightMap, *l83->getOutput(0), 256, 1, 1, 0, 84);

    float *deval = reinterpret_cast<float*>(malloc(sizeof(float) * 256 * 2 * 2));
    for (int i = 0; i < 256 * 2 * 2; i++) {
        deval[i] = 1.0;
    }
    Weights deconvwts85{DataType::kFLOAT, deval, 256 * 2 * 2};
    IDeconvolutionLayer* deconv85 = network->addDeconvolutionNd(*lr84->getOutput(0), 256, DimsHW{2, 2}, deconvwts85, emptywts);
    assert(deconv85);
    deconv85->setStrideNd(DimsHW{2, 2});
    deconv85->setNbGroups(256);
    weightMap["deconv85"] = deconvwts85;

    ITensor* inputTensors[] = {deconv85->getOutput(0), ew61->getOutput(0)};
    auto cat86 = network->addConcatenation(inputTensors, 2);
    auto lr87 = convBnLeaky(network, weightMap, *cat86->getOutput(0), 256, 1, 1, 0, 87);
    auto lr88 = convBnLeaky(network, weightMap, *lr87->getOutput(0), 512, 3, 1, 1, 88);
    auto lr89 = convBnLeaky(network, weightMap, *lr88->getOutput(0), 256, 1, 1, 0, 89);
    auto lr90 = convBnLeaky(network, weightMap, *lr89->getOutput(0), 512, 3, 1, 1, 90);
    auto lr91 = convBnLeaky(network, weightMap, *lr90->getOutput(0), 256, 1, 1, 0, 91);
    auto lr92 = convBnLeaky(network, weightMap, *lr91->getOutput(0), 512, 3, 1, 1, 92);
    IConvolutionLayer* conv93 = network->addConvolutionNd(*lr92->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.93.Conv2d.weight"], weightMap["module_list.93.Conv2d.bias"]);
    assert(conv93);
    // 94 is yolo
    auto l95 = lr91;
    auto lr96 = convBnLeaky(network, weightMap, *l95->getOutput(0), 128, 1, 1, 0, 96);
    Weights deconvwts97{DataType::kFLOAT, deval, 128 * 2 * 2};
    IDeconvolutionLayer* deconv97 = network->addDeconvolutionNd(*lr96->getOutput(0), 128, DimsHW{2, 2}, deconvwts97, emptywts);
    assert(deconv97);
    deconv97->setStrideNd(DimsHW{2, 2});
    deconv97->setNbGroups(128);
    ITensor* inputTensors1[] = {deconv97->getOutput(0), ew36->getOutput(0)};
    auto cat98 = network->addConcatenation(inputTensors1, 2);
    auto lr99 = convBnLeaky(network, weightMap, *cat98->getOutput(0), 128, 1, 1, 0, 99);
    auto lr100 = convBnLeaky(network, weightMap, *lr99->getOutput(0), 256, 3, 1, 1, 100);
    auto lr101 = convBnLeaky(network, weightMap, *lr100->getOutput(0), 128, 1, 1, 0, 101);
    auto lr102 = convBnLeaky(network, weightMap, *lr101->getOutput(0), 256, 3, 1, 1, 102);
    auto lr103 = convBnLeaky(network, weightMap, *lr102->getOutput(0), 128, 1, 1, 0, 103);
    auto lr104 = convBnLeaky(network, weightMap, *lr103->getOutput(0), 256, 3, 1, 1, 104);
    IConvolutionLayer* conv105 = network->addConvolutionNd(*lr104->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.105.Conv2d.weight"], weightMap["module_list.105.Conv2d.bias"]);
    assert(conv105);

    auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
    const PluginFieldCollection* pluginData = creator->getFieldNames();
    IPluginV2 *pluginObj = creator->createPlugin("yololayer", pluginData);
    ITensor* inputTensors_yolo[] = {conv81->getOutput(0), conv93->getOutput(0), conv105->getOutput(0)};
    auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj);

    yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*yolo->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#if defined(USE_FP16)
    config->setFlag(BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(BuilderFlag::kINT8);
    Int8EntropyCalibrator2 *calibrator = new Int8EntropyCalibrator2(1, INPUT_W, INPUT_H, "./coco_calib/", "int8calib.table", INPUT_BLOB_NAME);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*) (mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(buffers[inputIndex]));
    CUDA_CHECK(cudaFree(buffers[outputIndex]));
}

int main(int argc, char** argv) {
    cudaSetDevice(DEVICE);
    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{nullptr};
    size_t size{0};

    if (argc == 2 && std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(1, &modelStream);
        assert(modelStream != nullptr);
        std::ofstream p("yolov3.engine", std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 0;
    } else if (argc == 3 && std::string(argv[1]) == "-d") {
        std::ifstream file("yolov3.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./yolov3 -s  // serialize model to plan file" << std::endl;
        std::cerr << "./yolov3 -d ../samples  // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    std::vector<std::string> file_names;
    if (read_files_in_dir(argv[2], file_names) < 0) {
        std::cout << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    // prepare input data ---------------------------
    static float data[3 * INPUT_H * INPUT_W];
    //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
    //    data[i] = 1.0;
    static float prob[OUTPUT_SIZE];
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    int fcount = 0;
    for (auto f: file_names) {
        fcount++;
        std::cout << fcount << "  " << f << std::endl;
        cv::Mat img = cv::imread(std::string(argv[2]) + "/" + f);
        if (img.empty()) continue;
        cv::Mat pr_img = preprocess_img(img, INPUT_W, INPUT_H);
        for (int i = 0; i < INPUT_H * INPUT_W; i++) {
            data[i] = pr_img.at<cv::Vec3b>(i)[2] / 255.0;
            data[i + INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[1] / 255.0;
            data[i + 2 * INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[0] / 255.0;
        }

        // Run inference
        auto start = std::chrono::system_clock::now();
        doInference(*context, data, prob, 1);
        auto end = std::chrono::system_clock::now();
        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
        std::vector<Yolo::Detection> res;
        nms(res, prob);
        for (size_t j = 0; j < res.size(); j++) {
            cv::Rect r = get_rect(img, res[j].bbox);
            cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
            cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
        }
        cv::imwrite("_" + f, img);
    }

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();
    return 0;
}


================================================
FILE: yolov3/yolov3_trt.py
================================================
"""
An example that uses TensorRT's Python api to make inferences.
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt

CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4


def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret

def plot_one_box(x, img, color=None, label=None, line_thickness=None):
    """
    description: Plots one bounding box on image img,
                 this function comes from YoLov5 project.
    param: 
        x:      a box likes [x1,y1,x2,y2]
        img:    a opencv image object
        color:  color to draw rectangle, such as (0,255,0)
        label:  str
        line_thickness: int
    return:
        no return

    """
    tl = (
        line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
    )  # line/font thickness
    color = color or [random.randint(0, 255) for _ in range(3)]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
    if label:
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(
            img,
            label,
            (c1[0], c1[1] - 2),
            0,
            tl / 3,
            [225, 255, 255],
            thickness=tf,
            lineType=cv2.LINE_AA,
        )


class YoLov3TRT(object):
    """
    description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.batch_size = engine.max_batch_size

    def infer(self, raw_image_generator):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        engine = self.engine
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_origin_h = []
        batch_origin_w = []
        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
            batch_image_raw.append(image_raw)
            batch_origin_h.append(origin_h)
            batch_origin_w.append(origin_w)
            np.copyto(batch_input_image[i], input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        #print(output.shape)

        # Do postprocess
        for i in range(self.batch_size):
            result_boxes, result_scores, result_classid = self.post_process(
                output[i * 7001: (i + 1) * 7001], batch_origin_h[i], batch_origin_w[i]
            )
            
            # Draw rectangles and labels on the original image
            for j in range(len(result_boxes)):
                box = result_boxes[j]
                plot_one_box(
                    box,
                    batch_image_raw[i],
                
                    label="{}:{:.2f}".format(
                        categories[int(result_classid[j])], result_scores[j]
                    ),
                )
        return batch_image_raw, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        
    def get_raw_image(self, image_path_batch):
        """
        description: Read an image from image path
        """
        for img_path in image_path_batch:
            yield cv2.imread(img_path)
        
    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_image(self, raw_bgr_image):
        """
        description: Convert BGR image to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = self.input_w / w
        r_h = self.input_h / h
        if r_h > r_w:
            tw = self.input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.input_h - th) / 2)
            ty2 = self.input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = self.input_h
            tx1 = int((self.input_w - tw) / 2)
            tx2 = self.input_w - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy(self, origin_h, origin_w, x):
        """
        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
        param:
            origin_h:   height of original image
            origin_w:   width of original image
            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
        return:
            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
        """
        y = np.zeros_like(x)
        r_w = self.input_w / origin_w
        r_h = self.input_h / origin_h
        if r_h > r_w:
            y[:, 0] = x[:, 0] - x[:, 2] / 2
            y[:, 2] = x[:, 0] + x[:, 2] / 2
            y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
            y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
            y /= r_w
        else:
            y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
            y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
            y[:, 1] = x[:, 1] - x[:, 3] / 2
            y[:, 3] = x[:, 1] + x[:, 3] / 2
            y /= r_h

        return y

    def post_process(self, output, origin_h, origin_w):
        """
        description: postprocess the prediction
        param:
            output:     A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] 
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
            result_scores: finally scores, a numpy, each element is the score correspoing to box
            result_classid: finally classid, a numpy, each element is the classid correspoing to box
        """
        # Get the num of boxes detected
        num = int(output[0])
        np.set_printoptions(suppress=True)
        #print("num:", num)
        #np.set_printoptions(threshold=sys.maxsize)
        #print(output[1:])
        # Reshape to a two dimentional ndarray
        pred = np.reshape(output[1:], (-1, 7))[:num, :]
        if pred.shape[0] > 0:
            #print(pred[0])
            pred[:,4] *= pred[:,6]
            pred = pred[:,:-1]
            #print(pred[0])

        # Do nms
        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
        result_scores = boxes[:, 4] if len(boxes) else np.array([])
        result_classid = boxes[:, 5] if len(boxes) else np.array([])
        return result_boxes, result_scores, result_classid

    def bbox_iou(self, box1, box2, x1y1x2y2=True):
        """
        description: compute the IoU of two bounding boxes
        param:
            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))            
            x1y1x2y2: select the coordinate format
        return:
            iou: computed iou
        """
        if not x1y1x2y2:
            # Transform from center and width to exact coordinates
            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
        else:
            # Get the coordinates of bounding boxes
            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

        # Get the coordinates of the intersection rectangle
        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
        # Intersection area
        inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \
                     np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)
        # Union Area
        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)

        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

        return iou

    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
        """
        description: Removes detections with lower object confidence score than 'conf_thres' and performs
        Non-Maximum Suppression to further filter detections.
        param:
            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
            origin_h: original image height
            origin_w: original image width
            conf_thres: a confidence threshold to filter detections
            nms_thres: a iou threshold to filter detections
        return:
            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
        """
        # Get the boxes that score > CONF_THRESH
        boxes = prediction[prediction[:, 4] >= conf_thres]
        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
        boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
        # clip the coordinates
        boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w -1)
        boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w -1)
        boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h -1)
        boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h -1)
        # Object confidence
        confs = boxes[:, 4]
        # Sort by the confs
        boxes = boxes[np.argsort(-confs)]
        # Perform non-maximum suppression
        keep_boxes = []
        while boxes.shape[0]:
            large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
            label_match = boxes[0, -1] == boxes[:, -1]
            # Indices of boxes with lower confidence scores, large IOUs and matching labels
            invalid = large_overlap & label_match
            keep_boxes += [boxes[0]]
            boxes = boxes[~invalid]
        boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
        return boxes


class inferThread(threading.Thread):
    def __init__(self, yolov3_wrapper, image_path_batch):
        threading.Thread.__init__(self)
        self.yolov3_wrapper = yolov3_wrapper
        self.image_path_batch = image_path_batch

    def run(self):
        batch_image_raw, use_time = self.yolov3_wrapper.infer(self.yolov3_wrapper.get_raw_image(self.image_path_batch))
        for i, img_path in enumerate(self.image_path_batch):
            parent, filename = os.path.split(img_path)
            save_name = os.path.join('output', filename)
            # Save image
            cv2.imwrite(save_name, batch_image_raw[i])
        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))


class warmUpThread(threading.Thread):
    def __init__(self, yolov3_wrapper):
        threading.Thread.__init__(self)
        self.yolov3_wrapper = yolov3_wrapper

    def run(self):
        batch_image_raw, use_time = self.yolov3_wrapper.infer(self.yolov3_wrapper.get_raw_image_zeros())
        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))


if __name__ == "__main__":
    # load custom plugin and engine
    PLUGIN_LIBRARY = "build/libyololayer.so"
    engine_file_path = "build/yolov3.engine"

    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]
    if len(sys.argv) > 2:
        PLUGIN_LIBRARY = sys.argv[2]

    ctypes.CDLL(PLUGIN_LIBRARY)

    # load coco labels

    categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
            "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
            "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
            "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
            "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
            "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
            "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
            "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
            "hair drier", "toothbrush"]

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')
    # a YoLov5TRT instance
    yolov3_wrapper = YoLov3TRT(engine_file_path)
    try:
        print('batch size is', yolov3_wrapper.batch_size)
        
        image_dir = "samples/"
        image_path_batches = get_img_path_batches(yolov3_wrapper.batch_size, image_dir)

        for i in range(10):
            # create a new thread to do warm_up
            thread1 = warmUpThread(yolov3_wrapper)
            thread1.start()
            thread1.join()
        for batch in image_path_batches:
            # create a new thread to do inference
            thread1 = inferThread(yolov3_wrapper, batch)
            thread1.start()
            thread1.join()
    finally:
        # destroy the instance
        yolov3_wrapper.destroy()


================================================
FILE: yolov3-spp/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(yolov3-spp)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

find_package(CUDA REQUIRED)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(/usr/include/x86_64-linux-gnu/)
link_directories(/usr/lib/x86_64-linux-gnu/)

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")

cuda_add_library(yololayer SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu)
target_link_libraries(yololayer nvinfer cudart)

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

add_executable(yolov3-spp ${PROJECT_SOURCE_DIR}/yolov3-spp.cpp)
target_link_libraries(yolov3-spp nvinfer)
target_link_libraries(yolov3-spp cudart)
target_link_libraries(yolov3-spp yololayer)
target_link_libraries(yolov3-spp ${OpenCV_LIBS})

add_definitions(-O2 -pthread)


================================================
FILE: yolov3-spp/README.md
================================================
# yolov3-spp

Currently this is supporting dynamic input shape, if you want to use non-dynamic version, please checkout commit [659fd2b](https://github.com/wang-xinyu/tensorrtx/commit/659fd2b23482197b19dccf746a5a3dbff1611381).

The Pytorch implementation is [ultralytics/yolov3 archive branch](https://github.com/ultralytics/yolov3/tree/archive). It provides two trained weights of yolov3-spp, `yolov3-spp.pt` and `yolov3-spp-ultralytics.pt`(originally named `ultralytics68.pt`).

## Config

- Number of classes defined in yololayer.h
- FP16/FP32 can be selected by the macro in yolov3-spp.cpp
- GPU id can be selected by the macro in yolov3-spp.cpp
- NMS thresh in yolov3-spp.cpp
- BBox confidence thresh in yolov3-spp.cpp
- MIN and MAX input size defined in yolov3-spp.cpp
- Optimization width and height for IOptimizationProfile defined in yolov3-spp.cpp

## How to Run

1. generate yolov3-spp_ultralytics68.wts from pytorch implementation with yolov3-spp.cfg and yolov3-spp-ultralytics.pt, or download .wts from model zoo

```
git clone https://github.com/wang-xinyu/tensorrtx.git
git clone -b archive https://github.com/ultralytics/yolov3.git
// download its weights 'yolov3-spp-ultralytics.pt'
// copy gen_wts.py from tensorrtx/yolov3-spp/ to ultralytics/yolov3/
// go to ultralytics/yolov3/
python gen_wts.py yolov3-spp-ultralytics.pt
// a file 'yolov3-spp_ultralytics68.wts' will be generated.
// the master branch of yolov3 should work, if not, you can checkout 4ac60018f6e6c1e24b496485f126a660d9c793d8
```

2. build tensorrtx/yolov3-spp and run

```
// put yolov3-spp_ultralytics68.wts into tensorrtx/yolov3-spp/
// go to tensorrtx/yolov3-spp/
mkdir build
cd build
cmake ..
make
sudo ./yolov3-spp -s             // serialize model to plan file i.e. 'yolov3-spp.engine'
sudo ./yolov3-spp -d  ../samples // deserialize plan file and run inference, the images in samples will be processed.
```

3. check the images generated, as follows. _zidane.jpg and _bus.jpg

<p align="center">
<img src="https://user-images.githubusercontent.com/15235574/78247927-4d9fac00-751e-11ea-8b1b-704a0aeb3fcf.jpg">
</p>

<p align="center">
<img src="https://user-images.githubusercontent.com/15235574/78247970-60b27c00-751e-11ea-88df-41473fed4823.jpg">
</p>

## More Information

See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)


================================================
FILE: yolov3-spp/Utils.h
================================================
#ifndef __TRT_UTILS_H_
#define __TRT_UTILS_H_

#include <iostream>
#include <vector>
#include <algorithm>
#include <cudnn.h>

#ifndef CUDA_CHECK

#define CUDA_CHECK(callstr)                                                                    \
    {                                                                                          \
        cudaError_t error_code = callstr;                                                      \
        if (error_code != cudaSuccess) {                                                       \
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
            assert(0);                                                                         \
        }                                                                                      \
    }

#endif

namespace Tn
{
    class Profiler : public nvinfer1::IProfiler
    {
    public:
        void printLayerTimes(int itrationsTimes)
        {
            float totalTime = 0;
            for (size_t i = 0; i < mProfile.size(); i++)
            {
                printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / itrationsTimes);
                totalTime += mProfile[i].second;
            }
            printf("Time over all layers: %4.3f\n", totalTime / itrationsTimes);
        }
    private:
        typedef std::pair<std::string, float> Record;
        std::vector<Record> mProfile;

        virtual void reportLayerTime(const char* layerName, float ms)
        {
            auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; });
            if (record == mProfile.end())
                mProfile.push_back(std::make_pair(layerName, ms));
            else
                record->second += ms;
        }
    };

    //Logger for TensorRT info/warning/errors
    class Logger : public nvinfer1::ILogger
    {
    public:

        Logger(): Logger(Severity::kWARNING) {}

        Logger(Severity severity): reportableSeverity(severity) {}

        void log(Severity severity, const char* msg) override
        {
            // suppress messages with severity enum value greater than the reportable
            if (severity > reportableSeverity) return;

            switch (severity)
            {
                case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
                case Severity::kERROR: std::cerr << "ERROR: "; break;
                case Severity::kWARNING: std::cerr << "WARNING: "; break;
                case Severity::kINFO: std::cerr << "INFO: "; break;
                default: std::cerr << "UNKNOWN: "; break;
            }
            std::cerr << msg << std::endl;
        }

        Severity reportableSeverity{Severity::kWARNING};
    };

    template<typename T> 
    void write(char*& buffer, const T& val)
    {
        *reinterpret_cast<T*>(buffer) = val;
        buffer += sizeof(T);
    }

    template<typename T> 
    void read(const char*& buffer, T& val)
    {
        val = *reinterpret_cast<const T*>(buffer);
        buffer += sizeof(T);
    }
}

#endif

================================================
FILE: yolov3-spp/gen_wts.py
================================================
import struct
import sys
import torch
from models import *  # noqa: F403
from utils.utils import *  # noqa: F403

model = Darknet('cfg/yolov3-spp.cfg', (416, 416))  # noqa: F405
weights = sys.argv[1]
dev = '0'
device = torch_utils.select_device(dev)  # noqa: F405
model.load_state_dict(torch.load(weights, map_location=device, weights_only=False)['model'])


with open('yolov3-spp_ultralytics68.wts', 'w') as f:
    f.write('{}\n'.format(len(model.state_dict().keys())))
    for k, v in model.state_dict().items():
        vr = v.reshape(-1).cpu().numpy()
        f.write('{} {} '.format(k, len(vr)))
        for vv in vr:
            f.write(' ')
            f.write(struct.pack('>f', float(vv)).hex())
        f.write('\n')


================================================
FILE: yolov3-spp/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) override
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: yolov3-spp/yololayer.cu
================================================
#include "yololayer.h"

using namespace Yolo;

namespace nvinfer1
{
    YoloLayerPlugin::YoloLayerPlugin()
    {
        mClassCount = CLASS_NUM;
        mYoloKernel.clear();
        mYoloKernel.push_back(yolo1);
        mYoloKernel.push_back(yolo2);
        mYoloKernel.push_back(yolo3);
        mKernelCount = mYoloKernel.size();

        CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
        size_t anchorLen = sizeof(float) * CHECK_COUNT * 2;
        for (int i = 0; i < mKernelCount; i++)
        {
            CUDA_CHECK(cudaMalloc(&mAnchor[i], anchorLen));
            const auto& yolo = mYoloKernel[i];
            CUDA_CHECK(cudaMemcpy(mAnchor[i], yolo.anchors, anchorLen, cudaMemcpyHostToDevice));
        }
    }

    YoloLayerPlugin::~YoloLayerPlugin()
    {
        for (int i = 0; i < mKernelCount; i++)
        {
            CUDA_CHECK(cudaFree(mAnchor[i]));
        }
        CUDA_CHECK(cudaFreeHost(mAnchor));
    }

    // create the plugin at runtime from a byte stream
    YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length)
    {
        using namespace Tn;
        const char *d = reinterpret_cast<const char *>(data), *a = d;
        read(d, mClassCount);
        read(d, mThreadCount);
        read(d, mKernelCount);
        mYoloKernel.resize(mKernelCount);
        auto kernelSize = mKernelCount * sizeof(YoloKernel);
        memcpy(mYoloKernel.data(), d, kernelSize);
        d += kernelSize;
        assert(d == a + length);

        CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
        size_t anchorLen = sizeof(float) * CHECK_COUNT * 2;
        for (int i = 0; i < mKernelCount; i++)
        {
            CUDA_CHECK(cudaMalloc(&mAnchor[i], anchorLen));
            const auto& yolo = mYoloKernel[i];
            CUDA_CHECK(cudaMemcpy(mAnchor[i], yolo.anchors, anchorLen, cudaMemcpyHostToDevice));
        }
    }

    void YoloLayerPlugin::serialize(void* buffer) const
    {
        using namespace Tn;
        char* d = static_cast<char*>(buffer), *a = d;
        write(d, mClassCount);
        write(d, mThreadCount);
        write(d, mKernelCount);
        auto kernelSize = mKernelCount * sizeof(YoloKernel);
        memcpy(d,mYoloKernel.data(), kernelSize);
        d += kernelSize;

        assert(d == a + getSerializationSize());
    }

    size_t YoloLayerPlugin::getSerializationSize() const
    {
        return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount)  + sizeof(Yolo::YoloKernel) * mYoloKernel.size();
    }

    int YoloLayerPlugin::initialize()
    {
        return 0;
    }

    DimsExprs YoloLayerPlugin::getOutputDimensions(int outputIndex, const DimsExprs* inputs, int nbInputs, IExprBuilder& exprBuilder)
    {
        //output the result to channel
        int totalsize = MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float);
        DimsExprs de;
        de.nbDims = 2;
        de.d[0] = exprBuilder.constant(inputs[0].d[0]->getConstantValue());  // batchsize
        de.d[1] = exprBuilder.constant(totalsize + 1);  // outputsize
        return de;
    }

    // Set plugin namespace
    void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace)
    {
        mPluginNamespace = pluginNamespace;
    }

    const char* YoloLayerPlugin::getPluginNamespace() const
    {
        return mPluginNamespace;
    }

    // Return the DataType of the plugin output at the requested index
    DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
    {
        return DataType::kFLOAT;
    }

    void YoloLayerPlugin::configurePlugin(const DynamicPluginTensorDesc* in, int nbInputs, const DynamicPluginTensorDesc* out, int nbOutputs)
    {
    }

    // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
    void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
    {
    }

    // Detach the plugin object from its execution context.
    void YoloLayerPlugin::detachFromContext() {}

    const char* YoloLayerPlugin::getPluginType() const
    {
        return "YoloLayer_TRT";
    }

    const char* YoloLayerPlugin::getPluginVersion() const
    {
        return "1";
    }

    void YoloLayerPlugin::destroy()
    {
        delete this;
    }

    // Clone the plugin
    IPluginV2DynamicExt* YoloLayerPlugin::clone() const
    {
        YoloLayerPlugin *p = new YoloLayerPlugin();
        p->setPluginNamespace(mPluginNamespace);
        return p;
    }

    __device__ float Logist(float data){ return 1.0f / (1.0f + expf(-data)); };

    __global__ void CalDetection(const float *input, float *output, int noElements,
            int yoloWidth, int yoloHeight, int yoloStride, const float anchors[CHECK_COUNT * 2], int classes, int outputElem) {

        int idx = threadIdx.x + blockDim.x * blockIdx.x;
        if (idx >= noElements) return;

        int total_grid = yoloWidth * yoloHeight;
        int bnIdx = idx / total_grid;
        idx = idx - total_grid*bnIdx;
        int info_len_i = 5 + classes;
        const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT);

        for (int k = 0; k < 3; ++k) {
            int class_id = 0;
            float max_cls_prob = 0.0;
            for (int i = 5; i < info_len_i; ++i) {
                float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]);
                if (p > max_cls_prob) {
                    max_cls_prob = p;
                    class_id = i - 5;
                }
            }
            float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]);
            if (max_cls_prob < IGNORE_THRESH || box_prob < IGNORE_THRESH) continue;

            float *res_count = output + bnIdx * outputElem;
            int count = (int)atomicAdd(res_count, 1);
            if (count >= MAX_OUTPUT_BBOX_COUNT) return;
            char* data = (char*)res_count + sizeof(float) + count * sizeof(Detection);
            Detection* det = (Detection*)(data);

            int row = idx / yoloWidth;
            int col = idx % yoloWidth;

            //Location
            det->bbox[0] = (col + Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * yoloStride;
            det->bbox[1] = (row + Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * yoloStride;
            det->bbox[2] = expf(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]) * anchors[2 * k];
            det->bbox[3] = expf(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]) * anchors[2 * k + 1];
            det->det_confidence = box_prob;
            det->class_id = class_id;
            det->class_confidence = max_cls_prob;
        }
    }

    void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) {
        int outputElem = 1 + MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float);
        for(int idx = 0 ; idx < batchSize; ++idx) {
            CUDA_CHECK(cudaMemset(output + idx * outputElem, 0, sizeof(float)));
        }
        int numElem = 0;
        for (size_t i = 0; i < mYoloKernel.size(); ++i) {
            const auto& yolo = mYoloKernel[i];
            numElem = yolo.width * yolo.height * batchSize;
            CalDetection<<<(yolo.width * yolo.height * batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>>
                (inputs[i], output, numElem, yolo.width, yolo.height, yolo.stride, (float*)mAnchor[i], mClassCount, outputElem);
        }
    }

    int YoloLayerPlugin::enqueue(const PluginTensorDesc* inputDesc, const PluginTensorDesc* outputDesc, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream)
    {
        int batchSize = inputDesc[0].dims.d[0];
        for (size_t i = 0; i < mYoloKernel.size(); ++i) {
            mYoloKernel[i].width = inputDesc[i].dims.d[3];
            mYoloKernel[i].height = inputDesc[i].dims.d[2];
        }
        forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);
        return 0;
    }

    PluginFieldCollection YoloPluginCreator::mFC{};
    std::vector<PluginField> YoloPluginCreator::mPluginAttributes;

    YoloPluginCreator::YoloPluginCreator()
    {
        mPluginAttributes.clear();

        mFC.nbFields = mPluginAttributes.size();
        mFC.fields = mPluginAttributes.data();
    }

    const char* YoloPluginCreator::getPluginName() const
    {
            return "YoloLayer_TRT";
    }

    const char* YoloPluginCreator::getPluginVersion() const
    {
            return "1";
    }

    const PluginFieldCollection* YoloPluginCreator::getFieldNames()
    {
            return &mFC;
    }

    IPluginV2DynamicExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
    {
        YoloLayerPlugin* obj = new YoloLayerPlugin();
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

    IPluginV2DynamicExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
    {
        // This object will be deleted when the network is destroyed, which will
        // call YoloLayerPlugin::destroy()
        YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

}


================================================
FILE: yolov3-spp/yololayer.h
================================================
#ifndef _YOLO_LAYER_H
#define _YOLO_LAYER_H

#include <assert.h>
#include <cmath>
#include <string.h>
#include <cublas_v2.h>
#include "NvInfer.h"
#include "Utils.h"
#include <iostream>

namespace Yolo
{
    static constexpr int CHECK_COUNT = 3;
    static constexpr float IGNORE_THRESH = 0.1f;
    static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000;
    static constexpr int CLASS_NUM = 80;

    struct YoloKernel
    {
        int width;
        int height;
        int stride;
        float anchors[CHECK_COUNT*2];
    };

    static constexpr YoloKernel yolo1 = {
        -1,  // dynamic width and height
        -1,
        32,
        {116,90,  156,198,  373,326}
    };
    static constexpr YoloKernel yolo2 = {
        -1,
        -1,
        16,
        {30,61,  62,45,  59,119}
    };
    static constexpr YoloKernel yolo3 = {
        -1,
        -1,
        8,
        {10,13,  16,30,  33,23}
    };

    static constexpr int LOCATIONS = 4;
    struct alignas(float) Detection{
        //x y w h
        float bbox[LOCATIONS];
        float det_confidence;
        float class_id;
        float class_confidence;
    };
}

namespace nvinfer1
{
    class YoloLayerPlugin: public IPluginV2DynamicExt
    {
        public:
            explicit YoloLayerPlugin();
            YoloLayerPlugin(const void* data, size_t length);

            ~YoloLayerPlugin();

            int getNbOutputs() const override
            {
                return 1;
            }

            //virtual Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) final;
            virtual DimsExprs getOutputDimensions(int outputIndex, const DimsExprs* inputs, int nbInputs, IExprBuilder& exprBuilder) override;

            int initialize() override;

            virtual void terminate() override {};

            //virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;}
            size_t getWorkspaceSize(const PluginTensorDesc* inputs, int nbInputs, const PluginTensorDesc* outputs, int nbOutputs) const override { return 0; }

            //virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override;
            int enqueue(const PluginTensorDesc* inputDesc, const PluginTensorDesc* outputDesc, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) override;

            virtual size_t getSerializationSize() const override;

            virtual void serialize(void* buffer) const override;

            bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) override {
                return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
            }

            const char* getPluginType() const override;

            const char* getPluginVersion() const override;

            void destroy() override;

            IPluginV2DynamicExt* clone() const override;

            void setPluginNamespace(const char* pluginNamespace) override;

            const char* getPluginNamespace() const override;

            DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;

            void attachToContext(
                    cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override;

            void configurePlugin(const DynamicPluginTensorDesc* in, int nbInputs, const DynamicPluginTensorDesc* out, int nbOutputs) override;

            void detachFromContext() override;

        private:
            void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1);
            int mClassCount;
            int mKernelCount;
            std::vector<Yolo::YoloKernel> mYoloKernel;
            int mThreadCount = 256;
            void** mAnchor;
            const char* mPluginNamespace;
    };

    class YoloPluginCreator : public IPluginCreator
    {
        public:
            YoloPluginCreator();

            ~YoloPluginCreator() override = default;

            const char* getPluginName() const override;

            const char* getPluginVersion() const override;

            const PluginFieldCollection* getFieldNames() override;

            IPluginV2DynamicExt* createPlugin(const char* name, const PluginFieldCollection* fc) override;

            IPluginV2DynamicExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;

            void setPluginNamespace(const char* libNamespace) override
            {
                mNamespace = libNamespace;
            }

            const char* getPluginNamespace() const override
            {
                return mNamespace.c_str();
            }

        private:
            std::string mNamespace;
            static PluginFieldCollection mFC;
            static std::vector<PluginField> mPluginAttributes;
    };
    REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
};

#endif 


================================================
FILE: yolov3-spp/yolov3-spp.cpp
================================================
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include <opencv2/opencv.hpp>
#include <opencv2/dnn/dnn.hpp>
#include <dirent.h>
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"
#include "yololayer.h"

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

#define USE_FP16  // comment out this if want to use FP32
#define DEVICE 0  // GPU id
#define NMS_THRESH 0.4
#define BBOX_CONF_THRESH 0.5

using namespace nvinfer1;

// stuff we know about the network and the input/output blobs
static const int MAX_INPUT_SIZE = 608;
static const int MIN_INPUT_SIZE = 128;
static const int OPT_INPUT_W = 608;
static const int OPT_INPUT_H = 608;
static const int DET_LEN = sizeof(Yolo::Detection) / sizeof(float);
static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * DET_LEN + 1;  // we limit the yololayer to output no more than MAX_OUTPUT_BBOX_COUNT bboxes
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
static Logger gLogger;

cv::Mat letterbox(cv::Mat& img) {
    float r = std::min(MAX_INPUT_SIZE / (img.cols*1.0), MAX_INPUT_SIZE / (img.rows*1.0));
    r = std::min(r, 1.0f);
    int unpad_w = r * img.cols;
    int unpad_h = r * img.rows;
    int dw = (MAX_INPUT_SIZE - unpad_w) % 32;
    int dh = (MAX_INPUT_SIZE - unpad_h) % 32;
    cv::Mat re(unpad_h, unpad_w, CV_8UC3);
    cv::resize(img, re, re.size());
    cv::Mat out(unpad_h + dh, unpad_w + dw, CV_8UC3, cv::Scalar(128, 128, 128));
    re.copyTo(out(cv::Rect(dw / 2, dh / 2, re.cols, re.rows)));
    return out;
}

cv::Rect get_rect(cv::Size src_shape, cv::Size pre_shape, float bbox[4]) {
    float ra = std::min(MAX_INPUT_SIZE / (src_shape.width * 1.0), MAX_INPUT_SIZE / (src_shape.height * 1.0));
    ra = std::min(ra, 1.0f);
    int unpad_w = ra * src_shape.width;
    int unpad_h = ra * src_shape.height;
    int dw = (MAX_INPUT_SIZE - unpad_w) % 32;
    int dh = (MAX_INPUT_SIZE - unpad_h) % 32;

    int l = bbox[0] - bbox[2]/2.f - dw / 2;
    int r = bbox[0] + bbox[2]/2.f - dw / 2;
    int t = bbox[1] - bbox[3]/2.f - dh / 2;
    int b = bbox[1] + bbox[3]/2.f - dh / 2;
    l /= ra;
    r /= ra;
    t /= ra;
    b /= ra;
    return cv::Rect(l, t, r-l, b-t);
}

float iou(float lbox[4], float rbox[4]) {
    float interBox[] = {
        std::max(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left
        std::min(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right
        std::max(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top
        std::min(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom
    };

    if(interBox[2] > interBox[3] || interBox[0] > interBox[1])
        return 0.0f;

    float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]);
    return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS);
}

bool cmp(const Yolo::Detection& a, const Yolo::Detection& b) {
    return a.det_confidence > b.det_confidence;
}

void nms(std::vector<Yolo::Detection>& res, float *output, float nms_thresh = NMS_THRESH) {
    std::map<float, std::vector<Yolo::Detection>> m;
    for (int i = 0; i < output[0] && i < Yolo::MAX_OUTPUT_BBOX_COUNT; i++) {
        if (output[1 + DET_LEN * i + 4] <= BBOX_CONF_THRESH) continue;
        Yolo::Detection det;
        memcpy(&det, &output[1 + DET_LEN * i], DET_LEN * sizeof(float));
        if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Yolo::Detection>());
        m[det.class_id].push_back(det);
    }
    for (auto it = m.begin(); it != m.end(); it++) {
        //std::cout << it->second[0].class_id << " --- " << std::endl;
        auto& dets = it->second;
        std::sort(dets.begin(), dets.end(), cmp);
        for (size_t m = 0; m < dets.size(); ++m) {
            auto& item = dets[m];
            res.push_back(item);
            for (size_t n = m + 1; n < dets.size(); ++n) {
                if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
                    dets.erase(dets.begin()+n);
                    --n;
                }
            }
        }
    }
}

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        
        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + ".weight"].values;
    float *beta = (float*)weightMap[lname + ".bias"].values;
    float *mean = (float*)weightMap[lname + ".running_mean"].values;
    float *var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};
    
    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

ILayer* convBnLeaky(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input,  int outch, int ksize, int s, int p, int linx) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap["module_list." + std::to_string(linx) + ".Conv2d.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{p, p});

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "module_list." + std::to_string(linx) + ".BatchNorm2d", 1e-5);

    auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU);
    lr->setAlpha(0.1);

    return lr;
}

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
    auto network = builder->createNetworkV2(explicitBatch);

    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims4{1, 3, -1, -1});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../yolov3-spp_ultralytics68.wts");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    // Yeah I am stupid, I just want to expand the complete arch of darknet..
    auto lr0 = convBnLeaky(network, weightMap, *data, 32, 3, 1, 1, 0);
    auto lr1 = convBnLeaky(network, weightMap, *lr0->getOutput(0), 64, 3, 2, 1, 1);
    auto lr2 = convBnLeaky(network, weightMap, *lr1->getOutput(0), 32, 1, 1, 0, 2);
    auto lr3 = convBnLeaky(network, weightMap, *lr2->getOutput(0), 64, 3, 1, 1, 3);
    auto ew4 = network->addElementWise(*lr3->getOutput(0), *lr1->getOutput(0), ElementWiseOperation::kSUM);
    auto lr5 = convBnLeaky(network, weightMap, *ew4->getOutput(0), 128, 3, 2, 1, 5);
    auto lr6 = convBnLeaky(network, weightMap, *lr5->getOutput(0), 64, 1, 1, 0, 6);
    auto lr7 = convBnLeaky(network, weightMap, *lr6->getOutput(0), 128, 3, 1, 1, 7);
    auto ew8 = network->addElementWise(*lr7->getOutput(0), *lr5->getOutput(0), ElementWiseOperation::kSUM);
    auto lr9 = convBnLeaky(network, weightMap, *ew8->getOutput(0), 64, 1, 1, 0, 9);
    auto lr10 = convBnLeaky(network, weightMap, *lr9->getOutput(0), 128, 3, 1, 1, 10);
    auto ew11 = network->addElementWise(*lr10->getOutput(0), *ew8->getOutput(0), ElementWiseOperation::kSUM);
    auto lr12 = convBnLeaky(network, weightMap, *ew11->getOutput(0), 256, 3, 2, 1, 12);
    auto lr13 = convBnLeaky(network, weightMap, *lr12->getOutput(0), 128, 1, 1, 0, 13);
    auto lr14 = convBnLeaky(network, weightMap, *lr13->getOutput(0), 256, 3, 1, 1, 14);
    auto ew15 = network->addElementWise(*lr14->getOutput(0), *lr12->getOutput(0), ElementWiseOperation::kSUM);
    auto lr16 = convBnLeaky(network, weightMap, *ew15->getOutput(0), 128, 1, 1, 0, 16);
    auto lr17 = convBnLeaky(network, weightMap, *lr16->getOutput(0), 256, 3, 1, 1, 17);
    auto ew18 = network->addElementWise(*lr17->getOutput(0), *ew15->getOutput(0), ElementWiseOperation::kSUM);
    auto lr19 = convBnLeaky(network, weightMap, *ew18->getOutput(0), 128, 1, 1, 0, 19);
    auto lr20 = convBnLeaky(network, weightMap, *lr19->getOutput(0), 256, 3, 1, 1, 20);
    auto ew21 = network->addElementWise(*lr20->getOutput(0), *ew18->getOutput(0), ElementWiseOperation::kSUM);
    auto lr22 = convBnLeaky(network, weightMap, *ew21->getOutput(0), 128, 1, 1, 0, 22);
    auto lr23 = convBnLeaky(network, weightMap, *lr22->getOutput(0), 256, 3, 1, 1, 23);
    auto ew24 = network->addElementWise(*lr23->getOutput(0), *ew21->getOutput(0), ElementWiseOperation::kSUM);
    auto lr25 = convBnLeaky(network, weightMap, *ew24->getOutput(0), 128, 1, 1, 0, 25);
    auto lr26 = convBnLeaky(network, weightMap, *lr25->getOutput(0), 256, 3, 1, 1, 26);
    auto ew27 = network->addElementWise(*lr26->getOutput(0), *ew24->getOutput(0), ElementWiseOperation::kSUM);
    auto lr28 = convBnLeaky(network, weightMap, *ew27->getOutput(0), 128, 1, 1, 0, 28);
    auto lr29 = convBnLeaky(network, weightMap, *lr28->getOutput(0), 256, 3, 1, 1, 29);
    auto ew30 = network->addElementWise(*lr29->getOutput(0), *ew27->getOutput(0), ElementWiseOperation::kSUM);
    auto lr31 = convBnLeaky(network, weightMap, *ew30->getOutput(0), 128, 1, 1, 0, 31);
    auto lr32 = convBnLeaky(network, weightMap, *lr31->getOutput(0), 256, 3, 1, 1, 32);
    auto ew33 = network->addElementWise(*lr32->getOutput(0), *ew30->getOutput(0), ElementWiseOperation::kSUM);
    auto lr34 = convBnLeaky(network, weightMap, *ew33->getOutput(0), 128, 1, 1, 0, 34);
    auto lr35 = convBnLeaky(network, weightMap, *lr34->getOutput(0), 256, 3, 1, 1, 35);
    auto ew36 = network->addElementWise(*lr35->getOutput(0), *ew33->getOutput(0), ElementWiseOperation::kSUM);
    auto lr37 = convBnLeaky(network, weightMap, *ew36->getOutput(0), 512, 3, 2, 1, 37);
    auto lr38 = convBnLeaky(network, weightMap, *lr37->getOutput(0), 256, 1, 1, 0, 38);
    auto lr39 = convBnLeaky(network, weightMap, *lr38->getOutput(0), 512, 3, 1, 1, 39);
    auto ew40 = network->addElementWise(*lr39->getOutput(0), *lr37->getOutput(0), ElementWiseOperation::kSUM);
    auto lr41 = convBnLeaky(network, weightMap, *ew40->getOutput(0), 256, 1, 1, 0, 41);
    auto lr42 = convBnLeaky(network, weightMap, *lr41->getOutput(0), 512, 3, 1, 1, 42);
    auto ew43 = network->addElementWise(*lr42->getOutput(0), *ew40->getOutput(0), ElementWiseOperation::kSUM);
    auto lr44 = convBnLeaky(network, weightMap, *ew43->getOutput(0), 256, 1, 1, 0, 44);
    auto lr45 = convBnLeaky(network, weightMap, *lr44->getOutput(0), 512, 3, 1, 1, 45);
    auto ew46 = network->addElementWise(*lr45->getOutput(0), *ew43->getOutput(0), ElementWiseOperation::kSUM);
    auto lr47 = convBnLeaky(network, weightMap, *ew46->getOutput(0), 256, 1, 1, 0, 47);
    auto lr48 = convBnLeaky(network, weightMap, *lr47->getOutput(0), 512, 3, 1, 1, 48);
    auto ew49 = network->addElementWise(*lr48->getOutput(0), *ew46->getOutput(0), ElementWiseOperation::kSUM);
    auto lr50 = convBnLeaky(network, weightMap, *ew49->getOutput(0), 256, 1, 1, 0, 50);
    auto lr51 = convBnLeaky(network, weightMap, *lr50->getOutput(0), 512, 3, 1, 1, 51);
    auto ew52 = network->addElementWise(*lr51->getOutput(0), *ew49->getOutput(0), ElementWiseOperation::kSUM);
    auto lr53 = convBnLeaky(network, weightMap, *ew52->getOutput(0), 256, 1, 1, 0, 53);
    auto lr54 = convBnLeaky(network, weightMap, *lr53->getOutput(0), 512, 3, 1, 1, 54);
    auto ew55 = network->addElementWise(*lr54->getOutput(0), *ew52->getOutput(0), ElementWiseOperation::kSUM);
    auto lr56 = convBnLeaky(network, weightMap, *ew55->getOutput(0), 256, 1, 1, 0, 56);
    auto lr57 = convBnLeaky(network, weightMap, *lr56->getOutput(0), 512, 3, 1, 1, 57);
    auto ew58 = network->addElementWise(*lr57->getOutput(0), *ew55->getOutput(0), ElementWiseOperation::kSUM);
    auto lr59 = convBnLeaky(network, weightMap, *ew58->getOutput(0), 256, 1, 1, 0, 59);
    auto lr60 = convBnLeaky(network, weightMap, *lr59->getOutput(0), 512, 3, 1, 1, 60);
    auto ew61 = network->addElementWise(*lr60->getOutput(0), *ew58->getOutput(0), ElementWiseOperation::kSUM);
    auto lr62 = convBnLeaky(network, weightMap, *ew61->getOutput(0), 1024, 3, 2, 1, 62);
    auto lr63 = convBnLeaky(network, weightMap, *lr62->getOutput(0), 512, 1, 1, 0, 63);
    auto lr64 = convBnLeaky(network, weightMap, *lr63->getOutput(0), 1024, 3, 1, 1, 64);
    auto ew65 = network->addElementWise(*lr64->getOutput(0), *lr62->getOutput(0), ElementWiseOperation::kSUM);
    auto lr66 = convBnLeaky(network, weightMap, *ew65->getOutput(0), 512, 1, 1, 0, 66);
    auto lr67 = convBnLeaky(network, weightMap, *lr66->getOutput(0), 1024, 3, 1, 1, 67);
    auto ew68 = network->addElementWise(*lr67->getOutput(0), *ew65->getOutput(0), ElementWiseOperation::kSUM);
    auto lr69 = convBnLeaky(network, weightMap, *ew68->getOutput(0), 512, 1, 1, 0, 69);
    auto lr70 = convBnLeaky(network, weightMap, *lr69->getOutput(0), 1024, 3, 1, 1, 70);
    auto ew71 = network->addElementWise(*lr70->getOutput(0), *ew68->getOutput(0), ElementWiseOperation::kSUM);
    auto lr72 = convBnLeaky(network, weightMap, *ew71->getOutput(0), 512, 1, 1, 0, 72);
    auto lr73 = convBnLeaky(network, weightMap, *lr72->getOutput(0), 1024, 3, 1, 1, 73);
    auto ew74 = network->addElementWise(*lr73->getOutput(0), *ew71->getOutput(0), ElementWiseOperation::kSUM);
    auto lr75 = convBnLeaky(network, weightMap, *ew74->getOutput(0), 512, 1, 1, 0, 75);
    auto lr76 = convBnLeaky(network, weightMap, *lr75->getOutput(0), 1024, 3, 1, 1, 76);
    auto lr77 = convBnLeaky(network, weightMap, *lr76->getOutput(0), 512, 1, 1, 0, 77);

    auto pool78 = network->addPoolingNd(*lr77->getOutput(0), PoolingType::kMAX, DimsHW{5,5});
    pool78->setPaddingNd(DimsHW{2, 2});
    pool78->setStrideNd(DimsHW{1, 1});
    auto pool80 = network->addPoolingNd(*lr77->getOutput(0), PoolingType::kMAX, DimsHW{9,9});
    pool80->setPaddingNd(DimsHW{4, 4});
    pool80->setStrideNd(DimsHW{1, 1});
    auto pool82 = network->addPoolingNd(*lr77->getOutput(0), PoolingType::kMAX, DimsHW{13,13});
    pool82->setPaddingNd(DimsHW{6, 6});
    pool82->setStrideNd(DimsHW{1, 1});

    ITensor* inputTensors83[] = {pool82->getOutput(0), pool80->getOutput(0), pool78->getOutput(0), lr77->getOutput(0)};
    auto cat83 = network->addConcatenation(inputTensors83, 4);

    auto lr84 = convBnLeaky(network, weightMap, *cat83->getOutput(0), 512, 1, 1, 0, 84);
    auto lr85 = convBnLeaky(network, weightMap, *lr84->getOutput(0), 1024, 3, 1, 1, 85);
    auto lr86 = convBnLeaky(network, weightMap, *lr85->getOutput(0), 512, 1, 1, 0, 86);
    auto lr87 = convBnLeaky(network, weightMap, *lr86->getOutput(0), 1024, 3, 1, 1, 87);
    IConvolutionLayer* conv88 = network->addConvolutionNd(*lr87->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.88.Conv2d.weight"], weightMap["module_list.88.Conv2d.bias"]);
    assert(conv88);
    auto lr91 = convBnLeaky(network, weightMap, *lr86->getOutput(0), 256, 1, 1, 0, 91);

    float *deval = reinterpret_cast<float*>(malloc(sizeof(float) * 256 * 2 * 2));
    for (int i = 0; i < 256 * 2 * 2; i++) {
        deval[i] = 1.0;
    }
    Weights deconvwts92{DataType::kFLOAT, deval, 256 * 2 * 2};
    IDeconvolutionLayer* deconv92 = network->addDeconvolutionNd(*lr91->getOutput(0), 256, DimsHW{2, 2}, deconvwts92, emptywts);
    assert(deconv92);
    deconv92->setStrideNd(DimsHW{2, 2});
    deconv92->setNbGroups(256);
    weightMap["deconv92"] = deconvwts92;

    ITensor* inputTensors[] = {deconv92->getOutput(0), ew61->getOutput(0)};
    auto cat93 = network->addConcatenation(inputTensors, 2);
    auto lr94 = convBnLeaky(network, weightMap, *cat93->getOutput(0), 256, 1, 1, 0, 94);
    auto lr95 = convBnLeaky(network, weightMap, *lr94->getOutput(0), 512, 3, 1, 1, 95);
    auto lr96 = convBnLeaky(network, weightMap, *lr95->getOutput(0), 256, 1, 1, 0, 96);
    auto lr97 = convBnLeaky(network, weightMap, *lr96->getOutput(0), 512, 3, 1, 1, 97);
    auto lr98 = convBnLeaky(network, weightMap, *lr97->getOutput(0), 256, 1, 1, 0, 98);
    auto lr99 = convBnLeaky(network, weightMap, *lr98->getOutput(0), 512, 3, 1, 1, 99);
    IConvolutionLayer* conv100 = network->addConvolutionNd(*lr99->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.100.Conv2d.weight"], weightMap["module_list.100.Conv2d.bias"]);
    assert(conv100);
    auto lr103 = convBnLeaky(network, weightMap, *lr98->getOutput(0), 128, 1, 1, 0, 103);
    Weights deconvwts104{DataType::kFLOAT, deval, 128 * 2 * 2};
    IDeconvolutionLayer* deconv104 = network->addDeconvolutionNd(*lr103->getOutput(0), 128, DimsHW{2, 2}, deconvwts104, emptywts);
    assert(deconv104);
    deconv104->setStrideNd(DimsHW{2, 2});
    deconv104->setNbGroups(128);
    ITensor* inputTensors1[] = {deconv104->getOutput(0), ew36->getOutput(0)};
    auto cat105 = network->addConcatenation(inputTensors1, 2);
    auto lr106 = convBnLeaky(network, weightMap, *cat105->getOutput(0), 128, 1, 1, 0, 106);
    auto lr107 = convBnLeaky(network, weightMap, *lr106->getOutput(0), 256, 3, 1, 1, 107);
    auto lr108 = convBnLeaky(network, weightMap, *lr107->getOutput(0), 128, 1, 1, 0, 108);
    auto lr109 = convBnLeaky(network, weightMap, *lr108->getOutput(0), 256, 3, 1, 1, 109);
    auto lr110 = convBnLeaky(network, weightMap, *lr109->getOutput(0), 128, 1, 1, 0, 110);
    auto lr111 = convBnLeaky(network, weightMap, *lr110->getOutput(0), 256, 3, 1, 1, 111);
    IConvolutionLayer* conv112 = network->addConvolutionNd(*lr111->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.112.Conv2d.weight"], weightMap["module_list.112.Conv2d.bias"]);
    assert(conv112);

    auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
    const PluginFieldCollection* pluginData = creator->getFieldNames();
    IPluginV2 *pluginObj = creator->createPlugin("yololayer", pluginData);
    ITensor* inputTensors_yolo[] = {conv88->getOutput(0), conv100->getOutput(0), conv112->getOutput(0)};
    auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj);

    auto dim = yolo->getOutput(0)->getDimensions();
    std::cout << "yololayer output shape: ";
    for (int i = 0; i < dim.nbDims; i++) {
        std::cout << dim.d[i] << " ";
    }
    std::cout << std::endl;
    yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*yolo->getOutput(0));

    IOptimizationProfile* profile = builder->createOptimizationProfile();
    profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kMIN, Dims4(1, 3, MIN_INPUT_SIZE, MIN_INPUT_SIZE));
    profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kOPT, Dims4(1, 3, OPT_INPUT_H, OPT_INPUT_W));
    profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kMAX, Dims4(1, 3, MAX_INPUT_SIZE, MAX_INPUT_SIZE));
    config->addOptimizationProfile(profile);

    // Build engine
    config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#ifdef USE_FP16
    config->setFlag(BuilderFlag::kFP16);
#endif
    std::cout << "Building engine, please wait for a while..." << std::endl;
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*) (mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, cv::Size input_shape) {
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
    context.setBindingDimensions(inputIndex, Dims4(1, 3, input_shape.height, input_shape.width));

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], 3 * input_shape.height * input_shape.width * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, 3 * input_shape.height * input_shape.width * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueueV2(buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
    DIR *p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 &&
                strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}

int main(int argc, char** argv) {
    cudaSetDevice(DEVICE);
    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{nullptr};
    size_t size{0};

    if (argc == 2 && std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(1, &modelStream);
        assert(modelStream != nullptr);
        std::ofstream p("yolov3-spp.engine", std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 0;
    } else if (argc == 3 && std::string(argv[1]) == "-d") {
        std::ifstream file("yolov3-spp.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./yolov3-spp -s  // serialize model to plan file" << std::endl;
        std::cerr << "./yolov3-spp -d ../samples  // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    std::vector<std::string> file_names;
    if (read_files_in_dir(argv[2], file_names) < 0) {
        std::cout << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    static float prob[OUTPUT_SIZE];
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;
    context->setOptimizationProfile(0);

    int fcount = 0;
    for (auto f: file_names) {
        fcount++;
        std::cout << fcount << "  " << f << std::endl;
        cv::Mat img = cv::imread(std::string(argv[2]) + "/" + f);
        if (img.empty()) continue;
        cv::Mat pr_img = letterbox(img);
        std::cout << "letterbox shape: " << pr_img.cols << ", " << pr_img.rows << std::endl;
        if (pr_img.cols < MIN_INPUT_SIZE || pr_img.rows < MIN_INPUT_SIZE) continue;
        cv::Mat blob = cv::dnn::blobFromImage(pr_img, 1.0 / 255.0, pr_img.size(), cv::Scalar(0, 0, 0), true, false);

        // Run inference
        auto start = std::chrono::system_clock::now();
        doInference(*context, blob.ptr<float>(0), prob, pr_img.size());
        auto end = std::chrono::system_clock::now();
        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
        std::vector<Yolo::Detection> res;
        nms(res, prob);
        std::cout << "num of bbox: " << res.size() << std::endl;
        for (size_t j = 0; j < res.size(); j++) {
            cv::Rect r = get_rect(img.size(), pr_img.size(), res[j].bbox);
            cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
            cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
        }
        cv::imwrite("_" + f, img);
    }

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();
    return 0;
}


================================================
FILE: yolov3-tiny/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(yolov3-tiny)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

find_package(CUDA REQUIRED)

include_directories(${PROJECT_SOURCE_DIR}/include)
if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
    message("embed_platform on")
    include_directories(/usr/local/cuda/targets/aarch64-linux/include)
    link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
else()
    message("embed_platform off")
    include_directories(/usr/local/cuda/include)
    link_directories(/usr/local/cuda/lib64)
endif()


set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")

#cuda_add_library(leaky ${PROJECT_SOURCE_DIR}/leaky.cu)
cuda_add_library(yololayer SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu)
target_link_libraries(yololayer nvinfer cudart)

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

add_executable(yolov3-tiny ${PROJECT_SOURCE_DIR}/yolov3-tiny.cpp)
target_link_libraries(yolov3-tiny nvinfer)
target_link_libraries(yolov3-tiny cudart)
target_link_libraries(yolov3-tiny yololayer)
target_link_libraries(yolov3-tiny ${OpenCV_LIBS})

add_definitions(-O2 -pthread)


================================================
FILE: yolov3-tiny/README.md
================================================
# yolov3-tiny

The Pytorch implementation is [ultralytics/yolov3 archive branch](https://github.com/ultralytics/yolov3/tree/archive).

## Excute:

```
1. generate yolov3-tiny.wts from pytorch implementation with yolov3-tiny.cfg and yolov3-tiny.weights, or download .wts from model zoo

git clone -b archive https://github.com/ultralytics/yolov3.git
// download its weights 'yolov3-tiny.pt' or 'yolov3-tiny.weights'
// put tensorrtx/yolov3-tiny/gen_wts.py into ultralytics/yolov3 and run
python gen_wts.py yolov3-tiny.weights
// a file 'yolov3-tiny.wts' will be generated.

2. put yolov3-tiny.wts into tensorrtx/yolov3-tiny, build and run

// go to tensorrtx/yolov3-tiny
mkdir build
cd build
cmake ..
make
sudo ./yolov3-tiny -s             // serialize model to plan file i.e. 'yolov3-tiny.engine'
sudo ./yolov3-tiny -d  ../../yolov3-spp/samples // deserialize plan file and run inference, the images in samples will be processed.

3. check the images generated, as follows. _zidane.jpg and _bus.jpg
```

<p align="center">
<img src="https://user-images.githubusercontent.com/15235574/78247927-4d9fac00-751e-11ea-8b1b-704a0aeb3fcf.jpg">
</p>

<p align="center">
<img src="https://user-images.githubusercontent.com/15235574/78247970-60b27c00-751e-11ea-88df-41473fed4823.jpg">
</p>

## Config

- Input shape defined in yololayer.h
- Number of classes defined in yololayer.h
- FP16/FP32 can be selected by the macro in yolov3-tiny.cpp
- GPU id can be selected by the macro in yolov3-tiny.cpp
- NMS thresh in yolov3-tiny.cpp
- BBox confidence thresh in yolov3-tiny.cpp

## More Information

See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)


================================================
FILE: yolov3-tiny/gen_wts.py
================================================
import struct
import sys
import torch
from models import *  # noqa: F403
from utils.utils import *  # noqa: F403

model = Darknet('cfg/yolov3-tiny.cfg', (608, 608))  # noqa: F405
weights = sys.argv[1]
device = torch_utils.select_device('0')  # noqa: F405
if weights.endswith('.pt'):  # pytorch format
    model.load_state_dict(torch.load(weights, map_location=device, weights_only=False)['model'])
else:  # darknet format
    load_darknet_weights(model, weights)  # noqa: F405
model = model.eval()

with open('yolov3-tiny.wts', 'w') as f:
    f.write('{}\n'.format(len(model.state_dict().keys())))
    for k, v in model.state_dict().items():
        vr = v.reshape(-1).cpu().numpy()
        f.write('{} {} '.format(k, len(vr)))
        for vv in vr:
            f.write(' ')
            f.write(struct.pack('>f', float(vv)).hex())
        f.write('\n')


================================================
FILE: yolov3-tiny/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "macros.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: yolov3-tiny/macros.h
================================================
#ifndef __MACROS_H
#define __MACROS_H

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif

#endif  // __MACROS_H

================================================
FILE: yolov3-tiny/utils.h
================================================
#ifndef __TRT_UTILS_H_
#define __TRT_UTILS_H_

#include <iostream>
#include <vector>
#include <algorithm>
#include <cudnn.h>
#include "macros.h"

#ifndef CUDA_CHECK

#define CUDA_CHECK(callstr)                                                                    \
    {                                                                                          \
        cudaError_t error_code = callstr;                                                      \
        if (error_code != cudaSuccess) {                                                       \
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
            assert(0);                                                                         \
        }                                                                                      \
    }

#endif

namespace Tn
{
    class Profiler : public nvinfer1::IProfiler
    {
    public:
        void printLayerTimes(int itrationsTimes)
        {
            float totalTime = 0;
            for (size_t i = 0; i < mProfile.size(); i++)
            {
                printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / itrationsTimes);
                totalTime += mProfile[i].second;
            }
            printf("Time over all layers: %4.3f\n", totalTime / itrationsTimes);
        }
    private:
        typedef std::pair<std::string, float> Record;
        std::vector<Record> mProfile;

        virtual void reportLayerTime(const char* layerName, float ms) TRT_NOEXCEPT
        {
            auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; });
            if (record == mProfile.end())
                mProfile.push_back(std::make_pair(layerName, ms));
            else
                record->second += ms;
        }
    };

    //Logger for TensorRT info/warning/errors
    class Logger : public nvinfer1::ILogger
    {
    public:

        Logger(): Logger(Severity::kWARNING) {}

        Logger(Severity severity): reportableSeverity(severity) {}

        void log(Severity severity, const char* msg) TRT_NOEXCEPT override
        {
            // suppress messages with severity enum value greater than the reportable
            if (severity > reportableSeverity) return;

            switch (severity)
            {
                case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
                case Severity::kERROR: std::cerr << "ERROR: "; break;
                case Severity::kWARNING: std::cerr << "WARNING: "; break;
                case Severity::kINFO: std::cerr << "INFO: "; break;
                default: std::cerr << "UNKNOWN: "; break;
            }
            std::cerr << msg << std::endl;
        }

        Severity reportableSeverity{Severity::kWARNING};
    };

    template<typename T> 
    void write(char*& buffer, const T& val)
    {
        *reinterpret_cast<T*>(buffer) = val;
        buffer += sizeof(T);
    }

    template<typename T> 
    void read(const char*& buffer, T& val)
    {
        val = *reinterpret_cast<const T*>(buffer);
        buffer += sizeof(T);
    }
}

#endif

================================================
FILE: yolov3-tiny/yololayer.cu
================================================
#include <assert.h>
#include "yololayer.h"
#include "utils.h"

using namespace Yolo;

namespace nvinfer1
{
    YoloLayerPlugin::YoloLayerPlugin()
    {
        mClassCount = CLASS_NUM;
        mYoloKernel.clear();
        mYoloKernel.push_back(yolo1);
        mYoloKernel.push_back(yolo2);

        mKernelCount = mYoloKernel.size();
    }
    
    YoloLayerPlugin::~YoloLayerPlugin()
    {
    }

    // create the plugin at runtime from a byte stream
    YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length)
    {
        using namespace Tn;
        const char *d = reinterpret_cast<const char *>(data), *a = d;
        read(d, mClassCount);
        read(d, mThreadCount);
        read(d, mKernelCount);
        mYoloKernel.resize(mKernelCount);
        auto kernelSize = mKernelCount*sizeof(YoloKernel);
        memcpy(mYoloKernel.data(),d,kernelSize);
        d += kernelSize;

        assert(d == a + length);
    }

    void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT
    {
        using namespace Tn;
        char* d = static_cast<char*>(buffer), *a = d;
        write(d, mClassCount);
        write(d, mThreadCount);
        write(d, mKernelCount);
        auto kernelSize = mKernelCount*sizeof(YoloKernel);
        memcpy(d,mYoloKernel.data(),kernelSize);
        d += kernelSize;

        assert(d == a + getSerializationSize());
    }
    
    size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT
    {  
        return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount)  + sizeof(Yolo::YoloKernel) * mYoloKernel.size();
    }

    int YoloLayerPlugin::initialize() TRT_NOEXCEPT
    { 
        return 0;
    }
    
    Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT
    {
        //output the result to channel
        int totalsize = MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float);

        return Dims3(totalsize + 1, 1, 1);
    }

    // Set plugin namespace
    void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT
    {
        mPluginNamespace = pluginNamespace;
    }

    const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT
    {
        return mPluginNamespace;
    }

    // Return the DataType of the plugin output at the requested index
    DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT
    {
        return DataType::kFLOAT;
    }

    // Return true if output tensor is broadcast across a batch.
    bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT
    {
        return false;
    }

    // Return true if plugin can use input that is broadcast across batch without replication.
    bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT
    {
        return false;
    }

    void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT
    {
    }

    // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
    void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT
    {
    }

    // Detach the plugin object from its execution context.
    void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {}

    const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT
    {
        return "YoloLayer_TRT";
    }

    const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT
    {
        return "1";
    }

    void YoloLayerPlugin::destroy() TRT_NOEXCEPT
    {
        delete this;
    }

    // Clone the plugin
    IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT
    {
        YoloLayerPlugin *p = new YoloLayerPlugin();
        p->setPluginNamespace(mPluginNamespace);
        return p;
    }

    __device__ float Logist(float data){ return 1.0f / (1.0f + expf(-data)); };

    __global__ void CalDetection(const float *input, float *output,int noElements, 
            int yoloWidth,int yoloHeight,const float anchors[CHECK_COUNT*2],int classes,int outputElem) {
 
        int idx = threadIdx.x + blockDim.x * blockIdx.x;
        if (idx >= noElements) return;

        int total_grid = yoloWidth * yoloHeight;
        int bnIdx = idx / total_grid;
        idx = idx - total_grid*bnIdx;
        int info_len_i = 5 + classes;
        const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT);

        for (int k = 0; k < 3; ++k) {
            int class_id = 0;
            float max_cls_prob = 0.0;
            for (int i = 5; i < info_len_i; ++i) {
                float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]);
                if (p > max_cls_prob) {
                    max_cls_prob = p;
                    class_id = i - 5;
                }
            }
            float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]);
            if (max_cls_prob < IGNORE_THRESH || box_prob < IGNORE_THRESH) continue;

            float *res_count = output + bnIdx*outputElem;
            int count = (int)atomicAdd(res_count, 1);
            if (count >= MAX_OUTPUT_BBOX_COUNT) return;
            char* data = (char * )res_count + sizeof(float) + count*sizeof(Detection);
            Detection* det =  (Detection*)(data);

            int row = idx / yoloWidth;
            int col = idx % yoloWidth;

            //Location
            det->bbox[0] = (col + Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * INPUT_W / yoloWidth;
            det->bbox[1] = (row + Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * INPUT_H / yoloHeight;
            det->bbox[2] = expf(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]) * anchors[2*k];
            det->bbox[3] = expf(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]) * anchors[2*k + 1];
            det->det_confidence = box_prob;
            det->class_id = class_id;
            det->class_confidence = max_cls_prob;
        }
    }

    void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) {
        void* devAnchor;
        size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;
        CUDA_CHECK(cudaMalloc(&devAnchor,AnchorLen));

        int outputElem = 1 + MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float);

        for(int idx = 0 ; idx < batchSize; ++idx) {
            CUDA_CHECK(cudaMemset(output + idx*outputElem, 0, sizeof(float)));
        }
        int numElem = 0;
        for (unsigned int i = 0;i< mYoloKernel.size();++i)
        {
            const auto& yolo = mYoloKernel[i];
            numElem = yolo.width*yolo.height*batchSize;
            if (numElem < mThreadCount)
                mThreadCount = numElem;
            CUDA_CHECK(cudaMemcpy(devAnchor, yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
            CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>>
                (inputs[i],output, numElem, yolo.width, yolo.height, (float *)devAnchor, mClassCount ,outputElem);
        }

        CUDA_CHECK(cudaFree(devAnchor));
    }


    int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT
    {
        //assert(batchSize == 1);
        //GPU
        //CUDA_CHECK(cudaStreamSynchronize(stream));
        forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);

        return 0;
    }

    PluginFieldCollection YoloPluginCreator::mFC{};
    std::vector<PluginField> YoloPluginCreator::mPluginAttributes;

    YoloPluginCreator::YoloPluginCreator()
    {
        mPluginAttributes.clear();

        mFC.nbFields = mPluginAttributes.size();
        mFC.fields = mPluginAttributes.data();
    }

    const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT
    {
            return "YoloLayer_TRT";
    }

    const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT
    {
            return "1";
    }

    const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT
    {
            return &mFC;
    }

    IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT
    {
        YoloLayerPlugin* obj = new YoloLayerPlugin();
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

    IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT
    {
        // This object will be deleted when the network is destroyed, which will
        // call MishPlugin::destroy()
        YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

}


================================================
FILE: yolov3-tiny/yololayer.h
================================================
#ifndef _YOLO_LAYER_H
#define _YOLO_LAYER_H

#include <vector>
#include <string>
#include "NvInfer.h"
#include "macros.h"


namespace Yolo
{
    static constexpr int CHECK_COUNT = 3;
    static constexpr float IGNORE_THRESH = 0.1f;
    static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000;
    static constexpr int CLASS_NUM = 80;
    static constexpr int INPUT_H = 608;
    static constexpr int INPUT_W = 608;

    struct YoloKernel
    {
        int width;
        int height;
        float anchors[CHECK_COUNT*2];
    };

    static constexpr YoloKernel yolo1 = {
        INPUT_W / 32,
        INPUT_H / 32,
        {81,82, 135,169, 344,319}
    };
    static constexpr YoloKernel yolo2 = {
        INPUT_W / 16,
        INPUT_H / 16,
        {23,27, 37,58, 81,82}
    };

    static constexpr int LOCATIONS = 4;
    struct alignas(float) Detection{
        //x y w h
        float bbox[LOCATIONS];
        float det_confidence;
        float class_id;
        float class_confidence;
    };
}


namespace nvinfer1
{
    class YoloLayerPlugin: public IPluginV2IOExt
    {
        public:
            explicit YoloLayerPlugin();
            YoloLayerPlugin(const void* data, size_t length);

            ~YoloLayerPlugin();

            int getNbOutputs() const TRT_NOEXCEPT override
            {
                return 1;
            }

            Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;

            int initialize() TRT_NOEXCEPT override;

            virtual void terminate() TRT_NOEXCEPT override {};

            virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0;}

            virtual int enqueue(int batchSize, const void*const * inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;

            virtual size_t getSerializationSize() const TRT_NOEXCEPT override;

            virtual void serialize(void* buffer) const TRT_NOEXCEPT override;

            bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override {
                return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
            }

            const char* getPluginType() const TRT_NOEXCEPT override;

            const char* getPluginVersion() const TRT_NOEXCEPT override;

            void destroy() TRT_NOEXCEPT override;

            IPluginV2IOExt* clone() const TRT_NOEXCEPT override;

            void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;

            const char* getPluginNamespace() const TRT_NOEXCEPT override;

            DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;

            bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override;

            bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;

            void attachToContext(
                    cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;

            void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT override;

            void detachFromContext() TRT_NOEXCEPT override;

        private:
            void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1);
            int mClassCount;
            int mKernelCount;
            std::vector<Yolo::YoloKernel> mYoloKernel;
            int mThreadCount = 256;
            const char* mPluginNamespace;
    };

    class YoloPluginCreator : public IPluginCreator
    {
        public:
            YoloPluginCreator();

            ~YoloPluginCreator() override = default;

            const char* getPluginName() const TRT_NOEXCEPT override;

            const char* getPluginVersion() const TRT_NOEXCEPT override;

            const PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;

            IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT override;

            IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;

            void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override
            {
                mNamespace = libNamespace;
            }

            const char* getPluginNamespace() const TRT_NOEXCEPT override
            {
                return mNamespace.c_str();
            }

        private:
            std::string mNamespace;
            static PluginFieldCollection mFC;
            static std::vector<PluginField> mPluginAttributes;
    };
    REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
};

#endif 


================================================
FILE: yolov3-tiny/yolov3-tiny.cpp
================================================
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include <opencv2/opencv.hpp>
#include <dirent.h>
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"
#include "yololayer.h"

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

#define USE_FP16  // comment out this if want to use FP32
#define DEVICE 0  // GPU id
#define NMS_THRESH 0.5
#define BBOX_CONF_THRESH 0.4

using namespace nvinfer1;

// stuff we know about the network and the input/output blobs
static const int INPUT_H = Yolo::INPUT_H;
static const int INPUT_W = Yolo::INPUT_W;
static const int OUTPUT_SIZE = 1000 * 7 + 1;  // we assume the yololayer outputs no more than 1000 boxes that conf >= 0.1
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
static Logger gLogger;

cv::Mat preprocess_img(cv::Mat& img) {
    int w, h, x, y;
    float r_w = INPUT_W / (img.cols*1.0);
    float r_h = INPUT_H / (img.rows*1.0);
    if (r_h > r_w) {
        w = INPUT_W;
        h = r_w * img.rows;
        x = 0;
        y = (INPUT_H - h) / 2;
    } else {
        w = r_h* img.cols;
        h = INPUT_H;
        x = (INPUT_W - w) / 2;
        y = 0;
    }
    cv::Mat re(h, w, CV_8UC3);
    cv::resize(img, re, re.size(), 0, 0, cv::INTER_CUBIC);
    cv::Mat out(INPUT_H, INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128));
    re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
    return out;
}

cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
    int l, r, t, b;
    float r_w = INPUT_W / (img.cols * 1.0);
    float r_h = INPUT_H / (img.rows * 1.0);
    if (r_h > r_w) {
        l = bbox[0] - bbox[2]/2.f;
        r = bbox[0] + bbox[2]/2.f;
        t = bbox[1] - bbox[3]/2.f - (INPUT_H - r_w * img.rows) / 2;
        b = bbox[1] + bbox[3]/2.f - (INPUT_H - r_w * img.rows) / 2;
        l = l / r_w;
        r = r / r_w;
        t = t / r_w;
        b = b / r_w;
    } else {
        l = bbox[0] - bbox[2]/2.f - (INPUT_W - r_h * img.cols) / 2;
        r = bbox[0] + bbox[2]/2.f - (INPUT_W - r_h * img.cols) / 2;
        t = bbox[1] - bbox[3]/2.f;
        b = bbox[1] + bbox[3]/2.f;
        l = l / r_h;
        r = r / r_h;
        t = t / r_h;
        b = b / r_h;
    }
    return cv::Rect(l, t, r-l, b-t);
}

float iou(float lbox[4], float rbox[4]) {
    float interBox[] = {
        std::max(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left
        std::min(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right
        std::max(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top
        std::min(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom
    };

    if(interBox[2] > interBox[3] || interBox[0] > interBox[1])
        return 0.0f;

    float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]);
    return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS);
}

bool cmp(const Yolo::Detection& a, const Yolo::Detection& b) {
    return a.det_confidence > b.det_confidence;
}

void nms(std::vector<Yolo::Detection>& res, float *output, float nms_thresh = NMS_THRESH) {
    std::map<float, std::vector<Yolo::Detection>> m;
    for (int i = 0; i < output[0] && i < 1000; i++) {
        if (output[1 + 7 * i + 4] <= BBOX_CONF_THRESH) continue;
        Yolo::Detection det;
        memcpy(&det, &output[1 + 7 * i], 7 * sizeof(float));
        if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Yolo::Detection>());
        m[det.class_id].push_back(det);
    }
    for (auto it = m.begin(); it != m.end(); it++) {
        //std::cout << it->second[0].class_id << " --- " << std::endl;
        auto& dets = it->second;
        std::sort(dets.begin(), dets.end(), cmp);
        for (size_t m = 0; m < dets.size(); ++m) {
            auto& item = dets[m];
            res.push_back(item);
            for (size_t n = m + 1; n < dets.size(); ++n) {
                if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
                    dets.erase(dets.begin()+n);
                    --n;
                }
            }
        }
    }
}

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        
        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + ".weight"].values;
    float *beta = (float*)weightMap[lname + ".bias"].values;
    float *mean = (float*)weightMap[lname + ".running_mean"].values;
    float *var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};
    
    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

ILayer* convBnLeaky(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input,  int outch, int ksize, int s, int p, int linx) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap["module_list." + std::to_string(linx) + ".Conv2d.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{p, p});

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "module_list." + std::to_string(linx) + ".BatchNorm2d", 1e-4);

    auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU);
    lr->setAlpha(0.1);

    return lr;
}

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../yolov3-tiny.wts");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    auto lr0 = convBnLeaky(network, weightMap, *data, 16, 3, 1, 1, 0);
    auto pool1 = network->addPoolingNd(*lr0->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    pool1->setStrideNd(DimsHW{2, 2});
    auto lr2 = convBnLeaky(network, weightMap, *pool1->getOutput(0), 32, 3, 1, 1, 2);
    auto pool3 = network->addPoolingNd(*lr2->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    pool3->setStrideNd(DimsHW{2, 2});
    auto lr4 = convBnLeaky(network, weightMap, *pool3->getOutput(0), 64, 3, 1, 1, 4);
    auto pool5 = network->addPoolingNd(*lr4->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    pool5->setStrideNd(DimsHW{2, 2});
    auto lr6 = convBnLeaky(network, weightMap, *pool5->getOutput(0), 128, 3, 1, 1, 6);
    auto pool7 = network->addPoolingNd(*lr6->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    pool7->setStrideNd(DimsHW{2, 2});
    auto lr8 = convBnLeaky(network, weightMap, *pool7->getOutput(0), 256, 3, 1, 1, 8);
    auto pool9 = network->addPoolingNd(*lr8->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    pool9->setStrideNd(DimsHW{2, 2});
    auto lr10 = convBnLeaky(network, weightMap, *pool9->getOutput(0), 512, 3, 1, 1, 10);
    auto pad11 = network->addPaddingNd(*lr10->getOutput(0), DimsHW{0, 0}, DimsHW{1, 1});
    auto pool11 = network->addPoolingNd(*pad11->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    pool11->setStrideNd(DimsHW{1, 1});
    auto lr12 = convBnLeaky(network, weightMap, *pool11->getOutput(0), 1024, 3, 1, 1, 12);
    auto lr13 = convBnLeaky(network, weightMap, *lr12->getOutput(0), 256, 1, 1, 0, 13);
    auto lr14 = convBnLeaky(network, weightMap, *lr13->getOutput(0), 512, 3, 1, 1, 14);
    IConvolutionLayer* conv15 = network->addConvolutionNd(*lr14->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.15.Conv2d.weight"], weightMap["module_list.15.Conv2d.bias"]);
    // 16 is yolo
    auto l17 = lr13;
    auto lr18 = convBnLeaky(network, weightMap, *l17->getOutput(0), 128, 1, 1, 0, 18);

    float *deval = reinterpret_cast<float*>(malloc(sizeof(float) * 128 * 2 * 2));
    for (int i = 0; i < 128 * 2 * 2; i++) {
        deval[i] = 1.0;
    }
    Weights deconvwts19{DataType::kFLOAT, deval, 128 * 2 * 2};
    IDeconvolutionLayer* deconv19 = network->addDeconvolutionNd(*lr18->getOutput(0), 128, DimsHW{2, 2}, deconvwts19, emptywts);
    assert(deconv19);
    deconv19->setStrideNd(DimsHW{2, 2});
    deconv19->setNbGroups(128);
    weightMap["deconv19"] = deconvwts19;

    ITensor* inputTensors[] = {deconv19->getOutput(0), lr8->getOutput(0)};
    auto cat20 = network->addConcatenation(inputTensors, 2);
    auto lr21 = convBnLeaky(network, weightMap, *cat20->getOutput(0), 256, 3, 1, 1, 21);
    IConvolutionLayer* conv22 = network->addConvolutionNd(*lr21->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.22.Conv2d.weight"], weightMap["module_list.22.Conv2d.bias"]);
    // 22 is yolo

    auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
    const PluginFieldCollection* pluginData = creator->getFieldNames();
    IPluginV2 *pluginObj = creator->createPlugin("yololayer", pluginData);
    ITensor* inputTensors_yolo[] = {conv15->getOutput(0), conv22->getOutput(0)};
    auto yolo = network->addPluginV2(inputTensors_yolo, 2, *pluginObj);

    yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*yolo->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#ifdef USE_FP16
    config->setFlag(BuilderFlag::kFP16);
#endif
    std::cout << "Building engine, please wait for a while..." << std::endl;
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*) (mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
    DIR *p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 &&
                strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}

int main(int argc, char** argv) {
    cudaSetDevice(DEVICE);
    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{nullptr};
    size_t size{0};

    if (argc == 2 && std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(1, &modelStream);
        assert(modelStream != nullptr);
        std::ofstream p("yolov3-tiny.engine", std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 0;
    } else if (argc == 3 && std::string(argv[1]) == "-d") {
        std::ifstream file("yolov3-tiny.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./yolov3-tiny -s  // serialize model to plan file" << std::endl;
        std::cerr << "./yolov3-tiny -d ../samples  // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    std::vector<std::string> file_names;
    if (read_files_in_dir(argv[2], file_names) < 0) {
        std::cout << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    // prepare input data ---------------------------
    static float data[3 * INPUT_H * INPUT_W];
    //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
    //    data[i] = 1.0;
    static float prob[OUTPUT_SIZE];
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    int fcount = 0;
    for (auto f: file_names) {
        fcount++;
        std::cout << fcount << "  " << f << std::endl;
        cv::Mat img = cv::imread(std::string(argv[2]) + "/" + f);
        if (img.empty()) continue;
        cv::Mat pr_img = preprocess_img(img);
        for (int i = 0; i < INPUT_H * INPUT_W; i++) {
            data[i] = pr_img.at<cv::Vec3b>(i)[2] / 255.0;
            data[i + INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[1] / 255.0;
            data[i + 2 * INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[0] / 255.0;
        }

        // Run inference
        auto start = std::chrono::system_clock::now();
        doInference(*context, data, prob, 1);
        auto end = std::chrono::system_clock::now();
        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
        std::vector<Yolo::Detection> res;
        nms(res, prob);
        for (int i=0; i<20; i++) {
            std::cout << prob[i] << ",";
        }
        std::cout << res.size() << std::endl;
        for (size_t j = 0; j < res.size(); j++) {
            float *p = (float*)&res[j];
            for (size_t k = 0; k < 7; k++) {
                std::cout << p[k] << ", ";
            }
            std::cout << std::endl;
            cv::Rect r = get_rect(img, res[j].bbox);
            cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
            cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
        }
        cv::imwrite("_" + f, img);
    }

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();
    return 0;
}


================================================
FILE: yolov4/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)

project(yolov4)

add_definitions(-std=c++11)

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

find_package(CUDA REQUIRED)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(/usr/include/x86_64-linux-gnu/)
link_directories(/usr/lib/x86_64-linux-gnu/)

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")

cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu ${PROJECT_SOURCE_DIR}/mish.cu)
target_link_libraries(myplugins nvinfer cudart)

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

add_executable(yolov4 ${PROJECT_SOURCE_DIR}/yolov4.cpp)
target_link_libraries(yolov4 nvinfer)
target_link_libraries(yolov4 cudart)
target_link_libraries(yolov4 myplugins)
target_link_libraries(yolov4 ${OpenCV_LIBS})

add_definitions(-O2 -pthread)


================================================
FILE: yolov4/README.md
================================================
# yolov4

The Pytorch implementation is from [ultralytics/yolov3 archive branch](https://github.com/ultralytics/yolov3/tree/archive). It can load yolov4.cfg and yolov4.weights(from AlexeyAB/darknet).

## Config

- Input shape `INPUT_H`, `INPUT_W` defined in yololayer.h
- Number of classes `CLASS_NUM` defined in yololayer.h
- FP16/FP32 can be selected by the macro `USE_FP16` in yolov4.cpp
- GPU id can be selected by the macro `DEVICE` in yolov4.cpp
- NMS thresh `NMS_THRESH` in yolov4.cpp
- bbox confidence threshold `BBOX_CONF_THRESH` in yolov4.cpp
- `BATCH_SIZE` in yolov4.cpp

## How to run

1. generate yolov4.wts from pytorch implementation with yolov4.cfg and yolov4.weights, or download .wts from model zoo

```
git clone https://github.com/wang-xinyu/tensorrtx.git
git clone -b archive https://github.com/ultralytics/yolov3.git
// download yolov4.weights from https://github.com/AlexeyAB/darknet#pre-trained-models
cp {tensorrtx}/yolov4/gen_wts.py {ultralytics/yolov3/}
cd {ultralytics/yolov3/}
python gen_wts.py yolov4.weights
// a file 'yolov4.wts' will be generated.
// the master branch of yolov3 should work, if not, you can checkout be87b41aa2fe59be8e62f4b488052b24ad0bd450
```

2. put yolov4.wts into {tensorrtx}/yolov4, build and run

```
mv yolov4.wts {tensorrtx}/yolov4/
cd {tensorrtx}/yolov4
mkdir build
cd build
cmake ..
make
sudo ./yolov4 -s                          // serialize model to plan file i.e. 'yolov4.engine'
sudo ./yolov4 -d ../../yolov3-spp/samples // deserialize plan file and run inference, the images in samples will be processed.
```

3. check the images generated, as follows. _zidane.jpg and _bus.jpg

<p align="center">
<img src="https://user-images.githubusercontent.com/15235574/80863728-cbd3a780-8cb0-11ea-8640-7983bb41c354.jpg">
</p>

<p align="center">
<img src="https://user-images.githubusercontent.com/15235574/80863730-cfffc500-8cb0-11ea-810e-94d693e71d80.jpg">
</p>

## More Information

See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)


================================================
FILE: yolov4/gen_wts.py
================================================
import struct
import sys
import torch
from models import *  # noqa: F403
from utils.utils import *  # noqa: F403

model = Darknet('cfg/yolov4.cfg', (608, 608))  # noqa: F405
weights = sys.argv[1]
device = torch_utils.select_device('0')  # noqa: F405
if weights.endswith('.pt'):  # pytorch format
    model.load_state_dict(torch.load(weights, map_location=device, weights_only=False)['model'])
else:  # darknet format
    load_darknet_weights(model, weights)  # noqa: F405

with open('yolov4.wts', 'w') as f:
    f.write('{}\n'.format(len(model.state_dict().keys())))
    for k, v in model.state_dict().items():
        vr = v.reshape(-1).cpu().numpy()
        f.write('{} {} '.format(k, len(vr)))
        for vv in vr:
            f.write(' ')
            f.write(struct.pack('>f', float(vv)).hex())
        f.write('\n')


================================================
FILE: yolov4/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) override
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: yolov4/mish.cu
================================================
#include <cmath>
#include <stdio.h>
#include <cassert>
#include <iostream>
#include "mish.h"

namespace nvinfer1
{
    MishPlugin::MishPlugin()
    {
    }

    MishPlugin::~MishPlugin()
    {
    }

    // create the plugin at runtime from a byte stream
    MishPlugin::MishPlugin(const void* data, size_t length)
    {
        assert(length == sizeof(input_size_));
        input_size_ = *reinterpret_cast<const int*>(data);
    }

    void MishPlugin::serialize(void* buffer) const
    {
        *reinterpret_cast<int*>(buffer) = input_size_;
    }

    size_t MishPlugin::getSerializationSize() const
    {  
        return sizeof(input_size_);
    }

    int MishPlugin::initialize()
    { 
        return 0;
    }

    Dims MishPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
    {
        assert(nbInputDims == 1);
        assert(index == 0);
        input_size_ = inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2];
        // Output dimensions
        return Dims3(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]);
    }

    // Set plugin namespace
    void MishPlugin::setPluginNamespace(const char* pluginNamespace)
    {
        mPluginNamespace = pluginNamespace;
    }

    const char* MishPlugin::getPluginNamespace() const
    {
        return mPluginNamespace;
    }

    // Return the DataType of the plugin output at the requested index
    DataType MishPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
    {
        return DataType::kFLOAT;
    }

    // Return true if output tensor is broadcast across a batch.
    bool MishPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
    {
        return false;
    }

    // Return true if plugin can use input that is broadcast across batch without replication.
    bool MishPlugin::canBroadcastInputAcrossBatch(int inputIndex) const
    {
        return false;
    }

    void MishPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)
    {
    }

    // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
    void MishPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
    {
    }

    // Detach the plugin object from its execution context.
    void MishPlugin::detachFromContext() {}

    const char* MishPlugin::getPluginType() const
    {
        return "Mish_TRT";
    }

    const char* MishPlugin::getPluginVersion() const
    {
        return "1";
    }

    void MishPlugin::destroy()
    {
        delete this;
    }

    // Clone the plugin
    IPluginV2IOExt* MishPlugin::clone() const
    {
        MishPlugin *p = new MishPlugin();
        p->input_size_ = input_size_;
        p->setPluginNamespace(mPluginNamespace);
        return p;
    }

    __device__ float tanh_activate_kernel(float x){return (2/(1 + expf(-2*x)) - 1);}

    __device__ float softplus_kernel(float x, float threshold = 20) {
        if (x > threshold) return x;                // too large
        else if (x < -threshold) return expf(x);    // too small
        return logf(expf(x) + 1);
    }

    __global__ void mish_kernel(const float *input, float *output, int num_elem) {

        int idx = threadIdx.x + blockDim.x * blockIdx.x;
        if (idx >= num_elem) return;

        //float t = exp(input[idx]);
        //if (input[idx] > 20.0) {
        //    t *= t;
        //    output[idx] = (t - 1.0) / (t + 1.0);
        //} else {
        //    float tt = t * t;
        //    output[idx] = (tt + 2.0 * t) / (tt + 2.0 * t + 2.0);
        //}
        //output[idx] *= input[idx];
        output[idx] = input[idx] * tanh_activate_kernel(softplus_kernel(input[idx]));
    }

    void MishPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) {
        int block_size = thread_count_;
        int grid_size = (input_size_ * batchSize + block_size - 1) / block_size;
        mish_kernel<<<grid_size, block_size>>>(inputs[0], output, input_size_ * batchSize);
    }

    int MishPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
    {
        //assert(batchSize == 1);
        //GPU
        //CUDA_CHECK(cudaStreamSynchronize(stream));
        forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);
        return 0;
    }

    PluginFieldCollection MishPluginCreator::mFC{};
    std::vector<PluginField> MishPluginCreator::mPluginAttributes;

    MishPluginCreator::MishPluginCreator()
    {
        mPluginAttributes.clear();

        mFC.nbFields = mPluginAttributes.size();
        mFC.fields = mPluginAttributes.data();
    }

    const char* MishPluginCreator::getPluginName() const
    {
            return "Mish_TRT";
    }

    const char* MishPluginCreator::getPluginVersion() const
    {
            return "1";
    }

    const PluginFieldCollection* MishPluginCreator::getFieldNames()
    {
            return &mFC;
    }

    IPluginV2IOExt* MishPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
    {
        MishPlugin* obj = new MishPlugin();
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

    IPluginV2IOExt* MishPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
    {
        // This object will be deleted when the network is destroyed, which will
        // call MishPlugin::destroy()
        MishPlugin* obj = new MishPlugin(serialData, serialLength);
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

}


================================================
FILE: yolov4/mish.h
================================================
#ifndef _MISH_PLUGIN_H
#define _MISH_PLUGIN_H

#include <string>
#include <vector>
#include "NvInfer.h"

namespace nvinfer1
{
    class MishPlugin: public IPluginV2IOExt
    {
        public:
            explicit MishPlugin();
            MishPlugin(const void* data, size_t length);

            ~MishPlugin();

            int getNbOutputs() const override
            {
                return 1;
            }

            Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;

            int initialize() override;

            virtual void terminate() override {};

            virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;}

            virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override;

            virtual size_t getSerializationSize() const override;

            virtual void serialize(void* buffer) const override;

            bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override {
                return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
            }

            const char* getPluginType() const override;

            const char* getPluginVersion() const override;

            void destroy() override;

            IPluginV2IOExt* clone() const override;

            void setPluginNamespace(const char* pluginNamespace) override;

            const char* getPluginNamespace() const override;

            DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;

            bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override;

            bool canBroadcastInputAcrossBatch(int inputIndex) const override;

            void attachToContext(
                    cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override;

            void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override;

            void detachFromContext() override;

            int input_size_;
        private:
            void forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize = 1);
            int thread_count_ = 256;
            const char* mPluginNamespace;
    };

    class MishPluginCreator : public IPluginCreator
    {
        public:
            MishPluginCreator();

            ~MishPluginCreator() override = default;

            const char* getPluginName() const override;

            const char* getPluginVersion() const override;

            const PluginFieldCollection* getFieldNames() override;

            IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override;

            IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;

            void setPluginNamespace(const char* libNamespace) override
            {
                mNamespace = libNamespace;
            }

            const char* getPluginNamespace() const override
            {
                return mNamespace.c_str();
            }

        private:
            std::string mNamespace;
            static PluginFieldCollection mFC;
            static std::vector<PluginField> mPluginAttributes;
    };
    REGISTER_TENSORRT_PLUGIN(MishPluginCreator);
};
#endif 


================================================
FILE: yolov4/utils.h
================================================
#ifndef __TRT_UTILS_H_
#define __TRT_UTILS_H_

#include <iostream>
#include <vector>
#include <algorithm>
#include <cudnn.h>

#ifndef CUDA_CHECK

#define CUDA_CHECK(callstr)                                                                    \
    {                                                                                          \
        cudaError_t error_code = callstr;                                                      \
        if (error_code != cudaSuccess) {                                                       \
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
            assert(0);                                                                         \
        }                                                                                      \
    }

#endif

namespace Tn
{
    template<typename T> 
    void write(char*& buffer, const T& val)
    {
        *reinterpret_cast<T*>(buffer) = val;
        buffer += sizeof(T);
    }

    template<typename T> 
    void read(const char*& buffer, T& val)
    {
        val = *reinterpret_cast<const T*>(buffer);
        buffer += sizeof(T);
    }
}

#endif


================================================
FILE: yolov4/yololayer.cu
================================================
#include <assert.h>
#include "yololayer.h"
#include "utils.h"

using namespace Yolo;

namespace nvinfer1
{
    YoloLayerPlugin::YoloLayerPlugin()
    {
        mClassCount = CLASS_NUM;
        mYoloKernel.clear();
        mYoloKernel.push_back(yolo1);
        mYoloKernel.push_back(yolo2);
        mYoloKernel.push_back(yolo3);

        mKernelCount = mYoloKernel.size();

        CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
        size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;
        for(int ii = 0; ii < mKernelCount; ii ++)
        {
            CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen));
            const auto& yolo = mYoloKernel[ii];
            CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
        }
    }
    
    YoloLayerPlugin::~YoloLayerPlugin()
    {
    }

    // create the plugin at runtime from a byte stream
    YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length)
    {
        using namespace Tn;
        const char *d = reinterpret_cast<const char *>(data), *a = d;
        read(d, mClassCount);
        read(d, mThreadCount);
        read(d, mKernelCount);
        mYoloKernel.resize(mKernelCount);
        auto kernelSize = mKernelCount*sizeof(YoloKernel);
        memcpy(mYoloKernel.data(),d,kernelSize);
        d += kernelSize;

        CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
        size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;
        for(int ii = 0; ii < mKernelCount; ii ++)
        {
            CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen));
            const auto& yolo = mYoloKernel[ii];
            CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
        }

        assert(d == a + length);
    }

    void YoloLayerPlugin::serialize(void* buffer) const
    {
        using namespace Tn;
        char* d = static_cast<char*>(buffer), *a = d;
        write(d, mClassCount);
        write(d, mThreadCount);
        write(d, mKernelCount);
        auto kernelSize = mKernelCount*sizeof(YoloKernel);
        memcpy(d,mYoloKernel.data(),kernelSize);
        d += kernelSize;

        assert(d == a + getSerializationSize());
    }
    
    size_t YoloLayerPlugin::getSerializationSize() const
    {  
        return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount)  + sizeof(Yolo::YoloKernel) * mYoloKernel.size();
    }

    int YoloLayerPlugin::initialize()
    { 
        return 0;
    }
    
    Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
    {
        //output the result to channel
        int totalsize = MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float);

        return Dims3(totalsize + 1, 1, 1);
    }

    // Set plugin namespace
    void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace)
    {
        mPluginNamespace = pluginNamespace;
    }

    const char* YoloLayerPlugin::getPluginNamespace() const
    {
        return mPluginNamespace;
    }

    // Return the DataType of the plugin output at the requested index
    DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
    {
        return DataType::kFLOAT;
    }

    // Return true if output tensor is broadcast across a batch.
    bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
    {
        return false;
    }

    // Return true if plugin can use input that is broadcast across batch without replication.
    bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const
    {
        return false;
    }

    void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)
    {
    }

    // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
    void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
    {
    }

    // Detach the plugin object from its execution context.
    void YoloLayerPlugin::detachFromContext() {}

    const char* YoloLayerPlugin::getPluginType() const
    {
        return "YoloLayer_TRT";
    }

    const char* YoloLayerPlugin::getPluginVersion() const
    {
        return "1";
    }

    void YoloLayerPlugin::destroy()
    {
        delete this;
    }

    // Clone the plugin
    IPluginV2IOExt* YoloLayerPlugin::clone() const
    {
        YoloLayerPlugin *p = new YoloLayerPlugin();
        p->setPluginNamespace(mPluginNamespace);
        return p;
    }

    __device__ float Logist(float data){ return 1./(1. + exp(-data)); };

    __global__ void CalDetection(const float *input, float *output,int noElements, 
            int yoloWidth,int yoloHeight,const float anchors[CHECK_COUNT*2],int classes,int outputElem) {
 
        int idx = threadIdx.x + blockDim.x * blockIdx.x;
        if (idx >= noElements) return;

        int total_grid = yoloWidth * yoloHeight;
        int bnIdx = idx / total_grid;
        idx = idx - total_grid*bnIdx;
        int info_len_i = 5 + classes;
        const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT);

        for (int k = 0; k < 3; ++k) {
            int class_id = 0;
            float max_cls_prob = 0.0;
            for (int i = 5; i < info_len_i; ++i) {
                float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]);
                if (p > max_cls_prob) {
                    max_cls_prob = p;
                    class_id = i - 5;
                }
            }
            float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]);
            if (max_cls_prob < IGNORE_THRESH || box_prob < IGNORE_THRESH) continue;

            float *res_count = output + bnIdx*outputElem;
            int count = (int)atomicAdd(res_count, 1);
            if (count >= MAX_OUTPUT_BBOX_COUNT) return;
            char* data = (char * )res_count + sizeof(float) + count*sizeof(Detection);
            Detection* det =  (Detection*)(data);

            int row = idx / yoloWidth;
            int col = idx % yoloWidth;

            //Location
            det->bbox[0] = (col + Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * INPUT_W / yoloWidth;
            det->bbox[1] = (row + Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * INPUT_H / yoloHeight;
            det->bbox[2] = exp(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]) * anchors[2*k];
            det->bbox[3] = exp(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]) * anchors[2*k + 1];
            det->det_confidence = box_prob;
            det->class_id = class_id;
            det->class_confidence = max_cls_prob;
        }
    }

    void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) {

        int outputElem = 1 + MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float);

        for(int idx = 0 ; idx < batchSize; ++idx) {
            CUDA_CHECK(cudaMemset(output + idx*outputElem, 0, sizeof(float)));
        }
        int numElem = 0;
        for (unsigned int i = 0;i< mYoloKernel.size();++i)
        {
            const auto& yolo = mYoloKernel[i];
            numElem = yolo.width*yolo.height*batchSize;
            if (numElem < mThreadCount)
                mThreadCount = numElem;
            CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>>
                (inputs[i],output, numElem, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount ,outputElem);
        }

    }


    int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
    {
        //assert(batchSize == 1);
        //GPU
        //CUDA_CHECK(cudaStreamSynchronize(stream));
        forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);

        return 0;
    }

    PluginFieldCollection YoloPluginCreator::mFC{};
    std::vector<PluginField> YoloPluginCreator::mPluginAttributes;

    YoloPluginCreator::YoloPluginCreator()
    {
        mPluginAttributes.clear();

        mFC.nbFields = mPluginAttributes.size();
        mFC.fields = mPluginAttributes.data();
    }

    const char* YoloPluginCreator::getPluginName() const
    {
            return "YoloLayer_TRT";
    }

    const char* YoloPluginCreator::getPluginVersion() const
    {
            return "1";
    }

    const PluginFieldCollection* YoloPluginCreator::getFieldNames()
    {
            return &mFC;
    }

    IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
    {
        YoloLayerPlugin* obj = new YoloLayerPlugin();
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

    IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
    {
        // This object will be deleted when the network is destroyed, which will
        // call MishPlugin::destroy()
        YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

}


================================================
FILE: yolov4/yololayer.h
================================================
#ifndef _YOLO_LAYER_H
#define _YOLO_LAYER_H

#include <iostream>
#include <vector>
#include "NvInfer.h"

namespace Yolo
{
    static constexpr int CHECK_COUNT = 3;
    static constexpr float IGNORE_THRESH = 0.1f;
    static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000;
    static constexpr int CLASS_NUM = 80;
    static constexpr int INPUT_H = 608;
    static constexpr int INPUT_W = 608;

    struct YoloKernel
    {
        int width;
        int height;
        float anchors[CHECK_COUNT*2];
    };

    static constexpr YoloKernel yolo1 = {
        INPUT_W / 8,
        INPUT_H / 8,
        {12,16, 19,36, 40,28}
    };
    static constexpr YoloKernel yolo2 = {
        INPUT_W / 16,
        INPUT_H / 16,
        {36,75, 76,55, 72,146}
    };
    static constexpr YoloKernel yolo3 = {
        INPUT_W / 32,
        INPUT_H / 32,
        {142,110, 192,243, 459,401}
    };

    static constexpr int LOCATIONS = 4;
    struct alignas(float) Detection{
        //x y w h
        float bbox[LOCATIONS];
        float det_confidence;
        float class_id;
        float class_confidence;
    };
}


namespace nvinfer1
{
    class YoloLayerPlugin: public IPluginV2IOExt
    {
        public:
            explicit YoloLayerPlugin();
            YoloLayerPlugin(const void* data, size_t length);

            ~YoloLayerPlugin();

            int getNbOutputs() const override
            {
                return 1;
            }

            Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;

            int initialize() override;

            virtual void terminate() override {};

            virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;}

            virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override;

            virtual size_t getSerializationSize() const override;

            virtual void serialize(void* buffer) const override;

            bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override {
                return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
            }

            const char* getPluginType() const override;

            const char* getPluginVersion() const override;

            void destroy() override;

            IPluginV2IOExt* clone() const override;

            void setPluginNamespace(const char* pluginNamespace) override;

            const char* getPluginNamespace() const override;

            DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;

            bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override;

            bool canBroadcastInputAcrossBatch(int inputIndex) const override;

            void attachToContext(
                    cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override;

            void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override;

            void detachFromContext() override;

        private:
            void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1);
            int mClassCount;
            int mKernelCount;
            std::vector<Yolo::YoloKernel> mYoloKernel;
            int mThreadCount = 256;
            void** mAnchor;
            const char* mPluginNamespace;
    };

    class YoloPluginCreator : public IPluginCreator
    {
        public:
            YoloPluginCreator();

            ~YoloPluginCreator() override = default;

            const char* getPluginName() const override;

            const char* getPluginVersion() const override;

            const PluginFieldCollection* getFieldNames() override;

            IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override;

            IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;

            void setPluginNamespace(const char* libNamespace) override
            {
                mNamespace = libNamespace;
            }

            const char* getPluginNamespace() const override
            {
                return mNamespace.c_str();
            }

        private:
            std::string mNamespace;
            static PluginFieldCollection mFC;
            static std::vector<PluginField> mPluginAttributes;
    };
    REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
};

#endif 


================================================
FILE: yolov4/yolov4.cpp
================================================
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include <opencv2/opencv.hpp>
#include <dirent.h>
#include "NvInfer.h"
#include "utils.h"
#include "cuda_runtime_api.h"
#include "logging.h"
#include "yololayer.h"
#include "mish.h"

#define USE_FP16  // comment out this if want to use FP32
#define DEVICE 0  // GPU id
#define NMS_THRESH 0.4
#define BBOX_CONF_THRESH 0.5
#define BATCH_SIZE 1

using namespace nvinfer1;

// stuff we know about the network and the input/output blobs
static const int INPUT_H = Yolo::INPUT_H;
static const int INPUT_W = Yolo::INPUT_W;
static const int DETECTION_SIZE = sizeof(Yolo::Detection) / sizeof(float);
static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * DETECTION_SIZE + 1;  // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
static Logger gLogger;

cv::Mat preprocess_img(cv::Mat& img) {
    int w, h, x, y;
    float r_w = INPUT_W / (img.cols*1.0);
    float r_h = INPUT_H / (img.rows*1.0);
    if (r_h > r_w) {
        w = INPUT_W;
        h = r_w * img.rows;
        x = 0;
        y = (INPUT_H - h) / 2;
    } else {
        w = r_h* img.cols;
        h = INPUT_H;
        x = (INPUT_W - w) / 2;
        y = 0;
    }
    cv::Mat re(h, w, CV_8UC3);
    cv::resize(img, re, re.size());
    cv::Mat out(INPUT_H, INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128));
    re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
    return out;
}

cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
    int l, r, t, b;
    float r_w = INPUT_W / (img.cols * 1.0);
    float r_h = INPUT_H / (img.rows * 1.0);
    if (r_h > r_w) {
        l = bbox[0] - bbox[2]/2.f;
        r = bbox[0] + bbox[2]/2.f;
        t = bbox[1] - bbox[3]/2.f - (INPUT_H - r_w * img.rows) / 2;
        b = bbox[1] + bbox[3]/2.f - (INPUT_H - r_w * img.rows) / 2;
        l = l / r_w;
        r = r / r_w;
        t = t / r_w;
        b = b / r_w;
    } else {
        l = bbox[0] - bbox[2]/2.f - (INPUT_W - r_h * img.cols) / 2;
        r = bbox[0] + bbox[2]/2.f - (INPUT_W - r_h * img.cols) / 2;
        t = bbox[1] - bbox[3]/2.f;
        b = bbox[1] + bbox[3]/2.f;
        l = l / r_h;
        r = r / r_h;
        t = t / r_h;
        b = b / r_h;
    }
    return cv::Rect(l, t, r-l, b-t);
}

float iou(float lbox[4], float rbox[4]) {
    float interBox[] = {
        std::max(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left
        std::min(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right
        std::max(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top
        std::min(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom
    };

    if(interBox[2] > interBox[3] || interBox[0] > interBox[1])
        return 0.0f;

    float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]);
    return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS);
}

bool cmp(const Yolo::Detection& a, const Yolo::Detection& b) {
    return a.det_confidence > b.det_confidence;
}

void nms(std::vector<Yolo::Detection>& res, float *output, float nms_thresh = NMS_THRESH) {
    std::map<float, std::vector<Yolo::Detection>> m;
    for (int i = 0; i < output[0] && i < Yolo::MAX_OUTPUT_BBOX_COUNT; i++) {
        if (output[1 + DETECTION_SIZE * i + 4] <= BBOX_CONF_THRESH) continue;
        Yolo::Detection det;
        memcpy(&det, &output[1 + DETECTION_SIZE * i], DETECTION_SIZE * sizeof(float));
        if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Yolo::Detection>());
        m[det.class_id].push_back(det);
    }
    for (auto it = m.begin(); it != m.end(); it++) {
        //std::cout << it->second[0].class_id << " --- " << std::endl;
        auto& dets = it->second;
        std::sort(dets.begin(), dets.end(), cmp);
        for (size_t m = 0; m < dets.size(); ++m) {
            auto& item = dets[m];
            res.push_back(item);
            for (size_t n = m + 1; n < dets.size(); ++n) {
                if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
                    dets.erase(dets.begin()+n);
                    --n;
                }
            }
        }
    }
}

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        
        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float *gamma = (float*)weightMap[lname + ".weight"].values;
    float *beta = (float*)weightMap[lname + ".bias"].values;
    float *mean = (float*)weightMap[lname + ".running_mean"].values;
    float *var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{DataType::kFLOAT, scval, len};
    
    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{DataType::kFLOAT, shval, len};

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{DataType::kFLOAT, pval, len};

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

ILayer* convBnMish(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int p, int linx) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap["module_list." + std::to_string(linx) + ".Conv2d.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{p, p});

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "module_list." + std::to_string(linx) + ".BatchNorm2d", 1e-4);

    auto creator = getPluginRegistry()->getPluginCreator("Mish_TRT", "1");
    const PluginFieldCollection* pluginData = creator->getFieldNames();
    IPluginV2 *pluginObj = creator->createPlugin(("mish" + std::to_string(linx)).c_str(), pluginData);
    ITensor* inputTensors[] = {bn1->getOutput(0)};
    auto mish = network->addPluginV2(&inputTensors[0], 1, *pluginObj);
    return mish;
}

ILayer* convBnLeaky(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int p, int linx) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap["module_list." + std::to_string(linx) + ".Conv2d.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{p, p});

    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "module_list." + std::to_string(linx) + ".BatchNorm2d", 1e-4);

    auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU);
    lr->setAlpha(0.1);

    return lr;
}

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
    assert(data);

    std::map<std::string, Weights> weightMap = loadWeights("../yolov4.wts");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    // define each layer.
    auto l0 = convBnMish(network, weightMap, *data, 32, 3, 1, 1, 0);
    auto l1 = convBnMish(network, weightMap, *l0->getOutput(0), 64, 3, 2, 1, 1);
    auto l2 = convBnMish(network, weightMap, *l1->getOutput(0), 64, 1, 1, 0, 2);
    auto l3 = l1;
    auto l4 = convBnMish(network, weightMap, *l3->getOutput(0), 64, 1, 1, 0, 4);
    auto l5 = convBnMish(network, weightMap, *l4->getOutput(0), 32, 1, 1, 0, 5);
    auto l6 = convBnMish(network, weightMap, *l5->getOutput(0), 64, 3, 1, 1, 6);
    auto ew7 = network->addElementWise(*l6->getOutput(0), *l4->getOutput(0), ElementWiseOperation::kSUM);
    auto l8 = convBnMish(network, weightMap, *ew7->getOutput(0), 64, 1, 1, 0, 8);

    ITensor* inputTensors9[] = {l8->getOutput(0), l2->getOutput(0)};
    auto cat9 = network->addConcatenation(inputTensors9, 2);

    auto l10 = convBnMish(network, weightMap, *cat9->getOutput(0), 64, 1, 1, 0, 10);
    auto l11 = convBnMish(network, weightMap, *l10->getOutput(0), 128, 3, 2, 1, 11);
    auto l12 = convBnMish(network, weightMap, *l11->getOutput(0), 64, 1, 1, 0, 12);
    auto l13 = l11;
    auto l14 = convBnMish(network, weightMap, *l13->getOutput(0), 64, 1, 1, 0, 14);
    auto l15 = convBnMish(network, weightMap, *l14->getOutput(0), 64, 1, 1, 0, 15);
    auto l16 = convBnMish(network, weightMap, *l15->getOutput(0), 64, 3, 1, 1, 16);
    auto ew17 = network->addElementWise(*l16->getOutput(0), *l14->getOutput(0), ElementWiseOperation::kSUM);
    auto l18 = convBnMish(network, weightMap, *ew17->getOutput(0), 64, 1, 1, 0, 18);
    auto l19 = convBnMish(network, weightMap, *l18->getOutput(0), 64, 3, 1, 1, 19);
    auto ew20 = network->addElementWise(*l19->getOutput(0), *ew17->getOutput(0), ElementWiseOperation::kSUM);
    auto l21 = convBnMish(network, weightMap, *ew20->getOutput(0), 64, 1, 1, 0, 21);

    ITensor* inputTensors22[] = {l21->getOutput(0), l12->getOutput(0)};
    auto cat22 = network->addConcatenation(inputTensors22, 2);

    auto l23 = convBnMish(network, weightMap, *cat22->getOutput(0), 128, 1, 1, 0, 23);
    auto l24 = convBnMish(network, weightMap, *l23->getOutput(0), 256, 3, 2, 1, 24);
    auto l25 = convBnMish(network, weightMap, *l24->getOutput(0), 128, 1, 1, 0, 25);
    auto l26 = l24;
    auto l27 = convBnMish(network, weightMap, *l26->getOutput(0), 128, 1, 1, 0, 27);
    auto l28 = convBnMish(network, weightMap, *l27->getOutput(0), 128, 1, 1, 0, 28);
    auto l29 = convBnMish(network, weightMap, *l28->getOutput(0), 128, 3, 1, 1, 29);
    auto ew30 = network->addElementWise(*l29->getOutput(0), *l27->getOutput(0), ElementWiseOperation::kSUM);
    auto l31 = convBnMish(network, weightMap, *ew30->getOutput(0), 128, 1, 1, 0, 31);
    auto l32 = convBnMish(network, weightMap, *l31->getOutput(0), 128, 3, 1, 1, 32);
    auto ew33 = network->addElementWise(*l32->getOutput(0), *ew30->getOutput(0), ElementWiseOperation::kSUM);
    auto l34 = convBnMish(network, weightMap, *ew33->getOutput(0), 128, 1, 1, 0, 34);
    auto l35 = convBnMish(network, weightMap, *l34->getOutput(0), 128, 3, 1, 1, 35);
    auto ew36 = network->addElementWise(*l35->getOutput(0), *ew33->getOutput(0), ElementWiseOperation::kSUM);
    auto l37 = convBnMish(network, weightMap, *ew36->getOutput(0), 128, 1, 1, 0, 37);
    auto l38 = convBnMish(network, weightMap, *l37->getOutput(0), 128, 3, 1, 1, 38);
    auto ew39 = network->addElementWise(*l38->getOutput(0), *ew36->getOutput(0), ElementWiseOperation::kSUM);
    auto l40 = convBnMish(network, weightMap, *ew39->getOutput(0), 128, 1, 1, 0, 40);
    auto l41 = convBnMish(network, weightMap, *l40->getOutput(0), 128, 3, 1, 1, 41);
    auto ew42 = network->addElementWise(*l41->getOutput(0), *ew39->getOutput(0), ElementWiseOperation::kSUM);
    auto l43 = convBnMish(network, weightMap, *ew42->getOutput(0), 128, 1, 1, 0, 43);
    auto l44 = convBnMish(network, weightMap, *l43->getOutput(0), 128, 3, 1, 1, 44);
    auto ew45 = network->addElementWise(*l44->getOutput(0), *ew42->getOutput(0), ElementWiseOperation::kSUM);
    auto l46 = convBnMish(network, weightMap, *ew45->getOutput(0), 128, 1, 1, 0, 46);
    auto l47 = convBnMish(network, weightMap, *l46->getOutput(0), 128, 3, 1, 1, 47);
    auto ew48 = network->addElementWise(*l47->getOutput(0), *ew45->getOutput(0), ElementWiseOperation::kSUM);
    auto l49 = convBnMish(network, weightMap, *ew48->getOutput(0), 128, 1, 1, 0, 49);
    auto l50 = convBnMish(network, weightMap, *l49->getOutput(0), 128, 3, 1, 1, 50);
    auto ew51 = network->addElementWise(*l50->getOutput(0), *ew48->getOutput(0), ElementWiseOperation::kSUM);
    auto l52 = convBnMish(network, weightMap, *ew51->getOutput(0), 128, 1, 1, 0, 52);

    ITensor* inputTensors53[] = {l52->getOutput(0), l25->getOutput(0)};
    auto cat53 = network->addConcatenation(inputTensors53, 2);

    auto l54 = convBnMish(network, weightMap, *cat53->getOutput(0), 256, 1, 1, 0, 54);
    auto l55 = convBnMish(network, weightMap, *l54->getOutput(0), 512, 3, 2, 1, 55);
    auto l56 = convBnMish(network, weightMap, *l55->getOutput(0), 256, 1, 1, 0, 56);
    auto l57 = l55;
    auto l58 = convBnMish(network, weightMap, *l57->getOutput(0), 256, 1, 1, 0, 58);
    auto l59 = convBnMish(network, weightMap, *l58->getOutput(0), 256, 1, 1, 0, 59);
    auto l60 = convBnMish(network, weightMap, *l59->getOutput(0), 256, 3, 1, 1, 60);
    auto ew61 = network->addElementWise(*l60->getOutput(0), *l58->getOutput(0), ElementWiseOperation::kSUM);
    auto l62 = convBnMish(network, weightMap, *ew61->getOutput(0), 256, 1, 1, 0, 62);
    auto l63 = convBnMish(network, weightMap, *l62->getOutput(0), 256, 3, 1, 1, 63);
    auto ew64 = network->addElementWise(*l63->getOutput(0), *ew61->getOutput(0), ElementWiseOperation::kSUM);
    auto l65 = convBnMish(network, weightMap, *ew64->getOutput(0), 256, 1, 1, 0, 65);
    auto l66 = convBnMish(network, weightMap, *l65->getOutput(0), 256, 3, 1, 1, 66);
    auto ew67 = network->addElementWise(*l66->getOutput(0), *ew64->getOutput(0), ElementWiseOperation::kSUM);
    auto l68 = convBnMish(network, weightMap, *ew67->getOutput(0), 256, 1, 1, 0, 68);
    auto l69 = convBnMish(network, weightMap, *l68->getOutput(0), 256, 3, 1, 1, 69);
    auto ew70 = network->addElementWise(*l69->getOutput(0), *ew67->getOutput(0), ElementWiseOperation::kSUM);
    auto l71 = convBnMish(network, weightMap, *ew70->getOutput(0), 256, 1, 1, 0, 71);
    auto l72 = convBnMish(network, weightMap, *l71->getOutput(0), 256, 3, 1, 1, 72);
    auto ew73 = network->addElementWise(*l72->getOutput(0), *ew70->getOutput(0), ElementWiseOperation::kSUM);
    auto l74 = convBnMish(network, weightMap, *ew73->getOutput(0), 256, 1, 1, 0, 74);
    auto l75 = convBnMish(network, weightMap, *l74->getOutput(0), 256, 3, 1, 1, 75);
    auto ew76 = network->addElementWise(*l75->getOutput(0), *ew73->getOutput(0), ElementWiseOperation::kSUM);
    auto l77 = convBnMish(network, weightMap, *ew76->getOutput(0), 256, 1, 1, 0, 77);
    auto l78 = convBnMish(network, weightMap, *l77->getOutput(0), 256, 3, 1, 1, 78);
    auto ew79 = network->addElementWise(*l78->getOutput(0), *ew76->getOutput(0), ElementWiseOperation::kSUM);
    auto l80 = convBnMish(network, weightMap, *ew79->getOutput(0), 256, 1, 1, 0, 80);
    auto l81 = convBnMish(network, weightMap, *l80->getOutput(0), 256, 3, 1, 1, 81);
    auto ew82 = network->addElementWise(*l81->getOutput(0), *ew79->getOutput(0), ElementWiseOperation::kSUM);
    auto l83 = convBnMish(network, weightMap, *ew82->getOutput(0), 256, 1, 1, 0, 83);

    ITensor* inputTensors84[] = {l83->getOutput(0), l56->getOutput(0)};
    auto cat84 = network->addConcatenation(inputTensors84, 2);

    auto l85 = convBnMish(network, weightMap, *cat84->getOutput(0), 512, 1, 1, 0, 85);
    auto l86 = convBnMish(network, weightMap, *l85->getOutput(0), 1024, 3, 2, 1, 86);
    auto l87 = convBnMish(network, weightMap, *l86->getOutput(0), 512, 1, 1, 0, 87);
    auto l88 = l86;
    auto l89 = convBnMish(network, weightMap, *l88->getOutput(0), 512, 1, 1, 0, 89);
    auto l90 = convBnMish(network, weightMap, *l89->getOutput(0), 512, 1, 1, 0, 90);
    auto l91 = convBnMish(network, weightMap, *l90->getOutput(0), 512, 3, 1, 1, 91);
    auto ew92 = network->addElementWise(*l91->getOutput(0), *l89->getOutput(0), ElementWiseOperation::kSUM);
    auto l93 = convBnMish(network, weightMap, *ew92->getOutput(0), 512, 1, 1, 0, 93);
    auto l94 = convBnMish(network, weightMap, *l93->getOutput(0), 512, 3, 1, 1, 94);
    auto ew95 = network->addElementWise(*l94->getOutput(0), *ew92->getOutput(0), ElementWiseOperation::kSUM);
    auto l96 = convBnMish(network, weightMap, *ew95->getOutput(0), 512, 1, 1, 0, 96);
    auto l97 = convBnMish(network, weightMap, *l96->getOutput(0), 512, 3, 1, 1, 97);
    auto ew98 = network->addElementWise(*l97->getOutput(0), *ew95->getOutput(0), ElementWiseOperation::kSUM);
    auto l99 = convBnMish(network, weightMap, *ew98->getOutput(0), 512, 1, 1, 0, 99);
    auto l100 = convBnMish(network, weightMap, *l99->getOutput(0), 512, 3, 1, 1, 100);
    auto ew101 = network->addElementWise(*l100->getOutput(0), *ew98->getOutput(0), ElementWiseOperation::kSUM);
    auto l102 = convBnMish(network, weightMap, *ew101->getOutput(0), 512, 1, 1, 0, 102);

    ITensor* inputTensors103[] = {l102->getOutput(0), l87->getOutput(0)};
    auto cat103 = network->addConcatenation(inputTensors103, 2);

    auto l104 = convBnMish(network, weightMap, *cat103->getOutput(0), 1024, 1, 1, 0, 104);

    // ---------
    auto l105 = convBnLeaky(network, weightMap, *l104->getOutput(0), 512, 1, 1, 0, 105);
    auto l106 = convBnLeaky(network, weightMap, *l105->getOutput(0), 1024, 3, 1, 1, 106);
    auto l107 = convBnLeaky(network, weightMap, *l106->getOutput(0), 512, 1, 1, 0, 107);

    auto pool108 = network->addPoolingNd(*l107->getOutput(0), PoolingType::kMAX, DimsHW{5, 5});
    pool108->setPaddingNd(DimsHW{2, 2});
    pool108->setStrideNd(DimsHW{1, 1});

    auto l109 = l107;

    auto pool110 = network->addPoolingNd(*l109->getOutput(0), PoolingType::kMAX, DimsHW{9, 9});
    pool110->setPaddingNd(DimsHW{4, 4});
    pool110->setStrideNd(DimsHW{1, 1});

    auto l111 = l107;

    auto pool112 = network->addPoolingNd(*l111->getOutput(0), PoolingType::kMAX, DimsHW{13, 13});
    pool112->setPaddingNd(DimsHW{6, 6});
    pool112->setStrideNd(DimsHW{1, 1});

    ITensor* inputTensors113[] = {pool112->getOutput(0), pool110->getOutput(0), pool108->getOutput(0), l107->getOutput(0)};
    auto cat113 = network->addConcatenation(inputTensors113, 4);

    auto l114 = convBnLeaky(network, weightMap, *cat113->getOutput(0), 512, 1, 1, 0, 114);
    auto l115 = convBnLeaky(network, weightMap, *l114->getOutput(0), 1024, 3, 1, 1, 115);
    auto l116 = convBnLeaky(network, weightMap, *l115->getOutput(0), 512, 1, 1, 0, 116);
    auto l117 = convBnLeaky(network, weightMap, *l116->getOutput(0), 256, 1, 1, 0, 117);

    float *deval = reinterpret_cast<float*>(malloc(sizeof(float) * 256 * 2 * 2));
    for (int i = 0; i < 256 * 2 * 2; i++) {
        deval[i] = 1.0;
    }
    Weights deconvwts118{DataType::kFLOAT, deval, 256 * 2 * 2};
    IDeconvolutionLayer* deconv118 = network->addDeconvolutionNd(*l117->getOutput(0), 256, DimsHW{2, 2}, deconvwts118, emptywts);
    assert(deconv118);
    deconv118->setStrideNd(DimsHW{2, 2});
    deconv118->setNbGroups(256);
    weightMap["deconv118"] = deconvwts118;

    auto l119 = l85;
    auto l120 = convBnLeaky(network, weightMap, *l119->getOutput(0), 256, 1, 1, 0, 120);

    ITensor* inputTensors121[] = {l120->getOutput(0), deconv118->getOutput(0)};
    auto cat121 = network->addConcatenation(inputTensors121, 2);

    auto l122 = convBnLeaky(network, weightMap, *cat121->getOutput(0), 256, 1, 1, 0, 122);
    auto l123 = convBnLeaky(network, weightMap, *l122->getOutput(0), 512, 3, 1, 1, 123);
    auto l124 = convBnLeaky(network, weightMap, *l123->getOutput(0), 256, 1, 1, 0, 124);
    auto l125 = convBnLeaky(network, weightMap, *l124->getOutput(0), 512, 3, 1, 1, 125);
    auto l126 = convBnLeaky(network, weightMap, *l125->getOutput(0), 256, 1, 1, 0, 126);
    auto l127 = convBnLeaky(network, weightMap, *l126->getOutput(0), 128, 1, 1, 0, 127);

    Weights deconvwts128{DataType::kFLOAT, deval, 128 * 2 * 2};
    IDeconvolutionLayer* deconv128 = network->addDeconvolutionNd(*l127->getOutput(0), 128, DimsHW{2, 2}, deconvwts128, emptywts);
    assert(deconv128);
    deconv128->setStrideNd(DimsHW{2, 2});
    deconv128->setNbGroups(128);

    auto l129 = l54;
    auto l130 = convBnLeaky(network, weightMap, *l129->getOutput(0), 128, 1, 1, 0, 130);

    ITensor* inputTensors131[] = {l130->getOutput(0), deconv128->getOutput(0)};
    auto cat131 = network->addConcatenation(inputTensors131, 2);

    auto l132 = convBnLeaky(network, weightMap, *cat131->getOutput(0), 128, 1, 1, 0, 132);
    auto l133 = convBnLeaky(network, weightMap, *l132->getOutput(0), 256, 3, 1, 1, 133);
    auto l134 = convBnLeaky(network, weightMap, *l133->getOutput(0), 128, 1, 1, 0, 134);
    auto l135 = convBnLeaky(network, weightMap, *l134->getOutput(0), 256, 3, 1, 1, 135);
    auto l136 = convBnLeaky(network, weightMap, *l135->getOutput(0), 128, 1, 1, 0, 136);
    auto l137 = convBnLeaky(network, weightMap, *l136->getOutput(0), 256, 3, 1, 1, 137);
    IConvolutionLayer* conv138 = network->addConvolutionNd(*l137->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.138.Conv2d.weight"], weightMap["module_list.138.Conv2d.bias"]);
    assert(conv138);
    // 139 is yolo layer

    auto l140 = l136;
    auto l141 = convBnLeaky(network, weightMap, *l140->getOutput(0), 256, 3, 2, 1, 141);

    ITensor* inputTensors142[] = {l141->getOutput(0), l126->getOutput(0)};
    auto cat142 = network->addConcatenation(inputTensors142, 2);

    auto l143 = convBnLeaky(network, weightMap, *cat142->getOutput(0), 256, 1, 1, 0, 143);
    auto l144 = convBnLeaky(network, weightMap, *l143->getOutput(0), 512, 3, 1, 1, 144);
    auto l145 = convBnLeaky(network, weightMap, *l144->getOutput(0), 256, 1, 1, 0, 145);
    auto l146 = convBnLeaky(network, weightMap, *l145->getOutput(0), 512, 3, 1, 1, 146);
    auto l147 = convBnLeaky(network, weightMap, *l146->getOutput(0), 256, 1, 1, 0, 147);
    auto l148 = convBnLeaky(network, weightMap, *l147->getOutput(0), 512, 3, 1, 1, 148);
    IConvolutionLayer* conv149 = network->addConvolutionNd(*l148->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.149.Conv2d.weight"], weightMap["module_list.149.Conv2d.bias"]);
    assert(conv149);
    // 150 is yolo layer

    auto l151 = l147;
    auto l152 = convBnLeaky(network, weightMap, *l151->getOutput(0), 512, 3, 2, 1, 152);

    ITensor* inputTensors153[] = {l152->getOutput(0), l116->getOutput(0)};
    auto cat153 = network->addConcatenation(inputTensors153, 2);

    auto l154 = convBnLeaky(network, weightMap, *cat153->getOutput(0), 512, 1, 1, 0, 154);
    auto l155 = convBnLeaky(network, weightMap, *l154->getOutput(0), 1024, 3, 1, 1, 155);
    auto l156 = convBnLeaky(network, weightMap, *l155->getOutput(0), 512, 1, 1, 0, 156);
    auto l157 = convBnLeaky(network, weightMap, *l156->getOutput(0), 1024, 3, 1, 1, 157);
    auto l158 = convBnLeaky(network, weightMap, *l157->getOutput(0), 512, 1, 1, 0, 158);
    auto l159 = convBnLeaky(network, weightMap, *l158->getOutput(0), 1024, 3, 1, 1, 159);
    IConvolutionLayer* conv160 = network->addConvolutionNd(*l159->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.160.Conv2d.weight"], weightMap["module_list.160.Conv2d.bias"]);
    assert(conv160);
    // 161 is yolo layer

    auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
    const PluginFieldCollection* pluginData = creator->getFieldNames();
    IPluginV2 *pluginObj = creator->createPlugin("yololayer", pluginData);
    ITensor* inputTensors_yolo[] = {conv138->getOutput(0), conv149->getOutput(0), conv160->getOutput(0)};
    auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj);

    yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*yolo->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#ifdef USE_FP16
    config->setFlag(BuilderFlag::kFP16);
#endif
    std::cout << "Building tensorrt engine, please wait for a while..." << std::endl;
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*) (mem.second.values));
    }

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
    config->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(buffers[inputIndex]));
    CUDA_CHECK(cudaFree(buffers[outputIndex]));
}

int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
    DIR *p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 &&
                strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}

int main(int argc, char** argv) {
    cudaSetDevice(DEVICE);
    // create a model using the API directly and serialize it to a stream
    char *trtModelStream{nullptr};
    size_t size{0};

    if (argc == 2 && std::string(argv[1]) == "-s") {
        IHostMemory* modelStream{nullptr};
        APIToModel(BATCH_SIZE, &modelStream);
        assert(modelStream != nullptr);
        std::ofstream p("yolov4.engine", std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 0;
    } else if (argc == 3 && std::string(argv[1]) == "-d") {
        std::ifstream file("yolov4.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
    } else {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./yolov4 -s  // serialize model to plan file" << std::endl;
        std::cerr << "./yolov4 -d ../samples  // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    std::vector<std::string> file_names;
    if (read_files_in_dir(argv[2], file_names) < 0) {
        std::cout << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    // prepare input data ---------------------------
    static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
    //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
    //    data[i] = 1.0;
    static float prob[BATCH_SIZE * OUTPUT_SIZE];
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;

    int fcount = 0;
    for (int f = 0; f < (int)file_names.size(); f++) {
        fcount++;
        if (fcount < BATCH_SIZE && f + 1 != (int)file_names.size()) continue;
        for (int b = 0; b < fcount; b++) {
            cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]);
            if (img.empty()) continue;
            cv::Mat pr_img = preprocess_img(img);
            for (int i = 0; i < INPUT_H * INPUT_W; i++) {
                data[b * 3 * INPUT_H * INPUT_W + i] = pr_img.at<cv::Vec3b>(i)[2] / 255.0;
                data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[1] / 255.0;
                data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[0] / 255.0;
            }
        }

        // Run inference
        auto start = std::chrono::system_clock::now();
        doInference(*context, data, prob, BATCH_SIZE);
        auto end = std::chrono::system_clock::now();
        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
        std::vector<std::vector<Yolo::Detection>> batch_res(fcount);
        for (int b = 0; b < fcount; b++) {
            auto& res = batch_res[b];
            nms(res, &prob[b * OUTPUT_SIZE]);
        }
        for (int b = 0; b < fcount; b++) {
            auto& res = batch_res[b];
            //std::cout << res.size() << std::endl;
            cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]);
            for (size_t j = 0; j < res.size(); j++) {
                //float *p = (float*)&res[j];
                //for (size_t k = 0; k < 7; k++) {
                //    std::cout << p[k] << ", ";
                //}
                //std::cout << std::endl;
                cv::Rect r = get_rect(img, res[j].bbox);
                cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
                cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
            }
            cv::imwrite("_" + file_names[f - fcount + 1 + b], img);
        }
        fcount = 0;
    }

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    //Print histogram of the output distribution
    //std::cout << "\nOutput:\n\n";
    //for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
    //{
    //    std::cout << prob[i] << ", ";
    //    if (i % 10 == 0) std::cout << i / 10 << std::endl;
    //}
    //std::cout << std::endl;

    return 0;
}


================================================
FILE: yolov5/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.10)

project(yolov5)

add_definitions(-std=c++11)
add_definitions(-DAPI_EXPORTS)
option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

# TODO(Call for PR): make cmake compatible with Windows
set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
enable_language(CUDA)

# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
# TODO(Call for PR): make TRT path configurable from command line
include_directories(/home/nvidia/TensorRT-8.2.5.1/include/)
link_directories(/home/nvidia/TensorRT-8.2.5.1/lib/)

include_directories(${PROJECT_SOURCE_DIR}/src/)
include_directories(${PROJECT_SOURCE_DIR}/plugin/)
file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu)
file(GLOB_RECURSE PLUGIN_SRCS ${PROJECT_SOURCE_DIR}/plugin/*.cu)

add_library(myplugins SHARED ${PLUGIN_SRCS})
target_link_libraries(myplugins nvinfer cudart)

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

add_executable(yolov5_det yolov5_det.cpp ${SRCS})
target_link_libraries(yolov5_det nvinfer)
target_link_libraries(yolov5_det cudart)
target_link_libraries(yolov5_det myplugins)
target_link_libraries(yolov5_det ${OpenCV_LIBS})

add_executable(yolov5_cls yolov5_cls.cpp ${SRCS})
target_link_libraries(yolov5_cls nvinfer)
target_link_libraries(yolov5_cls cudart)
target_link_libraries(yolov5_cls myplugins)
target_link_libraries(yolov5_cls ${OpenCV_LIBS})

add_executable(yolov5_seg yolov5_seg.cpp ${SRCS})
target_link_libraries(yolov5_seg nvinfer)
target_link_libraries(yolov5_seg cudart)
target_link_libraries(yolov5_seg myplugins)
target_link_libraries(yolov5_seg ${OpenCV_LIBS})


================================================
FILE: yolov5/README.md
================================================
# YOLOv5

TensorRTx inference code base for [ultralytics/yolov5](https://github.com/ultralytics/yolov5).

## Contributors

<a href="https://github.com/wang-xinyu"><img src="https://avatars.githubusercontent.com/u/15235574?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/BaofengZan"><img src="https://avatars.githubusercontent.com/u/20653176?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/upczww"><img src="https://avatars.githubusercontent.com/u/16224249?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/cesarandreslopez"><img src="https://avatars.githubusercontent.com/u/14029177?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/makaveli10"><img src="https://avatars.githubusercontent.com/u/39617050?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/priteshgohil"><img src="https://avatars.githubusercontent.com/u/43172056?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/rymzt"><img src="https://avatars.githubusercontent.com/u/3270954?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/AsakusaRinne"><img src="https://avatars.githubusercontent.com/u/47343601?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/freedenS"><img src="https://avatars.githubusercontent.com/u/26213470?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/smarttowel"><img src="https://avatars.githubusercontent.com/u/1128528?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/wwqgtxx"><img src="https://avatars.githubusercontent.com/u/582584?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/adujardin"><img src="https://avatars.githubusercontent.com/u/12609780?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/jow905"><img src="https://avatars.githubusercontent.com/u/19189198?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/CristiFati"><img src="https://avatars.githubusercontent.com/u/29705787?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/HaiyangPeng"><img src="https://avatars.githubusercontent.com/u/46739135?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/Armassarion"><img src="https://avatars.githubusercontent.com/u/33727511?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/xupengao"><img src="https://avatars.githubusercontent.com/u/51817015?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/liuqi123123"><img src="https://avatars.githubusercontent.com/u/46275888?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/ASONG0506"><img src="https://avatars.githubusercontent.com/u/26050577?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/bobo0810"><img src="https://avatars.githubusercontent.com/u/26057879?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/Silmeria112"><img src="https://avatars.githubusercontent.com/u/16464837?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/LW-SCU"><img src="https://avatars.githubusercontent.com/u/28128257?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/AdanWang"><img src="https://avatars.githubusercontent.com/u/32757980?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/triple-Mu"><img src="https://avatars.githubusercontent.com/u/92794867?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/xiang-wuu"><img src="https://avatars.githubusercontent.com/u/107029401?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/uyolo1314"><img src="https://avatars.githubusercontent.com/u/101853326?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/Rex-LK"><img src="https://avatars.githubusercontent.com/u/74702576?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/PrinceP"><img src="https://avatars.githubusercontent.com/u/10251537?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/hky3535"><img src="https://avatars.githubusercontent.com/u/126926285?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/CharlesHuan"><img src="https://avatars.githubusercontent.com/u/47875698?s=48&v=4" width="40px;" alt=""/></a>

## Different versions of yolov5

Currently, we support yolov5 v1.0, v2.0, v3.0, v3.1, v4.0, v5.0, v6.0, v6.2, v7.0

- For yolov5 v7.0, download .pt from [yolov5 release v7.0](https://github.com/ultralytics/yolov5/releases/tag/v7.0), `git clone -b v7.0 https://github.com/ultralytics/yolov5.git` and `git clone -b yolov5-v7.0 https://github.com/wang-xinyu/tensorrtx.git`, then follow how-to-run in [tensorrtx/yolov5-v7.0](https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v7.0/yolov5)
- For yolov5 v6.2, download .pt from [yolov5 release v6.2](https://github.com/ultralytics/yolov5/releases/tag/v6.2), `git clone -b v6.2 https://github.com/ultralytics/yolov5.git` and `git clone -b yolov5-v6.2 https://github.com/wang-xinyu/tensorrtx.git`, then follow how-to-run in [tensorrtx/yolov5-v6.2](https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v6.2/yolov5)
- For yolov5 v6.0, download .pt from [yolov5 release v6.0](https://github.com/ultralytics/yolov5/releases/tag/v6.0), `git clone -b v6.0 https://github.com/ultralytics/yolov5.git` and `git clone -b yolov5-v6.0 https://github.com/wang-xinyu/tensorrtx.git`, then follow how-to-run in [tensorrtx/yolov5-v6.0](https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v6.0/yolov5).
- For yolov5 v5.0, download .pt from [yolov5 release v5.0](https://github.com/ultralytics/yolov5/releases/tag/v5.0), `git clone -b v5.0 https://github.com/ultralytics/yolov5.git` and `git clone -b yolov5-v5.0 https://github.com/wang-xinyu/tensorrtx.git`, then follow how-to-run in [tensorrtx/yolov5-v5.0](https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v5.0/yolov5).
- For yolov5 v4.0, download .pt from [yolov5 release v4.0](https://github.com/ultralytics/yolov5/releases/tag/v4.0), `git clone -b v4.0 https://github.com/ultralytics/yolov5.git` and `git clone -b yolov5-v4.0 https://github.com/wang-xinyu/tensorrtx.git`, then follow how-to-run in [tensorrtx/yolov5-v4.0](https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v4.0/yolov5).
- For yolov5 v3.1, download .pt from [yolov5 release v3.1](https://github.com/ultralytics/yolov5/releases/tag/v3.1), `git clone -b v3.1 https://github.com/ultralytics/yolov5.git` and `git clone -b yolov5-v3.1 https://github.com/wang-xinyu/tensorrtx.git`, then follow how-to-run in [tensorrtx/yolov5-v3.1](https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v3.1/yolov5).
- For yolov5 v3.0, download .pt from [yolov5 release v3.0](https://github.com/ultralytics/yolov5/releases/tag/v3.0), `git clone -b v3.0 https://github.com/ultralytics/yolov5.git` and `git clone -b yolov5-v3.0 https://github.com/wang-xinyu/tensorrtx.git`, then follow how-to-run in [tensorrtx/yolov5-v3.0](https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v3.0/yolov5).
- For yolov5 v2.0, download .pt from [yolov5 release v2.0](https://github.com/ultralytics/yolov5/releases/tag/v2.0), `git clone -b v2.0 https://github.com/ultralytics/yolov5.git` and `git clone -b yolov5-v2.0 https://github.com/wang-xinyu/tensorrtx.git`, then follow how-to-run in [tensorrtx/yolov5-v2.0](https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v2.0/yolov5).
- For yolov5 v1.0, download .pt from [yolov5 release v1.0](https://github.com/ultralytics/yolov5/releases/tag/v1.0), `git clone -b v1.0 https://github.com/ultralytics/yolov5.git` and `git clone -b yolov5-v1.0 https://github.com/wang-xinyu/tensorrtx.git`, then follow how-to-run in [tensorrtx/yolov5-v1.0](https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v1.0/yolov5).

## Config

- Choose the YOLOv5 sub-model n/s/m/l/x/n6/s6/m6/l6/x6 from command line arguments.
- Other configs please check [src/config.h](src/config.h)

## Build and Run

### Detection

1. generate .wts from pytorch with .pt, or download .wts from model zoo

```
git clone -b v7.0 https://github.com/ultralytics/yolov5.git
git clone -b yolov5-v7.0 https://github.com/wang-xinyu/tensorrtx.git
cd yolov5/
wget https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s.pt
cp [PATH-TO-TENSORRTX]/yolov5/gen_wts.py .
python gen_wts.py -w yolov5s.pt -o yolov5s.wts
# A file 'yolov5s.wts' will be generated.
```

2. build tensorrtx/yolov5 and run

```
cd [PATH-TO-TENSORRTX]/yolov5/
# Update kNumClass in src/config.h if your model is trained on custom dataset
mkdir build
cd build
cp [PATH-TO-ultralytics-yolov5]/yolov5s.wts . 
cmake ..
make

./yolov5_det -s [.wts] [.engine] [n/s/m/l/x/n6/s6/m6/l6/x6 or c/c6 gd gw]  // serialize model to plan file
./yolov5_det -d [.engine] [image folder]  // deserialize and run inference, the images in [image folder] will be processed.

# For example yolov5s
./yolov5_det -s yolov5s.wts yolov5s.engine s
./yolov5_det -d yolov5s.engine ../images

# For example Custom model with depth_multiple=0.17, width_multiple=0.25 in yolov5.yaml
./yolov5_det -s yolov5_custom.wts yolov5.engine c 0.17 0.25
./yolov5_det -d yolov5.engine ../images
```

3. Check the images generated, _zidane.jpg and _bus.jpg

4. Optional, load and run the tensorrt model in Python

```
// Install python-tensorrt, pycuda, etc.
// Ensure the yolov5s.engine and libmyplugins.so have been built
python yolov5_det_trt.py

// Another version of python script, which is using CUDA Python instead of pycuda.
python yolov5_det_trt_cuda_python.py
```

<p align="center">
<img src="https://user-images.githubusercontent.com/15235574/78247927-4d9fac00-751e-11ea-8b1b-704a0aeb3fcf.jpg" height="360px;">
</p>

### Classification

```
# Download ImageNet labels
wget https://github.com/joannzhang00/ImageNet-dataset-classes-labels/blob/main/imagenet_classes.txt

# Build and serialize TensorRT engine
./yolov5_cls -s yolov5s-cls.wts yolov5s-cls.engine s

# Run inference
./yolov5_cls -d yolov5s-cls.engine ../images
```

### Instance Segmentation

```
# Build and serialize TensorRT engine
./yolov5_seg -s yolov5s-seg.wts yolov5s-seg.engine s

# Download the labels file
wget -O coco.txt https://raw.githubusercontent.com/amikelive/coco-labels/master/coco-labels-2014_2017.txt

# Run inference with labels file
./yolov5_seg -d yolov5s-seg.engine ../images coco.txt
```

<p align="center">
<img src="https://user-images.githubusercontent.com/10251537/211291625-1b912483-b6a6-4e92-80c1-434d165b6776.jpg" height="360px;">
</p>

# INT8 Quantization

1. Prepare calibration images, you can randomly select 1000s images from your train set. For coco, you can also download my calibration images `coco_calib` from [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing) or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh

2. unzip it in yolov5/build

3. set the macro `USE_INT8` in src/config.h and make

4. serialize the model and test


## More Information

See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)


================================================
FILE: yolov5/gen_wts.py
================================================
import argparse
import os
import struct
import torch
from utils.torch_utils import select_device


def parse_args():
    parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
    parser.add_argument('-w', '--weights', required=True,
                        help='Input weights (.pt) file path (required)')
    parser.add_argument(
        '-o', '--output', help='Output (.wts) file path (optional)')
    parser.add_argument(
        '-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg'],
        help='determines the model is detection/classification')
    args = parser.parse_args()
    if not os.path.isfile(args.weights):
        raise SystemExit('Invalid input file')
    if not args.output:
        args.output = os.path.splitext(args.weights)[0] + '.wts'
    elif os.path.isdir(args.output):
        args.output = os.path.join(
            args.output,
            os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
    return args.weights, args.output, args.type


pt_file, wts_file, m_type = parse_args()
print(f'Generating .wts for {m_type} model')

# Load model
print(f'Loading {pt_file}')
device = select_device('cpu')
model = torch.load(pt_file, map_location=device, weights_only=False)  # Load FP32 weights
model = model['ema' if model.get('ema') else 'model'].float()

if m_type in ['detect', 'seg']:
    # update anchor_grid info
    anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None]
    # model.model[-1].anchor_grid = anchor_grid
    delattr(model.model[-1], 'anchor_grid')  # model.model[-1] is detect layer
    # The parameters are saved in the OrderDict through the "register_buffer" method, and then saved to the weight.
    model.model[-1].register_buffer("anchor_grid", anchor_grid)
    model.model[-1].register_buffer("strides", model.model[-1].stride)

model.to(device).eval()

print(f'Writing into {wts_file}')
with open(wts_file, 'w') as f:
    f.write('{}\n'.format(len(model.state_dict().keys())))
    for k, v in model.state_dict().items():
        vr = v.reshape(-1).cpu().numpy()
        f.write('{} {} '.format(k, len(vr)))
        for vv in vr:
            f.write(' ')
            f.write(struct.pack('>f', float(vv)).hex())
        f.write('\n')


================================================
FILE: yolov5/plugin/yololayer.cu
================================================
#include "yololayer.h"
#include "cuda_utils.h"

#include <cassert>
#include <vector>
#include <iostream>

namespace Tn {
template<typename T> 
void write(char*& buffer, const T& val) {
  *reinterpret_cast<T*>(buffer) = val;
  buffer += sizeof(T);
}

template<typename T> 
void read(const char*& buffer, T& val) {
  val = *reinterpret_cast<const T*>(buffer);
  buffer += sizeof(T);
}
}

namespace nvinfer1 {
YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation, const std::vector<YoloKernel>& vYoloKernel) {
  mClassCount = classCount;
  mYoloV5NetWidth = netWidth;
  mYoloV5NetHeight = netHeight;
  mMaxOutObject = maxOut;
  is_segmentation_ = is_segmentation;
  mYoloKernel = vYoloKernel;
  mKernelCount = vYoloKernel.size();

  CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
  size_t AnchorLen = sizeof(float)* kNumAnchor * 2;
  for (int ii = 0; ii < mKernelCount; ii++) {
    CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
    const auto& yolo = mYoloKernel[ii];
    CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
  }
}

YoloLayerPlugin::~YoloLayerPlugin() {
  for (int ii = 0; ii < mKernelCount; ii++) {
    CUDA_CHECK(cudaFree(mAnchor[ii]));
  }
  CUDA_CHECK(cudaFreeHost(mAnchor));
}

// create the plugin at runtime from a byte stream
YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) {
  using namespace Tn;
  const char *d = reinterpret_cast<const char *>(data), *a = d;
  read(d, mClassCount);
  read(d, mThreadCount);
  read(d, mKernelCount);
  read(d, mYoloV5NetWidth);
  read(d, mYoloV5NetHeight);
  read(d, mMaxOutObject);
  read(d, is_segmentation_);
  mYoloKernel.resize(mKernelCount);
  auto kernelSize = mKernelCount * sizeof(YoloKernel);
  memcpy(mYoloKernel.data(), d, kernelSize);
  d += kernelSize;
  CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
  size_t AnchorLen = sizeof(float)* kNumAnchor * 2;
  for (int ii = 0; ii < mKernelCount; ii++) {
    CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
    const auto& yolo = mYoloKernel[ii];
    CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
  }
  assert(d == a + length);
}

void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT {
  using namespace Tn;
  char* d = static_cast<char*>(buffer), *a = d;
  write(d, mClassCount);
  write(d, mThreadCount);
  write(d, mKernelCount);
  write(d, mYoloV5NetWidth);
  write(d, mYoloV5NetHeight);
  write(d, mMaxOutObject);
  write(d, is_segmentation_);
  auto kernelSize = mKernelCount * sizeof(YoloKernel);
  memcpy(d, mYoloKernel.data(), kernelSize);
  d += kernelSize;

  assert(d == a + getSerializationSize());
}

size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT {
  size_t s = sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount);
  s += sizeof(YoloKernel) * mYoloKernel.size();
  s += sizeof(mYoloV5NetWidth) + sizeof(mYoloV5NetHeight);
  s += sizeof(mMaxOutObject) + sizeof(is_segmentation_);
  return s;
}

int YoloLayerPlugin::initialize() TRT_NOEXCEPT {
  return 0;
}

Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT {
  //output the result to channel
  int totalsize = mMaxOutObject * sizeof(Detection) / sizeof(float);
  return Dims3(totalsize + 1, 1, 1);
}

// Set plugin namespace
void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT {
  mPluginNamespace = pluginNamespace;
}

const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT {
  return mPluginNamespace;
}

// Return the DataType of the plugin output at the requested index
DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT {
  return DataType::kFLOAT;
}

// Return true if output tensor is broadcast across a batch.
bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT {
  return false;
}

// Return true if plugin can use input that is broadcast across batch without replication.
bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT {
  return false;
}

void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT {}

// Attach the plugin object to an execution context and grant the plugin the access to some context resource.
void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {}

// Detach the plugin object from its execution context.
void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {}

const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT {
  return "YoloLayer_TRT";
}

const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT {
  return "1";
}

void YoloLayerPlugin::destroy() TRT_NOEXCEPT {
  delete this;
}

// Clone the plugin
IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT {
  YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, is_segmentation_, mYoloKernel);
  p->setPluginNamespace(mPluginNamespace);
  return p;
}

__device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); };

__global__ void CalDetection(const float *input, float *output, int noElements,
    const int netwidth, const int netheight, int maxoutobject, int yoloWidth,
    int yoloHeight, const float anchors[kNumAnchor * 2], int classes, int outputElem, bool is_segmentation) {

  int idx = threadIdx.x + blockDim.x * blockIdx.x;
  if (idx >= noElements) return;

  int total_grid = yoloWidth * yoloHeight;
  int bnIdx = idx / total_grid;
  idx = idx - total_grid * bnIdx;
  int info_len_i = 5 + classes;
  if (is_segmentation) info_len_i += 32;
  const float* curInput = input + bnIdx * (info_len_i * total_grid * kNumAnchor);

  for (int k = 0; k < kNumAnchor; ++k) {
    float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]);
    if (box_prob < kIgnoreThresh) continue;
    int class_id = 0;
    float max_cls_prob = 0.0;
    for (int i = 5; i < 5 + classes; ++i) {
      float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]);
      if (p > max_cls_prob) {
        max_cls_prob = p;
        class_id = i - 5;
      }
    }
    float *res_count = output + bnIdx * outputElem;
    int count = (int)atomicAdd(res_count, 1);
    if (count >= maxoutobject) return;
    char *data = (char*)res_count + sizeof(float) + count * sizeof(Detection);
    Detection *det = (Detection*)(data);

    int row = idx / yoloWidth;
    int col = idx % yoloWidth;

    det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * netwidth / yoloWidth;
    det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * netheight / yoloHeight;

    det->bbox[2] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]);
    det->bbox[2] = det->bbox[2] * det->bbox[2] * anchors[2 * k];
    det->bbox[3] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]);
    det->bbox[3] = det->bbox[3] * det->bbox[3] * anchors[2 * k + 1];
    det->conf = box_prob * max_cls_prob;
    det->class_id = class_id;

    for (int i = 0; is_segmentation && i < 32; i++) {
      det->mask[i] = curInput[idx + k * info_len_i * total_grid + (i + 5 + classes) * total_grid];
    }
  }
}

void YoloLayerPlugin::forwardGpu(const float* const* inputs, float *output, cudaStream_t stream, int batchSize) {
  int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
  for (int idx = 0; idx < batchSize; ++idx) {
    CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream));
  }
  int numElem = 0;
  for (unsigned int i = 0; i < mYoloKernel.size(); ++i) {
    const auto& yolo = mYoloKernel[i];
    numElem = yolo.width * yolo.height * batchSize;
    if (numElem < mThreadCount) mThreadCount = numElem;

    CalDetection << < (numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream >> >
      (inputs[i], output, numElem, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, yolo.width, yolo.height, (float*)mAnchor[i], mClassCount, outputElem, is_segmentation_);
  }
}


int YoloLayerPlugin::enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
  forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, batchSize);
  return 0;
}

PluginFieldCollection YoloPluginCreator::mFC{};
std::vector<PluginField> YoloPluginCreator::mPluginAttributes;

YoloPluginCreator::YoloPluginCreator() {
  mPluginAttributes.clear();
  mFC.nbFields = mPluginAttributes.size();
  mFC.fields = mPluginAttributes.data();
}

const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT {
  return "YoloLayer_TRT";
}

const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT {
  return "1";
}

const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT {
  return &mFC;
}

IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT {
  assert(fc->nbFields == 2);
  assert(strcmp(fc->fields[0].name, "netinfo") == 0);
  assert(strcmp(fc->fields[1].name, "kernels") == 0);
  int *p_netinfo = (int*)(fc->fields[0].data);
  int class_count = p_netinfo[0];
  int input_w = p_netinfo[1];
  int input_h = p_netinfo[2];
  int max_output_object_count = p_netinfo[3];
  bool is_segmentation = (bool)p_netinfo[4];
  std::vector<YoloKernel> kernels(fc->fields[1].length);
  memcpy(&kernels[0], fc->fields[1].data, kernels.size() * sizeof(YoloKernel));
  YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, is_segmentation, kernels);
  obj->setPluginNamespace(mNamespace.c_str());
  return obj;
}

IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT {
  // This object will be deleted when the network is destroyed, which will
  // call YoloLayerPlugin::destroy()
  YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
  obj->setPluginNamespace(mNamespace.c_str());
  return obj;
}
}


================================================
FILE: yolov5/plugin/yololayer.h
================================================
#pragma once

#include "types.h"
#include "macros.h"

#include <vector>
#include <string>

namespace nvinfer1 {
class API YoloLayerPlugin : public IPluginV2IOExt {
public:
  YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation, const std::vector<YoloKernel>& vYoloKernel);
  YoloLayerPlugin(const void* data, size_t length);
  ~YoloLayerPlugin();

  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }

  Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;

  int initialize() TRT_NOEXCEPT override;

  virtual void terminate() TRT_NOEXCEPT override {};

  virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; }

  virtual int enqueue(int batchSize, const void* const* inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;

  virtual size_t getSerializationSize() const TRT_NOEXCEPT override;

  virtual void serialize(void* buffer) const TRT_NOEXCEPT override;

  bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override {
    return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
  }

  const char* getPluginType() const TRT_NOEXCEPT override;

  const char* getPluginVersion() const TRT_NOEXCEPT override;

  void destroy() TRT_NOEXCEPT override;

  IPluginV2IOExt* clone() const TRT_NOEXCEPT override;

  void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;

  const char* getPluginNamespace() const TRT_NOEXCEPT override;

  DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;

  bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override;

  bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;

  void attachToContext(
      cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;

  void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT override;

  void detachFromContext() TRT_NOEXCEPT override;

 private:
  void forwardGpu(const float* const* inputs, float *output, cudaStream_t stream, int batchSize = 1);
  int mThreadCount = 256;
  const char* mPluginNamespace;
  int mKernelCount;
  int mClassCount;
  int mYoloV5NetWidth;
  int mYoloV5NetHeight;
  int mMaxOutObject;
  bool is_segmentation_;
  std::vector<YoloKernel> mYoloKernel;
  void** mAnchor;
};

class API YoloPluginCreator : public IPluginCreator {
 public:
  YoloPluginCreator();

  ~YoloPluginCreator() override = default;

  const char* getPluginName() const TRT_NOEXCEPT override;

  const char* getPluginVersion() const TRT_NOEXCEPT override;

  const PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;

  IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT override;

  IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;

  void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override {
    mNamespace = libNamespace;
  }

  const char* getPluginNamespace() const TRT_NOEXCEPT override {
    return mNamespace.c_str();
  }

 private:
  std::string mNamespace;
  static PluginFieldCollection mFC;
  static std::vector<PluginField> mPluginAttributes;
};
REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
};


================================================
FILE: yolov5/src/calibrator.cpp
================================================
#include "calibrator.h"
#include "cuda_utils.h"
#include "utils.h"

#include <iostream>
#include <iterator>
#include <fstream>
#include <opencv2/opencv.hpp>
#include <opencv2/dnn/dnn.hpp>

cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
  int w, h, x, y;
  float r_w = input_w / (img.cols * 1.0);
  float r_h = input_h / (img.rows * 1.0);
  if (r_h > r_w) {
    w = input_w;
    h = r_w * img.rows;
    x = 0;
    y = (input_h - h) / 2;
  } else {
    w = r_h * img.cols;
    h = input_h;
    x = (input_w - w) / 2;
    y = 0;
  }
  cv::Mat re(h, w, CV_8UC3);
  cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
  cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
  re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
  return out;
}

Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache)
    : batchsize_(batchsize),
      input_w_(input_w),
      input_h_(input_h),
      img_idx_(0),
      img_dir_(img_dir),
      calib_table_name_(calib_table_name),
      input_blob_name_(input_blob_name),
      read_cache_(read_cache) {
  input_count_ = 3 * input_w * input_h * batchsize;
  CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float)));
  read_files_in_dir(img_dir, img_files_);
}

Int8EntropyCalibrator2::~Int8EntropyCalibrator2() {
  CUDA_CHECK(cudaFree(device_input_));
}

int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT {
  return batchsize_;
}

bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT {
  if (img_idx_ + batchsize_ > (int)img_files_.size()) {
    return false;
  }

  std::vector<cv::Mat> input_imgs_;
  for (int i = img_idx_; i < img_idx_ + batchsize_; i++) {
    std::cout << img_files_[i] << "  " << i << std::endl;
    cv::Mat temp = cv::imread(img_dir_ + img_files_[i]);
    if (temp.empty()) {
      std::cerr << "Fatal error: image cannot open!" << std::endl;
      return false;
    }
    cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_);
    input_imgs_.push_back(pr_img);
  }
  img_idx_ += batchsize_;
  cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false);

  CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr<float>(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice));
  assert(!strcmp(names[0], input_blob_name_));
  bindings[0] = device_input_;
  return true;
}

const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT {
  std::cout << "reading calib cache: " << calib_table_name_ << std::endl;
  calib_cache_.clear();
  std::ifstream input(calib_table_name_, std::ios::binary);
  input >> std::noskipws;
  if (read_cache_ && input.good()) {
    std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(calib_cache_));
  }
  length = calib_cache_.size();
  return length ? calib_cache_.data() : nullptr;
}

void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT {
  std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl;
  std::ofstream output(calib_table_name_, std::ios::binary);
  output.write(reinterpret_cast<const char*>(cache), length);
}


================================================
FILE: yolov5/src/calibrator.h
================================================
#pragma once

#include "macros.h"
#include <string>
#include <vector>
#include <opencv2/opencv.hpp>

cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h);

//! \class Int8EntropyCalibrator2
//!
//! \brief Implements Entropy calibrator 2.
//!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
//!
class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
 public:
  Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true);

  virtual ~Int8EntropyCalibrator2();
  int getBatchSize() const TRT_NOEXCEPT override;
  bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
  const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
  void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;

 private:
  int batchsize_;
  int input_w_;
  int input_h_;
  int img_idx_;
  std::string img_dir_;
  std::vector<std::string> img_files_;
  size_t input_count_;
  std::string calib_table_name_;
  const char* input_blob_name_;
  bool read_cache_;
  void* device_input_;
  std::vector<char> calib_cache_;
};


================================================
FILE: yolov5/src/config.h
================================================
#pragma once

/* --------------------------------------------------------
 * These configs are related to tensorrt model, if these are changed,
 * please re-compile and re-serialize the tensorrt model.
 * --------------------------------------------------------*/

// For INT8, you need prepare the calibration dataset, please refer to
// https://github.com/wang-xinyu/tensorrtx/tree/master/yolov5#int8-quantization
#define USE_FP16  // set USE_INT8 or USE_FP16 or USE_FP32

// These are used to define input/output tensor names,
// you can set them to whatever you want.
const static char* kInputTensorName = "data";
const static char* kOutputTensorName = "prob";

// Detection model and Segmentation model' number of classes
constexpr static int kNumClass = 80;

// Classfication model's number of classes
constexpr static int kClsNumClass = 1000;

constexpr static int kBatchSize = 1;

// Yolo's input width and height must by divisible by 32
constexpr static int kInputH = 640;
constexpr static int kInputW = 640;

// Classfication model's input shape
constexpr static int kClsInputH = 224;
constexpr static int kClsInputW = 224;

// Maximum number of output bounding boxes from yololayer plugin.
// That is maximum number of output bounding boxes before NMS.
constexpr static int kMaxNumOutputBbox = 1000;

constexpr static int kNumAnchor = 3;

// The bboxes whose confidence is lower than kIgnoreThresh will be ignored in yololayer plugin.
constexpr static float kIgnoreThresh = 0.1f;

/* --------------------------------------------------------
 * These configs are NOT related to tensorrt model, if these are changed,
 * please re-compile, but no need to re-serialize the tensorrt model.
 * --------------------------------------------------------*/

// NMS overlapping thresh and final detection confidence thresh
const static float kNmsThresh = 0.45f;
const static float kConfThresh = 0.5f;

const static int kGpuId = 0;

// If your image size is larger than 4096 * 3112, please increase this value
const static int kMaxInputImageSize = 4096 * 3112;


================================================
FILE: yolov5/src/cuda_utils.h
================================================
#ifndef TRTX_CUDA_UTILS_H_
#define TRTX_CUDA_UTILS_H_

#include <cuda_runtime_api.h>

#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)\
    {\
        cudaError_t error_code = callstr;\
        if (error_code != cudaSuccess) {\
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
            assert(0);\
        }\
    }
#endif  // CUDA_CHECK

#endif  // TRTX_CUDA_UTILS_H_


================================================
FILE: yolov5/src/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "macros.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override 
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: yolov5/src/macros.h
================================================
#ifndef __MACROS_H
#define __MACROS_H

#include <NvInfer.h>

#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else

#if defined(_MSC_VER)
#define API __declspec(dllimport)
#else
#define API
#endif
#endif  // API_EXPORTS

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif

#endif  // __MACROS_H


================================================
FILE: yolov5/src/model.cpp
================================================
#include "model.h"
#include "calibrator.h"
#include "config.h"
#include "yololayer.h"

#include <iostream>
#include <fstream>
#include <map>
#include <cassert>
#include <cmath>
#include <cstring>

using namespace nvinfer1;

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
static std::map<std::string, Weights> loadWeights(const std::string file) {
  std::cout << "Loading weights: " << file << std::endl;
  std::map<std::string, Weights> weightMap;

  // Open weights file
  std::ifstream input(file);
  assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");

  // Read number of weight blobs
  int32_t count;
  input >> count;
  assert(count > 0 && "Invalid weight map file.");

  while (count--) {
    Weights wt{ DataType::kFLOAT, nullptr, 0 };
    uint32_t size;

    // Read name and type of blob
    std::string name;
    input >> name >> std::dec >> size;
    wt.type = DataType::kFLOAT;

    // Load blob
    uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
    for (uint32_t x = 0, y = size; x < y; ++x) {
      input >> std::hex >> val[x];
    }
    wt.values = val;

    wt.count = size;
    weightMap[name] = wt;
  }

  return weightMap;
}

static int get_width(int x, float gw, int divisor = 8) {
  return int(ceil((x * gw) / divisor)) * divisor;
}

static int get_depth(int x, float gd) {
  if (x == 1) return 1;
  int r = round(x * gd);
  if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0) {
    --r;
  }
  return std::max<int>(r, 1);
}

static IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
  float* gamma = (float*)weightMap[lname + ".weight"].values;
  float* beta = (float*)weightMap[lname + ".bias"].values;
  float* mean = (float*)weightMap[lname + ".running_mean"].values;
  float* var = (float*)weightMap[lname + ".running_var"].values;
  int len = weightMap[lname + ".running_var"].count;

  float* scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
  for (int i = 0; i < len; i++) {
    scval[i] = gamma[i] / sqrt(var[i] + eps);
  }
  Weights scale{ DataType::kFLOAT, scval, len };

  float* shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
  for (int i = 0; i < len; i++) {
    shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
  }
  Weights shift{ DataType::kFLOAT, shval, len };

  float* pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
  for (int i = 0; i < len; i++) {
    pval[i] = 1.0;
  }
  Weights power{ DataType::kFLOAT, pval, len };

  weightMap[lname + ".scale"] = scale;
  weightMap[lname + ".shift"] = shift;
  weightMap[lname + ".power"] = power;
  IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
  assert(scale_1);
  return scale_1;
}

static ILayer* convBlock(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname) {
  Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
  int p = ksize / 3;
  IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ ksize, ksize }, weightMap[lname + ".conv.weight"], emptywts);
  assert(conv1);
  conv1->setStrideNd(DimsHW{ s, s });
  conv1->setPaddingNd(DimsHW{ p, p });
  conv1->setNbGroups(g);
  conv1->setName((lname + ".conv").c_str());
  IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3);

  // silu = x * sigmoid
  auto sig = network->addActivation(*bn1->getOutput(0), ActivationType::kSIGMOID);
  assert(sig);
  auto ew = network->addElementWise(*bn1->getOutput(0), *sig->getOutput(0), ElementWiseOperation::kPROD);
  assert(ew);
  return ew;
}

static ILayer* focus(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int ksize, std::string lname) {
  ISliceLayer* s1 = network->addSlice(input, Dims3{ 0, 0, 0 }, Dims3{ inch, kInputH / 2, kInputW / 2 }, Dims3{ 1, 2, 2 });
  ISliceLayer* s2 = network->addSlice(input, Dims3{ 0, 1, 0 }, Dims3{ inch, kInputH / 2, kInputW / 2 }, Dims3{ 1, 2, 2 });
  ISliceLayer* s3 = network->addSlice(input, Dims3{ 0, 0, 1 }, Dims3{ inch, kInputH / 2, kInputW / 2 }, Dims3{ 1, 2, 2 });
  ISliceLayer* s4 = network->addSlice(input, Dims3{ 0, 1, 1 }, Dims3{ inch, kInputH / 2, kInputW / 2 }, Dims3{ 1, 2, 2 });
  ITensor* inputTensors[] = { s1->getOutput(0), s2->getOutput(0), s3->getOutput(0), s4->getOutput(0) };
  auto cat = network->addConcatenation(inputTensors, 4);
  auto conv = convBlock(network, weightMap, *cat->getOutput(0), outch, ksize, 1, 1, lname + ".conv");
  return conv;
}

static ILayer* bottleneck(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, bool shortcut, int g, float e, std::string lname) {
  auto cv1 = convBlock(network, weightMap, input, (int)((float)c2 * e), 1, 1, 1, lname + ".cv1");
  auto cv2 = convBlock(network, weightMap, *cv1->getOutput(0), c2, 3, 1, g, lname + ".cv2");
  if (shortcut && c1 == c2) {
    auto ew = network->addElementWise(input, *cv2->getOutput(0), ElementWiseOperation::kSUM);
    return ew;
  }
  return cv2;
}

static ILayer* bottleneckCSP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) {
  Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
  int c_ = (int)((float)c2 * e);
  auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
  auto cv2 = network->addConvolutionNd(input, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv2.weight"], emptywts);
  ITensor* y1 = cv1->getOutput(0);
  for (int i = 0; i < n; i++) {
    auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i));
    y1 = b->getOutput(0);
  }
  auto cv3 = network->addConvolutionNd(*y1, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv3.weight"], emptywts);

  ITensor* inputTensors[] = { cv3->getOutput(0), cv2->getOutput(0) };
  auto cat = network->addConcatenation(inputTensors, 2);

  IScaleLayer* bn = addBatchNorm2d(network, weightMap, *cat->getOutput(0), lname + ".bn", 1e-4);
  auto lr = network->addActivation(*bn->getOutput(0), ActivationType::kLEAKY_RELU);
  lr->setAlpha(0.1);

  auto cv4 = convBlock(network, weightMap, *lr->getOutput(0), c2, 1, 1, 1, lname + ".cv4");
  return cv4;
}

static ILayer* C3(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) {
  int c_ = (int)((float)c2 * e);
  auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
  auto cv2 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv2");
  ITensor *y1 = cv1->getOutput(0);
  for (int i = 0; i < n; i++) {
    auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i));
    y1 = b->getOutput(0);
  }

  ITensor* inputTensors[] = { y1, cv2->getOutput(0) };
  auto cat = network->addConcatenation(inputTensors, 2);

  auto cv3 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv3");
  return cv3;
}

static ILayer* SPP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int k1, int k2, int k3, std::string lname) {
  int c_ = c1 / 2;
  auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");

  auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k1, k1 });
  pool1->setPaddingNd(DimsHW{ k1 / 2, k1 / 2 });
  pool1->setStrideNd(DimsHW{ 1, 1 });
  auto pool2 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k2, k2 });
  pool2->setPaddingNd(DimsHW{ k2 / 2, k2 / 2 });
  pool2->setStrideNd(DimsHW{ 1, 1 });
  auto pool3 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k3, k3 });
  pool3->setPaddingNd(DimsHW{ k3 / 2, k3 / 2 });
  pool3->setStrideNd(DimsHW{ 1, 1 });

  ITensor* inputTensors[] = { cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0) };
  auto cat = network->addConcatenation(inputTensors, 4);

  auto cv2 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2");
  return cv2;
}

static ILayer* SPPF(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int k, std::string lname) {
  int c_ = c1 / 2;
  auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");

  auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k, k });
  pool1->setPaddingNd(DimsHW{ k / 2, k / 2 });
  pool1->setStrideNd(DimsHW{ 1, 1 });
  auto pool2 = network->addPoolingNd(*pool1->getOutput(0), PoolingType::kMAX, DimsHW{ k, k });
  pool2->setPaddingNd(DimsHW{ k / 2, k / 2 });
  pool2->setStrideNd(DimsHW{ 1, 1 });
  auto pool3 = network->addPoolingNd(*pool2->getOutput(0), PoolingType::kMAX, DimsHW{ k, k });
  pool3->setPaddingNd(DimsHW{ k / 2, k / 2 });
  pool3->setStrideNd(DimsHW{ 1, 1 });
  ITensor* inputTensors[] = { cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0) };
  auto cat = network->addConcatenation(inputTensors, 4);
  auto cv2 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2");
  return cv2;
}

static ILayer* Proto(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int c_, int c2, std::string lname) {
  auto cv1 = convBlock(network, weightMap, input, c_, 3, 1, 1, lname + ".cv1");

  auto upsample = network->addResize(*cv1->getOutput(0));
  assert(upsample);
  upsample->setResizeMode(ResizeMode::kNEAREST);
  const float scales[] = {1, 2, 2};
  upsample->setScales(scales, 3);

  auto cv2 = convBlock(network, weightMap, *upsample->getOutput(0), c_, 3, 1, 1, lname + ".cv2");
  auto cv3 = convBlock(network, weightMap, *cv2->getOutput(0), c2, 1, 1, 1, lname + ".cv3");
  assert(cv3);
  return cv3;
}

static std::vector<std::vector<float>> getAnchors(std::map<std::string, Weights>& weightMap, std::string lname) {
  std::vector<std::vector<float>> anchors;
  Weights wts = weightMap[lname + ".anchor_grid"];
  int anchor_len = kNumAnchor * 2;
  for (int i = 0; i < wts.count / anchor_len; i++) {
    auto *p = (const float*)wts.values + i * anchor_len;
    std::vector<float> anchor(p, p + anchor_len);
    anchors.push_back(anchor);
  }
  return anchors;
}

static IPluginV2Layer* addYoLoLayer(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, std::string lname, std::vector<IConvolutionLayer*> dets, bool is_segmentation = false) {
  auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
  auto anchors = getAnchors(weightMap, lname);
  PluginField plugin_fields[2];
  int netinfo[5] = {kNumClass, kInputW, kInputH, kMaxNumOutputBbox, (int)is_segmentation};
  plugin_fields[0].data = netinfo;
  plugin_fields[0].length = 5;
  plugin_fields[0].name = "netinfo";
  plugin_fields[0].type = PluginFieldType::kFLOAT32;

  //load strides from Detect layer
  assert(weightMap.find(lname + ".strides") != weightMap.end() && "Not found `strides`, please check gen_wts.py!!!");
  Weights strides = weightMap[lname + ".strides"];
  auto *p = (const float*)(strides.values);
  std::vector<int> scales(p, p + strides.count);

  std::vector<YoloKernel> kernels;
  for (size_t i = 0; i < anchors.size(); i++) {
    YoloKernel kernel;
    kernel.width = kInputW / scales[i];
    kernel.height = kInputH / scales[i];
    memcpy(kernel.anchors, &anchors[i][0], anchors[i].size() * sizeof(float));
    kernels.push_back(kernel);
  }
  plugin_fields[1].data = &kernels[0];
  plugin_fields[1].length = kernels.size();
  plugin_fields[1].name = "kernels";
  plugin_fields[1].type = PluginFieldType::kFLOAT32;
  PluginFieldCollection plugin_data;
  plugin_data.nbFields = 2;
  plugin_data.fields = plugin_fields;
  IPluginV2 *plugin_obj = creator->createPlugin("yololayer", &plugin_data);
  std::vector<ITensor*> input_tensors;
  for (auto det: dets) {
    input_tensors.push_back(det->getOutput(0));
  }
  auto yolo = network->addPluginV2(&input_tensors[0], input_tensors.size(), *plugin_obj);
  return yolo;
}

ICudaEngine* build_det_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, float& gd, float& gw, std::string& wts_name) {
  INetworkDefinition* network = builder->createNetworkV2(0U);

  // Create input tensor of shape {3, kInputH, kInputW}
  ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, kInputH, kInputW });
  assert(data);
  std::map<std::string, Weights> weightMap = loadWeights(wts_name);

  // Backbone
  auto conv0 = convBlock(network, weightMap, *data,  get_width(64, gw), 6, 2, 1,  "model.0");
  assert(conv0);
  auto conv1 = convBlock(network, weightMap, *conv0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1");
  auto bottleneck_CSP2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2");
  auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3");
  auto bottleneck_csp4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(6, gd), true, 1, 0.5, "model.4");
  auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5");
  auto bottleneck_csp6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6");
  auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.7");
  auto bottleneck_csp8 = C3(network, weightMap, *conv7->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), true, 1, 0.5, "model.8");
  auto spp9 = SPPF(network, weightMap, *bottleneck_csp8->getOutput(0), get_width(1024, gw), get_width(1024, gw), 5, "model.9");

  // Head
  auto conv10 = convBlock(network, weightMap, *spp9->getOutput(0), get_width(512, gw), 1, 1, 1, "model.10");

  auto upsample11 = network->addResize(*conv10->getOutput(0));
  assert(upsample11);
  upsample11->setResizeMode(ResizeMode::kNEAREST);
  upsample11->setOutputDimensions(bottleneck_csp6->getOutput(0)->getDimensions());

  ITensor* inputTensors12[] = { upsample11->getOutput(0), bottleneck_csp6->getOutput(0) };
  auto cat12 = network->addConcatenation(inputTensors12, 2);
  auto bottleneck_csp13 = C3(network, weightMap, *cat12->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.13");
  auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), get_width(256, gw), 1, 1, 1, "model.14");

  auto upsample15 = network->addResize(*conv14->getOutput(0));
  assert(upsample15);
  upsample15->setResizeMode(ResizeMode::kNEAREST);
  upsample15->setOutputDimensions(bottleneck_csp4->getOutput(0)->getDimensions());

  ITensor* inputTensors16[] = { upsample15->getOutput(0), bottleneck_csp4->getOutput(0) };
  auto cat16 = network->addConcatenation(inputTensors16, 2);

  auto bottleneck_csp17 = C3(network, weightMap, *cat16->getOutput(0), get_width(512, gw), get_width(256, gw), get_depth(3, gd), false, 1, 0.5, "model.17");

  // Detect
  IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);
  auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), get_width(256, gw), 3, 2, 1, "model.18");
  ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) };
  auto cat19 = network->addConcatenation(inputTensors19, 2);
  auto bottleneck_csp20 = C3(network, weightMap, *cat19->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.20");
  IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);
  auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), get_width(512, gw), 3, 2, 1, "model.21");
  ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) };
  auto cat22 = network->addConcatenation(inputTensors22, 2);
  auto bottleneck_csp23 = C3(network, weightMap, *cat22->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.23");
  IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);

  auto yolo = addYoLoLayer(network, weightMap, "model.24", std::vector<IConvolutionLayer*>{det0, det1, det2});
  yolo->getOutput(0)->setName(kOutputTensorName);
  network->markOutput(*yolo->getOutput(0));

  // Engine config
  builder->setMaxBatchSize(maxBatchSize);
  config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#if defined(USE_FP16)
  config->setFlag(BuilderFlag::kFP16);
#elif defined(USE_INT8)
  std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
  assert(builder->platformHasFastInt8());
  config->setFlag(BuilderFlag::kINT8);
  Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName);
  config->setInt8Calibrator(calibrator);
#endif

  std::cout << "Building engine, please wait for a while..." << std::endl;
  ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
  std::cout << "Build engine successfully!" << std::endl;

  // Don't need the network any more
  network->destroy();

  // Release host memory
  for (auto& mem : weightMap) {
    free((void*)(mem.second.values));
  }

  return engine;
}

ICudaEngine* build_det_p6_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, float& gd, float& gw, std::string& wts_name) {
  INetworkDefinition* network = builder->createNetworkV2(0U);

  // Create input tensor of shape {3, kInputH, kInputW}
  ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, kInputH, kInputW });
  assert(data);

  std::map<std::string, Weights> weightMap = loadWeights(wts_name);

  // Backbone
  auto conv0 = convBlock(network, weightMap, *data,  get_width(64, gw), 6, 2, 1,  "model.0");
  auto conv1 = convBlock(network, weightMap, *conv0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1");
  auto c3_2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2");
  auto conv3 = convBlock(network, weightMap, *c3_2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3");
  auto c3_4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(6, gd), true, 1, 0.5, "model.4");
  auto conv5 = convBlock(network, weightMap, *c3_4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5");
  auto c3_6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6");
  auto conv7 = convBlock(network, weightMap, *c3_6->getOutput(0), get_width(768, gw), 3, 2, 1, "model.7");
  auto c3_8 = C3(network, weightMap, *conv7->getOutput(0), get_width(768, gw), get_width(768, gw), get_depth(3, gd), true, 1, 0.5, "model.8");
  auto conv9 = convBlock(network, weightMap, *c3_8->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.9");
  auto c3_10 = C3(network, weightMap, *conv9->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), true, 1, 0.5, "model.10");
  auto sppf11 = SPPF(network, weightMap, *c3_10->getOutput(0), get_width(1024, gw), get_width(1024, gw), 5, "model.11");

  // Head
  auto conv12 = convBlock(network, weightMap, *sppf11->getOutput(0), get_width(768, gw), 1, 1, 1, "model.12");
  auto upsample13 = network->addResize(*conv12->getOutput(0));
  assert(upsample13);
  upsample13->setResizeMode(ResizeMode::kNEAREST);
  upsample13->setOutputDimensions(c3_8->getOutput(0)->getDimensions());
  ITensor* inputTensors14[] = { upsample13->getOutput(0), c3_8->getOutput(0) };
  auto cat14 = network->addConcatenation(inputTensors14, 2);
  auto c3_15 = C3(network, weightMap, *cat14->getOutput(0), get_width(1536, gw), get_width(768, gw), get_depth(3, gd), false, 1, 0.5, "model.15");

  auto conv16 = convBlock(network, weightMap, *c3_15->getOutput(0), get_width(512, gw), 1, 1, 1, "model.16");
  auto upsample17 = network->addResize(*conv16->getOutput(0));
  assert(upsample17);
  upsample17->setResizeMode(ResizeMode::kNEAREST);
  upsample17->setOutputDimensions(c3_6->getOutput(0)->getDimensions());
  ITensor* inputTensors18[] = { upsample17->getOutput(0), c3_6->getOutput(0) };
  auto cat18 = network->addConcatenation(inputTensors18, 2);
  auto c3_19 = C3(network, weightMap, *cat18->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.19");

  auto conv20 = convBlock(network, weightMap, *c3_19->getOutput(0), get_width(256, gw), 1, 1, 1, "model.20");
  auto upsample21 = network->addResize(*conv20->getOutput(0));
  assert(upsample21);
  upsample21->setResizeMode(ResizeMode::kNEAREST);
  upsample21->setOutputDimensions(c3_4->getOutput(0)->getDimensions());
  ITensor* inputTensors21[] = { upsample21->getOutput(0), c3_4->getOutput(0) };
  auto cat22 = network->addConcatenation(inputTensors21, 2);
  auto c3_23 = C3(network, weightMap, *cat22->getOutput(0), get_width(512, gw), get_width(256, gw), get_depth(3, gd), false, 1, 0.5, "model.23");

  auto conv24 = convBlock(network, weightMap, *c3_23->getOutput(0), get_width(256, gw), 3, 2, 1, "model.24");
  ITensor* inputTensors25[] = { conv24->getOutput(0), conv20->getOutput(0) };
  auto cat25 = network->addConcatenation(inputTensors25, 2);
  auto c3_26 = C3(network, weightMap, *cat25->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.26");

  auto conv27 = convBlock(network, weightMap, *c3_26->getOutput(0), get_width(512, gw), 3, 2, 1, "model.27");
  ITensor* inputTensors28[] = { conv27->getOutput(0), conv16->getOutput(0) };
  auto cat28 = network->addConcatenation(inputTensors28, 2);
  auto c3_29 = C3(network, weightMap, *cat28->getOutput(0), get_width(1536, gw), get_width(768, gw), get_depth(3, gd), false, 1, 0.5, "model.29");

  auto conv30 = convBlock(network, weightMap, *c3_29->getOutput(0), get_width(768, gw), 3, 2, 1, "model.30");
  ITensor* inputTensors31[] = { conv30->getOutput(0), conv12->getOutput(0) };
  auto cat31 = network->addConcatenation(inputTensors31, 2);
  auto c3_32 = C3(network, weightMap, *cat31->getOutput(0), get_width(2048, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.32");

  // Detect
  IConvolutionLayer* det0 = network->addConvolutionNd(*c3_23->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.0.weight"], weightMap["model.33.m.0.bias"]);
  IConvolutionLayer* det1 = network->addConvolutionNd(*c3_26->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.1.weight"], weightMap["model.33.m.1.bias"]);
  IConvolutionLayer* det2 = network->addConvolutionNd(*c3_29->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.2.weight"], weightMap["model.33.m.2.bias"]);
  IConvolutionLayer* det3 = network->addConvolutionNd(*c3_32->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.3.weight"], weightMap["model.33.m.3.bias"]);

  auto yolo = addYoLoLayer(network, weightMap, "model.33", std::vector<IConvolutionLayer*>{det0, det1, det2, det3});
  yolo->getOutput(0)->setName(kOutputTensorName);
  network->markOutput(*yolo->getOutput(0));

  // Engine config
  builder->setMaxBatchSize(maxBatchSize);
  config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#if defined(USE_FP16)
  config->setFlag(BuilderFlag::kFP16);
#elif defined(USE_INT8)
  std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
  assert(builder->platformHasFastInt8());
  config->setFlag(BuilderFlag::kINT8);
  Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName);
  config->setInt8Calibrator(calibrator);
#endif

  std::cout << "Building engine, please wait for a while..." << std::endl;
  ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
  std::cout << "Build engine successfully!" << std::endl;

  // Don't need the network any more
  network->destroy();

  // Release host memory
  for (auto& mem : weightMap) {
    free((void*)(mem.second.values));
  }

  return engine;
}

ICudaEngine* build_cls_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, float& gd, float& gw, std::string& wts_name) {
  INetworkDefinition* network = builder->createNetworkV2(0U);

  // Create input tensor
  ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, kClsInputH, kClsInputW });
  assert(data);
  std::map<std::string, Weights> weightMap = loadWeights(wts_name);

  // Backbone
  auto conv0 = convBlock(network, weightMap, *data,  get_width(64, gw), 6, 2, 1,  "model.0");
  assert(conv0);
  auto conv1 = convBlock(network, weightMap, *conv0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1");
  auto bottleneck_CSP2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2");
  auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3");
  auto bottleneck_csp4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(6, gd), true, 1, 0.5, "model.4");
  auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5");
  auto bottleneck_csp6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6");
  auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.7");
  auto bottleneck_csp8 = C3(network, weightMap, *conv7->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), true, 1, 0.5, "model.8");

  // Head
  auto conv_class = convBlock(network, weightMap, *bottleneck_csp8->getOutput(0), 1280, 1, 1, 1, "model.9.conv");
  int k = kClsInputH / 32;
  IPoolingLayer* pool2 = network->addPoolingNd(*conv_class->getOutput(0), PoolingType::kAVERAGE, DimsHW{ k, k });
  assert(pool2);
  IFullyConnectedLayer* yolo = network->addFullyConnected(*pool2->getOutput(0), kClsNumClass, weightMap["model.9.linear.weight"], weightMap["model.9.linear.bias"]);
  assert(yolo);

  yolo->getOutput(0)->setName(kOutputTensorName);
  network->markOutput(*yolo->getOutput(0));

  // Engine config
  builder->setMaxBatchSize(maxBatchSize);
  config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB

#if defined(USE_FP16)
  config->setFlag(BuilderFlag::kFP16);
#elif defined(USE_INT8)
  std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
  assert(builder->platformHasFastInt8());
  config->setFlag(BuilderFlag::kINT8);
  Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kClsInputW, kClsInputW, "./coco_calib/", "int8calib.table", kInputTensorName);
  config->setInt8Calibrator(calibrator);
#endif

  std::cout << "Building engine, please wait for a while..." << std::endl;
  ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
  std::cout << "Build engine successfully!" << std::endl;

  // Don't need the network any more
  network->destroy();

  // Release host memory
  for (auto& mem : weightMap) {
    free((void*)(mem.second.values));
  }

  return engine;
}

ICudaEngine* build_seg_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, float& gd, float& gw, std::string& wts_name) {
  INetworkDefinition* network = builder->createNetworkV2(0U);
  ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, kInputH, kInputW });
  assert(data);
  std::map<std::string, Weights> weightMap = loadWeights(wts_name);

  // Backbone
  auto conv0 = convBlock(network, weightMap, *data,  get_width(64, gw), 6, 2, 1,  "model.0");
  assert(conv0);
  auto conv1 = convBlock(network, weightMap, *conv0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1");
  auto bottleneck_CSP2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2");
  auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3");
  auto bottleneck_csp4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(6, gd), true, 1, 0.5, "model.4");
  auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5");
  auto bottleneck_csp6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6");
  auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.7");
  auto bottleneck_csp8 = C3(network, weightMap, *conv7->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), true, 1, 0.5, "model.8");
  auto spp9 = SPPF(network, weightMap, *bottleneck_csp8->getOutput(0), get_width(1024, gw), get_width(1024, gw), 5, "model.9");

  // Head
  auto conv10 = convBlock(network, weightMap, *spp9->getOutput(0), get_width(512, gw), 1, 1, 1, "model.10");

  auto upsample11 = network->addResize(*conv10->getOutput(0));
  assert(upsample11);
  upsample11->setResizeMode(ResizeMode::kNEAREST);
  upsample11->setOutputDimensions(bottleneck_csp6->getOutput(0)->getDimensions());

  ITensor* inputTensors12[] = { upsample11->getOutput(0), bottleneck_csp6->getOutput(0) };
  auto cat12 = network->addConcatenation(inputTensors12, 2);
  auto bottleneck_csp13 = C3(network, weightMap, *cat12->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.13");
  auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), get_width(256, gw), 1, 1, 1, "model.14");

  auto upsample15 = network->addResize(*conv14->getOutput(0));
  assert(upsample15);
  upsample15->setResizeMode(ResizeMode::kNEAREST);
  upsample15->setOutputDimensions(bottleneck_csp4->getOutput(0)->getDimensions());

  ITensor* inputTensors16[] = { upsample15->getOutput(0), bottleneck_csp4->getOutput(0) };
  auto cat16 = network->addConcatenation(inputTensors16, 2);

  auto bottleneck_csp17 = C3(network, weightMap, *cat16->getOutput(0), get_width(512, gw), get_width(256, gw), get_depth(3, gd), false, 1, 0.5, "model.17");

  // Segmentation
  IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), kNumAnchor * (32 + kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);
  auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), get_width(256, gw), 3, 2, 1, "model.18");
  ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) };
  auto cat19 = network->addConcatenation(inputTensors19, 2);
  auto bottleneck_csp20 = C3(network, weightMap, *cat19->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.20");
  IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), kNumAnchor * (32 + kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);
  auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), get_width(512, gw), 3, 2, 1, "model.21");
  ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) };
  auto cat22 = network->addConcatenation(inputTensors22, 2);
  auto bottleneck_csp23 = C3(network, weightMap, *cat22->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.23");
  IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), kNumAnchor * (32 + kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);

  auto yolo = addYoLoLayer(network, weightMap, "model.24", std::vector<IConvolutionLayer*>{det0, det1, det2}, true);
  yolo->getOutput(0)->setName(kOutputTensorName);
  network->markOutput(*yolo->getOutput(0));

  auto proto = Proto(network, weightMap, *bottleneck_csp17->getOutput(0), get_width(256, gw), 32, "model.24.proto");
  proto->getOutput(0)->setName("proto");
  network->markOutput(*proto->getOutput(0));

  // Engine config
  builder->setMaxBatchSize(maxBatchSize);
  config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#if defined(USE_FP16)
  config->setFlag(BuilderFlag::kFP16);
#elif defined(USE_INT8)
  std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
  assert(builder->platformHasFastInt8());
  config->setFlag(BuilderFlag::kINT8);
  Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName);
  config->setInt8Calibrator(calibrator);
#endif

  std::cout << "Building engine, please wait for a while..." << std::endl;
  ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
  std::cout << "Build engine successfully!" << std::endl;

  // Don't need the network any more
  network->destroy();

  // Release host memory
  for (auto& mem : weightMap) {
    free((void*)(mem.second.values));
  }

  return engine;
}


================================================
FILE: yolov5/src/model.h
================================================
#pragma once

#include <NvInfer.h>
#include <string>

nvinfer1::ICudaEngine* build_det_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder,
                                        nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt,
                                        float& gd, float& gw, std::string& wts_name);

nvinfer1::ICudaEngine* build_det_p6_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder,
                                           nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt,
                                           float& gd, float& gw, std::string& wts_name);

nvinfer1::ICudaEngine* build_cls_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, float& gd, float& gw, std::string& wts_name);

nvinfer1::ICudaEngine* build_seg_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, float& gd, float& gw, std::string& wts_name);


================================================
FILE: yolov5/src/postprocess.cpp
================================================
#include "postprocess.h"
#include "utils.h"

cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
  float l, r, t, b;
  float r_w = kInputW / (img.cols * 1.0);
  float r_h = kInputH / (img.rows * 1.0);
  if (r_h > r_w) {
    l = bbox[0] - bbox[2] / 2.f;
    r = bbox[0] + bbox[2] / 2.f;
    t = bbox[1] - bbox[3] / 2.f - (kInputH - r_w * img.rows) / 2;
    b = bbox[1] + bbox[3] / 2.f - (kInputH - r_w * img.rows) / 2;
    l = l / r_w;
    r = r / r_w;
    t = t / r_w;
    b = b / r_w;
  } else {
    l = bbox[0] - bbox[2] / 2.f - (kInputW - r_h * img.cols) / 2;
    r = bbox[0] + bbox[2] / 2.f - (kInputW - r_h * img.cols) / 2;
    t = bbox[1] - bbox[3] / 2.f;
    b = bbox[1] + bbox[3] / 2.f;
    l = l / r_h;
    r = r / r_h;
    t = t / r_h;
    b = b / r_h;
  }
  return cv::Rect(round(l), round(t), round(r - l), round(b - t));
}

static float iou(float lbox[4], float rbox[4]) {
  float interBox[] = {
    (std::max)(lbox[0] - lbox[2] / 2.f , rbox[0] - rbox[2] / 2.f), //left
    (std::min)(lbox[0] + lbox[2] / 2.f , rbox[0] + rbox[2] / 2.f), //right
    (std::max)(lbox[1] - lbox[3] / 2.f , rbox[1] - rbox[3] / 2.f), //top
    (std::min)(lbox[1] + lbox[3] / 2.f , rbox[1] + rbox[3] / 2.f), //bottom
  };

  if (interBox[2] > interBox[3] || interBox[0] > interBox[1])
    return 0.0f;

  float interBoxS = (interBox[1] - interBox[0])*(interBox[3] - interBox[2]);
  return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS);
}

static bool cmp(const Detection& a, const Detection& b) {
  return a.conf > b.conf;
}

void nms(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh) {
  int det_size = sizeof(Detection) / sizeof(float);
  std::map<float, std::vector<Detection>> m;
  for (int i = 0; i < output[0] && i < kMaxNumOutputBbox; i++) {
    if (output[1 + det_size * i + 4] <= conf_thresh) continue;
    Detection det;
    memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
    if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Detection>());
    m[det.class_id].push_back(det);
  }
  for (auto it = m.begin(); it != m.end(); it++) {
    auto& dets = it->second;
    std::sort(dets.begin(), dets.end(), cmp);
    for (size_t m = 0; m < dets.size(); ++m) {
      auto& item = dets[m];
      res.push_back(item);
      for (size_t n = m + 1; n < dets.size(); ++n) {
        if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
          dets.erase(dets.begin() + n);
          --n;
        }
      }
    }
  }
}

void batch_nms(std::vector<std::vector<Detection>>& res_batch, float *output, int batch_size, int output_size, float conf_thresh, float nms_thresh) {
  res_batch.resize(batch_size);
  for (int i = 0; i < batch_size; i++) {
    nms(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh);
  }
}

void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
  for (size_t i = 0; i < img_batch.size(); i++) {
    auto& res = res_batch[i];
    cv::Mat img = img_batch[i];
    for (size_t j = 0; j < res.size(); j++) {
      cv::Rect r = get_rect(img, res[j].bbox);
      cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
      cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
    }
  }
}

static cv::Rect get_downscale_rect(float bbox[4], float scale) {
  float left = bbox[0] - bbox[2] / 2;
  float top = bbox[1] - bbox[3] / 2;
  float right = bbox[0] + bbox[2] / 2;
  float bottom = bbox[1] + bbox[3] / 2;
  left /= scale;
  top /= scale;
  right /= scale;
  bottom /= scale;
  return cv::Rect(round(left), round(top), round(right - left), round(bottom - top));
}

std::vector<cv::Mat> process_mask(const float* proto, int proto_size, std::vector<Detection>& dets) {
  std::vector<cv::Mat> masks;
  for (size_t i = 0; i < dets.size(); i++) {
    cv::Mat mask_mat = cv::Mat::zeros(kInputH / 4, kInputW / 4, CV_32FC1);
    auto r = get_downscale_rect(dets[i].bbox, 4);
    for (int x = r.x; x < r.x + r.width; x++) {
      for (int y = r.y; y < r.y + r.height; y++) {
        float e = 0.0f;
        for (int j = 0; j < 32; j++) {
          e += dets[i].mask[j] * proto[j * proto_size / 32 + y * mask_mat.cols + x];
        }
        e = 1.0f / (1.0f + expf(-e));
        mask_mat.at<float>(y, x) = e;
      }
    }
    cv::resize(mask_mat, mask_mat, cv::Size(kInputW, kInputH));
    masks.push_back(mask_mat);
  }
  return masks;
}

cv::Mat scale_mask(cv::Mat mask, cv::Mat img) {
  int x, y, w, h;
  float r_w = kInputW / (img.cols * 1.0);
  float r_h = kInputH / (img.rows * 1.0);
  if (r_h > r_w) {
    w = kInputW;
    h = r_w * img.rows;
    x = 0;
    y = (kInputH - h) / 2;
  } else {
    w = r_h * img.cols;
    h = kInputH;
    x = (kInputW - w) / 2;
    y = 0;
  }
  cv::Rect r(x, y, w, h);
  cv::Mat res;
  cv::resize(mask(r), res, img.size());
  return res;
}

void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks, std::unordered_map<int, std::string>& labels_map) {
  static std::vector<uint32_t> colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A,
                                         0x92CC17, 0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF,
                                         0x344593, 0x6473FF, 0x0018EC, 0x8438FF, 0x520085, 0xCB38FF,
                                         0xFF95C8, 0xFF37C7};
  for (size_t i = 0; i < dets.size(); i++) {
    cv::Mat img_mask = scale_mask(masks[i], img);
    auto color = colors[(int)dets[i].class_id % colors.size()];
    auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF);

    cv::Rect r = get_rect(img, dets[i].bbox);
    for (int x = r.x; x < r.x + r.width; x++) {
      for (int y = r.y; y < r.y + r.height; y++) {
        float val = img_mask.at<float>(y, x);
        if (val <= 0.5) continue;
        img.at<cv::Vec3b>(y, x)[0] = img.at<cv::Vec3b>(y, x)[0] / 2 + bgr[0] / 2;
        img.at<cv::Vec3b>(y, x)[1] = img.at<cv::Vec3b>(y, x)[1] / 2 + bgr[1] / 2;
        img.at<cv::Vec3b>(y, x)[2] = img.at<cv::Vec3b>(y, x)[2] / 2 + bgr[2] / 2;
      }
    }

    cv::rectangle(img, r, bgr, 2);
    
    // Get the size of the text
    cv::Size textSize = cv::getTextSize(labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf), cv::FONT_HERSHEY_PLAIN, 1.2, 2, NULL);
    // Set the top left corner of the rectangle
    cv::Point topLeft(r.x, r.y - textSize.height);

    // Set the bottom right corner of the rectangle
    cv::Point bottomRight(r.x + textSize.width, r.y + textSize.height);

    // Set the thickness of the rectangle lines
    int lineThickness = 2;

    // Draw the rectangle on the image
    cv::rectangle(img, topLeft, bottomRight, bgr, -1);

    cv::putText(img, labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf), cv::Point(r.x, r.y + 4), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar::all(0xFF), 2);

  }
}


================================================
FILE: yolov5/src/postprocess.h
================================================
#pragma once

#include "types.h"
#include <opencv2/opencv.hpp>

cv::Rect get_rect(cv::Mat& img, float bbox[4]);

void nms(std::vector<Detection>& res, float *output, float conf_thresh, float nms_thresh = 0.5);

void batch_nms(std::vector<std::vector<Detection>>& batch_res, float *output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5);

void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);

std::vector<cv::Mat> process_mask(const float* proto, int proto_size, std::vector<Detection>& dets);

void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks, std::unordered_map<int, std::string>& labels_map);


================================================
FILE: yolov5/src/preprocess.cu
================================================
#include "preprocess.h"
#include "cuda_utils.h"

static uint8_t* img_buffer_host = nullptr;
static uint8_t* img_buffer_device = nullptr;

struct AffineMatrix {
  float value[6];
};

__global__ void warpaffine_kernel(
    uint8_t* src, int src_line_size, int src_width,
    int src_height, float* dst, int dst_width,
    int dst_height, uint8_t const_value_st,
    AffineMatrix d2s, int edge) {
  int position = blockDim.x * blockIdx.x + threadIdx.x;
  if (position >= edge) return;

  float m_x1 = d2s.value[0];
  float m_y1 = d2s.value[1];
  float m_z1 = d2s.value[2];
  float m_x2 = d2s.value[3];
  float m_y2 = d2s.value[4];
  float m_z2 = d2s.value[5];

  int dx = position % dst_width;
  int dy = position / dst_width;
  float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
  float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
  float c0, c1, c2;

  if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
    // out of range
    c0 = const_value_st;
    c1 = const_value_st;
    c2 = const_value_st;
  } else {
    int y_low = floorf(src_y);
    int x_low = floorf(src_x);
    int y_high = y_low + 1;
    int x_high = x_low + 1;

    uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
    float ly = src_y - y_low;
    float lx = src_x - x_low;
    float hy = 1 - ly;
    float hx = 1 - lx;
    float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
    uint8_t* v1 = const_value;
    uint8_t* v2 = const_value;
    uint8_t* v3 = const_value;
    uint8_t* v4 = const_value;

    if (y_low >= 0) {
      if (x_low >= 0)
        v1 = src + y_low * src_line_size + x_low * 3;

      if (x_high < src_width)
        v2 = src + y_low * src_line_size + x_high * 3;
    }

    if (y_high < src_height) {
      if (x_low >= 0)
        v3 = src + y_high * src_line_size + x_low * 3;

      if (x_high < src_width)
        v4 = src + y_high * src_line_size + x_high * 3;
    }

    c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
    c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
    c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
  }

  // bgr to rgb 
  float t = c2;
  c2 = c0;
  c0 = t;

  // normalization
  c0 = c0 / 255.0f;
  c1 = c1 / 255.0f;
  c2 = c2 / 255.0f;

  // rgbrgbrgb to rrrgggbbb
  int area = dst_width * dst_height;
  float* pdst_c0 = dst + dy * dst_width + dx;
  float* pdst_c1 = pdst_c0 + area;
  float* pdst_c2 = pdst_c1 + area;
  *pdst_c0 = c0;
  *pdst_c1 = c1;
  *pdst_c2 = c2;
}

void cuda_preprocess(
    uint8_t* src, int src_width, int src_height,
    float* dst, int dst_width, int dst_height,
    cudaStream_t stream) {

  int img_size = src_width * src_height * 3;
  // copy data to pinned memory
  memcpy(img_buffer_host, src, img_size);
  // copy data to device memory
  CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream));

  AffineMatrix s2d, d2s;
  float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width);

  s2d.value[0] = scale;
  s2d.value[1] = 0;
  s2d.value[2] = -scale * src_width  * 0.5  + dst_width * 0.5;
  s2d.value[3] = 0;
  s2d.value[4] = scale;
  s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5;

  cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
  cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
  cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);

  memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));

  int jobs = dst_height * dst_width;
  int threads = 256;
  int blocks = ceil(jobs / (float)threads);

  warpaffine_kernel<<<blocks, threads, 0, stream>>>(
      img_buffer_device, src_width * 3, src_width,
      src_height, dst, dst_width,
      dst_height, 128, d2s, jobs);
}

void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch,
                           float* dst, int dst_width, int dst_height,
                           cudaStream_t stream) {
  int dst_size = dst_width * dst_height * 3;
  for (size_t i = 0; i < img_batch.size(); i++) {
    cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width, dst_height, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
  }
}

void cuda_preprocess_init(int max_image_size) {
  // prepare input data in pinned memory
  CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3));
  // prepare input data in device memory
  CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3));
}

void cuda_preprocess_destroy() {
  CUDA_CHECK(cudaFree(img_buffer_device));
  CUDA_CHECK(cudaFreeHost(img_buffer_host));
}


================================================
FILE: yolov5/src/preprocess.h
================================================
#pragma once

#include <cuda_runtime.h>
#include <cstdint>
#include <opencv2/opencv.hpp>

void cuda_preprocess_init(int max_image_size);
void cuda_preprocess_destroy();
void cuda_preprocess(uint8_t* src, int src_width, int src_height,
                     float* dst, int dst_width, int dst_height,
                     cudaStream_t stream);
void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch,
                           float* dst, int dst_width, int dst_height,
                           cudaStream_t stream);


================================================
FILE: yolov5/src/types.h
================================================
#pragma once

#include "config.h"

struct YoloKernel {
  int width;
  int height;
  float anchors[kNumAnchor * 2];
};

struct alignas(float) Detection {
  float bbox[4];  // center_x center_y w h
  float conf;  // bbox_conf * cls_conf
  float class_id;
  float mask[32];
};


================================================
FILE: yolov5/src/utils.h
================================================
#pragma once

#include <dirent.h>
#include <fstream>
#include <unordered_map>
#include <string>
#include <sstream>
#include <vector>
#include <cstring>

static inline int read_files_in_dir(const char* p_dir_name, std::vector<std::string>& file_names) {
    DIR *p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 &&
            strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}

// Function to trim leading and trailing whitespace from a string
static inline std::string trim_leading_whitespace(const std::string& str) {
    size_t first = str.find_first_not_of(' ');
    if (std::string::npos == first) {
        return str;
    }
    size_t last = str.find_last_not_of(' ');
    return str.substr(first, (last - first + 1));
}

// Src: https://stackoverflow.com/questions/16605967
static inline std::string to_string_with_precision(const float a_value, const int n = 2) {
    std::ostringstream out;
    out.precision(n);
    out << std::fixed << a_value;
    return out.str();
}

static inline int read_labels(const std::string labels_filename, std::unordered_map<int, std::string>& labels_map) {

    std::ifstream file(labels_filename);
    // Read each line of the file
    std::string line;
    int index = 0;
    while (std::getline(file, line)) {
        // Strip the line of any leading or trailing whitespace
        line = trim_leading_whitespace(line);

        // Add the stripped line to the labels_map, using the loop index as the key
        labels_map[index] = line;
        index++;
    }
    // Close the file
    file.close();

    return 0;
}


================================================
FILE: yolov5/yolov5_cls.cpp
================================================
#include "cuda_utils.h"
#include "logging.h"
#include "utils.h"
#include "model.h"
#include "config.h"
#include "calibrator.h"

#include <iostream>
#include <chrono>
#include <cmath>
#include <numeric>
#include <opencv2/opencv.hpp>

using namespace nvinfer1;

static Logger gLogger;
const static int kOutputSize = kClsNumClass;

void batch_preprocess(std::vector<cv::Mat>& imgs, float* output) {
  for (size_t b = 0; b < imgs.size(); b++) {
    cv::Mat img;
    // cv::resize(imgs[b], img, cv::Size(kClsInputW, kClsInputH));
    img = preprocess_img(imgs[b], kClsInputW, kClsInputH);
    int i = 0;
    for (int row = 0; row < img.rows; ++row) {
      uchar* uc_pixel = img.data + row * img.step;
      for (int col = 0; col < img.cols; ++col) {
        output[b * 3 * img.rows * img.cols  + i] = ((float)uc_pixel[2] / 255.0 - 0.485) / 0.229;  // R - 0.485
        output[b * 3 * img.rows * img.cols + i + img.rows * img.cols] = ((float)uc_pixel[1] / 255.0 - 0.456) / 0.224;
        output[b * 3 * img.rows * img.cols + i + 2 * img.rows * img.cols] = ((float)uc_pixel[0] / 255.0 - 0.406) / 0.225;
        uc_pixel += 3;
        ++i;
      }
    }
  }
}

std::vector<float> softmax(float *prob, int n) {
  std::vector<float> res;
  float sum = 0.0f;
  float t;
  for (int i = 0; i < n; i++) {
    t = expf(prob[i]);
    res.push_back(t);
    sum += t;
  }
  for (int i = 0; i < n; i++) {
    res[i] /= sum;
  }
  return res;
}

std::vector<int> topk(const std::vector<float>& vec, int k) {
  std::vector<int> topk_index;
  std::vector<size_t> vec_index(vec.size());
  std::iota(vec_index.begin(), vec_index.end(), 0);

  std::sort(vec_index.begin(), vec_index.end(), [&vec](size_t index_1, size_t index_2) { return vec[index_1] > vec[index_2]; });

  int k_num = std::min<int>(vec.size(), k);

  for (int i = 0; i < k_num; ++i) {
    topk_index.push_back(vec_index[i]);
  }

  return topk_index;
}

std::vector<std::string> read_classes(std::string file_name) {
  std::vector<std::string> classes;
  std::ifstream ifs(file_name, std::ios::in);
  if (!ifs.is_open()) {
    std::cerr << file_name << " is not found, pls refer to README and download it." << std::endl;
    assert(0);
  }
  std::string s;
  while (std::getline(ifs, s)) {
    classes.push_back(s);
  }
  ifs.close();
  return classes;
}

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, float& gd, float& gw, std::string& img_dir) {
  if (argc < 4) return false;
  if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) {
    wts = std::string(argv[2]);
    engine = std::string(argv[3]);
    auto net = std::string(argv[4]);
    if (net[0] == 'n') {
      gd = 0.33;
      gw = 0.25;
    } else if (net[0] == 's') {
      gd = 0.33;
      gw = 0.50;
    } else if (net[0] == 'm') {
      gd = 0.67;
      gw = 0.75;
    } else if (net[0] == 'l') {
      gd = 1.0;
      gw = 1.0;
    } else if (net[0] == 'x') {
      gd = 1.33;
      gw = 1.25;
    } else if (net[0] == 'c' && argc == 7) {
      gd = atof(argv[5]);
      gw = atof(argv[6]);
    } else {
      return false;
    }
  } else if (std::string(argv[1]) == "-d" && argc == 4) {
    engine = std::string(argv[2]);
    img_dir = std::string(argv[3]);
  } else {
    return false;
  }
  return true;
}

void prepare_buffers(ICudaEngine* engine, float** gpu_input_buffer, float** gpu_output_buffer, float** cpu_input_buffer, float** cpu_output_buffer) {
  assert(engine->getNbBindings() == 2);
  // In order to bind the buffers, we need to know the names of the input and output tensors.
  // Note that indices are guaranteed to be less than IEngine::getNbBindings()
  const int inputIndex = engine->getBindingIndex(kInputTensorName);
  const int outputIndex = engine->getBindingIndex(kOutputTensorName);
  assert(inputIndex == 0);
  assert(outputIndex == 1);
  // Create GPU buffers on device
  CUDA_CHECK(cudaMalloc((void**)gpu_input_buffer, kBatchSize * 3 * kClsInputH * kClsInputW * sizeof(float)));
  CUDA_CHECK(cudaMalloc((void**)gpu_output_buffer, kBatchSize * kOutputSize * sizeof(float)));

  *cpu_input_buffer = new float[kBatchSize * 3 * kClsInputH * kClsInputW];
  *cpu_output_buffer = new float[kBatchSize * kOutputSize];
}

void infer(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* output, int batchSize) {
  CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * kClsInputH * kClsInputW * sizeof(float), cudaMemcpyHostToDevice, stream));
  context.enqueue(batchSize, buffers, stream, nullptr);
  CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream));
  cudaStreamSynchronize(stream);
}

void serialize_engine(unsigned int max_batchsize, float& gd, float& gw, std::string& wts_name, std::string& engine_name) {
  // Create builder
  IBuilder* builder = createInferBuilder(gLogger);
  IBuilderConfig* config = builder->createBuilderConfig();

  // Create model to populate the network, then set the outputs and create an engine
  ICudaEngine *engine = nullptr;

  engine = build_cls_engine(max_batchsize, builder, config, DataType::kFLOAT, gd, gw, wts_name);

  assert(engine != nullptr);

  // Serialize the engine
  IHostMemory* serialized_engine = engine->serialize();
  assert(serialized_engine != nullptr);

  // Save engine to file
  std::ofstream p(engine_name, std::ios::binary);
  if (!p) {
    std::cerr << "Could not open plan output file" << std::endl;
    assert(false);
  }
  p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());

  // Close everything down
  engine->destroy();
  config->destroy();
  serialized_engine->destroy();
  builder->destroy();
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) {
  std::ifstream file(engine_name, std::ios::binary);
  if (!file.good()) {
    std::cerr << "read " << engine_name << " error!" << std::endl;
    assert(false);
  }
  size_t size = 0;
  file.seekg(0, file.end);
  size = file.tellg();
  file.seekg(0, file.beg);
  char* serialized_engine = new char[size];
  assert(serialized_engine);
  file.read(serialized_engine, size);
  file.close();

  *runtime = createInferRuntime(gLogger);
  assert(*runtime);
  *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
  assert(*engine);
  *context = (*engine)->createExecutionContext();
  assert(*context);
  delete[] serialized_engine;
}

int main(int argc, char** argv) {
  cudaSetDevice(kGpuId);

  std::string wts_name = "";
  std::string engine_name = "";
  float gd = 0.0f, gw = 0.0f;
  std::string img_dir;

  if (!parse_args(argc, argv, wts_name, engine_name, gd, gw, img_dir)) {
    std::cerr << "arguments not right!" << std::endl;
    std::cerr << "./yolov5_cls -s [.wts] [.engine] [n/s/m/l/x or c gd gw]  // serialize model to plan file" << std::endl;
    std::cerr << "./yolov5_cls -d [.engine] ../images  // deserialize plan file and run inference" << std::endl;
    return -1;
  }

  // Create a model using the API directly and serialize it to a file
  if (!wts_name.empty()) {
    serialize_engine(kBatchSize, gd, gw, wts_name, engine_name);
    return 0;
  }

  // Deserialize the engine from file
  IRuntime* runtime = nullptr;
  ICudaEngine* engine = nullptr;
  IExecutionContext* context = nullptr;
  deserialize_engine(engine_name, &runtime, &engine, &context);
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  // Prepare cpu and gpu buffers
  float* gpu_buffers[2];
  float* cpu_input_buffer = nullptr;
  float* cpu_output_buffer = nullptr;
  prepare_buffers(engine, &gpu_buffers[0], &gpu_buffers[1], &cpu_input_buffer, &cpu_output_buffer);

  // Read images from directory
  std::vector<std::string> file_names;
  if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
    std::cerr << "read_files_in_dir failed." << std::endl;
    return -1;
  }

  // Read imagenet labels
  auto classes = read_classes("imagenet_classes.txt");

  // batch predict
  for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
    // Get a batch of images
    std::vector<cv::Mat> img_batch;
    std::vector<std::string> img_name_batch;
    for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
      cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
      img_batch.push_back(img);
      img_name_batch.push_back(file_names[j]);
    }

    // Preprocess
    batch_preprocess(img_batch, cpu_input_buffer);

    // Run inference
    auto start = std::chrono::system_clock::now();
    infer(*context, stream, (void**)gpu_buffers, cpu_input_buffer, cpu_output_buffer, kBatchSize);
    auto end = std::chrono::system_clock::now();
    std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;

    // Postprocess and get top-k result
    for (size_t b = 0; b < img_name_batch.size(); b++) {
      float* p = &cpu_output_buffer[b * kOutputSize];
      auto res = softmax(p, kOutputSize);
      auto topk_idx = topk(res, 3);
      std::cout << img_name_batch[b] << std::endl;
      for (auto idx: topk_idx) {
        std::cout << "  " << classes[idx] << " " << res[idx] << std::endl;
      }
    }
  }

  // Release stream and buffers
  cudaStreamDestroy(stream);
  CUDA_CHECK(cudaFree(gpu_buffers[0]));
  CUDA_CHECK(cudaFree(gpu_buffers[1]));
  delete[] cpu_input_buffer;
  delete[] cpu_output_buffer;
  // Destroy the engine
  context->destroy();
  engine->destroy();
  runtime->destroy();

  return 0;
}


================================================
FILE: yolov5/yolov5_cls_trt.py
================================================
"""
An example that uses TensorRT's Python api to make inferences.
"""
import os
import shutil
import sys
import threading
import time
import cv2
import numpy as np
import torch
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt


def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret


with open("imagenet_classes.txt") as f:
    classes = [line.strip() for line in f.readlines()]


class YoLov5TRT(object):
    """
    description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []
        self.mean = (0.485, 0.456, 0.406)
        self.std = (0.229, 0.224, 0.225)

        for binding in engine:
            print('binding:', binding, engine.get_binding_shape(binding))
            size = trt.volume(engine.get_binding_shape(
                binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.batch_size = engine.max_batch_size

    def infer(self, raw_image_generator):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        engine = self.engine
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_input_image = np.empty(
            shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            batch_image_raw.append(image_raw)
            input_image = self.preprocess_cls_image(image_raw)
            np.copyto(batch_input_image[i], input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async(batch_size=self.batch_size,
                              bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        # Do postprocess
        for i in range(self.batch_size):
            classes_ls, predicted_conf_ls, category_id_ls = self.postprocess_cls(
                output)
            cv2.putText(batch_image_raw[i], str(
                classes_ls), (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1, cv2.LINE_AA)
            print(classes_ls, predicted_conf_ls)
        return batch_image_raw, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()

    def get_raw_image(self, image_path_batch):
        """
        description: Read an image from image path
        """
        for img_path in image_path_batch:
            yield cv2.imread(img_path)

    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_cls_image(self, input_img):
        im = cv2.cvtColor(input_img, cv2.COLOR_BGR2RGB)
        im = cv2.resize(im, (self.input_h, self.input_w))
        im = np.float32(im)
        im /= 255.0
        im -= self.mean
        im /= self.std
        im = im.transpose(2, 0, 1)
        # prepare batch
        batch_data = np.expand_dims(im, axis=0)
        return batch_data

    def postprocess_cls(self, output_data):
        classes_ls = []
        predicted_conf_ls = []
        category_id_ls = []
        output_data = output_data.reshape(self.batch_size, -1)
        output_data = torch.Tensor(output_data)
        p = torch.nn.functional.softmax(output_data, dim=1)
        score, index = torch.topk(p, 3)
        for ind in range(index.shape[0]):
            input_category_id = index[ind][0].item()  # 716
            category_id_ls.append(input_category_id)
            predicted_confidence = score[ind][0].item()
            predicted_conf_ls.append(predicted_confidence)
            classes_ls.append(classes[input_category_id])
        return classes_ls, predicted_conf_ls, category_id_ls


class inferThread(threading.Thread):
    def __init__(self, yolov5_wrapper, image_path_batch):
        threading.Thread.__init__(self)
        self.yolov5_wrapper = yolov5_wrapper
        self.image_path_batch = image_path_batch

    def run(self):
        batch_image_raw, use_time = self.yolov5_wrapper.infer(
            self.yolov5_wrapper.get_raw_image(self.image_path_batch))
        for i, img_path in enumerate(self.image_path_batch):
            parent, filename = os.path.split(img_path)
            save_name = os.path.join('output', filename)
            # Save image
            cv2.imwrite(save_name, batch_image_raw[i])
        print('input->{}, time->{:.2f}ms, saving into output/'.format(
            self.image_path_batch, use_time * 1000))


class warmUpThread(threading.Thread):
    def __init__(self, yolov5_wrapper):
        threading.Thread.__init__(self)
        self.yolov5_wrapper = yolov5_wrapper

    def run(self):
        batch_image_raw, use_time = self.yolov5_wrapper.infer(
            self.yolov5_wrapper.get_raw_image_zeros())
        print(
            'warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))


if __name__ == "__main__":
    # load custom plugin and engine
    engine_file_path = "build/yolov5s-cls.engine"

    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')
    # a YoLov5TRT instance
    yolov5_wrapper = YoLov5TRT(engine_file_path)
    try:
        print('batch size is', yolov5_wrapper.batch_size)

        image_dir = "images/"
        image_path_batches = get_img_path_batches(
            yolov5_wrapper.batch_size, image_dir)

        for i in range(10):
            # create a new thread to do warm_up
            thread1 = warmUpThread(yolov5_wrapper)
            thread1.start()
            thread1.join()
        for batch in image_path_batches:
            # create a new thread to do inference
            thread1 = inferThread(yolov5_wrapper, batch)
            thread1.start()
            thread1.join()
    finally:
        # destroy the instance
        yolov5_wrapper.destroy()


================================================
FILE: yolov5/yolov5_det.cpp
================================================
#include "cuda_utils.h"
#include "logging.h"
#include "utils.h"
#include "preprocess.h"
#include "postprocess.h"
#include "model.h"

#include <iostream>
#include <chrono>
#include <cmath>

using namespace nvinfer1;

static Logger gLogger;
const static int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, bool& is_p6, float& gd, float& gw, std::string& img_dir) {
  if (argc < 4) return false;
  if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) {
    wts = std::string(argv[2]);
    engine = std::string(argv[3]);
    auto net = std::string(argv[4]);
    if (net[0] == 'n') {
      gd = 0.33;
      gw = 0.25;
    } else if (net[0] == 's') {
      gd = 0.33;
      gw = 0.50;
    } else if (net[0] == 'm') {
      gd = 0.67;
      gw = 0.75;
    } else if (net[0] == 'l') {
      gd = 1.0;
      gw = 1.0;
    } else if (net[0] == 'x') {
      gd = 1.33;
      gw = 1.25;
    } else if (net[0] == 'c' && argc == 7) {
      gd = atof(argv[5]);
      gw = atof(argv[6]);
    } else {
      return false;
    }
    if (net.size() == 2 && net[1] == '6') {
      is_p6 = true;
    }
  } else if (std::string(argv[1]) == "-d" && argc == 4) {
    engine = std::string(argv[2]);
    img_dir = std::string(argv[3]);
  } else {
    return false;
  }
  return true;
}

void prepare_buffers(ICudaEngine* engine, float** gpu_input_buffer, float** gpu_output_buffer, float** cpu_output_buffer) {
  assert(engine->getNbBindings() == 2);
  // In order to bind the buffers, we need to know the names of the input and output tensors.
  // Note that indices are guaranteed to be less than IEngine::getNbBindings()
  const int inputIndex = engine->getBindingIndex(kInputTensorName);
  const int outputIndex = engine->getBindingIndex(kOutputTensorName);
  assert(inputIndex == 0);
  assert(outputIndex == 1);
  // Create GPU buffers on device
  CUDA_CHECK(cudaMalloc((void**)gpu_input_buffer, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
  CUDA_CHECK(cudaMalloc((void**)gpu_output_buffer, kBatchSize * kOutputSize * sizeof(float)));

  *cpu_output_buffer = new float[kBatchSize * kOutputSize];
}

void infer(IExecutionContext& context, cudaStream_t& stream, void** gpu_buffers, float* output, int batchsize) {
  context.enqueue(batchsize, gpu_buffers, stream, nullptr);
  CUDA_CHECK(cudaMemcpyAsync(output, gpu_buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream));
  cudaStreamSynchronize(stream);
}

void serialize_engine(unsigned int max_batchsize, bool& is_p6, float& gd, float& gw, std::string& wts_name, std::string& engine_name) {
  // Create builder
  IBuilder* builder = createInferBuilder(gLogger);
  IBuilderConfig* config = builder->createBuilderConfig();

  // Create model to populate the network, then set the outputs and create an engine
  ICudaEngine *engine = nullptr;
  if (is_p6) {
    engine = build_det_p6_engine(max_batchsize, builder, config, DataType::kFLOAT, gd, gw, wts_name);
  } else {
    engine = build_det_engine(max_batchsize, builder, config, DataType::kFLOAT, gd, gw, wts_name);
  }
  assert(engine != nullptr);

  // Serialize the engine
  IHostMemory* serialized_engine = engine->serialize();
  assert(serialized_engine != nullptr);

  // Save engine to file
  std::ofstream p(engine_name, std::ios::binary);
  if (!p) {
    std::cerr << "Could not open plan output file" << std::endl;
    assert(false);
  }
  p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());

  // Close everything down
  engine->destroy();
  config->destroy();
  serialized_engine->destroy();
  builder->destroy();
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) {
  std::ifstream file(engine_name, std::ios::binary);
  if (!file.good()) {
    std::cerr << "read " << engine_name << " error!" << std::endl;
    assert(false);
  }
  size_t size = 0;
  file.seekg(0, file.end);
  size = file.tellg();
  file.seekg(0, file.beg);
  char* serialized_engine = new char[size];
  assert(serialized_engine);
  file.read(serialized_engine, size);
  file.close();

  *runtime = createInferRuntime(gLogger);
  assert(*runtime);
  *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
  assert(*engine);
  *context = (*engine)->createExecutionContext();
  assert(*context);
  delete[] serialized_engine;
}

int main(int argc, char** argv) {
  cudaSetDevice(kGpuId);

  std::string wts_name = "";
  std::string engine_name = "";
  bool is_p6 = false;
  float gd = 0.0f, gw = 0.0f;
  std::string img_dir;

  if (!parse_args(argc, argv, wts_name, engine_name, is_p6, gd, gw, img_dir)) {
    std::cerr << "arguments not right!" << std::endl;
    std::cerr << "./yolov5_det -s [.wts] [.engine] [n/s/m/l/x/n6/s6/m6/l6/x6 or c/c6 gd gw]  // serialize model to plan file" << std::endl;
    std::cerr << "./yolov5_det -d [.engine] ../images  // deserialize plan file and run inference" << std::endl;
    return -1;
  }

  // Create a model using the API directly and serialize it to a file
  if (!wts_name.empty()) {
    serialize_engine(kBatchSize, is_p6, gd, gw, wts_name, engine_name);
    return 0;
  }

  // Deserialize the engine from file
  IRuntime* runtime = nullptr;
  ICudaEngine* engine = nullptr;
  IExecutionContext* context = nullptr;
  deserialize_engine(engine_name, &runtime, &engine, &context);
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  // Init CUDA preprocessing
  cuda_preprocess_init(kMaxInputImageSize);

  // Prepare cpu and gpu buffers
  float* gpu_buffers[2];
  float* cpu_output_buffer = nullptr;
  prepare_buffers(engine, &gpu_buffers[0], &gpu_buffers[1], &cpu_output_buffer);

  // Read images from directory
  std::vector<std::string> file_names;
  if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
    std::cerr << "read_files_in_dir failed." << std::endl;
    return -1;
  }

  // batch predict
  for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
    // Get a batch of images
    std::vector<cv::Mat> img_batch;
    std::vector<std::string> img_name_batch;
    for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
      cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
      img_batch.push_back(img);
      img_name_batch.push_back(file_names[j]);
    }

    // Preprocess
    cuda_batch_preprocess(img_batch, gpu_buffers[0], kInputW, kInputH, stream);

    // Run inference
    auto start = std::chrono::system_clock::now();
    infer(*context, stream, (void**)gpu_buffers, cpu_output_buffer, kBatchSize);
    auto end = std::chrono::system_clock::now();
    std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;

    // NMS
    std::vector<std::vector<Detection>> res_batch;
    batch_nms(res_batch, cpu_output_buffer, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);

    // Draw bounding boxes
    draw_bbox(img_batch, res_batch);

    // Save images
    for (size_t j = 0; j < img_batch.size(); j++) {
      cv::imwrite("_" + img_name_batch[j], img_batch[j]);
    }
  }

  // Release stream and buffers
  cudaStreamDestroy(stream);
  CUDA_CHECK(cudaFree(gpu_buffers[0]));
  CUDA_CHECK(cudaFree(gpu_buffers[1]));
  delete[] cpu_output_buffer;
  cuda_preprocess_destroy();
  // Destroy the engine
  context->destroy();
  engine->destroy();
  runtime->destroy();

  // Print histogram of the output distribution
  // std::cout << "\nOutput:\n\n";
  // for (unsigned int i = 0; i < kOutputSize; i++) {
  //   std::cout << prob[i] << ", ";
  //   if (i % 10 == 0) std::cout << std::endl;
  // }
  // std::cout << std::endl;

  return 0;
}


================================================
FILE: yolov5/yolov5_det_cuda_python.py
================================================
"""
An example that uses TensorRT's Python api to make inferences.
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
from cuda import cudart
import tensorrt as trt

CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4


def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret


def plot_one_box(x, img, color=None, label=None, line_thickness=None):
    """
    description: Plots one bounding box on image img,
                 this function comes from YoLov5 project.
    param:
        x:      a box likes [x1,y1,x2,y2]
        img:    a opencv image object
        color:  color to draw rectangle, such as (0,255,0)
        label:  str
        line_thickness: int
    return:
        no return

    """
    tl = (
            line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
    )  # line/font thickness
    color = color or [random.randint(0, 255) for _ in range(3)]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
    if label:
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(
            img,
            label,
            (c1[0], c1[1] - 2),
            0,
            tl / 3,
            [225, 255, 255],
            thickness=tf,
            lineType=cv2.LINE_AA,
        )


class YoLov5TRT(object):
    """
    description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()
        # Create a Stream on this device,
        _, stream = cudart.cudaStreamCreate()
        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = np.empty(size, dtype=dtype)
            _, cuda_mem = cudart.cudaMallocAsync(host_mem.nbytes, stream)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.batch_size = engine.max_batch_size

    def infer(self, raw_image_generator):
        threading.Thread.__init__(self)
        # Restore
        stream = self.stream
        context = self.context
        engine = self.engine
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_origin_h = []
        batch_origin_w = []
        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
            batch_image_raw.append(image_raw)
            batch_origin_h.append(origin_h)
            batch_origin_w.append(origin_w)
            np.copyto(batch_input_image[i], input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cudart.cudaMemcpyAsync(cuda_inputs[0], host_inputs[0].ctypes.data, host_inputs[0].nbytes,
                               cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
        # Run inference.
        context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream)
        # Transfer predictions back from the GPU.
        cudart.cudaMemcpyAsync(host_outputs[0].ctypes.data, cuda_outputs[0], host_outputs[0].nbytes,
                               cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)
        # Synchronize the stream
        cudart.cudaStreamSynchronize(stream)
        end = time.time()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        # Do postprocess
        for i in range(self.batch_size):
            result_boxes, result_scores, result_classid = self.post_process(
                output[i * 6001: (i + 1) * 6001], batch_origin_h[i], batch_origin_w[i]
            )
            # Draw rectangles and labels on the original image
            for j in range(len(result_boxes)):
                box = result_boxes[j]
                plot_one_box(
                    box,
                    batch_image_raw[i],
                    label="{}:{:.2f}".format(
                        categories[int(result_classid[j])], result_scores[j]
                    ),
                )
        return batch_image_raw, end - start

    def destroy(self):
        # Remove any stream and cuda mem
        cudart.cudaStreamDestroy(self.stream)
        cudart.cudaFree(self.cuda_inputs[0])
        cudart.cudaFree(self.cuda_outputs[0])

    def get_raw_image(self, image_path_batch):
        """
        description: Read an image from image path
        """
        for img_path in image_path_batch:
            yield cv2.imread(img_path)

    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_image(self, raw_bgr_image):
        """
        description: Convert BGR image to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = self.input_w / w
        r_h = self.input_h / h
        if r_h > r_w:
            tw = self.input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.input_h - th) / 2)
            ty2 = self.input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = self.input_h
            tx1 = int((self.input_w - tw) / 2)
            tx2 = self.input_w - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy(self, origin_h, origin_w, x):
        """
        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
        param:
            origin_h:   height of original image
            origin_w:   width of original image
            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
        return:
            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
        """
        y = np.zeros_like(x)
        r_w = self.input_w / origin_w
        r_h = self.input_h / origin_h
        if r_h > r_w:
            y[:, 0] = x[:, 0] - x[:, 2] / 2
            y[:, 2] = x[:, 0] + x[:, 2] / 2
            y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
            y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
            y /= r_w
        else:
            y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
            y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
            y[:, 1] = x[:, 1] - x[:, 3] / 2
            y[:, 3] = x[:, 1] + x[:, 3] / 2
            y /= r_h

        return y

    def post_process(self, output, origin_h, origin_w):
        """
        description: postprocess the prediction
        param:
            output:     A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
            result_scores: finally scores, a numpy, each element is the score correspoing to box
            result_classid: finally classid, a numpy, each element is the classid correspoing to box
        """
        # Get the num of boxes detected
        num = int(output[0])
        # Reshape to a two dimentional ndarray
        pred = np.reshape(output[1:], (-1, 6))[:num, :]
        # Do nms
        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
        result_scores = boxes[:, 4] if len(boxes) else np.array([])
        result_classid = boxes[:, 5] if len(boxes) else np.array([])
        return result_boxes, result_scores, result_classid

    def bbox_iou(self, box1, box2, x1y1x2y2=True):
        """
        description: compute the IoU of two bounding boxes
        param:
            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            x1y1x2y2: select the coordinate format
        return:
            iou: computed iou
        """
        if not x1y1x2y2:
            # Transform from center and width to exact coordinates
            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
        else:
            # Get the coordinates of bounding boxes
            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

        # Get the coordinates of the intersection rectangle
        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
        # Intersection area
        inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \
                     np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)
        # Union Area
        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)

        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

        return iou

    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
        """
        description: Removes detections with lower object confidence score than 'conf_thres' and performs
        Non-Maximum Suppression to further filter detections.
        param:
            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
            origin_h: original image height
            origin_w: original image width
            conf_thres: a confidence threshold to filter detections
            nms_thres: a iou threshold to filter detections
        return:
            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
        """
        # Get the boxes that score > CONF_THRESH
        boxes = prediction[prediction[:, 4] >= conf_thres]
        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
        boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
        # clip the coordinates
        boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1)
        boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1)
        boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1)
        boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1)
        # Object confidence
        confs = boxes[:, 4]
        # Sort by the confs
        boxes = boxes[np.argsort(-confs)]
        # Perform non-maximum suppression
        keep_boxes = []
        while boxes.shape[0]:
            large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
            label_match = boxes[0, -1] == boxes[:, -1]
            # Indices of boxes with lower confidence scores, large IOUs and matching labels
            invalid = large_overlap & label_match
            keep_boxes += [boxes[0]]
            boxes = boxes[~invalid]
        boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
        return boxes


class inferThread(threading.Thread):
    def __init__(self, yolov5_wrapper, image_path_batch):
        threading.Thread.__init__(self)
        self.yolov5_wrapper = yolov5_wrapper
        self.image_path_batch = image_path_batch

    def run(self):
        batch_image_raw, use_time = self.yolov5_wrapper.infer(self.yolov5_wrapper.get_raw_image(self.image_path_batch))
        for i, img_path in enumerate(self.image_path_batch):
            parent, filename = os.path.split(img_path)
            save_name = os.path.join('output', filename)
            # Save image
            cv2.imwrite(save_name, batch_image_raw[i])
        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))


class warmUpThread(threading.Thread):
    def __init__(self, yolov5_wrapper):
        threading.Thread.__init__(self)
        self.yolov5_wrapper = yolov5_wrapper

    def run(self):
        batch_image_raw, use_time = self.yolov5_wrapper.infer(self.yolov5_wrapper.get_raw_image_zeros())
        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))


if __name__ == "__main__":
    # load custom plugin and engine
    PLUGIN_LIBRARY = "build/libmyplugins.so"
    engine_file_path = "build/yolov5s.engine"

    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]
    if len(sys.argv) > 2:
        PLUGIN_LIBRARY = sys.argv[2]

    ctypes.CDLL(PLUGIN_LIBRARY)
    cudart.cudaDeviceSynchronize()

    # load coco labels

    categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
                  "traffic light",
                  "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
                  "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase",
                  "frisbee",
                  "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
                  "surfboard",
                  "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
                  "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
                  "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
                  "cell phone",
                  "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
                  "teddy bear",
                  "hair drier", "toothbrush"]

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')
    # a YoLov5TRT instance
    yolov5_wrapper = YoLov5TRT(engine_file_path)
    try:
        print('batch size is', yolov5_wrapper.batch_size)

        image_dir = "images/"
        image_path_batches = get_img_path_batches(yolov5_wrapper.batch_size, image_dir)

        for i in range(10):
            # create a new thread to do warm_up
            thread1 = warmUpThread(yolov5_wrapper)
            thread1.start()
            thread1.join()
        for batch in image_path_batches:
            # create a new thread to do inference
            thread1 = inferThread(yolov5_wrapper, batch)
            thread1.start()
            thread1.join()
    finally:
        # destroy the instance
        yolov5_wrapper.destroy()


================================================
FILE: yolov5/yolov5_det_trt.py
================================================
"""
An example that uses TensorRT's Python api to make inferences.
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt

CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4
LEN_ALL_RESULT = 38001
LEN_ONE_RESULT = 38

def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret

def plot_one_box(x, img, color=None, label=None, line_thickness=None):
    """
    description: Plots one bounding box on image img,
                 this function comes from YoLov5 project.
    param: 
        x:      a box likes [x1,y1,x2,y2]
        img:    a opencv image object
        color:  color to draw rectangle, such as (0,255,0)
        label:  str
        line_thickness: int
    return:
        no return

    """
    tl = (
        line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
    )  # line/font thickness
    color = color or [random.randint(0, 255) for _ in range(3)]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
    if label:
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(
            img,
            label,
            (c1[0], c1[1] - 2),
            0,
            tl / 3,
            [225, 255, 255],
            thickness=tf,
            lineType=cv2.LINE_AA,
        )


class YoLov5TRT(object):
    """
    description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.batch_size = engine.max_batch_size

    def infer(self, raw_image_generator):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        engine = self.engine
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_origin_h = []
        batch_origin_w = []
        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
            batch_image_raw.append(image_raw)
            batch_origin_h.append(origin_h)
            batch_origin_w.append(origin_w)
            np.copyto(batch_input_image[i], input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        # Do postprocess
        for i in range(self.batch_size):
            result_boxes, result_scores, result_classid = self.post_process(
                output[i * LEN_ALL_RESULT: (i + 1) * LEN_ALL_RESULT], batch_origin_h[i], batch_origin_w[i]
            )
            # Draw rectangles and labels on the original image
            for j in range(len(result_boxes)):
                box = result_boxes[j]
                plot_one_box(
                    box,
                    batch_image_raw[i],
                    label="{}:{:.2f}".format(
                        categories[int(result_classid[j])], result_scores[j]
                    ),
                )
        return batch_image_raw, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        
    def get_raw_image(self, image_path_batch):
        """
        description: Read an image from image path
        """
        for img_path in image_path_batch:
            yield cv2.imread(img_path)
        
    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_image(self, raw_bgr_image):
        """
        description: Convert BGR image to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = self.input_w / w
        r_h = self.input_h / h
        if r_h > r_w:
            tw = self.input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.input_h - th) / 2)
            ty2 = self.input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = self.input_h
            tx1 = int((self.input_w - tw) / 2)
            tx2 = self.input_w - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy(self, origin_h, origin_w, x):
        """
        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
        param:
            origin_h:   height of original image
            origin_w:   width of original image
            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
        return:
            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
        """
        y = np.zeros_like(x)
        r_w = self.input_w / origin_w
        r_h = self.input_h / origin_h
        if r_h > r_w:
            y[:, 0] = x[:, 0] - x[:, 2] / 2
            y[:, 2] = x[:, 0] + x[:, 2] / 2
            y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
            y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
            y /= r_w
        else:
            y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
            y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
            y[:, 1] = x[:, 1] - x[:, 3] / 2
            y[:, 3] = x[:, 1] + x[:, 3] / 2
            y /= r_h

        return y

    def post_process(self, output, origin_h, origin_w):
        """
        description: postprocess the prediction
        param:
            output:     A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] 
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
            result_scores: finally scores, a numpy, each element is the score correspoing to box
            result_classid: finally classid, a numpy, each element is the classid correspoing to box
        """
        # Get the num of boxes detected
        num = int(output[0])
        # Reshape to a two dimentional ndarray
        pred = np.reshape(output[1:], (-1, LEN_ONE_RESULT))[:num, :]
        pred = pred[:, :6]
        # Do nms
        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
        result_scores = boxes[:, 4] if len(boxes) else np.array([])
        result_classid = boxes[:, 5] if len(boxes) else np.array([])
        return result_boxes, result_scores, result_classid

    def bbox_iou(self, box1, box2, x1y1x2y2=True):
        """
        description: compute the IoU of two bounding boxes
        param:
            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))            
            x1y1x2y2: select the coordinate format
        return:
            iou: computed iou
        """
        if not x1y1x2y2:
            # Transform from center and width to exact coordinates
            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
        else:
            # Get the coordinates of bounding boxes
            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

        # Get the coordinates of the intersection rectangle
        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
        # Intersection area
        inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \
                     np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)
        # Union Area
        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)

        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

        return iou

    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
        """
        description: Removes detections with lower object confidence score than 'conf_thres' and performs
        Non-Maximum Suppression to further filter detections.
        param:
            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
            origin_h: original image height
            origin_w: original image width
            conf_thres: a confidence threshold to filter detections
            nms_thres: a iou threshold to filter detections
        return:
            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
        """
        # Get the boxes that score > CONF_THRESH
        boxes = prediction[prediction[:, 4] >= conf_thres]
        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
        boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
        # clip the coordinates
        boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w -1)
        boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w -1)
        boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h -1)
        boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h -1)
        # Object confidence
        confs = boxes[:, 4]
        # Sort by the confs
        boxes = boxes[np.argsort(-confs)]
        # Perform non-maximum suppression
        keep_boxes = []
        while boxes.shape[0]:
            large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
            label_match = boxes[0, -1] == boxes[:, -1]
            # Indices of boxes with lower confidence scores, large IOUs and matching labels
            invalid = large_overlap & label_match
            keep_boxes += [boxes[0]]
            boxes = boxes[~invalid]
        boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
        return boxes


class inferThread(threading.Thread):
    def __init__(self, yolov5_wrapper, image_path_batch):
        threading.Thread.__init__(self)
        self.yolov5_wrapper = yolov5_wrapper
        self.image_path_batch = image_path_batch

    def run(self):
        batch_image_raw, use_time = self.yolov5_wrapper.infer(self.yolov5_wrapper.get_raw_image(self.image_path_batch))
        for i, img_path in enumerate(self.image_path_batch):
            parent, filename = os.path.split(img_path)
            save_name = os.path.join('output', filename)
            # Save image
            cv2.imwrite(save_name, batch_image_raw[i])
        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))


class warmUpThread(threading.Thread):
    def __init__(self, yolov5_wrapper):
        threading.Thread.__init__(self)
        self.yolov5_wrapper = yolov5_wrapper

    def run(self):
        batch_image_raw, use_time = self.yolov5_wrapper.infer(self.yolov5_wrapper.get_raw_image_zeros())
        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))


if __name__ == "__main__":
    # load custom plugin and engine
    PLUGIN_LIBRARY = "build/libmyplugins.so"
    engine_file_path = "build/yolov5s.engine"

    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]
    if len(sys.argv) > 2:
        PLUGIN_LIBRARY = sys.argv[2]

    ctypes.CDLL(PLUGIN_LIBRARY)

    # load coco labels

    categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
            "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
            "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
            "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
            "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
            "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
            "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
            "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
            "hair drier", "toothbrush"]

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')
    # a YoLov5TRT instance
    yolov5_wrapper = YoLov5TRT(engine_file_path)
    try:
        print('batch size is', yolov5_wrapper.batch_size)
        
        image_dir = "images/"
        image_path_batches = get_img_path_batches(yolov5_wrapper.batch_size, image_dir)

        for i in range(10):
            # create a new thread to do warm_up
            thread1 = warmUpThread(yolov5_wrapper)
            thread1.start()
            thread1.join()
        for batch in image_path_batches:
            # create a new thread to do inference
            thread1 = inferThread(yolov5_wrapper, batch)
            thread1.start()
            thread1.join()
    finally:
        # destroy the instance
        yolov5_wrapper.destroy()


================================================
FILE: yolov5/yolov5_seg.cpp
================================================
#include "config.h"
#include "cuda_utils.h"
#include "logging.h"
#include "utils.h"
#include "preprocess.h"
#include "postprocess.h"
#include "model.h"

#include <iostream>
#include <chrono>
#include <cmath>

using namespace nvinfer1;

static Logger gLogger;
const static int kOutputSize1 = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
const static int kOutputSize2 = 32 * (kInputH / 4) * (kInputW / 4);

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, float& gd, float& gw, std::string& img_dir, std::string& labels_filename) {
    if (argc < 4) return false;
    if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) {
        wts = std::string(argv[2]);
        engine = std::string(argv[3]);
        auto net = std::string(argv[4]);
        if (net[0] == 'n') {
            gd = 0.33;
            gw = 0.25;
        } else if (net[0] == 's') {
            gd = 0.33;
            gw = 0.50;
        } else if (net[0] == 'm') {
            gd = 0.67;
            gw = 0.75;
        } else if (net[0] == 'l') {
            gd = 1.0;
            gw = 1.0;
        } else if (net[0] == 'x') {
            gd = 1.33;
            gw = 1.25;
        } else if (net[0] == 'c' && argc == 7) {
            gd = atof(argv[5]);
            gw = atof(argv[6]);
        } else {
            return false;
        }
    } else if (std::string(argv[1]) == "-d" && argc == 5) {
        engine = std::string(argv[2]);
        img_dir = std::string(argv[3]);
        labels_filename = std::string(argv[4]);
    } else {
        return false;
    }
    return true;
}

void prepare_buffers(ICudaEngine* engine, float** gpu_input_buffer, float** gpu_output_buffer1, float** gpu_output_buffer2, float** cpu_output_buffer1, float** cpu_output_buffer2) {
  assert(engine->getNbBindings() == 3);
  // In order to bind the buffers, we need to know the names of the input and output tensors.
  // Note that indices are guaranteed to be less than IEngine::getNbBindings()
  const int inputIndex = engine->getBindingIndex(kInputTensorName);
  const int outputIndex1 = engine->getBindingIndex(kOutputTensorName);
  const int outputIndex2 = engine->getBindingIndex("proto");
  assert(inputIndex == 0);
  assert(outputIndex1 == 1);
  assert(outputIndex2 == 2);

  // Create GPU buffers on device
  CUDA_CHECK(cudaMalloc((void**)gpu_input_buffer, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
  CUDA_CHECK(cudaMalloc((void**)gpu_output_buffer1, kBatchSize * kOutputSize1 * sizeof(float)));
  CUDA_CHECK(cudaMalloc((void**)gpu_output_buffer2, kBatchSize * kOutputSize2 * sizeof(float)));

  // Alloc CPU buffers
  *cpu_output_buffer1 = new float[kBatchSize * kOutputSize1];
  *cpu_output_buffer2 = new float[kBatchSize * kOutputSize2];
}

void infer(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* output1, float* output2, int batchSize) {
  context.enqueue(batchSize, buffers, stream, nullptr);
  CUDA_CHECK(cudaMemcpyAsync(output1, buffers[1], batchSize * kOutputSize1 * sizeof(float), cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(output2, buffers[2], batchSize * kOutputSize2 * sizeof(float), cudaMemcpyDeviceToHost, stream));
  cudaStreamSynchronize(stream);
}

void serialize_engine(unsigned int max_batchsize, float& gd, float& gw, std::string& wts_name, std::string& engine_name) {
  // Create builder
  IBuilder* builder = createInferBuilder(gLogger);
  IBuilderConfig* config = builder->createBuilderConfig();

  // Create model to populate the network, then set the outputs and create an engine
  ICudaEngine *engine = nullptr;

  engine = build_seg_engine(max_batchsize, builder, config, DataType::kFLOAT, gd, gw, wts_name);

  assert(engine != nullptr);

  // Serialize the engine
  IHostMemory* serialized_engine = engine->serialize();
  assert(serialized_engine != nullptr);

  // Save engine to file
  std::ofstream p(engine_name, std::ios::binary);
  if (!p) {
    std::cerr << "Could not open plan output file" << std::endl;
    assert(false);
  }
  p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());

  // Close everything down
  engine->destroy();
  config->destroy();
  serialized_engine->destroy();
  builder->destroy();
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) {
  std::ifstream file(engine_name, std::ios::binary);
  if (!file.good()) {
    std::cerr << "read " << engine_name << " error!" << std::endl;
    assert(false);
  }
  size_t size = 0;
  file.seekg(0, file.end);
  size = file.tellg();
  file.seekg(0, file.beg);
  char* serialized_engine = new char[size];
  assert(serialized_engine);
  file.read(serialized_engine, size);
  file.close();

  *runtime = createInferRuntime(gLogger);
  assert(*runtime);
  *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
  assert(*engine);
  *context = (*engine)->createExecutionContext();
  assert(*context);
  delete[] serialized_engine;
}

int main(int argc, char** argv) {
  cudaSetDevice(kGpuId);

  std::string wts_name = "";
  std::string engine_name = "";
  std::string labels_filename = "";
  float gd = 0.0f, gw = 0.0f;

  std::string img_dir;
  if (!parse_args(argc, argv, wts_name, engine_name, gd, gw, img_dir, labels_filename)) {
    std::cerr << "arguments not right!" << std::endl;
    std::cerr << "./yolov5_seg -s [.wts] [.engine] [n/s/m/l/x or c gd gw]  // serialize model to plan file" << std::endl;
    std::cerr << "./yolov5_seg -d [.engine] ../images coco.txt  // deserialize plan file, read the labels file and run inference" << std::endl;
    return -1;
  }

  // Create a model using the API directly and serialize it to a file
  if (!wts_name.empty()) {
    serialize_engine(kBatchSize, gd, gw, wts_name, engine_name);
    return 0;
  }

  // Deserialize the engine from file
  IRuntime* runtime = nullptr;
  ICudaEngine* engine = nullptr;
  IExecutionContext* context = nullptr;
  deserialize_engine(engine_name, &runtime, &engine, &context);
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  // Init CUDA preprocessing
  cuda_preprocess_init(kMaxInputImageSize);

  // Prepare cpu and gpu buffers
  float* gpu_buffers[3];
  float* cpu_output_buffer1 = nullptr;
  float* cpu_output_buffer2 = nullptr;
  prepare_buffers(engine, &gpu_buffers[0], &gpu_buffers[1], &gpu_buffers[2], &cpu_output_buffer1, &cpu_output_buffer2);

  // Read images from directory
  std::vector<std::string> file_names;
  if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
    std::cerr << "read_files_in_dir failed." << std::endl;
    return -1;
  }

  // Read the txt file for classnames
  std::ifstream labels_file(labels_filename, std::ios::binary);
  if (!labels_file.good()) {
    std::cerr << "read " << labels_filename << " error!" << std::endl;
    return -1;
  }
  std::unordered_map<int, std::string> labels_map;
  read_labels(labels_filename, labels_map);
  assert(kNumClass == labels_map.size());

  // batch predict
  for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
    // Get a batch of images
    std::vector<cv::Mat> img_batch;
    std::vector<std::string> img_name_batch;
    for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
      cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
      img_batch.push_back(img);
      img_name_batch.push_back(file_names[j]);
    }

    // Preprocess
    cuda_batch_preprocess(img_batch, gpu_buffers[0], kInputW, kInputH, stream);

    // Run inference
    auto start = std::chrono::system_clock::now();
    infer(*context, stream, (void**)gpu_buffers, cpu_output_buffer1, cpu_output_buffer2, kBatchSize);
    auto end = std::chrono::system_clock::now();
    std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;

    // NMS
    std::vector<std::vector<Detection>> res_batch;
    batch_nms(res_batch, cpu_output_buffer1, img_batch.size(), kOutputSize1, kConfThresh, kNmsThresh);

    // Draw result and save image
    for (size_t b = 0; b < img_name_batch.size(); b++) {
      auto& res = res_batch[b];
      cv::Mat img = img_batch[b];

      auto masks = process_mask(&cpu_output_buffer2[b * kOutputSize2], kOutputSize2, res);
      draw_mask_bbox(img, res, masks, labels_map);
      cv::imwrite("_" + img_name_batch[b], img);
    }
  }

  // Release stream and buffers
  cudaStreamDestroy(stream);
  CUDA_CHECK(cudaFree(gpu_buffers[0]));
  CUDA_CHECK(cudaFree(gpu_buffers[1]));
  CUDA_CHECK(cudaFree(gpu_buffers[2]));
  delete[] cpu_output_buffer1;
  delete[] cpu_output_buffer2;
  cuda_preprocess_destroy();
  // Destroy the engine
  context->destroy();
  engine->destroy();
  runtime->destroy();

  return 0;
}


================================================
FILE: yolov5/yolov5_seg_trt.py
================================================
"""
An example that uses TensorRT's Python api to make inferences.
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt

CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4


def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret


def plot_one_box(x, img, color=None, label=None, line_thickness=None):
    """
    description: Plots one bounding box on image img,
                 this function comes from YoLov5 project.
    param: 
        x:      a box likes [x1,y1,x2,y2]
        img:    a opencv image object
        color:  color to draw rectangle, such as (0,255,0)
        label:  str
        line_thickness: int
    return:
        no return

    """
    tl = (
        line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
    )  # line/font thickness
    color = color or [random.randint(0, 255) for _ in range(3)]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
    if label:
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(
            img,
            label,
            (c1[0], c1[1] - 2),
            0,
            tl / 3,
            [225, 255, 255],
            thickness=tf,
            lineType=cv2.LINE_AA,
        )


class YoLov5TRT(object):
    """
    description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)
        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.batch_size = engine.max_batch_size

        # Data length
        self.det_output_length  = host_outputs[0].shape[0]
        self.mask_output_length = host_outputs[1].shape[0]
        self.seg_w = int(self.input_w / 4)
        self.seg_h = int(self.input_h / 4)
        self.seg_c = int(self.mask_output_length / (self.seg_w * self.seg_w))
        self.det_row_output_length = self.seg_c + 6
        
        # Draw mask
        self.colors_obj = Colors()

    def infer(self, raw_image_generator):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        engine = self.engine
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_origin_h = []
        batch_origin_w = []
        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
            batch_image_raw.append(image_raw)
            batch_origin_h.append(origin_h)
            batch_origin_w.append(origin_w)
            np.copyto(batch_input_image[i], input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        cuda.memcpy_dtoh_async(host_outputs[1], cuda_outputs[1], stream)
        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        output_bbox = host_outputs[0]
        output_proto_mask = host_outputs[1]
        # Do postprocess
        for i in range(self.batch_size):
            result_boxes, result_scores, result_classid, result_proto_coef = self.post_process(
                output_bbox[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i], batch_origin_w[i]
            )
            if result_proto_coef.shape[0] == 0:
                continue
            result_masks = self.process_mask(output_proto_mask, result_proto_coef, result_boxes, batch_origin_h[i], batch_origin_w[i])

            # Draw masks on  the original image
            self.draw_mask(result_masks, colors_=[self.colors_obj(x, True) for x in result_classid],im_src=batch_image_raw[i])

            # Draw rectangles and labels on the original image
            for j in range(len(result_boxes)):
                box = result_boxes[j]
                plot_one_box(
                    box,
                    batch_image_raw[i],
                    label="{}:{:.2f}".format(
                        categories[int(result_classid[j])], result_scores[j]
                    ),
                )
        return batch_image_raw, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()

    def get_raw_image(self, image_path_batch):
        """
        description: Read an image from image path
        """
        for img_path in image_path_batch:
            yield cv2.imread(img_path)

    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_image(self, raw_bgr_image):
        """
        description: Convert BGR image to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = self.input_w / w
        r_h = self.input_h / h
        if r_h > r_w:
            tw = self.input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.input_h - th) / 2)
            ty2 = self.input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = self.input_h
            tx1 = int((self.input_w - tw) / 2)
            tx2 = self.input_w - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy(self, origin_h, origin_w, x):
        """
        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
        param:
            origin_h:   height of original image
            origin_w:   width of original image
            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
        return:
            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
        """
        y = np.zeros_like(x)
        r_w = self.input_w / origin_w
        r_h = self.input_h / origin_h
        if r_h > r_w:
            y[:, 0] = x[:, 0] - x[:, 2] / 2
            y[:, 2] = x[:, 0] + x[:, 2] / 2
            y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
            y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
            y /= r_w
        else:
            y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
            y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
            y[:, 1] = x[:, 1] - x[:, 3] / 2
            y[:, 3] = x[:, 1] + x[:, 3] / 2
            y /= r_h

        return y

    def post_process(self, output_boxes, origin_h, origin_w):
        """
        description: postprocess the prediction
        param:
            output:     A numpy likes [num_boxes, cx, cy, w, h, conf, cls_id, mask[32], cx, cy, w, h, conf, cls_id, mask[32] ...] 
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
            result_scores: finally scores, a numpy, each element is the score correspoing to box
            result_classid: finally classid, a numpy, each element is the classid correspoing to box
        """
        # Get the num of boxes detected
        num = int(output_boxes[0])
        # Reshape to a two dimentional ndarray
        pred = np.reshape(output_boxes[1:], (-1, self.det_row_output_length))[:num, :]
        # Do nms
        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH,
                                         nms_thres=IOU_THRESHOLD)
        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
        result_scores = boxes[:, 4] if len(boxes) else np.array([])
        result_classid = boxes[:, 5] if len(boxes) else np.array([])
        result_proto_coef = boxes[:, 6:] if len(boxes) else np.array([])
        return result_boxes, result_scores, result_classid, result_proto_coef

    def bbox_iou(self, box1, box2, x1y1x2y2=True):
        """
        description: compute the IoU of two bounding boxes
        param:
            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))            
            x1y1x2y2: select the coordinate format
        return:
            iou: computed iou
        """
        if not x1y1x2y2:
            # Transform from center and width to exact coordinates
            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
        else:
            # Get the coordinates of bounding boxes
            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

        # Get the coordinates of the intersection rectangle
        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
        # Intersection area
        inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \
                     np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)
        # Union Area
        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)

        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

        return iou

    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
        """
        description: Removes detections with lower object confidence score than 'conf_thres' and performs
        Non-Maximum Suppression to further filter detections.
        param:
            prediction: detections, (x1, y1, x2, y2, conf, cls_id, mask coefficients[32])
            origin_h: original image height
            origin_w: original image width
            conf_thres: a confidence threshold to filter detections
            nms_thres: a iou threshold to filter detections
        return:
            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
        """
        # Get the boxes that score > CONF_THRESH
        boxes = prediction[prediction[:, 4] >= conf_thres]
        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
        boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
        # clip the coordinates
        boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1)
        boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1)
        boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1)
        boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1)
        # Object confidence
        confs = boxes[:, 4]
        # Sort by the confs
        boxes = boxes[np.argsort(-confs)]
        # Perform non-maximum suppression
        keep_boxes = []
        while boxes.shape[0]:
            large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
            label_match = boxes[0, 5] == boxes[:, 5]
            # Indices of boxes with lower confidence scores, large IOUs and matching labels
            invalid = large_overlap & label_match
            keep_boxes += [boxes[0]]
            boxes = boxes[~invalid]
        boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
        return boxes

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def scale_mask(self, mask, ih, iw):
        mask = cv2.resize(mask, (self.input_w, self.input_h))
        r_w = self.input_w / (iw * 1.0)
        r_h = self.input_h / (ih * 1.0)
        if r_h > r_w:
            w = self.input_w
            h = int(r_w * ih)
            x = 0
            y = int((self.input_h - h) / 2)
        else:
            w = int(r_h * iw)
            h = self.input_h
            x = int((self.input_w - w) / 2)
            y = 0
        crop = mask[y:y+h, x:x+w]
        crop = cv2.resize(crop, (iw, ih))
        return crop


    def process_mask(self, output_proto_mask, result_proto_coef, result_boxes, ih, iw):
        """
        description: Mask pred by yolov5 instance segmentation ,
        param: 
            output_proto_mask: prototype mask e.g. (32, 160, 160) for 640x640 input
            result_proto_coef: prototype mask coefficients (n, 32), n represents n results
            result_boxes     :  
            ih: rows of original image
            iw: cols of original image
        return:
            mask_result: (n, ih, iw)
        """
        result_proto_masks = output_proto_mask.reshape(self.seg_c, self.seg_h, self.seg_w)
        c, mh, mw = result_proto_masks.shape
        masks = self.sigmoid((result_proto_coef @ result_proto_masks.astype(np.float32).reshape(c, -1))).reshape(-1, mh, mw)
        mask_result = []
        for mask, box in zip(masks, result_boxes):
            mask_s = np.zeros((ih, iw))
            crop_mask = self.scale_mask(mask, ih, iw)            
            x1 = int(box[0])
            y1 = int(box[1])
            x2 = int(box[2])
            y2 = int(box[3])
            crop = crop_mask[y1:y2, x1:x2]
            crop = np.where(crop >= 0.5, 1, 0)
            crop = crop.astype(np.uint8)
            mask_s[y1:y2, x1:x2] = crop
            mask_result.append(mask_s)
        mask_result = np.array(mask_result)
        return mask_result

    def draw_mask(self, masks, colors_, im_src, alpha=0.5):
        """
        description: Draw mask on image ,
        param: 
            masks  : result_mask
            colors_: color to draw mask
            im_src : original image
            alpha  : scale between original  image and mask
        return:
            no return
        """
        if len(masks) == 0:
            return
        masks = np.asarray(masks, dtype=np.uint8)
        masks = np.ascontiguousarray(masks.transpose(1, 2, 0))
        masks = np.asarray(masks, dtype=np.float32)
        colors_ = np.asarray(colors_, dtype=np.float32)
        s = masks.sum(2, keepdims=True).clip(0, 1)
        masks = (masks @ colors_).clip(0, 255)
        im_src[:] = masks * alpha + im_src * (1 - s * alpha)

class inferThread(threading.Thread):
    def __init__(self, yolov5_wrapper, image_path_batch):
        threading.Thread.__init__(self)
        self.yolov5_wrapper = yolov5_wrapper
        self.image_path_batch = image_path_batch

    def run(self):
        batch_image_raw, use_time = self.yolov5_wrapper.infer(self.yolov5_wrapper.get_raw_image(self.image_path_batch))
        for i, img_path in enumerate(self.image_path_batch):
            parent, filename = os.path.split(img_path)
            save_name = os.path.join('output', filename)
            # Save image
            cv2.imwrite(save_name, batch_image_raw[i])
        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))


class warmUpThread(threading.Thread):
    def __init__(self, yolov5_wrapper):
        threading.Thread.__init__(self)
        self.yolov5_wrapper = yolov5_wrapper

    def run(self):
        batch_image_raw, use_time = self.yolov5_wrapper.infer(self.yolov5_wrapper.get_raw_image_zeros())
        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))


class Colors:
    def __init__(self):
        hexs = ('FF3838', 'FF9D97', 'FF701F', 'FFB21D', 'CFD231', '48F90A',
                '92CC17', '3DDB86', '1A9334', '00D4BB', '2C99A8', '00C2FF',
                '344593', '6473FF', '0018EC', '8438FF', '520085', 'CB38FF',
                'FF95C8', 'FF37C7')
        self.palette = [self.hex2rgb(f'#{c}') for c in hexs]
        self.n = len(self.palette)

    def __call__(self, i, bgr=False):
        c = self.palette[int(i) % self.n]
        return (c[2], c[1], c[0]) if bgr else c

    @staticmethod
    def hex2rgb(h):  # rgb order (PIL)
        return tuple(int(h[1 + i:1 + i + 2], 16) for i in (0, 2, 4))

if __name__ == "__main__":
    # load custom plugin and engine
    PLUGIN_LIBRARY = "build/libmyplugins.so"
    engine_file_path = "build/yolov5s-seg.engine"

    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]
    if len(sys.argv) > 2:
        PLUGIN_LIBRARY = sys.argv[2]

    ctypes.CDLL(PLUGIN_LIBRARY)

    # load coco labels

    categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
            "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
            "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
            "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
            "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
            "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
            "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
            "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
            "hair drier", "toothbrush"]

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')
    # a YoLov5TRT instance
    yolov5_wrapper = YoLov5TRT(engine_file_path)
    try:
        print('batch size is', yolov5_wrapper.batch_size)

        image_dir = "images/"
        image_path_batches = get_img_path_batches(yolov5_wrapper.batch_size, image_dir)

        for i in range(10):
            # create a new thread to do warm_up
            thread1 = warmUpThread(yolov5_wrapper)
            thread1.start()
            thread1.join()
        for batch in image_path_batches:
            # create a new thread to do inference
            thread1 = inferThread(yolov5_wrapper, batch)
            thread1.start()
            thread1.join()
    finally:
        # destroy the instance
        yolov5_wrapper.destroy()


================================================
FILE: yolov5-lite/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.10)

project(yolov5-lite)

add_definitions(-std=c++11)
add_definitions(-DAPI_EXPORTS)
option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

find_package(CUDA REQUIRED)

if(WIN32)
enable_language(CUDA)
endif(WIN32)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
# include_directories(/usr/include/x86_64-linux-gnu/)
# link_directories(/usr/lib/x86_64-linux-gnu/)
include_directories(/opt/TensorRT-8.6.1.6/include)
link_directories(/opt/TensorRT-8.6.1.6/lib)

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")

cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu)
target_link_libraries(myplugins nvinfer cudart)

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

#add_executable(yolov5 ${PROJECT_SOURCE_DIR}/calibrator.cpp ${PROJECT_SOURCE_DIR}/main.cpp)
add_executable(v5lite ${PROJECT_SOURCE_DIR}/calibrator.cpp ${PROJECT_SOURCE_DIR}/v5lite.cpp)
target_link_libraries(v5lite nvinfer)
target_link_libraries(v5lite cudart)
target_link_libraries(v5lite myplugins)
target_link_libraries(v5lite ${OpenCV_LIBS})

if(UNIX)
add_definitions(-O2 -pthread)
endif(UNIX)


================================================
FILE: yolov5-lite/README.md
================================================
# YOLOv5-Lite TensorRT Deployment


Detection training code [link](https://github.com/ppogg/YOLOv5-Lite.git)

## Environment
TensorRT: 8.6.1.6
CUDA: 12.6
CUDNN: 8.9.0
OpenCV:4.10.0


## Configuration parameters

Before starting, you need to modify parameters in `include/yololayer.h` to match your training configuration (example at `include/yololayer.h`):

```cpp
static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000;
static constexpr int CLASS_NUM = 80;  // number of classes
static constexpr int INPUT_H = 640;   // input height for yolov5-lite (must be divisible by 32)
static constexpr int INPUT_W = 640;   // input width for yolov5-lite (must be divisible by 32)
static constexpr int DEVICE = 0;
static constexpr float NMS_THRESH = 0.4;
static constexpr float CONF_THRESH = 0.45;
static constexpr int BATCH_SIZE = 1;
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
```

## 1. Generate .wts from .pt

This step must be performed inside the `yolov5-lite` folder:

```bash
cd yolov5-lite
git clone https://gitcode.com/open-source-toolkit/ac70a.git
unzip your zip file 

python gen_wts.py -w v5lite-s.pt -o v5lite-s.wts
python gen_wts.py -w v5lite-e.pt -o v5lite-e.wts
python gen_wts.py -w v5lite-g.pt -o v5lite-g.wts
```

## 2. Build the engine and run inference

### Build steps

a. First, set `CLASS_NUM` in `include/yololayer.h` to match your dataset class count — this is important, otherwise you will get errors.

b. Run the following commands:

```bash
mkdir build
cd build
cmake ..
make
```

### Generate engine files

```bash
./v5lite -s ../v5lite-s.wts v5lite-s.engine s
./v5lite -s ../v5lite-g.wts v5lite-g.engine g
./v5lite -s ../v5lite-e.wts v5lite-e.engine e
./v5lite -s ../v5lite-c.wts v5lite-c.engine c
```

### Using the engine for inference

(`samples` is the folder containing your images):

```bash
./v5lite -d v5lite-s.engine ../samples
```

You can also use `yolov5-lite-trt.py` (in the repository root) for inference.

## 3. INT8 Quantization

### Preparation

1. Collect calibration images (recommended ~1000 images)
2. Put the images in a calibration folder (for example: `tensorrtx-int8calib-data/coco_calib`)
3. Modify the macro in [v5lite.cpp](yolov5-lite/v5lite.cpp):

   Change:
   ```cpp
   // #define USE_FP16  // set USE_INT8 or USE_FP16 or USE_FP32
   // #define USE_INT8  // set USE_INT8 or USE_FP16 or USE_FP32
   ```

   To:
   ```cpp
   // #define USE_FP16  // set USE_INT8 or USE_FP16 or USE_FP32
   #define USE_INT8  // set USE_INT8 or USE_FP16 or USE_FP32
   ```

4. Update the data path in the code to point to your calibration images

5. Rebuild and generate the engine, then run inference (repeat step 2)

## Notes

- In practice, calling the engine from Python may produce better inference behavior in some cases.


================================================
FILE: yolov5-lite/calibrator.cpp
================================================
#include <iostream>
#include <iterator>
#include <fstream>
#include <opencv2/dnn/dnn.hpp>
#include "calibrator.h"
#include "cuda_utils.h"
#include "utils.h"

Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache)
    : batchsize_(batchsize)
    , input_w_(input_w)
    , input_h_(input_h)
    , img_idx_(0)
    , img_dir_(img_dir)
    , calib_table_name_(calib_table_name)
    , input_blob_name_(input_blob_name)
    , read_cache_(read_cache)
{
    input_count_ = 3 * input_w * input_h * batchsize;
    CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float)));
    read_files_in_dir(img_dir, img_files_);
}

Int8EntropyCalibrator2::~Int8EntropyCalibrator2()
{
    CUDA_CHECK(cudaFree(device_input_));
}

int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT
{
    return batchsize_;
}

bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT
{
    if (img_idx_ + batchsize_ > (int)img_files_.size()) {
        return false;
    }

    std::vector<cv::Mat> input_imgs_;
    for (int i = img_idx_; i < img_idx_ + batchsize_; i++) {
        std::cout << img_files_[i] << "  " << i << std::endl;
        cv::Mat temp = cv::imread(img_dir_ + img_files_[i]);
        if (temp.empty()){
            std::cerr << "Fatal error: image cannot open!" << std::endl;
            return false;
        }
        cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_);
        input_imgs_.push_back(pr_img);
    }
    img_idx_ += batchsize_;
    cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false);

    CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr<float>(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice));
    assert(!strcmp(names[0], input_blob_name_));
    bindings[0] = device_input_;
    return true;
}

const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT
{
    std::cout << "reading calib cache: " << calib_table_name_ << std::endl;
    calib_cache_.clear();
    std::ifstream input(calib_table_name_, std::ios::binary);
    input >> std::noskipws;
    if (read_cache_ && input.good())
    {
        std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(calib_cache_));
    }
    length = calib_cache_.size();
    return length ? calib_cache_.data() : nullptr;
}

void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT
{
    std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl;
    std::ofstream output(calib_table_name_, std::ios::binary);
    output.write(reinterpret_cast<const char*>(cache), length);
}


================================================
FILE: yolov5-lite/common.hpp
================================================
#ifndef YOLOV5_COMMON_H_
#define YOLOV5_COMMON_H_

#include <fstream>
#include <map>
#include <sstream>
#include <vector>
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "yololayer.h"

using namespace nvinfer1;

cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
    int l, r, t, b;
    float r_w = Yolo::INPUT_W / (img.cols * 1.0);
    float r_h = Yolo::INPUT_H / (img.rows * 1.0);
    if (r_h > r_w) 
    {
        l = bbox[0] - bbox[2] / 2.f;
        r = bbox[0] + bbox[2] / 2.f;
        t = bbox[1] - bbox[3] / 2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
        b = bbox[1] + bbox[3] / 2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
        l = l / r_w;
        r = r / r_w;
        t = t / r_w;
        b = b / r_w;
    } 
    else
    {
        l = bbox[0] - bbox[2] / 2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
        r = bbox[0] + bbox[2] / 2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
        t = bbox[1] - bbox[3] / 2.f;
        b = bbox[1] + bbox[3] / 2.f;
        l = l / r_h;
        r = r / r_h;
        t = t / r_h;
        b = b / r_h;
    }
    return cv::Rect(l, t, r - l, b - t);
}

float iou(float lbox[4], float rbox[4]) 
{
    float interBox[] = {
        (std::max)(lbox[0] - lbox[2] / 2.f , rbox[0] - rbox[2] / 2.f), //left
        (std::min)(lbox[0] + lbox[2] / 2.f , rbox[0] + rbox[2] / 2.f), //right
        (std::max)(lbox[1] - lbox[3] / 2.f , rbox[1] - rbox[3] / 2.f), //top
        (std::min)(lbox[1] + lbox[3] / 2.f , rbox[1] + rbox[3] / 2.f), //bottom
    };

    if (interBox[2] > interBox[3] || interBox[0] > interBox[1])
    {
        std::cout << "The data is questionable!" << std::endl;
        return 0.0f;
    }

    float interBoxS = (interBox[1] - interBox[0])*(interBox[3] - interBox[2]);
    return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS);
}

bool cmp(const Yolo::Detection& a, const Yolo::Detection& b) 
{
    return a.conf > b.conf;
}

void nms(std::vector<Yolo::Detection>& res, float *output, float conf_thresh, float nms_thresh = 0.5) 
{
    int det_size = sizeof(Yolo::Detection) / sizeof(float);
    std::map<float, std::vector<Yolo::Detection>> m;
    for (int i = 0; i < output[0] && i < Yolo::MAX_OUTPUT_BBOX_COUNT; i++) {
        if (output[1 + det_size * i + 4] <= conf_thresh) continue;
        Yolo::Detection det;
        memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
        if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Yolo::Detection>());
        m[det.class_id].push_back(det);
    }
    for (auto it = m.begin(); it != m.end(); it++) {
        //std::cout << it->second[0].class_id << " --- " << std::endl;
        auto& dets = it->second;
        std::sort(dets.begin(), dets.end(), cmp);
        for (size_t m = 0; m < dets.size(); ++m) {
            auto& item = dets[m];
            res.push_back(item);
            for (size_t n = m + 1; n < dets.size(); ++n) {
                if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
                    dets.erase(dets.begin() + n);
                    --n;
                }
            }
        }
    }
}

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) 
{
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");

    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        /*
        class Weights
        {
        public:
            DataType type;      //!< The type of the weights.
            void const* values; //!< The weight values, in a contiguous array.
            int64_t count;      //!< The number of weights in the array.
        };
        */
        Weights wt{ DataType::kFLOAT, nullptr, 0 };
        uint32_t size; 
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }
    //  for (auto it = weightMap.begin(); it != weightMap.end(); it++) {
    //     std::cout << "========= keys: " << it -> first << " =================" <<  std::endl;
    // }

    return weightMap;
}

nvinfer1::IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps)
 {
    float *gamma = (float*)weightMap[lname + ".weight"].values;
    float *beta = (float*)weightMap[lname + ".bias"].values;
    float *mean = (float*)weightMap[lname + ".running_mean"].values;
    float *var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;

    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    // gamma / sqrt(running_var + eps)
    for (int i = 0; i < len; i++)
    {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{ DataType::kFLOAT, scval, len };

    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) 
    {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{ DataType::kFLOAT, shval, len };

    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) 
    {
        pval[i] = 1.0;
    }
    Weights power{ DataType::kFLOAT, pval, len };

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

nvinfer1::IPoolingLayer *conv_bn_relu_maxpool(nvinfer1::INetworkDefinition *network, std::map<std::string, nvinfer1::Weights> & weightMap, nvinfer1::ITensor &input, int outch, std::string lname){
  nvinfer1::Weights emptywts{nvinfer1::DataType::kFLOAT, nullptr, 0};
  nvinfer1::IConvolutionLayer *conv0 = network->addConvolutionNd(input, outch, nvinfer1::DimsHW{3, 3}, weightMap[lname + "conv.0.weight"], emptywts);
  conv0->setStrideNd(nvinfer1::DimsHW{2, 2});
  conv0->setPaddingNd(nvinfer1::DimsHW{1, 1});

  nvinfer1::IScaleLayer * bn1 = addBatchNorm2d(network, weightMap, *conv0->getOutput(0), lname + "conv.1", 1e-3);
  
  auto Relu = network->addActivation(*bn1->getOutput(0), nvinfer1::ActivationType::kRELU);
  assert(Relu);
  IPoolingLayer *pool = network->addPoolingNd(*Relu->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{3, 3});
  pool->setStrideNd(nvinfer1::DimsHW{2, 2});
  pool->setPaddingNd(nvinfer1::DimsHW{1, 1});
  assert(pool);
  return pool;
}


nvinfer1::IElementWiseLayer *HardSwish(nvinfer1::INetworkDefinition *network, nvinfer1::ITensor &input){
    auto hsig = network->addActivation(input, ActivationType::kHARD_SIGMOID);
    hsig->setAlpha(1.0 / 6.0);
    hsig->setBeta(0.5);
    auto ew = network->addElementWise(input, *hsig->getOutput(0), ElementWiseOperation::kPROD);
    return ew;
    
}


nvinfer1::IElementWiseLayer *CBH(nvinfer1::INetworkDefinition *network, std::map<std::string, nvinfer1::Weights> &weightMap, nvinfer1::ITensor &input, 
        int num_filters, int filter_size, int stride, std::string lname, int num_groups=1){
    
    int pad = (filter_size - 1) / 2;
    nvinfer1::Weights emptywts {nvinfer1::DataType::kFLOAT, nullptr, 0};

    nvinfer1::IConvolutionLayer *conv = network->addConvolutionNd(input, num_filters, nvinfer1::DimsHW{filter_size, filter_size}, 
                 weightMap[lname + ".conv.weight"], emptywts);
    conv->setStrideNd(nvinfer1::DimsHW{stride, stride});
    conv->setPaddingNd(nvinfer1::DimsHW{pad, pad});
    conv->setNbGroups(num_groups);

    nvinfer1::IScaleLayer *bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);
    nvinfer1::IElementWiseLayer *hash = HardSwish(network, *bn->getOutput(0));
    
    nvinfer1::Dims dims = hash->getOutput(0)->getDimensions();
   
    return hash;
}


nvinfer1::IElementWiseLayer *SiLU(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor& input)
{
    // Create Sigmoid activation layer
    nvinfer1::IActivationLayer *sig = network->addActivation(input, ActivationType::kSIGMOID);

    nvinfer1::IElementWiseLayer *mul = network->addElementWise(input, *sig->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);

    return mul;
}


nvinfer1::IElementWiseLayer *LC_SEModule(nvinfer1::INetworkDefinition *network, std::map<std::string, nvinfer1::Weights> &weightMap, nvinfer1::ITensor &input,
       int in_channels, std::string lname, int reduction=4){

    nvinfer1::IIdentityLayer *identity = network->addIdentity(input);
    nvinfer1::IReduceLayer *avg_pool = network->addReduce(input, nvinfer1::ReduceOperation::kAVG, (1 << 1) | (1 << 2), true);
    nvinfer1::IConvolutionLayer *conv1 = network->addConvolutionNd(*avg_pool->getOutput(0), in_channels / reduction, nvinfer1::DimsHW{1, 1},
             weightMap[lname + ".conv1.weight"], weightMap[lname + ".conv1.bias"]);
    nvinfer1::IActivationLayer *relu = network->addActivation(*conv1->getOutput(0), nvinfer1::ActivationType::kRELU);
    nvinfer1::IConvolutionLayer *conv2 = network->addConvolutionNd(*relu->getOutput(0), in_channels, nvinfer1::DimsHW{1, 1},
             weightMap[lname + ".conv2.weight"], weightMap[lname + ".conv2.bias"]);
    nvinfer1::IElementWiseLayer *silu = SiLU(network, *conv2->getOutput(0));

    nvinfer1::IElementWiseLayer *out = network->addElementWise(*silu->getOutput(0), *identity->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);

    return out;
}

nvinfer1::IElementWiseLayer *LC_Block(nvinfer1::INetworkDefinition *network, std::map<std::string, nvinfer1::Weights> &weightMap, nvinfer1::ITensor &input,
     int num_channels, int num_filters, int stride, int dw_size, std::string lname, bool use_se=false){
    // num_channels : in_channel
    // num_filters : out_channel
    // stride:dw_conv's stride
    // dw_size: dw_conv's filter-size
    nvinfer1::IElementWiseLayer *dw_conv = CBH(network, weightMap, input, num_channels, dw_size, stride, lname + ".dw_conv", num_channels);
    if(use_se){
        nvinfer1::IElementWiseLayer *se = LC_SEModule(network, weightMap, *dw_conv->getOutput(0), num_channels, lname + ".se");
        nvinfer1::IElementWiseLayer *pw_conv = CBH(network, weightMap, *se->getOutput(0), num_filters, 1, 1, lname + ".pw_conv");

        return pw_conv;
    }
    nvinfer1::IElementWiseLayer *pw_conv = CBH(network, weightMap, *dw_conv->getOutput(0), num_filters, 1, 1, lname + ".pw_conv");
    
    return pw_conv;
}


nvinfer1::IElementWiseLayer *Dense(nvinfer1::INetworkDefinition *network, std::map<std::string, nvinfer1::Weights> &weightMap, nvinfer1::ITensor &input, 
      int num_filters, int filter_size, std::string lname){
    nvinfer1::Weights emptywts{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer *dense_conv = network->addConvolutionNd(input, num_filters, nvinfer1::DimsHW{filter_size, filter_size}
     , weightMap[lname + ".dense_conv.weight"], emptywts);
    
    nvinfer1::IElementWiseLayer *hash = HardSwish(network, *dense_conv->getOutput(0));
    nvinfer1::Dims dims_o = hash->getOutput(0)->getDimensions();
    return hash;
}


nvinfer1::IElementWiseLayer* convBlock(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname) {
  Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
  int p = ksize / 3;
  IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ ksize, ksize }, weightMap[lname + ".conv.weight"], emptywts);
  assert(conv1);
  conv1->setStrideNd(DimsHW{ s, s });
  conv1->setPaddingNd(DimsHW{ p, p });
  conv1->setNbGroups(g);
  IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3);

  // silu = x * sigmoid
  auto sig = network->addActivation(*bn1->getOutput(0), ActivationType::kSIGMOID);
  assert(sig);
  auto ew = network->addElementWise(*bn1->getOutput(0), *sig->getOutput(0), ElementWiseOperation::kPROD);
  assert(ew);

  return ew;
}

nvinfer1::IShuffleLayer* shuffle_block(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, int inch, int outch, int s) {
    Weights emptywts{DataType::kFLOAT, nullptr, 0};
    int branch_features = outch / 2;
    ITensor *x1, *x2i, *x2o;
    if (s > 1) {
        IConvolutionLayer* conv1 = network->addConvolutionNd(input, inch, DimsHW{3, 3}, weightMap[lname + "branch1.0.weight"], emptywts);
        assert(conv1);
        conv1->setStrideNd(DimsHW{s, s});
        conv1->setPaddingNd(DimsHW{1, 1});
        conv1->setNbGroups(inch);
        IScaleLayer *bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "branch1.1", 1e-5);
        IConvolutionLayer* conv2 = network->addConvolutionNd(*bn1->getOutput(0), branch_features, DimsHW{1, 1}, weightMap[lname + "branch1.2.weight"], emptywts);
        assert(conv2);
        IScaleLayer *bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "branch1.3", 1e-5);
        IActivationLayer* relu1 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU);
        assert(relu1);
        x1 = relu1->getOutput(0);
        x2i = &input;
    } else {
        Dims d = input.getDimensions();
        ISliceLayer *s1 = network->addSlice(input, Dims3{ 0, 0, 0 }, Dims3{ d.d[0] / 2, d.d[1], d.d[2] }, Dims3{ 1, 1, 1 });
        ISliceLayer *s2 = network->addSlice(input, Dims3{ d.d[0] / 2, 0, 0 }, Dims3{ d.d[0] / 2, d.d[1], d.d[2] }, Dims3{ 1, 1, 1 });
        x1 = s1->getOutput(0);
        x2i = s2->getOutput(0);
    }

    IConvolutionLayer* conv3 = network->addConvolutionNd(*x2i, branch_features, DimsHW{1, 1}, weightMap[lname + "branch2.0.weight"], emptywts);
    assert(conv3);
    IScaleLayer *bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "branch2.1", 1e-5);
    IActivationLayer* relu2 = network->addActivation(*bn3->getOutput(0), ActivationType::kRELU);
    assert(relu2);
    IConvolutionLayer* conv4 = network->addConvolutionNd(*relu2->getOutput(0), branch_features, DimsHW{3, 3}, weightMap[lname + "branch2.3.weight"], emptywts);
    assert(conv4);
    conv4->setStrideNd(DimsHW{s, s});
    conv4->setPaddingNd(DimsHW{1, 1});
    conv4->setNbGroups(branch_features);
    IScaleLayer *bn4 = addBatchNorm2d(network, weightMap, *conv4->getOutput(0), lname + "branch2.4", 1e-5);
    IConvolutionLayer* conv5 = network->addConvolutionNd(*bn4->getOutput(0), branch_features, DimsHW{1, 1}, weightMap[lname + "branch2.5.weight"], emptywts);
    assert(conv5);
    IScaleLayer *bn5 = addBatchNorm2d(network, weightMap, *conv5->getOutput(0), lname + "branch2.6", 1e-5);
    IActivationLayer* relu3 = network->addActivation(*bn5->getOutput(0), ActivationType::kRELU);
    assert(relu3);

    ITensor* inputTensors1[] = {x1, relu3->getOutput(0)};
    IConcatenationLayer* cat1 = network->addConcatenation(inputTensors1, 2);
    assert(cat1);

    Dims dims = cat1->getOutput(0)->getDimensions();
    std::cout << cat1->getOutput(0)->getName() << " dims: ";
    for (int i = 0; i < dims.nbDims; i++) {
        std::cout << dims.d[i] << ", ";
    }
    std::cout << std::endl;

    IShuffleLayer *sf1 = network->addShuffle(*cat1->getOutput(0));
    assert(sf1);
    sf1->setReshapeDimensions(Dims4(2, dims.d[0] / 2, dims.d[1], dims.d[2]));
    sf1->setSecondTranspose(Permutation{1, 0, 2, 3});

    Dims dims1 = sf1->getOutput(0)->getDimensions();
    std::cout << sf1->getOutput(0)->getName() << " dims: ";
    for (int i = 0; i < dims1.nbDims; i++) {
        std::cout << dims1.d[i] << ", ";
    }
    std::cout << std::endl;

    IShuffleLayer *sf2 = network->addShuffle(*sf1->getOutput(0));
    assert(sf2);
    sf2->setReshapeDimensions(Dims3(dims.d[0], dims.d[1], dims.d[2]));

    Dims dims2 = sf2->getOutput(0)->getDimensions();
    std::cout << sf2->getOutput(0)->getName() << " dims: ";
    for (int i = 0; i < dims2.nbDims; i++) {
        std::cout << dims2.d[i] << ", ";
    }
    std::cout << std::endl;

    return sf2;
}

nvinfer1::IElementWiseLayer* SPP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, 
  int c1, int c2, int k1, int k2, int k3, std::string lname) {
  int c_ = c1 / 2;
  auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");

  auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k1, k1 });
  pool1->setPaddingNd(DimsHW{ k1 / 2, k1 / 2 });
  pool1->setStrideNd(DimsHW{ 1, 1 });
  auto pool2 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k2, k2 });
  pool2->setPaddingNd(DimsHW{ k2 / 2, k2 / 2 });
  pool2->setStrideNd(DimsHW{ 1, 1 });
  auto pool3 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k3, k3 });
  pool3->setPaddingNd(DimsHW{ k3 / 2, k3 / 2 });
  pool3->setStrideNd(DimsHW{ 1, 1 });

  ITensor* inputTensors[] = { cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0) };
  auto cat = network->addConcatenation(inputTensors, 4);

  auto cv2 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2");
  return cv2;
}

nvinfer1::IElementWiseLayer* bottleneck(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, bool shortcut, int g, float e, std::string lname) {
  auto cv1 = convBlock(network, weightMap, input, (int)((float)c2 * e), 1, 1, 1, lname + ".cv1");
  auto cv2 = convBlock(network, weightMap, *cv1->getOutput(0), c2, 3, 1, g, lname + ".cv2");
  if (shortcut && c1 == c2) {
    auto ew = network->addElementWise(input, *cv2->getOutput(0), ElementWiseOperation::kSUM);
    return ew;
  }
  return cv2;
}

nvinfer1::IElementWiseLayer* bottleneckCSP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) {
  Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
  int c_ = (int)((float)c2 * e);
  auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
  auto cv2 = network->addConvolutionNd(input, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv2.weight"], emptywts);
  ITensor* y1 = cv1->getOutput(0);
  for (int i = 0; i < n; i++) {
    auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i));
    y1 = b->getOutput(0);
  }
  auto cv3 = network->addConvolutionNd(*y1, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv3.weight"], emptywts);

  ITensor* inputTensors[] = { cv3->getOutput(0), cv2->getOutput(0) };
  auto cat = network->addConcatenation(inputTensors, 2);

  IScaleLayer* bn = addBatchNorm2d(network, weightMap, *cat->getOutput(0), lname + ".bn", 1e-4);
  auto lr = network->addActivation(*bn->getOutput(0), ActivationType::kLEAKY_RELU);
  lr->setAlpha(0.1);

  auto cv4 = convBlock(network, weightMap, *lr->getOutput(0), c2, 1, 1, 1, lname + ".cv4");
  return cv4;
}

nvinfer1::IElementWiseLayer* C3(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input,
           int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) {
  int c_ = (int)((float)c2 * e);
  auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
  auto cv2 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv2");
  ITensor *y1 = cv1->getOutput(0);
  for (int i = 0; i < n; i++) {
    auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i));
    y1 = b->getOutput(0);
  }

  ITensor* inputTensors[] = { y1, cv2->getOutput(0) };
  auto cat = network->addConcatenation(inputTensors, 2);

  auto cv3 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv3");
  return cv3;
}


nvinfer1::IScaleLayer *conv_bn(nvinfer1::INetworkDefinition *network, std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input,
           std::string lname, int out_channels, int kernel_size, int stride, int padding, int groups=1){
    nvinfer1::Weights emptywts{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer *conv = network->addConvolutionNd(input, out_channels, nvinfer1::DimsHW{kernel_size, kernel_size},
         weightMap[lname + ".conv.weight"], emptywts);
    conv->setStrideNd(nvinfer1::DimsHW{stride, stride});
    conv->setPaddingNd(nvinfer1::DimsHW{padding, padding});
    conv->setNbGroups(groups);

    nvinfer1::IScaleLayer *bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-5);
    return bn;
   }

nvinfer1::IActivationLayer *RepVGGBlock(nvinfer1::INetworkDefinition *network, std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input,
        std::string lname, int out_channels, int kernel_size = 3, int stride = 1, int padding = 1, int groups=1){

    nvinfer1::IScaleLayer *rbr_dense = conv_bn(network, weightMap, input, lname + ".rbr_dense", out_channels, kernel_size, stride, padding, groups);
    int padding_11 = padding - kernel_size / 2;
    nvinfer1::IScaleLayer *rbr_1x1 = conv_bn(network, weightMap, input, lname + ".rbr_1x1", out_channels, 1, stride, padding_11, groups);
    nvinfer1::IElementWiseLayer *add = network->addElementWise(*rbr_dense->getOutput(0), *rbr_1x1->getOutput(0),  nvinfer1::ElementWiseOperation::kSUM);

    nvinfer1::IActivationLayer *silu = network->addActivation(*add->getOutput(0), nvinfer1::ActivationType::kRELU);
    return silu;
}

nvinfer1::IActivationLayer *DWConvblock(nvinfer1::INetworkDefinition *network, std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input,
     std::string lname, int in_channels, int out_channels, int kernel_size, int stride){
    nvinfer1::Weights emptywts {nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer *conv1 = network->addConvolutionNd(input, in_channels, nvinfer1::DimsHW{kernel_size, kernel_size},
      weightMap[lname + ".conv1.weight"], emptywts);
    conv1->setStrideNd(nvinfer1::DimsHW{stride, stride});
    std::cout << (kernel_size / 2) << std::endl;
    conv1->setPaddingNd(nvinfer1::DimsHW{kernel_size / 2, kernel_size / 2});
    conv1->setNbGroups(in_channels);
    nvinfer1::IScaleLayer *bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn1", 1e-5);
    nvinfer1::IActivationLayer *relu1 = network->addActivation(*bn1->getOutput(0), nvinfer1::ActivationType::kRELU);
    nvinfer1::IConvolutionLayer *conv2 = network->addConvolutionNd(*relu1->getOutput(0), out_channels, nvinfer1::DimsHW{1, 1}, weightMap[lname + ".conv2.weight"], emptywts);
    conv2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::IScaleLayer *bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + ".bn2", 1e-5);
    nvinfer1::IActivationLayer *relu2 = network->addActivation(*bn2->getOutput(0), nvinfer1::ActivationType::kRELU);

    return relu2;    
    }

std::vector<std::vector<float>> getAnchors(std::map<std::string, Weights>& weightMap, std::string lname) 
{
    std::vector<std::vector<float>> anchors;
    Weights wts = weightMap[lname + ".anchor_grid"];
    int anchor_len = Yolo::CHECK_COUNT * 2; // 6
    for (int i = 0; i < wts.count / anchor_len; i++) 
    {
        auto *p = (const float*)wts.values + i * anchor_len;
        std::vector<float> anchor(p, p + anchor_len);
        anchors.push_back(anchor);
    }
    return anchors;
}

nvinfer1::IElementWiseLayer* focus(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch,
           int outch, int ksize, std::string lname) {
  ISliceLayer* s1 = network->addSlice(input, Dims3{ 0, 0, 0 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 });
  ISliceLayer* s2 = network->addSlice(input, Dims3{ 0, 1, 0 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 });
  ISliceLayer* s3 = network->addSlice(input, Dims3{ 0, 0, 1 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 });
  ISliceLayer* s4 = network->addSlice(input, Dims3{ 0, 1, 1 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 });
  ITensor* inputTensors[] = { s1->getOutput(0), s2->getOutput(0), s3->getOutput(0), s4->getOutput(0) };
  auto cat = network->addConcatenation(inputTensors, 4);
  auto conv = convBlock(network, weightMap, *cat->getOutput(0), outch, ksize, 1, 1, lname + ".conv");
  return conv;
}

nvinfer1::IElementWiseLayer *ADD(nvinfer1::INetworkDefinition *network,nvinfer1::ITensor& x1,nvinfer1::ITensor& x2, float alpha) {
    nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, nullptr, 0}; 
    nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, &alpha, 1};  
    nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0}; 

    nvinfer1::IScaleLayer* scaleLayer = network->addScale(x2, nvinfer1::ScaleMode::kUNIFORM, shift, scale, power);

    nvinfer1::IElementWiseLayer* addLayer = network->addElementWise(x1, *scaleLayer->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);

    return addLayer; 
}

IPluginV2Layer* addYoLoLayer(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, std::string lname, std::vector<IConvolutionLayer*> dets) {
    auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
    auto anchors = getAnchors(weightMap, lname);
    PluginField plugin_fields[2];
    int netinfo[4] = {Yolo::CLASS_NUM, Yolo::INPUT_W, Yolo::INPUT_H, Yolo::MAX_OUTPUT_BBOX_COUNT};
    plugin_fields[0].data = netinfo;
    plugin_fields[0].length = 4;
    plugin_fields[0].name = "netinfo";
    plugin_fields[0].type = PluginFieldType::kFLOAT32;
    int scale = 8;
    std::vector<Yolo::YoloKernel> kernels;
    for (size_t i = 0; i < anchors.size(); i++) {
        Yolo::YoloKernel kernel;
        kernel.width = Yolo::INPUT_W / scale;
        kernel.height = Yolo::INPUT_H / scale;
        memcpy(kernel.anchors, &anchors[i][0], anchors[i].size() * sizeof(float));
        kernels.push_back(kernel);
        scale *= 2;
    }
    plugin_fields[1].data = &kernels[0];
    plugin_fields[1].length = kernels.size();
    plugin_fields[1].name = "kernels";
    plugin_fields[1].type = PluginFieldType::kFLOAT32;
    PluginFieldCollection plugin_data;
    plugin_data.nbFields = 2;
    plugin_data.fields = plugin_fields;
    IPluginV2 *plugin_obj = creator->createPlugin("yololayer", &plugin_data);
    std::vector<ITensor*> input_tensors;
    for (auto det: dets) {
        input_tensors.push_back(det->getOutput(0));
    }
    auto yolo = network->addPluginV2(&input_tensors[0], input_tensors.size(), *plugin_obj);
    return yolo;
}


#endif


================================================
FILE: yolov5-lite/gen_wts.py
================================================
import argparse
import os
import struct
import torch
from utils.torch_utils import select_device


def parse_args():
    parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
    parser.add_argument('-w', '--weights', required=True, help='Input weights (.pt) file path (required)')
    parser.add_argument('-o', '--output', help='Output (.wts) file path (optional)')
    args = parser.parse_args()
    if not os.path.isfile(args.weights):
        raise SystemExit('Invalid input file')
    if not args.output:
        args.output = os.path.splitext(args.weights)[0] + '.wts'
    elif os.path.isdir(args.output):
        args.output = os.path.join(
            args.output,
            os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
    return args.weights, args.output


pt_file, wts_file = parse_args()

# Initialize
device = select_device('cpu')
# Load model
model = torch.load(pt_file, map_location=device, weights_only=False)['model'].float()  # load to FP32
model.to(device).eval()

with open(wts_file, 'w') as f:
    # Write the number of keys in the parameter dictionary first
    f.write('{}\n'.format(len(model.state_dict().keys())))
    for k, v in model.state_dict().items():
        # Flatten matrix parameters into a 1D array
        vr = v.reshape(-1).cpu().numpy()
        # Key, number of elements in the 1D array
        f.write('{} {} '.format(k, len(vr)))
        # Values, each separated by a space
        for vv in vr:
            f.write(' ')
            f.write(struct.pack('>f', float(vv)).hex())
        f.write('\n')


================================================
FILE: yolov5-lite/v5lite.cpp
================================================
#include <iostream>
#include <chrono>
#include <cmath>
#include <cstdio>
#include<cassert>


#include "cuda_utils.h"
#include "logging.h"
#include "common.hpp"
#include "utils.h"
#include "calibrator.h"

// #define USE_FP16  // set USE_INT8 or USE_FP16 or USE_FP32
#define USE_INT8  // set USE_INT8 or USE_FP16 or USE_FP32


static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1;  // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1
static Logger gLogger;

static int get_depth(int x, float gd) {
    if (x == 1) return 1;
    int r = round(x * gd);
    if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0) {
        --r;
    }
    return std::max<int>(r, 1);
}

inline int Get_channel(int x, int gw = 1, float divisor = 8.0){
  // std::cout << "=======" << (x*gw) / divisor << "===============" << std::endl;
  auto ch_out = int(ceil((x * gw) / divisor)) * divisor;
  return ch_out;
}

nvinfer1::ICudaEngine *build_det_v5_lite_c(unsigned int maxBatchSize, nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config, 
       nvinfer1::DataType dt, std::string wts_name)
{

  nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U);
  nvinfer1::ITensor *data = network->addInput(Yolo::INPUT_BLOB_NAME, dt, nvinfer1::Dims3{3, Yolo::INPUT_W, Yolo::INPUT_H});
  std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_name);


  // backbone
  nvinfer1::IElementWiseLayer *conv0 = CBH(network, weightMap, *data, Get_channel(32), 3, 2, "model.0");
  nvinfer1::IElementWiseLayer *conv1 = LC_Block(network, weightMap, *conv0->getOutput(0), Get_channel(32), Get_channel(64), 2, 3, "model.1", false);
  nvinfer1::IElementWiseLayer *conv2 = LC_Block(network, weightMap, *conv1->getOutput(0), Get_channel(64), Get_channel(64), 1, 3, "model.2", false);
  nvinfer1::IElementWiseLayer *conv3 = LC_Block(network, weightMap, *conv2->getOutput(0), Get_channel(64), Get_channel(128), 2, 3, "model.3", false);
  nvinfer1::IElementWiseLayer *conv4 = LC_Block(network, weightMap, *conv3->getOutput(0), Get_channel(128), Get_channel(128), 1, 3, "model.4", false);
  nvinfer1::IElementWiseLayer *conv5 = LC_Block(network, weightMap, *conv4->getOutput(0), Get_channel(128), Get_channel(128), 1, 3, "model.5", false);
  nvinfer1::IElementWiseLayer *conv6 = LC_Block(network, weightMap, *conv5->getOutput(0), Get_channel(128), Get_channel(128), 1, 3, "model.6", false);
  nvinfer1::IElementWiseLayer *conv7 = LC_Block(network, weightMap, *conv6->getOutput(0), Get_channel(128), Get_channel(256), 2, 3, "model.7", false);
  nvinfer1::IElementWiseLayer *conv8 = LC_Block(network, weightMap, *conv7->getOutput(0), Get_channel(256), Get_channel(256), 1, 5, "model.8", false);
  nvinfer1::IElementWiseLayer *conv9 = LC_Block(network, weightMap, *conv8->getOutput(0), Get_channel(256), Get_channel(256), 1, 5, "model.9", false);
  nvinfer1::IElementWiseLayer *conv10 = LC_Block(network, weightMap, *conv9->getOutput(0), Get_channel(256), Get_channel(256), 1, 5, "model.10", false);
  nvinfer1::IElementWiseLayer *conv11 = LC_Block(network, weightMap, *conv10->getOutput(0), Get_channel(256), Get_channel(256), 1, 5, "model.11", false);
  nvinfer1::IElementWiseLayer *conv12 = LC_Block(network, weightMap, *conv11->getOutput(0), Get_channel(256), Get_channel(256), 1, 5, "model.12", false);
  nvinfer1::IElementWiseLayer *conv13 = LC_Block(network, weightMap, *conv12->getOutput(0), Get_channel(256), Get_channel(512), 2, 5, "model.13", true);
  nvinfer1::IElementWiseLayer *conv14 = LC_Block(network, weightMap, *conv13->getOutput(0), Get_channel(512), Get_channel(512), 1, 5, "model.14", true);
  nvinfer1::IElementWiseLayer *conv15 = LC_Block(network, weightMap, *conv14->getOutput(0), Get_channel(512), Get_channel(512), 1, 5, "model.15", true);
  nvinfer1::IElementWiseLayer *conv16 = LC_Block(network, weightMap, *conv15->getOutput(0), Get_channel(512), Get_channel(512), 1, 5, "model.16", true);
  nvinfer1::IElementWiseLayer *conv17 = Dense(network, weightMap, *conv16->getOutput(0), Get_channel(512), 1, "model.17");

  // neck
  float scale[] = {1.0, 2.0, 2.0};
  nvinfer1::IElementWiseLayer *conv18 = convBlock(network, weightMap, *conv17->getOutput(0), Get_channel(256), 1, 1, 1, "model.18");
  nvinfer1::IResizeLayer *upsample19 = network->addResize(*conv18->getOutput(0));
  upsample19->setScales(scale, 3);
  nvinfer1::ITensor *inputTensors20[] = {upsample19->getOutput(0), conv12->getOutput(0)}; // 256 + 256 = 512
  nvinfer1::IConcatenationLayer *cat20 = network->addConcatenation(inputTensors20, 2);
  nvinfer1::IElementWiseLayer *conv21 = C3(network, weightMap, *cat20->getOutput(0), 512, Get_channel(256), get_depth(1, 1), false, 1, 0.5, "model.21");

  nvinfer1::IElementWiseLayer *conv22 = convBlock(network, weightMap, *conv21->getOutput(0), Get_channel(128), 1, 1, 1, "model.22");
  nvinfer1::IResizeLayer *upsample23 = network->addResize(*conv22->getOutput(0));
  upsample23->setScales(scale, 3);
  nvinfer1::ITensor *inputTensors24[] = {upsample23->getOutput(0), conv6->getOutput(0)}; // 128 + 128 = 256
  nvinfer1::IConcatenationLayer *cat24 = network->addConcatenation(inputTensors24, 2);
  nvinfer1::IElementWiseLayer *conv25 = C3(network, weightMap, *cat24->getOutput(0), 256, Get_channel(128), get_depth(1, 1), false, 1, 0.5, "model.25");

  nvinfer1::IElementWiseLayer *conv26 = LC_Block(network, weightMap, *conv25->getOutput(0), Get_channel(128), Get_channel(128), 2, 5, "model.26", true);
  nvinfer1::ITensor *inputTensor27[] = {conv26->getOutput(0), conv22->getOutput(0)}; // 128 + 128 = 256
  nvinfer1::IConcatenationLayer *cat27 = network->addConcatenation(inputTensor27, 2);
  nvinfer1::IElementWiseLayer *conv28 = C3(network, weightMap, *cat27->getOutput(0), 256, Get_channel(256), get_depth(1, 1), false, 1, 0.5, "model.28");

  nvinfer1::IElementWiseLayer *conv29 = LC_Block(network, weightMap, *conv28->getOutput(0), Get_channel(256), Get_channel(256), 2, 5, "model.29", true);
  nvinfer1::ITensor *inputTensor30[] = {conv29->getOutput(0), conv18->getOutput(0)}; // 256 + 256 = 512
  nvinfer1::IConcatenationLayer *cat30 = network->addConcatenation(inputTensor30, 2);
  nvinfer1::IElementWiseLayer *conv31 = C3(network, weightMap, *cat30->getOutput(0), 512, Get_channel(512), get_depth(1, 1), false, 1, 0.5, "model.31");

    // detect
  nvinfer1::IConvolutionLayer *det0 = network->addConvolutionNd(*conv25->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), 
      nvinfer1::DimsHW{1, 1}, weightMap["model.32.m.0.weight"], weightMap["model.32.m.0.bias"]);
    
  nvinfer1::IConvolutionLayer *det1 = network->addConvolutionNd(*conv28->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), 
      nvinfer1::DimsHW{1, 1}, weightMap["model.32.m.1.weight"], weightMap["model.32.m.1.bias"]);
    
  nvinfer1::IConvolutionLayer *det2 = network->addConvolutionNd(*conv31->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), 
        nvinfer1::DimsHW{1, 1}, weightMap["model.32.m.2.weight"], weightMap["model.32.m.2.bias"]);
    
  auto yolo = addYoLoLayer(network, weightMap, "model.32", std::vector<nvinfer1::IConvolutionLayer*>{det0, det1, det2});
  yolo->getOutput(0)->setName(Yolo::OUTPUT_BLOB_NAME);
  network->markOutput(*yolo->getOutput(0));

      // Engine config
  builder->setMaxBatchSize(maxBatchSize);
  config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#if defined(USE_FP16)
  config->setFlag(BuilderFlag::kFP16);
#elif defined(USE_INT8)
  std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
  assert(builder->platformHasFastInt8());
  config->setFlag(BuilderFlag::kINT8);
  std::string data_path = "tensorrtx-int8calib-data/coco_calib/";
  //Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName);
  Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, Yolo::INPUT_W, Yolo::INPUT_H, data_path.c_str(), "int8calib.table", Yolo::INPUT_BLOB_NAME);
  config->setInt8Calibrator(calibrator);
#endif

  std::cout << "Building engine, please wait for a while..." << std::endl;
  ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
  std::cout << "Build engine successfully!" << std::endl;

  // Don't need the network any more
  network->destroy();

  // Release host memory
  for (auto& mem : weightMap) {
    free((void*)(mem.second.values));
  }

  return engine;
}


nvinfer1::ICudaEngine *build_det_v5_lite_e(unsigned int maxBatchSize, nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config,
    nvinfer1::DataType dt, std::string wts_name){
  nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U);
  nvinfer1::ITensor *data = network->addInput(Yolo::INPUT_BLOB_NAME, dt, nvinfer1::Dims3{3, Yolo::INPUT_W, Yolo::INPUT_H});
  std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_name);

  // backbone
  nvinfer1::IPoolingLayer *conv0 = conv_bn_relu_maxpool(network, weightMap, *data, 32, "model.0."); //32
  // std::cout << "Get_channel: " << Get_channel(116) << std::endl;
  nvinfer1::IShuffleLayer *conv1 = shuffle_block(network, weightMap, *conv0->getOutput(0), "model.1.", 32, Get_channel(116), 2); //120
  nvinfer1::IShuffleLayer *conv2_0 = shuffle_block(network, weightMap, *conv1->getOutput(0), "model.2.0.", Get_channel(116), Get_channel(116), 1); //120
  nvinfer1::IShuffleLayer *conv2_1 = shuffle_block(network, weightMap, *conv2_0->getOutput(0), "model.2.1.", Get_channel(116), Get_channel(116), 1); // 120
  nvinfer1::IShuffleLayer *conv2_2 = shuffle_block(network, weightMap, *conv2_1->getOutput(0), "model.2.2.", Get_channel(116), Get_channel(116), 1); // 120
  nvinfer1::IShuffleLayer *conv3 = shuffle_block(network, weightMap, *conv2_2->getOutput(0), "model.3.", Get_channel(116), Get_channel(232), 2); // 232
  nvinfer1::IShuffleLayer *conv4_0 = shuffle_block(network, weightMap, *conv3->getOutput(0), "model.4.0.", Get_channel(232), Get_channel(232), 1); // 232 
  nvinfer1::IShuffleLayer *conv4_1 = shuffle_block(network, weightMap, *conv4_0->getOutput(0), "model.4.1.", Get_channel(232), Get_channel(232), 1); // 232
  nvinfer1::IShuffleLayer *conv4_2 = shuffle_block(network, weightMap, *conv4_1->getOutput(0), "model.4.2.", Get_channel(232), Get_channel(232), 1); // 232
  nvinfer1::IShuffleLayer *conv4_3 = shuffle_block(network, weightMap, *conv4_2->getOutput(0), "model.4.3.", Get_channel(232), Get_channel(232), 1); // 232
  nvinfer1::IShuffleLayer *conv4_4 = shuffle_block(network, weightMap, *conv4_3->getOutput(0), "model.4.4.", Get_channel(232), Get_channel(232), 1); //232
  nvinfer1::IShuffleLayer *conv4_5 = shuffle_block(network, weightMap, *conv4_4->getOutput(0), "model.4.5.", Get_channel(232), Get_channel(232), 1);
  nvinfer1::IShuffleLayer *conv4_6 = shuffle_block(network, weightMap, *conv4_5->getOutput(0), "model.4.6.", Get_channel(232), Get_channel(232), 1); // 232
  nvinfer1::IShuffleLayer *conv5 = shuffle_block(network, weightMap, *conv4_6->getOutput(0), "model.5.", Get_channel(232), Get_channel(464), 2); //464 
  nvinfer1::IShuffleLayer *conv6 = shuffle_block(network, weightMap, *conv5->getOutput(0), "model.6.", Get_channel(464), Get_channel(464), 1); // 464

  // neck
  float scale[] = {1.0, 2.0, 2.0};
  nvinfer1::IElementWiseLayer *conv7 = convBlock(network, weightMap, *conv6->getOutput(0), Get_channel(96), 1, 1, 1, "model.7"); // 96
  nvinfer1::IResizeLayer *upsample8 = network->addResize(*conv7->getOutput(0));
  upsample8->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
  upsample8->setScales(scale, 3);
  nvinfer1::ITensor *inputTensors9[] = {upsample8->getOutput(0), conv4_6->getOutput(0)};
  nvinfer1::IConcatenationLayer *cat9 = network->addConcatenation(inputTensors9, 2); //  96 + 232 = 328
  nvinfer1::IActivationLayer *conv10 = DWConvblock(network, weightMap, *cat9->getOutput(0), "model.10", 328, Get_channel(96), 3, 1);

  nvinfer1::IElementWiseLayer *conv11 = convBlock(network, weightMap, *conv10->getOutput(0), Get_channel(96), 1, 1, 1, "model.11"); // 96
  nvinfer1::IResizeLayer *upsample12 = network->addResize(*conv11->getOutput(0));
  upsample12->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
  upsample12->setScales(scale, 3);
  nvinfer1::ITensor *inputTensors13[] = {upsample12->getOutput(0), conv2_2->getOutput(0)}; // 96 + 120 
  nvinfer1::IConcatenationLayer *cat13 = network->addConcatenation(inputTensors13, 2);
  nvinfer1::IActivationLayer *conv14 = DWConvblock(network, weightMap, *cat13->getOutput(0), "model.14", 216, Get_channel(96), 3, 1);

  nvinfer1::IActivationLayer *conv15 = DWConvblock(network, weightMap, *conv14->getOutput(0), "model.15", Get_channel(96), Get_channel(96), 3, 2);
  nvinfer1::IElementWiseLayer *add16 = ADD(network, *conv15->getOutput(0), *conv11->getOutput(0), 1.0);
  nvinfer1::IActivationLayer *conv17 = DWConvblock(network, weightMap, *add16->getOutput(0), "model.17", Get_channel(96), Get_channel(96), 3, 1);

  nvinfer1::IActivationLayer *conv18 = DWConvblock(network, weightMap, *conv17->getOutput(0), "model.18", Get_channel(96), Get_channel(96), 3, 2);
  nvinfer1::IElementWiseLayer *add19 = ADD(network, *conv18->getOutput(0), *conv7->getOutput(0), 1.0);
  nvinfer1::IActivationLayer *conv20 = DWConvblock(network, weightMap, *add19->getOutput(0), "model.20", Get_channel(96), Get_channel(96), 3, 1);


  // detect
  nvinfer1::IConvolutionLayer *det0 = network->addConvolutionNd(*conv14->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), 
      nvinfer1::DimsHW{1, 1}, weightMap["model.21.m.0.weight"], weightMap["model.21.m.0.bias"]);
    
  nvinfer1::IConvolutionLayer *det1 = network->addConvolutionNd(*conv17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), 
      nvinfer1::DimsHW{1, 1}, weightMap["model.21.m.1.weight"], weightMap["model.21.m.1.bias"]);
    
  nvinfer1::IConvolutionLayer *det2 = network->addConvolutionNd(*conv20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), 
        nvinfer1::DimsHW{1, 1}, weightMap["model.21.m.2.weight"], weightMap["model.21.m.2.bias"]);
    
  auto yolo = addYoLoLayer(network, weightMap, "model.21", std::vector<nvinfer1::IConvolutionLayer*>{det0, det1, det2});
  yolo->getOutput(0)->setName(Yolo::OUTPUT_BLOB_NAME);
  network->markOutput(*yolo->getOutput(0));

      // Engine config
  builder->setMaxBatchSize(maxBatchSize);
  config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#if defined(USE_FP16)
  config->setFlag(BuilderFlag::kFP16);
#elif defined(USE_INT8)
  std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
  assert(builder->platformHasFastInt8());
  config->setFlag(BuilderFlag::kINT8);
  std::string data_path = "tensorrtx-int8calib-data/coco_calib/";
  //Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName);
  Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, Yolo::INPUT_W, Yolo::INPUT_H, data_path.c_str(), "int8calib.table", Yolo::INPUT_BLOB_NAME);
  config->setInt8Calibrator(calibrator);
#endif

  std::cout << "Building engine, please wait for a while..." << std::endl;
  ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
  std::cout << "Build engine successfully!" << std::endl;

  // Don't need the network any more
  network->destroy();

  // Release host memory
  for (auto& mem : weightMap) {
    free((void*)(mem.second.values));
  }

  return engine;
}


nvinfer1::ICudaEngine *build_det_v5_lite_g(unsigned int maxBatchSize, nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config, 
                  nvinfer1::DataType dt,  std::string wts_name){
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);

    // backbone
    nvinfer1::ITensor *data = network->addInput(Yolo::INPUT_BLOB_NAME, dt, nvinfer1::Dims3{3, Yolo::INPUT_H, Yolo::INPUT_W});
    assert(data);
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_name);
    nvinfer1::IElementWiseLayer *conv0 = focus(network, weightMap, *data, 3, Get_channel(32), 3, "model.0"); // 32
    nvinfer1::IActivationLayer *conv1 = RepVGGBlock(network, weightMap, *conv0->getOutput(0), "model.1", Get_channel(64), 3, 2, 1); //64
    nvinfer1::IElementWiseLayer *conv2 = C3(network, weightMap, *conv1->getOutput(0), Get_channel(64), Get_channel(64), get_depth(1, 1), true, 1, 0.5, "model.2"); // 64
    nvinfer1::IActivationLayer *conv3 = RepVGGBlock(network, weightMap, *conv2->getOutput(0), "model.3", Get_channel(128), 3, 2, 1); // 128
    nvinfer1::IElementWiseLayer *conv4 = C3(network, weightMap, *conv3->getOutput(0), Get_channel(128), Get_channel(128), get_depth(3, 1), true, 1, 0.5, "model.4"); // 128
    nvinfer1::IActivationLayer *conv5 = RepVGGBlock(network, weightMap, *conv4->getOutput(0), "model.5", Get_channel(256), 3, 2, 1); // 256
    nvinfer1::IElementWiseLayer *conv6 = C3(network, weightMap, *conv5->getOutput(0), Get_channel(256), Get_channel(256), get_depth(3, 1), true, 1, 0.5, "model.6"); // 256
    nvinfer1::IActivationLayer *conv7 = RepVGGBlock(network, weightMap, *conv6->getOutput(0), "model.7", Get_channel(512), 3, 2, 1); // 512
    nvinfer1::IElementWiseLayer *conv8 = SPP(network, weightMap, *conv7->getOutput(0), Get_channel(512), Get_channel(512), 5, 9, 13, "model.8"); // 512
    nvinfer1::IElementWiseLayer *conv9 = C3(network, weightMap, *conv8->getOutput(0), Get_channel(512), Get_channel(512), get_depth(1, 1), false, 1, 0.5, "model.9"); // 512
    

    float scale[] = {1.0, 2.0, 2.0};
    nvinfer1::IElementWiseLayer *conv10 = convBlock(network, weightMap, *conv9->getOutput(0), Get_channel(128), 1, 1, 1, "model.10"); // 128
    nvinfer1::IResizeLayer *upsample11 = network->addResize(*conv10->getOutput(0));
    upsample11->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample11->setScales(scale, 3);
    nvinfer1::ITensor *inputTensors12[] = {upsample11->getOutput(0), conv6->getOutput(0)};
    nvinfer1::IConcatenationLayer *cat12 = network->addConcatenation(inputTensors12, 2); // 384
    nvinfer1::IElementWiseLayer *conv13 = C3(network, weightMap, *cat12->getOutput(0), 384, Get_channel(128), get_depth(3, 1), false, 1, 0.5, "model.13");

    nvinfer1::IElementWiseLayer *conv14 = convBlock(network, weightMap, *conv13->getOutput(0), Get_channel(128), 1, 1, 1, "model.14"); // 128
    nvinfer1::IResizeLayer *upsample15 = network->addResize(*conv14->getOutput(0));
    upsample15->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample15->setScales(scale, 3);
    nvinfer1::ITensor *inputTensors16[] = {upsample15->getOutput(0), conv4->getOutput(0)}; //  128+128
    nvinfer1::IConcatenationLayer *cat16 = network->addConcatenation(inputTensors16, 2);
    nvinfer1::IElementWiseLayer *conv17 = C3(network, weightMap, *cat16->getOutput(0), 256, Get_channel(128), get_depth(3, 1), false, 1, 0.5, "model.17");

    nvinfer1::IElementWiseLayer *conv18 = convBlock(network, weightMap, *conv17->getOutput(0), Get_channel(128), 3, 2, 1, "model.18"); // 128
    nvinfer1::ITensor *inputTensors19[] = {conv18->getOutput(0), conv14->getOutput(0)};
    nvinfer1::IConcatenationLayer *cat19 = network->addConcatenation(inputTensors19, 2); // 128 + 128
    nvinfer1::IElementWiseLayer *conv20 = C3(network, weightMap, *cat19->getOutput(0), 256, Get_channel(128), get_depth(3, 1), false, 1, 0.5, "model.20");

    nvinfer1::IElementWiseLayer *conv21 = convBlock(network, weightMap, *conv20->getOutput(0), Get_channel(128), 3, 2, 1, "model.21"); // 128
    nvinfer1::ITensor *inputTensors22[] = {conv21->getOutput(0), conv10->getOutput(0)}; 
    nvinfer1::IConcatenationLayer *cat22 = network->addConcatenation(inputTensors22, 2); // 128 + 128
    nvinfer1::IElementWiseLayer *conv23 = C3(network, weightMap, *cat22->getOutput(0), 256, Get_channel(128), get_depth(3, 1), false, 1, 0.5, "model.23");

      // detect
    nvinfer1::IConvolutionLayer *det0 = network->addConvolutionNd(*conv17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), 
      nvinfer1::DimsHW{1, 1}, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);
    
    nvinfer1::IConvolutionLayer *det1 = network->addConvolutionNd(*conv20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), 
      nvinfer1::DimsHW{1, 1}, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);
    
    nvinfer1::IConvolutionLayer *det2 = network->addConvolutionNd(*conv23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), 
        nvinfer1::DimsHW{1, 1}, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);
    
    auto yolo = addYoLoLayer(network, weightMap, "model.24", std::vector<nvinfer1::IConvolutionLayer*>{det0, det1, det2});
    yolo->getOutput(0)->setName(Yolo::OUTPUT_BLOB_NAME);
    network->markOutput(*yolo->getOutput(0));

      // Engine config
  builder->setMaxBatchSize(maxBatchSize);
  config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#if defined(USE_FP16)
  config->setFlag(BuilderFlag::kFP16);
#elif defined(USE_INT8)
  std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
  assert(builder->platformHasFastInt8());
  config->setFlag(BuilderFlag::kINT8);
  std::string data_path = "tensorrtx-int8calib-data/coco_calib/";
  //Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName);
  Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, Yolo::INPUT_W, Yolo::INPUT_H, data_path.c_str(), "int8calib.table", Yolo::INPUT_BLOB_NAME);
  config->setInt8Calibrator(calibrator);
#endif

  std::cout << "Building engine, please wait for a while..." << std::endl;
  ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
  std::cout << "Build engine successfully!" << std::endl;

  // Don't need the network any more
  network->destroy();

  // Release host memory
  for (auto& mem : weightMap) {
    free((void*)(mem.second.values));
  }

  return engine;
}


nvinfer1::ICudaEngine *build_det_v5_lite_s(unsigned int maxBatchSize, nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config, nvinfer1::DataType dt,std::string & wts_name){
  // backbone
  nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
  nvinfer1::ITensor *data = network->addInput(Yolo::INPUT_BLOB_NAME, dt, nvinfer1::Dims3{3, Yolo::INPUT_H, Yolo::INPUT_W});
  assert(data);
  std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_name);
  nvinfer1::IPoolingLayer *conv0 = conv_bn_relu_maxpool(network, weightMap, *data, 32, "model.0.");
  std::cout << "Get_channel: " << Get_channel(116) << std::endl;
  nvinfer1::IShuffleLayer *conv1 = shuffle_block(network, weightMap, *conv0->getOutput(0), "model.1.", 32, Get_channel(116), 2);
  nvinfer1::IShuffleLayer *conv2_0 = shuffle_block(network, weightMap, *conv1->getOutput(0), "model.2.0.", Get_channel(116), Get_channel(116), 1);
  nvinfer1::IShuffleLayer *conv2_1 = shuffle_block(network, weightMap, *conv2_0->getOutput(0), "model.2.1.", Get_channel(116), Get_channel(116), 1);
  nvinfer1::IShuffleLayer *conv2_2 = shuffle_block(network, weightMap, *conv2_1->getOutput(0), "model.2.2.", Get_channel(116), Get_channel(116), 1);
  nvinfer1::IShuffleLayer *conv3 = shuffle_block(network, weightMap, *conv2_2->getOutput(0), "model.3.", Get_channel(116), Get_channel(232), 2);
  nvinfer1::IShuffleLayer *conv4_0 = shuffle_block(network, weightMap, *conv3->getOutput(0), "model.4.0.", Get_channel(232), Get_channel(232), 1);
  nvinfer1::IShuffleLayer *conv4_1 = shuffle_block(network, weightMap, *conv4_0->getOutput(0), "model.4.1.", Get_channel(232), Get_channel(232), 1);
  nvinfer1::IShuffleLayer *conv4_2 = shuffle_block(network, weightMap, *conv4_1->getOutput(0), "model.4.2.", Get_channel(232), Get_channel(232), 1);
  nvinfer1::IShuffleLayer *conv4_3 = shuffle_block(network, weightMap, *conv4_2->getOutput(0), "model.4.3.", Get_channel(232), Get_channel(232), 1);
  nvinfer1::IShuffleLayer *conv4_4 = shuffle_block(network, weightMap, *conv4_3->getOutput(0), "model.4.4.", Get_channel(232), Get_channel(232), 1);
  nvinfer1::IShuffleLayer *conv4_5 = shuffle_block(network, weightMap, *conv4_4->getOutput(0), "model.4.5.", Get_channel(232), Get_channel(232), 1);
  nvinfer1::IShuffleLayer *conv4_6 = shuffle_block(network, weightMap, *conv4_5->getOutput(0), "model.4.6.", Get_channel(232), Get_channel(232), 1);
  nvinfer1::IShuffleLayer *conv5 = shuffle_block(network, weightMap, *conv4_6->getOutput(0), "model.5.", Get_channel(232), Get_channel(464), 2);
  nvinfer1::IShuffleLayer *conv6_0 = shuffle_block(network, weightMap, *conv5->getOutput(0), "model.6.0.", Get_channel(464), Get_channel(464), 1);
  nvinfer1::IShuffleLayer *conv6_1 = shuffle_block(network, weightMap, *conv6_0->getOutput(0), "model.6.1.", Get_channel(464), Get_channel(464), 1);
  nvinfer1::IShuffleLayer *conv6_2 = shuffle_block(network, weightMap, *conv6_1->getOutput(0), "model.6.2.", Get_channel(464), Get_channel(464), 1);

  // head
  float scale[] = {1.0, 2.0, 2.0};
  nvinfer1::IElementWiseLayer *conv7 = convBlock(network, weightMap, *conv6_2->getOutput(0), Get_channel(128), 1, 1, 1, "model.7");
  nvinfer1::IResizeLayer *upsample8 = network->addResize(*conv7->getOutput(0));
  upsample8->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
  upsample8->setScales(scale, 3);
  assert(upsample8);
  nvinfer1::ITensor *inputTensors9[] = {upsample8->getOutput(0), conv4_6->getOutput(0)}; // channels = 128 + 232 = 360
  nvinfer1::IConcatenationLayer *cat9 = network->addConcatenation(inputTensors9, 2);
  // std::cout << "The c3 's n is " << get_depth(3, 1) << std::endl;
  nvinfer1::IElementWiseLayer *conv10 = C3(network, weightMap, *cat9->getOutput(0), 360, Get_channel(128), get_depth(1, 1), false, 1, 0.5, "model.10");

  nvinfer1::IElementWiseLayer *conv11 = convBlock(network, weightMap, *conv10->getOutput(0), Get_channel(64), 1, 1, 1, "model.11");
  nvinfer1::IResizeLayer *upsample12 = network->addResize(*conv11->getOutput(0));
  upsample12->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
  upsample12->setScales(scale, 3);
  assert(upsample12);
  nvinfer1::ITensor *inputTensors13[] = {upsample12->getOutput(0), conv2_2->getOutput(0)}; // 64 + 120 = 184
  nvinfer1::IConcatenationLayer *cat13 = network->addConcatenation(inputTensors13, 2);
  nvinfer1::IElementWiseLayer *conv14 = C3(network, weightMap, *cat13->getOutput(0), 184, Get_channel(64), get_depth(1, 1), false, 1, 0.5, "model.14");

  nvinfer1::IElementWiseLayer *conv15 = convBlock(network, weightMap, *conv14->getOutput(0), Get_channel(64), 3, 2, 1, "model.15");
  nvinfer1::ITensor *inputTensors16[] = {conv15->getOutput(0), conv11->getOutput(0)}; // 64 + 64 = 128
  nvinfer1::IConcatenationLayer *cat16 = network->addConcatenation(inputTensors16, 2); 
  nvinfer1::IElementWiseLayer *conv17 = C3(network, weightMap, *cat16->getOutput(0), 128, Get_channel(128), get_depth(1, 1), false, 1, 0.5, "model.17");

  nvinfer1::IElementWiseLayer *conv18 = convBlock(network, weightMap, *conv17->getOutput(0), Get_channel(128), 3, 2, 1, "model.18");
  nvinfer1::ITensor *inputTensors19[] = {conv18->getOutput(0), conv7->getOutput(0)}; // 128 + 128 = 256
  nvinfer1::IConcatenationLayer *cat19 = network->addConcatenation(inputTensors19, 2); 
  nvinfer1::IElementWiseLayer *conv20 = C3(network, weightMap, *cat19->getOutput(0), 256, Get_channel(256), get_depth(1, 1), false, 1, 0.5, "model.20");

  // detect
  nvinfer1::IConvolutionLayer *det0 = network->addConvolutionNd(*conv14->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), 
     nvinfer1::DimsHW{1, 1}, weightMap["model.21.m.0.weight"], weightMap["model.21.m.0.bias"]);
  
  nvinfer1::IConvolutionLayer *det1 = network->addConvolutionNd(*conv17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), 
    nvinfer1::DimsHW{1, 1}, weightMap["model.21.m.1.weight"], weightMap["model.21.m.1.bias"]);
  
  nvinfer1::IConvolutionLayer *det2 = network->addConvolutionNd(*conv20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), 
      nvinfer1::DimsHW{1, 1}, weightMap["model.21.m.2.weight"], weightMap["model.21.m.2.bias"]);
  
  auto yolo = addYoLoLayer(network, weightMap, "model.21", std::vector<nvinfer1::IConvolutionLayer*>{det0, det1, det2});
  yolo->getOutput(0)->setName(Yolo::OUTPUT_BLOB_NAME);
  network->markOutput(*yolo->getOutput(0));

    // Engine config
  builder->setMaxBatchSize(maxBatchSize);
  config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#if defined(USE_FP16)
  config->setFlag(BuilderFlag::kFP16);
#elif defined(USE_INT8)
  std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
  assert(builder->platformHasFastInt8());
  config->setFlag(BuilderFlag::kINT8);
  std::string data_path = "tensorrtx-int8calib-data/coco_calib/";
  //Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName);
  Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, Yolo::INPUT_W, Yolo::INPUT_H, data_path.c_str(), "int8calib.table", Yolo::INPUT_BLOB_NAME);
  config->setInt8Calibrator(calibrator);
#endif

  std::cout << "Building engine, please wait for a while..." << std::endl;
  ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
  std::cout << "Build engine successfully!" << std::endl;

  // Don't need the network any more
  network->destroy();

  // Release host memory
  for (auto& mem : weightMap) {
    free((void*)(mem.second.values));
  }

  return engine;
}


void serialize_engine(unsigned int max_batchsize, std::string& wts_name, std::string& engine_name, std::string & used_model){
  
  IBuilder* builder = createInferBuilder(gLogger);
  IBuilderConfig* config = builder->createBuilderConfig();

  ICudaEngine *engine = nullptr;
  if(used_model == "g"){
    engine = build_det_v5_lite_g(max_batchsize, builder, config, nvinfer1::DataType::kFLOAT, wts_name);
  }else if(used_model == "s"){
    engine = build_det_v5_lite_s(max_batchsize, builder, config, nvinfer1::DataType::kFLOAT, wts_name);
  }else if(used_model == "c"){
    engine = build_det_v5_lite_c(max_batchsize, builder, config, nvinfer1::DataType::kFLOAT, wts_name);
  }
  else{
    engine = build_det_v5_lite_e(max_batchsize, builder, config, nvinfer1::DataType::kFLOAT, wts_name);
  }
  // Serialize the engine
  IHostMemory* serialized_engine = engine->serialize();
  assert(serialized_engine != nullptr);

  // Save engine to file
  std::ofstream p(engine_name, std::ios::binary);
  if (!p) {
    std::cerr << "Could not open plan output file" << std::endl;
    // assert(false);

  }
  p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());

  // Close everything down
  engine->destroy();
  config->destroy();
  serialized_engine->destroy();
  builder->destroy();
}


void doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* output, int batchSize) {
    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * Yolo::INPUT_H * Yolo::INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);
}

bool parse_args(int argc, char **argv, std::string & wts_name, std::string & engine_name,
                                   std::string & used_model, std::string & img_dir){
  if(argc < 4 || argc > 6) return false;
  if(std::string(argv[1]) == "-s" && (argc == 5)){
    wts_name = argv[2];
    engine_name = argv[3];
    used_model = argv[4];
  }else if(std::string(argv[1]) == "-d" && argc == 4){
    engine_name = std::string(argv[2]);
    img_dir = std::string(argv[3]);
  }else{
    return false;
  }

  return true;
}

int main(int argc, char** argv) {
    cudaSetDevice(Yolo::DEVICE);

    std::string wts_name = "";
    std::string engine_name = "";
    std::string img_dir, used_model;
    

    if(!parse_args(argc, argv, wts_name, engine_name, used_model, img_dir)){
      std::cerr << "arguments not right!" << std::endl;
      std::cerr << "./v5lite -s [.wts] [.engine] [s/e/g/c] // serialize modeo to the plan" << std::endl;
      std::cerr << "./v5lite -d [.engine] ../images  // deserialize plan file and run inference" << std::endl;
      return -1;  
    }

    if (!wts_name.empty()) {
        serialize_engine(Yolo::BATCH_SIZE,  wts_name, engine_name, used_model);
        return 0;
    }

    // deserialize the .engine and run inference
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        return -1;
    }
    char *trtModelStream = nullptr;
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    trtModelStream = new char[size];
    assert(trtModelStream);
    file.read(trtModelStream, size);
    file.close();

    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    // prepare input data ---------------------------
    static float data[Yolo::BATCH_SIZE * 3 * Yolo::INPUT_H * Yolo::INPUT_W];
    //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
    //    data[i] = 1.0;
    static float prob[Yolo::BATCH_SIZE * OUTPUT_SIZE];
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;
    assert(engine->getNbBindings() == 2);
    void* buffers[2];
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(Yolo::INPUT_BLOB_NAME);
    const int outputIndex = engine->getBindingIndex(Yolo::OUTPUT_BLOB_NAME);
    assert(inputIndex == 0);
    assert(outputIndex == 1);
    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc(&buffers[inputIndex], Yolo::BATCH_SIZE * 3 * Yolo::INPUT_H * Yolo::INPUT_W * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&buffers[outputIndex], Yolo::BATCH_SIZE * OUTPUT_SIZE * sizeof(float)));
    // Create stream
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    int fcount = 0;
    for (int f = 0; f < (int)file_names.size(); f++) {
        fcount++;
        if (fcount < Yolo::BATCH_SIZE && f + 1 != (int)file_names.size()) continue;
        for (int b = 0; b < fcount; b++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[f - fcount + 1 + b]);
            if (img.empty()) continue;
            cv::Mat pr_img = preprocess_img(img, Yolo::INPUT_W, Yolo::INPUT_H); // letterbox BGR to RGB
            int i = 0;
            for (int row = 0; row < Yolo::INPUT_H; ++row) {
                uchar* uc_pixel = pr_img.data + row * pr_img.step;
                for (int col = 0; col < Yolo::INPUT_W; ++col) {
                    data[b * 3 * Yolo::INPUT_H * Yolo::INPUT_W + i] = (float)uc_pixel[2] / 255.0;
                    data[b * 3 * Yolo::INPUT_H * Yolo::INPUT_W + i + Yolo::INPUT_H * Yolo::INPUT_W] = (float)uc_pixel[1] / 255.0;
                    data[b * 3 * Yolo::INPUT_H * Yolo::INPUT_W + i + 2 * Yolo::INPUT_H * Yolo::INPUT_W] = (float)uc_pixel[0] / 255.0;
                    uc_pixel += 3;
                    ++i;
                }
            }
        }

        // Run inference
        auto start = std::chrono::system_clock::now();
        doInference(*context, stream, buffers, data, prob, Yolo::BATCH_SIZE);
        auto end = std::chrono::system_clock::now();
        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
        std::vector<std::vector<Yolo::Detection>> batch_res(fcount);
        for (int b = 0; b < fcount; b++) {
            auto& res = batch_res[b];
            nms(res, &prob[b * OUTPUT_SIZE], Yolo::CONF_THRESH, Yolo::NMS_THRESH);
        }
        for (int b = 0; b < fcount; b++) {
            auto& res = batch_res[b];
            //std::cout << res.size() << std::endl;
            cv::Mat img = cv::imread(img_dir + "/" + file_names[f - fcount + 1 + b]);
            for (size_t j = 0; j < res.size(); j++) {
                cv::Rect r = get_rect(img, res[j].bbox);
                cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
                cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
            }
            cv::imwrite(file_names[f - fcount + 1 + b], img);
        }
        fcount = 0;
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(buffers[inputIndex]));
    CUDA_CHECK(cudaFree(buffers[outputIndex]));
    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    // Print histogram of the output distribution
    // std::cout << "\nOutput:\n\n";
    // for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
    // {
    //    std::cout << prob[i] << ", ";
    //    if (i % 10 == 0) std::cout << std::endl;
    // }
    // std::cout << std::endl;

    return 0;
}


================================================
FILE: yolov5-lite/yololayer.cu
================================================
#include <assert.h>
#include <vector>
#include <iostream>
#include "yololayer.h"
#include "cuda_utils.h"

namespace Tn
{
    template<typename T> 
    void write(char*& buffer, const T& val)
    {
        *reinterpret_cast<T*>(buffer) = val;
        buffer += sizeof(T);
    }

    template<typename T> 
    void read(const char*& buffer, T& val)
    {
        val = *reinterpret_cast<const T*>(buffer);
        buffer += sizeof(T);
    }
}

using namespace Yolo;

namespace nvinfer1
{
    YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector<Yolo::YoloKernel>& vYoloKernel)
    {
        mClassCount = classCount;
        mYoloV5NetWidth = netWidth;
        mYoloV5NetHeight = netHeight;
        mMaxOutObject = maxOut;
        mYoloKernel = vYoloKernel;
        mKernelCount = vYoloKernel.size();

        CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
        size_t AnchorLen = sizeof(float)* CHECK_COUNT * 2;
        for (int ii = 0; ii < mKernelCount; ii++)
        {
            CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
            const auto& yolo = mYoloKernel[ii];
            CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
        }
    }
    YoloLayerPlugin::~YoloLayerPlugin()
    {
        for (int ii = 0; ii < mKernelCount; ii++)
        {
            CUDA_CHECK(cudaFree(mAnchor[ii]));
        }
        CUDA_CHECK(cudaFreeHost(mAnchor));
    }

    // create the plugin at runtime from a byte stream
    YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length)
    {
        using namespace Tn;
        const char *d = reinterpret_cast<const char *>(data), *a = d;
        read(d, mClassCount);
        read(d, mThreadCount);
        read(d, mKernelCount);
        read(d, mYoloV5NetWidth);
        read(d, mYoloV5NetHeight);
        read(d, mMaxOutObject);
        mYoloKernel.resize(mKernelCount);
        auto kernelSize = mKernelCount * sizeof(YoloKernel);
        memcpy(mYoloKernel.data(), d, kernelSize);
        d += kernelSize;
        CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
        size_t AnchorLen = sizeof(float)* CHECK_COUNT * 2;
        for (int ii = 0; ii < mKernelCount; ii++)
        {
            CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
            const auto& yolo = mYoloKernel[ii];
            CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
        }
        assert(d == a + length);
    }

    void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT
    {
        using namespace Tn;
        char* d = static_cast<char*>(buffer), *a = d;
        write(d, mClassCount);
        write(d, mThreadCount);
        write(d, mKernelCount);
        write(d, mYoloV5NetWidth);
        write(d, mYoloV5NetHeight);
        write(d, mMaxOutObject);
        auto kernelSize = mKernelCount * sizeof(YoloKernel);
        memcpy(d, mYoloKernel.data(), kernelSize);
        d += kernelSize;

        assert(d == a + getSerializationSize());
    }

    size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT
    {
        return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(Yolo::YoloKernel) * mYoloKernel.size() + sizeof(mYoloV5NetWidth) + sizeof(mYoloV5NetHeight) + sizeof(mMaxOutObject);
    }

    int YoloLayerPlugin::initialize() TRT_NOEXCEPT
    {
        return 0;
    }

    Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT
    {
        //output the result to channel
        int totalsize = mMaxOutObject * sizeof(Detection) / sizeof(float);

        return Dims3(totalsize + 1, 1, 1);
    }

    // Set plugin namespace
    void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT
    {
        mPluginNamespace = pluginNamespace;
    }

    const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT
    {
        return mPluginNamespace;
    }

    // Return the DataType of the plugin output at the requested index
    DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT
    {
        return DataType::kFLOAT;
    }

    // Return true if output tensor is broadcast across a batch.
    bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT
    {
        return false;
    }

    // Return true if plugin can use input that is broadcast across batch without replication.
    bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT
    {
        return false;
    }

    void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT
    {
    }

    // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
    void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT
    {
    }

    // Detach the plugin object from its execution context.
    void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {}

    const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT
    {
        return "YoloLayer_TRT";
    }

    const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT
    {
        return "1";
    }

    void YoloLayerPlugin::destroy() TRT_NOEXCEPT
    {
        delete this;
    }

    // Clone the plugin
    IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT
    {
        YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, mYoloKernel);
        p->setPluginNamespace(mPluginNamespace);
        return p;
    }

    __device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); };

    __global__ void CalDetection(const float *input, float *output, int noElements,
        const int netwidth, const int netheight, int maxoutobject, int yoloWidth, int yoloHeight, const float anchors[CHECK_COUNT * 2], int classes, int outputElem)
    {

        int idx = threadIdx.x + blockDim.x * blockIdx.x;
        if (idx >= noElements) return;

        int total_grid = yoloWidth * yoloHeight;
        int bnIdx = idx / total_grid;
        idx = idx - total_grid * bnIdx;
        int info_len_i = 5 + classes;
        const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT);

        for (int k = 0; k < CHECK_COUNT; ++k) {
            float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]);
            if (box_prob < IGNORE_THRESH) continue;
            int class_id = 0;
            float max_cls_prob = 0.0;
            for (int i = 5; i < info_len_i; ++i) {
                float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]);
                if (p > max_cls_prob) {
                    max_cls_prob = p;
                    class_id = i - 5;
                }
            }
            float *res_count = output + bnIdx * outputElem;
            int count = (int)atomicAdd(res_count, 1);
            if (count >= maxoutobject) return;
            char *data = (char*)res_count + sizeof(float) + count * sizeof(Detection);
            Detection *det = (Detection*)(data);

            int row = idx / yoloWidth;
            int col = idx % yoloWidth;

            //Location
            // pytorch:
            //  y = x[i].sigmoid()
            //  y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]  # xy
            //  y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
            //  X: (sigmoid(tx) + cx)/FeaturemapW *  netwidth
            det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * netwidth / yoloWidth;
            det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * netheight / yoloHeight;

            // W: (Pw * e^tw) / FeaturemapW * netwidth
            // v5: https://github.com/ultralytics/yolov5/issues/471
            det->bbox[2] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]);
            det->bbox[2] = det->bbox[2] * det->bbox[2] * anchors[2 * k];
            det->bbox[3] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]);
            det->bbox[3] = det->bbox[3] * det->bbox[3] * anchors[2 * k + 1];
            det->conf = box_prob * max_cls_prob;
            det->class_id = class_id;
        }
    }

    void YoloLayerPlugin::forwardGpu(const float* const* inputs, float *output, cudaStream_t stream, int batchSize)
    {
        int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
        for (int idx = 0; idx < batchSize; ++idx) {
            CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream));
        }
        int numElem = 0;
        for (unsigned int i = 0; i < mYoloKernel.size(); ++i) {
            const auto& yolo = mYoloKernel[i];
            numElem = yolo.width * yolo.height * batchSize;
            if (numElem < mThreadCount) mThreadCount = numElem;

            //printf("Net: %d  %d \n", mYoloV5NetWidth, mYoloV5NetHeight);
            CalDetection << < (numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream >> >
                (inputs[i], output, numElem, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, yolo.width, yolo.height, (float*)mAnchor[i], mClassCount, outputElem);
        }
    }


    int YoloLayerPlugin::enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT
    {
        forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, batchSize);
        return 0;
    }

    PluginFieldCollection YoloPluginCreator::mFC{};
    std::vector<PluginField> YoloPluginCreator::mPluginAttributes;

    YoloPluginCreator::YoloPluginCreator()
    {
        mPluginAttributes.clear();

        mFC.nbFields = mPluginAttributes.size();
        mFC.fields = mPluginAttributes.data();
    }

    const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT
    {
        return "YoloLayer_TRT";
    }

    const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT
    {
        return "1";
    }

    const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT
    {
        return &mFC;
    }

    IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT
    {
        assert(fc->nbFields == 2);
        assert(strcmp(fc->fields[0].name, "netinfo") == 0);
        assert(strcmp(fc->fields[1].name, "kernels") == 0);
        int *p_netinfo = (int*)(fc->fields[0].data);
        int class_count = p_netinfo[0];
        int input_w = p_netinfo[1];
        int input_h = p_netinfo[2];
        int max_output_object_count = p_netinfo[3];
        std::vector<Yolo::YoloKernel> kernels(fc->fields[1].length);
        memcpy(&kernels[0], fc->fields[1].data, kernels.size() * sizeof(Yolo::YoloKernel));
        YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, kernels);
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

    IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT
    {
        // This object will be deleted when the network is destroyed, which will
        // call YoloLayerPlugin::destroy()
        YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }
}


================================================
FILE: yolov5-lite/yolov5-lite-trt.py
================================================
"""
An example that uses TensorRT's Python api to make inferences.
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt

CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4

# categories = ['faster']
categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
                  "traffic light",
                  "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
                  "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase",
                  "frisbee",
                  "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
                  "surfboard",
                  "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
                  "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
                  "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
                  "cell phone",
                  "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
                  "teddy bear",
                  "hair drier", "toothbrush"]
    

def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret

def plot_one_box(x, img, color=None, label=None, line_thickness=None):
    """
    description: Plots one bounding box on image img,
                 this function comes from YoLov5 project.
    param: 
        x:      a box likes [x1,y1,x2,y2]
        img:    a opencv image object
        color:  color to draw rectangle, such as (0,255,0)
        label:  str
        line_thickness: int
    return:
        no return

    """
    tl = (
        line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
    )  # line/font thickness
    color = color or [random.randint(0, 255) for _ in range(3)]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
    if label:
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(
            img,
            label,
            (c1[0], c1[1] - 2),
            0,
            tl / 3,
            [225, 255, 255],
            thickness=tf,
            lineType=cv2.LINE_AA,
        )


class YoLov5TRT(object):
    """
    description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.batch_size = engine.max_batch_size

    def infer(self, raw_image_generator):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        engine = self.engine
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_origin_h = []
        batch_origin_w = []
        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
            batch_image_raw.append(image_raw)
            batch_origin_h.append(origin_h)
            batch_origin_w.append(origin_w)
            np.copyto(batch_input_image[i], input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        # Do postprocess
        for i in range(self.batch_size):
            result_boxes, result_scores, result_classid = self.post_process(
                output[i * 6001: (i + 1) * 6001], batch_origin_h[i], batch_origin_w[i]
            )
            # Draw rectangles and labels on the original image
            for j in range(len(result_boxes)):
                box = result_boxes[j]
                # print("class:", categories[int(result_classid[j])])
                # print("probability:", result_scores[j])
                plot_one_box(
                    box,
                    batch_image_raw[i],
                    label="{}:{:.2f}".format(
                        categories[int(result_classid[j])], result_scores[j]
                    ),
                )
        return batch_image_raw, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        
    def get_raw_image(self, image_path_batch):
        """
        description: Read an image from image path
        """
        for img_path in image_path_batch:
            yield cv2.imread(img_path)
        
    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_image(self, raw_bgr_image):
        """
        description: Convert BGR image to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = self.input_w / w
        r_h = self.input_h / h
        if r_h > r_w:
            tw = self.input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.input_h - th) / 2)
            ty2 = self.input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = self.input_h
            tx1 = int((self.input_w - tw) / 2)
            tx2 = self.input_w - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy(self, origin_h, origin_w, x):
        """
        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
        param:
            origin_h:   height of original image
            origin_w:   width of original image
            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
        return:
            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
        """
        y = np.zeros_like(x)
        r_w = self.input_w / origin_w
        r_h = self.input_h / origin_h
        if r_h > r_w:
            y[:, 0] = x[:, 0] - x[:, 2] / 2
            y[:, 2] = x[:, 0] + x[:, 2] / 2
            y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
            y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
            y /= r_w
        else:
            y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
            y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
            y[:, 1] = x[:, 1] - x[:, 3] / 2
            y[:, 3] = x[:, 1] + x[:, 3] / 2
            y /= r_h

        return y

    def post_process(self, output, origin_h, origin_w):
        """
        description: postprocess the prediction
        param:
            output:     A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] 
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
            result_scores: finally scores, a numpy, each element is the score correspoing to box
            result_classid: finally classid, a numpy, each element is the classid correspoing to box
        """
        # Get the num of boxes detected
        num = int(output[0])
        # Reshape to a two dimentional ndarray
        pred = np.reshape(output[1:], (-1, 6))[:num, :]
        # Do nms
        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
        result_scores = boxes[:, 4] if len(boxes) else np.array([])
        result_classid = boxes[:, 5] if len(boxes) else np.array([])
        print("The lengh of result_boxes is ", len(result_boxes))
        return result_boxes, result_scores, result_classid

    def bbox_iou(self, box1, box2, x1y1x2y2=True):
        """
        description: compute the IoU of two bounding boxes
        param:
            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))            
            x1y1x2y2: select the coordinate format
        return:
            iou: computed iou
        """
        if not x1y1x2y2:
            # Transform from center and width to exact coordinates
            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
        else:
            # Get the coordinates of bounding boxes
            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

        # Get the coordinates of the intersection rectangle
        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
        # Intersection area
        inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \
                     np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)
        # Union Area
        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)

        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

        return iou

    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
        """
        description: Removes detections with lower object confidence score than 'conf_thres' and performs
        Non-Maximum Suppression to further filter detections.
        param:
            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
            origin_h: original image height
            origin_w: original image width
            conf_thres: a confidence threshold to filter detections
            nms_thres: a iou threshold to filter detections
        return:
            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
        """
        # Get the boxes that score > CONF_THRESH
        boxes = prediction[prediction[:, 4] >= conf_thres]
        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
        boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
        # clip the coordinates
        boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w -1)
        boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w -1)
        boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h -1)
        boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h -1)
        # Object confidence
        confs = boxes[:, 4]
        # Sort by the confs
        boxes = boxes[np.argsort(-confs)]
        # Perform non-maximum suppression
        keep_boxes = []
        while boxes.shape[0]:
            large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
            label_match = boxes[0, -1] == boxes[:, -1]
            # Indices of boxes with lower confidence scores, large IOUs and matching labels
            invalid = large_overlap & label_match
            keep_boxes += [boxes[0]]
            boxes = boxes[~invalid]
        boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
        return boxes

class inferThread(threading.Thread):
    def __init__(self, yolov5_wrapper, image_path_batch):
        threading.Thread.__init__(self)
        self.yolov5_wrapper = yolov5_wrapper
        self.image_path_batch = image_path_batch

    def run(self):
        batch_image_raw, use_time = self.yolov5_wrapper.infer(self.yolov5_wrapper.get_raw_image(self.image_path_batch))
        for i, img_path in enumerate(self.image_path_batch):
            parent, filename = os.path.split(img_path)
            save_name = os.path.join('output',  "e_" + filename)
            # Save image
            cv2.imwrite(save_name, batch_image_raw[i])
        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))


class warmUpThread(threading.Thread):
    def __init__(self, yolov5_wrapper):
        threading.Thread.__init__(self)
        self.yolov5_wrapper = yolov5_wrapper

    def run(self):
        batch_image_raw, use_time = self.yolov5_wrapper.infer(self.yolov5_wrapper.get_raw_image_zeros())
        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))


if __name__ == "__main__":
    # load custom plugin and engine
    PLUGIN_LIBRARY = "build/libmyplugins.so"
    engine_file_path = "build/v5lite-g-int8.engine"

    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]
    if len(sys.argv) > 2:
        PLUGIN_LIBRARY = sys.argv[2]

    ctypes.CDLL(PLUGIN_LIBRARY)


    # categories = ['faster']
    categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
                  "traffic light",
                  "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
                  "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase",
                  "frisbee",
                  "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
                  "surfboard",
                  "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
                  "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
                  "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
                  "cell phone",
                  "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
                  "teddy bear",
                  "hair drier", "toothbrush"]
    

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')
    # a YoLov5TRT instance
    yolov5_wrapper = YoLov5TRT(engine_file_path)
    try:
        print('batch size is', yolov5_wrapper.batch_size)
        
        image_dir = "images/"
        image_path_batches = get_img_path_batches(yolov5_wrapper.batch_size, image_dir)

        for i in range(10):
            # create a new thread to do warm_up
            thread1 = warmUpThread(yolov5_wrapper)
            thread1.start()
            thread1.join()
        for batch in image_path_batches:
            # create a new thread to do inference
            thread1 = inferThread(yolov5_wrapper, batch)
            thread1.start()
            thread1.join()
    finally:
        # destroy the instance
        yolov5_wrapper.destroy()


================================================
FILE: yolov7/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.10)

project(yolov7)

add_definitions(-std=c++11)
add_definitions(-DAPI_EXPORTS)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
enable_language(CUDA)

include_directories(${PROJECT_SOURCE_DIR}/include)
include_directories(${PROJECT_SOURCE_DIR}/plugin)

# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
  message("embed_platform on")
  include_directories(/usr/local/cuda/targets/aarch64-linux/include)
  link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
else()
  message("embed_platform off")
  # cuda
  include_directories(/usr/local/cuda/include)
  link_directories(/usr/local/cuda/lib64)

  # tensorrt
  include_directories(/home/nvidia/TensorRT-8.2.5.1/include)
  link_directories(/home/nvidia/TensorRT-8.2.5.1/lib)
endif()

add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu)
target_link_libraries(myplugins nvinfer cudart)

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu)
add_executable(yolov7 main.cpp ${SRCS})

target_link_libraries(yolov7 nvinfer)
target_link_libraries(yolov7 cudart)
target_link_libraries(yolov7 myplugins)
target_link_libraries(yolov7 ${OpenCV_LIBS})


================================================
FILE: yolov7/README.md
================================================
# YOLOv7

The Pytorch implementation is [WongKinYiu/yolov7](https://github.com/WongKinYiu/yolov7).

The tensorrt code is derived from [QIANXUNZDL123/tensorrtx-yolov7](https://github.com/QIANXUNZDL123/tensorrtx-yolov7)

## Contributors

<a href="https://github.com/QIANXUNZDL123"><img src="https://avatars.githubusercontent.com/u/46549527?v=4?s=48" width="40px;" alt=""/></a>
<a href="https://github.com/lindsayshuo"><img src="https://avatars.githubusercontent.com/u/45239466?v=4?s=48" width="40px;" alt=""/></a>
<a href="https://github.com/wang-xinyu"><img src="https://avatars.githubusercontent.com/u/15235574?s=48&v=4" width="40px;" alt=""/></a> 
<a href="https://github.com/AMIYAMAITY"><img src="https://avatars.githubusercontent.com/u/25117739?s=48&v=4" width="40px;" alt=""/></a> 

## Requirements

- TensorRT 8.0+
- OpenCV 3.4.0+

## Different versions of yolov7

Currently, we support yolov7 v0.1

- For yolov7 v0.1, download .pt from [yolov7 release v0.1](https://github.com/WongKinYiu/yolov7/releases/tag/v0.1), then follow how-to-run in current page.

## Config

- Choose the model tiny/v7/x/d6/w6/e6/e6e from command line arguments.
- Check more configs in [include/config.h](./include/config.h)

## How to Run, yolov7-tiny as example

1. generate .wts from pytorch with .pt, or download .wts from model zoo

```
// download https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7-tiny.pt
cp {tensorrtx}/yolov7/gen_wts.py {WongKinYiu}/yolov7
cd {WongKinYiu}/yolov7
python gen_wts.py
// a file 'yolov7.wts' will be generated.
```

2. build tensorrtx/yolov7 and run

```
cd {tensorrtx}/yolov7/
// update kNumClass in config.h if your model is trained on custom dataset
mkdir build
cd build
cp {WongKinYiu}/yolov7/yolov7.wts {tensorrtx}/yolov7/build
cmake ..
make
sudo ./yolov7 -s [.wts] [.engine] [t/v7/x/w6/e6/d6/e6e]  // serialize model to plan file
sudo ./yolov7 -d [.engine] [image folder]  // deserialize and run inference, the images in [image folder] will be processed.
// For example yolov7
sudo ./yolov7 -s yolov7.wts yolov7.engine v7
sudo ./yolov7 -d yolov7.engine ../images
```

3. check the images generated, as follows. _zidane.jpg and _bus.jpg

4. optional, load and run the tensorrt model in python

```
// install python-tensorrt, pycuda, etc.
// ensure the yolov7.engine and libmyplugins.so have been built
python yolov7_trt.py
```

# INT8 Quantization

1. Prepare calibration images, you can randomly select 1000s images from your train set. For coco, you can also download my calibration images `coco_calib` from [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing) or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh

2. unzip it in yolov7/build

3. set the macro `USE_INT8` in config.h and make

4. serialize the model and test

<p align="center">
<img src="https://user-images.githubusercontent.com/15235574/78247927-4d9fac00-751e-11ea-8b1b-704a0aeb3fcf.jpg" height="360px;">
</p>

## More Information

See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)


================================================
FILE: yolov7/gen_wts.py
================================================
import sys  # noqa: F401
import argparse
import os
import struct
import torch
from utils.torch_utils import select_device


def parse_args():
    parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
    parser.add_argument('-w', '--weights', required=True, help='Input weights (.pt) file path (required)')
    parser.add_argument('-o', '--output', help='Output (.wts) file path (optional)')
    args = parser.parse_args()
    if not os.path.isfile(args.weights):
        raise SystemExit('Invalid input file')
    if not args.output:
        args.output = os.path.splitext(args.weights)[0] + '.wts'
    elif os.path.isdir(args.output):
        args.output = os.path.join(
            args.output,
            os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
    return args.weights, args.output


pt_file, wts_file = parse_args()

# Initialize
device = select_device('cpu')
# Load model
model = torch.load(pt_file, map_location=device, weights_only=False)  # Load FP32 weights
model = model['ema' if model.get('ema') else 'model'].float()

# update anchor_grid info
anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None]
# model.model[-1].anchor_grid = anchor_grid
delattr(model.model[-1], 'anchor_grid')  # model.model[-1] is detect layer
# The parameters are saved in the OrderDict through the "register_buffer" method, and then saved to the weight.
model.model[-1].register_buffer("anchor_grid", anchor_grid)

model.to(device).eval()

with open(wts_file, 'w') as f:
    f.write('{}\n'.format(len(model.state_dict().keys())))
    for k, v in model.state_dict().items():
        vr = v.reshape(-1).cpu().numpy()
        f.write('{} {} '.format(k, len(vr)))
        for vv in vr:
            f.write(' ')
            f.write(struct.pack('>f', float(vv)).hex())
        f.write('\n')


================================================
FILE: yolov7/include/block.h
================================================
#pragma once

#include "NvInfer.h"
#include <string>
#include <vector>
#include <map>

std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file);

nvinfer1::IElementWiseLayer* convBnSilu(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c2, int k, int s, int p, std::string lname);

nvinfer1::ILayer* ReOrg(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int inch);

nvinfer1::ILayer* DownC(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c1, int c2, const std::string& lname);

nvinfer1::IElementWiseLayer* SPPCSPC(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c2, const std::string& lname);

nvinfer1::IElementWiseLayer* RepConv(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c2, int k, int s, const std::string& lname);

nvinfer1::IActivationLayer* convBlockLeakRelu(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int outch, int ksize, int s, int p, std::string lname);

nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition *network, std::map<std::string, nvinfer1::Weights>& weightMap, std::string lname, std::vector<nvinfer1::IConvolutionLayer*> dets);


================================================
FILE: yolov7/include/calibrator.h
================================================
#ifndef ENTROPY_CALIBRATOR_H
#define ENTROPY_CALIBRATOR_H

#include <NvInfer.h>
#include <string>
#include <vector>
#include "macros.h"

//! \class Int8EntropyCalibrator2
//!
//! \brief Implements Entropy calibrator 2.
//!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
//!
class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2
{
public:
    Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true);

    virtual ~Int8EntropyCalibrator2();
    int getBatchSize() const TRT_NOEXCEPT override;
    bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
    const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
    void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;

private:
    int batchsize_;
    int input_w_;
    int input_h_;
    int img_idx_;
    std::string img_dir_;
    std::vector<std::string> img_files_;
    size_t input_count_;
    std::string calib_table_name_;
    const char* input_blob_name_;
    bool read_cache_;
    void* device_input_;
    std::vector<char> calib_cache_;
};

#endif // ENTROPY_CALIBRATOR_H


================================================
FILE: yolov7/include/config.h
================================================
#pragma once

/* --------------------------------------------------------
 * These configs are related to tensorrt model, if these are changed,
 * please re-compile and re-serialize the tensorrt model.
 * --------------------------------------------------------*/

// For INT8, you need prepare the calibration dataset, please refer to
// https://github.com/wang-xinyu/tensorrtx/tree/master/yolov7#int8-quantization
#define USE_FP16  // set USE_INT8 or USE_FP16 or USE_FP32

// These are used to define input/output tensor names,
// you can set them to whatever you want.
const static char* kInputTensorName = "data";
const static char* kOutputTensorName = "prob";

const static int kNumClass = 80;
const static int kBatchSize = 1;

// Yolo's input width and height must by divisible by 32
const static int kInputH = 640;
const static int kInputW = 640;

// Maximum number of output bounding boxes from yololayer plugin.
// That is maximum number of output bounding boxes before NMS.
const static int kMaxNumOutputBbox = 1000;

const static int kNumAnchor = 3;

// The bboxes whose confidence is lower than kIgnoreThresh will be ignored in yololayer plugin.
const static float kIgnoreThresh = 0.1f;

/* --------------------------------------------------------
 * These configs are not related to tensorrt model, if these are changed,
 * please re-compile, but no need to re-serialize the tensorrt model.
 * --------------------------------------------------------*/

// NMS overlapping thresh and final detection confidence thresh
const static float kNmsThresh = 0.45f;
const static float kConfThresh = 0.5f;

const static int kGpuId = 0;

// If your image size is larger than 4096 * 3112, please increase this value
const static int kMaxInputImageSize = 4096 * 3112;


================================================
FILE: yolov7/include/cuda_utils.h
================================================
#ifndef TRTX_CUDA_UTILS_H_
#define TRTX_CUDA_UTILS_H_

#include <cuda_runtime_api.h>

#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)\
    {\
        cudaError_t error_code = callstr;\
        if (error_code != cudaSuccess) {\
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
            assert(0);\
        }\
    }
#endif  // CUDA_CHECK

#endif  // TRTX_CUDA_UTILS_H_


================================================
FILE: yolov7/include/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "macros.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override 
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: yolov7/include/macros.h
================================================
#ifndef __MACROS_H
#define __MACROS_H

#include "NvInfer.h"

#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else

#if defined(_MSC_VER)
#define API __declspec(dllimport)
#else
#define API
#endif
#endif  // API_EXPORTS

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif

#endif  // __MACROS_H


================================================
FILE: yolov7/include/model.h
================================================
#pragma once

#include "NvInfer.h"
#include <string>

nvinfer1::IHostMemory* build_engine_yolov7e6e(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path);
nvinfer1::IHostMemory* build_engine_yolov7d6(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path);
nvinfer1::IHostMemory* build_engine_yolov7e6(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path);
nvinfer1::IHostMemory* build_engine_yolov7w6(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path);
nvinfer1::IHostMemory* build_engine_yolov7x(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path);
nvinfer1::IHostMemory* build_engine_yolov7(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path);
nvinfer1::IHostMemory* build_engine_yolov7_tiny(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, std::string& wts_name);


================================================
FILE: yolov7/include/postprocess.h
================================================
#pragma once

#include "types.h"
#include <opencv2/opencv.hpp>

cv::Rect get_rect(cv::Mat& img, float bbox[4]);

void nms(std::vector<Detection>& res, float *output, float conf_thresh, float nms_thresh = 0.5);

void batch_nms(std::vector<std::vector<Detection>>& batch_res, float *output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5);

void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);


================================================
FILE: yolov7/include/preprocess.h
================================================
#pragma once

#include <cuda_runtime.h>
#include <cstdint>
#include <opencv2/opencv.hpp>
#include <iostream>

void cuda_preprocess_init(int max_image_size);
void cuda_preprocess_destroy();
void cuda_preprocess(uint8_t* src, int src_width, int src_height,
                     float* dst, int dst_width, int dst_height,
                     cudaStream_t stream);
void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch,
                           float* dst, int dst_width, int dst_height,
                           cudaStream_t stream);


================================================
FILE: yolov7/include/types.h
================================================
#pragma once

#include "config.h"

struct YoloKernel {
  int width;
  int height;
  float anchors[kNumAnchor * 2];
};

struct alignas(float) Detection {
  //center_x center_y w h
  float bbox[4];
  float conf;  // bbox_conf * cls_conf
  float class_id;
};


================================================
FILE: yolov7/include/utils.h
================================================
#ifndef TRTX_YOLOV7_UTILS_H_
#define TRTX_YOLOV7_UTILS_H_

#include <dirent.h>
#include <opencv2/opencv.hpp>

static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
    int w, h, x, y;
    float r_w = input_w / (img.cols*1.0);
    float r_h = input_h / (img.rows*1.0);
    if (r_h > r_w) {
        w = input_w;
        h = r_w * img.rows;
        x = 0;
        y = (input_h - h) / 2;
    } else {
        w = r_h * img.cols;
        h = input_h;
        x = (input_w - w) / 2;
        y = 0;
    }
    cv::Mat re(h, w, CV_8UC3);
    cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
    cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
    re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
    return out;
}

static inline int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
    DIR *p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 &&
            strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}

#endif  // TRTX_YOLOV7_UTILS_H_


================================================
FILE: yolov7/main.cpp
================================================
#include "config.h"
#include "model.h"
#include "cuda_utils.h"
#include "logging.h"
#include "utils.h"
#include "preprocess.h"
#include "postprocess.h"
#include <chrono>
#include <fstream>

using namespace nvinfer1;

const static int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
static Logger gLogger;

void serialize_engine(unsigned int maxBatchSize, std::string& wts_name, std::string& sub_type, std::string& engine_name) {
  // Create builder
  IBuilder* builder = createInferBuilder(gLogger);
  IBuilderConfig* config = builder->createBuilderConfig();

  // Create model to populate the network, then set the outputs and create an engine
  IHostMemory* serialized_engine = nullptr;
  if (sub_type == "t") {
    serialized_engine = build_engine_yolov7_tiny(maxBatchSize, builder, config, DataType::kFLOAT, wts_name);
  } else if (sub_type == "v7") {
    serialized_engine = build_engine_yolov7(maxBatchSize, builder, config, DataType::kFLOAT, wts_name);
  } else if (sub_type == "x") {
    serialized_engine = build_engine_yolov7x(maxBatchSize, builder, config, DataType::kFLOAT, wts_name);
  } else if (sub_type == "w6") {
    serialized_engine = build_engine_yolov7w6(maxBatchSize, builder, config, DataType::kFLOAT, wts_name);
  } else if (sub_type == "e6") {
    serialized_engine = build_engine_yolov7e6(maxBatchSize, builder, config, DataType::kFLOAT, wts_name);
  } else if (sub_type == "d6") {
    serialized_engine = build_engine_yolov7d6(maxBatchSize, builder, config, DataType::kFLOAT, wts_name);
  } else if (sub_type == "e6e") {
    serialized_engine = build_engine_yolov7e6e(maxBatchSize, builder, config, DataType::kFLOAT, wts_name);
  }
  assert(serialized_engine != nullptr);

  std::ofstream p(engine_name, std::ios::binary);
  if (!p) {
    std::cerr << "could not open plan output file" << std::endl;
    assert(false);
  }
  p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());

  delete config;
  delete serialized_engine;
  delete builder;
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) {
  std::ifstream file(engine_name, std::ios::binary);
  if (!file.good()) {
    std::cerr << "read " << engine_name << " error!" << std::endl;
    assert(false);
  }
  size_t size = 0;
  file.seekg(0, file.end);
  size = file.tellg();
  file.seekg(0, file.beg);
  char* serialized_engine = new char[size];
  assert(serialized_engine);
  file.read(serialized_engine, size);
  file.close();

  *runtime = createInferRuntime(gLogger);
  assert(*runtime);
  *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
  assert(*engine);
  *context = (*engine)->createExecutionContext();
  assert(*context);
  delete[] serialized_engine;
}

void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, float** output_buffer_host) {
  assert(engine->getNbBindings() == 2);
  // In order to bind the buffers, we need to know the names of the input and output tensors.
  // Note that indices are guaranteed to be less than IEngine::getNbBindings()
  const int inputIndex = engine->getBindingIndex(kInputTensorName);
  const int outputIndex = engine->getBindingIndex(kOutputTensorName);
  assert(inputIndex == 0);
  assert(outputIndex == 1);
  // Create GPU buffers on device
  CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
  CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));

  *output_buffer_host = new float[kBatchSize * kOutputSize];
}

void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchSize) {
  // infer on the batch asynchronously, and DMA output back to host
  context.enqueue(batchSize, buffers, stream, nullptr);
  CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));
}

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir, std::string& sub_type) {
  if (argc < 4) return false;
  if (std::string(argv[1]) == "-s" && argc == 5) {
    wts = std::string(argv[2]);
    engine = std::string(argv[3]);
    sub_type = std::string(argv[4]);
  } else if (std::string(argv[1]) == "-d" && argc == 4) {
    engine = std::string(argv[2]);
    img_dir = std::string(argv[3]);
  } else {
    return false;
  }
  return true;
}

int main(int argc, char** argv) {
  cudaSetDevice(kGpuId);

  std::string wts_name = "";
  std::string engine_name = "";
  std::string img_dir;
  std::string sub_type = "";

  if (!parse_args(argc, argv, wts_name, engine_name, img_dir, sub_type)) {
    std::cerr << "Arguments not right!" << std::endl;
    std::cerr << "./yolov7 -s [.wts] [.engine] [t/v7/x/w6/e6/d6/e6e]  // serialize model to plan file" << std::endl;
    std::cerr << "./yolov7 -d [.engine] ../samples  // deserialize plan file and run inference" << std::endl;
    return -1;
  }

  // Create a model using the API directly and serialize it to a file
  if (!wts_name.empty()) {
    serialize_engine(kBatchSize, wts_name, sub_type, engine_name);
    return 0;
  }

  // Deserialize the engine from file
  IRuntime* runtime = nullptr;
  ICudaEngine* engine = nullptr;
  IExecutionContext* context = nullptr;
  deserialize_engine(engine_name, &runtime, &engine, &context);
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  cuda_preprocess_init(kMaxInputImageSize);

  // Prepare cpu and gpu buffers
  float* device_buffers[2];
  float* output_buffer_host = nullptr;
  prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host);

  // Read images from directory
  std::vector<std::string> file_names;
  if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
    std::cerr << "read_files_in_dir failed." << std::endl;
    return -1;
  }

  // batch predict
  for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
    // Get a batch of images
    std::vector<cv::Mat> img_batch;
    std::vector<std::string> img_name_batch;
    for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
      cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
      img_batch.push_back(img);
      img_name_batch.push_back(file_names[j]);
    }

    // Preprocess
    cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);

    // Run inference
    auto start = std::chrono::system_clock::now();
    infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize);
    auto end = std::chrono::system_clock::now();
    std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;

    // NMS
    std::vector<std::vector<Detection>> res_batch;
    batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);

    // Draw bounding boxes
    draw_bbox(img_batch, res_batch);

    // Save images
    for (size_t j = 0; j < img_batch.size(); j++) {
      cv::imwrite("_" + img_name_batch[j], img_batch[j]);
    }
  }

  // Release stream and buffers
  cudaStreamDestroy(stream);
  CUDA_CHECK(cudaFree(device_buffers[0]));
  CUDA_CHECK(cudaFree(device_buffers[1]));
  delete[] output_buffer_host;
  cuda_preprocess_destroy();
  // Destroy the engine
  delete context;
  delete engine;
  delete runtime;

  // Print histogram of the output distribution
  //std::cout << "\nOutput:\n\n";
  //for (unsigned int i = 0; i < kOutputSize; i++)
  //{
  //    std::cout << prob[i] << ", ";
  //    if (i % 10 == 0) std::cout << std::endl;
  //}
  //std::cout << std::endl;

  return 0;
}


================================================
FILE: yolov7/plugin/yololayer.cu
================================================
#include "yololayer.h"
#include "cuda_utils.h"
#include <assert.h>
#include <vector>
#include <iostream>

namespace Tn {
template<typename T>
void write(char*& buffer, const T& val) {
  *reinterpret_cast<T*>(buffer) = val;
  buffer += sizeof(T);
}

template<typename T>
void read(const char*& buffer, T& val) {
  val = *reinterpret_cast<const T*>(buffer);
  buffer += sizeof(T);
}
}  // namespace Tn

namespace nvinfer1 {
YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector<YoloKernel>& vYoloKernel) {
  mClassCount = classCount;
  mYoloV7NetWidth = netWidth;
  mYoloV7NetHeight = netHeight;
  mMaxOutObject = maxOut;
  mYoloKernel = vYoloKernel;
  mKernelCount = vYoloKernel.size();

  CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
  size_t AnchorLen = sizeof(float) * kNumAnchor * 2;
  for (int ii = 0; ii < mKernelCount; ii++) {
    CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
    const auto& yolo = mYoloKernel[ii];
    CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
  }
}
YoloLayerPlugin::~YoloLayerPlugin() {
  for (int ii = 0; ii < mKernelCount; ii++) {
    CUDA_CHECK(cudaFree(mAnchor[ii]));
  }
  CUDA_CHECK(cudaFreeHost(mAnchor));
}

// create the plugin at runtime from a byte stream
YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) {
  using namespace Tn;
  const char *d = reinterpret_cast<const char *>(data), *a = d;
  read(d, mClassCount);
  read(d, mThreadCount);
  read(d, mKernelCount);
  read(d, mYoloV7NetWidth);
  read(d, mYoloV7NetHeight);
  read(d, mMaxOutObject);
  mYoloKernel.resize(mKernelCount);
  auto kernelSize = mKernelCount * sizeof(YoloKernel);
  memcpy(mYoloKernel.data(), d, kernelSize);
  d += kernelSize;
  CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
  size_t AnchorLen = sizeof(float) * kNumAnchor * 2;
  for (int ii = 0; ii < mKernelCount; ii++) {
    CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
    const auto& yolo = mYoloKernel[ii];
    CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
  }
  assert(d == a + length);
}

void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT {
  using namespace Tn;
  char* d = static_cast<char*>(buffer), *a = d;
  write(d, mClassCount);
  write(d, mThreadCount);
  write(d, mKernelCount);
  write(d, mYoloV7NetWidth);
  write(d, mYoloV7NetHeight);
  write(d, mMaxOutObject);
  auto kernelSize = mKernelCount * sizeof(YoloKernel);
  memcpy(d, mYoloKernel.data(), kernelSize);
  d += kernelSize;

  assert(d == a + getSerializationSize());
}

size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT {
  return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(YoloKernel) * mYoloKernel.size() + sizeof(mYoloV7NetWidth) + sizeof(mYoloV7NetHeight) + sizeof(mMaxOutObject);
}

int YoloLayerPlugin::initialize() TRT_NOEXCEPT {
  return 0;
}

Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT {
  //output the result to channel
  int totalsize = mMaxOutObject * sizeof(Detection) / sizeof(float);
  return Dims3(totalsize + 1, 1, 1);
}

// Set plugin namespace
void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT {
  mPluginNamespace = pluginNamespace;
}

const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT {
  return mPluginNamespace;
}

// Return the DataType of the plugin output at the requested index
DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT {
  return DataType::kFLOAT;
}

// Return true if output tensor is broadcast across a batch.
bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT {
  return false;
}

// Return true if plugin can use input that is broadcast across batch without replication.
bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT {
  return false;
}

void YoloLayerPlugin::configurePlugin(PluginTensorDesc const* in, int nbInput, PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT {}

// Attach the plugin object to an execution context and grant the plugin the access to some context resource.
void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {}

// Detach the plugin object from its execution context.
void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {}

const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT {
  return "YoloLayer_TRT";
}

const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT {
  return "1";
}

void YoloLayerPlugin::destroy() TRT_NOEXCEPT {
  delete this;
}

// Clone the plugin
IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT {
  YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV7NetWidth, mYoloV7NetHeight, mMaxOutObject, mYoloKernel);
  p->setPluginNamespace(mPluginNamespace);
  return p;
}

__device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); };

__global__ void CalDetection(const float *input, float *output, int noElements,
    const int netwidth, const int netheight, int maxoutobject, int yoloWidth, int yoloHeight, const float anchors[kNumAnchor * 2], int classes, int outputElem) {
  int idx = threadIdx.x + blockDim.x * blockIdx.x;
  if (idx >= noElements) return;

  int total_grid = yoloWidth * yoloHeight;  // 80*80 40*40 20*20
  int bnIdx = idx / total_grid;
  idx = idx - total_grid * bnIdx;
  int info_len_i = 5 + classes;
  const float* curInput = input + bnIdx * (info_len_i * total_grid * kNumAnchor);

  for (int k = 0; k < 3; k++) {
    float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]);
    if (box_prob < kIgnoreThresh) continue;
    int class_id = 0;
    float max_cls_prob = 0.0;
    for (int i = 5; i < info_len_i; ++i) {
      float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]);
      if (p > max_cls_prob) {
        max_cls_prob = p;
        class_id = i - 5;
      }
    }
    float *res_count = output + bnIdx * outputElem;
    int count = (int)atomicAdd(res_count, 1);
    if (count >= maxoutobject) return;
    char *data = (char*)res_count + sizeof(float) + count * sizeof(Detection);
    Detection *det = (Detection*)(data);

    int row = idx / yoloWidth;
    int col = idx % yoloWidth;

    // Location
    // pytorch:
    //  y = x[i].sigmoid()
    //  y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]  # xy
    //  y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
    //  X: (sigmoid(tx) + cx)/FeaturemapW *  netwidth
    det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * netwidth / yoloWidth;
    det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * netheight / yoloHeight;

    // W: (Pw * e^tw) / FeaturemapW * netwidth
    // v5: https://github.com/ultralytics/yolov7/issues/471
    //float box_w = ((row[2] * 2)*(row[2] * 2)) * float(anchors[a][c][0]) * scale;
    //float box_h = ((row[3] * 2) * (row[3] * 2)) * float(anchors[a][c][1]) * scale;
    det->bbox[2] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]);
    det->bbox[2] = det->bbox[2] * det->bbox[2] * anchors[2 * k];
    det->bbox[3] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]);
    det->bbox[3] = det->bbox[3] * det->bbox[3] * anchors[2 * k + 1];
    det->conf = box_prob * max_cls_prob;
    det->class_id = class_id;
  }
}

void YoloLayerPlugin::forwardGpu(const float* const* inputs, float *output, cudaStream_t stream, int batchSize) {
  int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
  for (int idx = 0; idx < batchSize; ++idx) {
    CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream));
  }
  int numElem = 0;

  for (unsigned int i = 0; i < mYoloKernel.size(); ++i) {
    const auto& yolo = mYoloKernel[i];
    numElem = yolo.width * yolo.height * batchSize;
    if (numElem < mThreadCount) mThreadCount = numElem;

    CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>
        (inputs[i], output, numElem, mYoloV7NetWidth, mYoloV7NetHeight, mMaxOutObject, yolo.width, yolo.height, (float*)mAnchor[i], mClassCount, outputElem);
  }
}

int YoloLayerPlugin::enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
  forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, batchSize);
  return 0;
}

PluginFieldCollection YoloPluginCreator::mFC{};
std::vector<PluginField> YoloPluginCreator::mPluginAttributes;

YoloPluginCreator::YoloPluginCreator() {
  mPluginAttributes.clear();
  mFC.nbFields = mPluginAttributes.size();
  mFC.fields = mPluginAttributes.data();
}

const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT {
  return "YoloLayer_TRT";
}

const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT {
  return "1";
}

const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT {
  return &mFC;
}

IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT {
  assert(fc->nbFields == 2);
  assert(strcmp(fc->fields[0].name, "netinfo") == 0);
  assert(strcmp(fc->fields[1].name, "kernels") == 0);
  int *p_netinfo = (int*)(fc->fields[0].data);
  int class_count = p_netinfo[0];
  int input_w = p_netinfo[1];
  int input_h = p_netinfo[2];
  int max_output_object_count = p_netinfo[3];
  std::vector<YoloKernel> kernels(fc->fields[1].length);
  memcpy(&kernels[0], fc->fields[1].data, kernels.size() * sizeof(YoloKernel));
  YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, kernels);
  obj->setPluginNamespace(mNamespace.c_str());
  return obj;
}

IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT {
  // This object will be deleted when the network is destroyed, which will
  // call YoloLayerPlugin::destroy()
  YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
  obj->setPluginNamespace(mNamespace.c_str());
  return obj;
}
}  // namespace nvinfer1


================================================
FILE: yolov7/plugin/yololayer.h
================================================
#pragma once

#include "macros.h"
#include "types.h"
#include <vector>
#include <string>

namespace nvinfer1 {
class API YoloLayerPlugin : public IPluginV2IOExt {
 public:
  YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector<YoloKernel>& vYoloKernel);
  YoloLayerPlugin(const void* data, size_t length);
  ~YoloLayerPlugin();

  int getNbOutputs() const TRT_NOEXCEPT override {
    return 1;
  }

  Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;

  int initialize() TRT_NOEXCEPT override;

  virtual void terminate() TRT_NOEXCEPT override {}

  virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; }

  virtual int enqueue(int batchSize, const void* const* inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;

  virtual size_t getSerializationSize() const TRT_NOEXCEPT override;

  virtual void serialize(void* buffer) const TRT_NOEXCEPT override;

  bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override {
    return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
  }

  const char* getPluginType() const TRT_NOEXCEPT override;

  const char* getPluginVersion() const TRT_NOEXCEPT override;

  void destroy() TRT_NOEXCEPT override;

  IPluginV2IOExt* clone() const TRT_NOEXCEPT override;

  void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;

  const char* getPluginNamespace() const TRT_NOEXCEPT override;

  DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;

  bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override;

  bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;

  void attachToContext(
      cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;

  void configurePlugin(PluginTensorDesc const* in, int nbInput, PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT override;

  void detachFromContext() TRT_NOEXCEPT override;

 private:
  void forwardGpu(const float* const* inputs, float *output, cudaStream_t stream, int batchSize = 1);
  int mThreadCount = 256;
  const char* mPluginNamespace;
  int mKernelCount;
  int mClassCount;
  int mYoloV7NetWidth;
  int mYoloV7NetHeight;
  int mMaxOutObject;
  std::vector<YoloKernel> mYoloKernel;
  void** mAnchor;
};

class API YoloPluginCreator : public IPluginCreator {
 public:
  YoloPluginCreator();

  ~YoloPluginCreator() override = default;

  const char* getPluginName() const TRT_NOEXCEPT override;

  const char* getPluginVersion() const TRT_NOEXCEPT override;

  const PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;

  IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT override;

  IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;

  void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override {
    mNamespace = libNamespace;
  }

  const char* getPluginNamespace() const TRT_NOEXCEPT override {
    return mNamespace.c_str();
  }

 private:
  std::string mNamespace;
  static PluginFieldCollection mFC;
  static std::vector<PluginField> mPluginAttributes;
};
REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
}  // namespace nvinfer1


================================================
FILE: yolov7/src/block.cpp
================================================
﻿#include "block.h"
#include "yololayer.h"
#include "NvInfer.h"
#include <iostream>
#include <fstream>
#include <assert.h>
#include <cmath>
#include <cstring>

using namespace nvinfer1;

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{ DataType::kFLOAT, nullptr, 0 };
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

static IScaleLayer* addBatchNorm2d(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
    float* gamma = (float*)weightMap[lname + ".weight"].values;
    float* beta = (float*)weightMap[lname + ".bias"].values;
    float* mean = (float*)weightMap[lname + ".running_mean"].values;
    float* var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;

    float* scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    Weights scale{ DataType::kFLOAT, scval, len };

    float* shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    Weights shift{ DataType::kFLOAT, shval, len };

    float* pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    Weights power{ DataType::kFLOAT, pval, len };

    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
    assert(scale_1);
    return scale_1;
}

IElementWiseLayer* convBnSilu(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int c2, int k, int s, int p, std::string lname) {
    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };

    IConvolutionLayer* conv1 = network->addConvolutionNd(input, c2, DimsHW{ k, k }, weightMap[lname + ".conv.weight"], emptywts);
    assert(conv1);
    conv1->setName((lname + ".conv").c_str());
    conv1->setStrideNd(DimsHW{ s, s });
    conv1->setPaddingNd(DimsHW{ p, p });


    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3);


    // silu = x * sigmoid(x)
    IActivationLayer* sig1 = network->addActivation(*bn1->getOutput(0), ActivationType::kSIGMOID);
    assert(sig1);
    IElementWiseLayer* ew1 = network->addElementWise(*bn1->getOutput(0), *sig1->getOutput(0), ElementWiseOperation::kPROD);
    assert(ew1);
    return ew1;
}

ILayer* ReOrg(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch) {
    ISliceLayer* s1 = network->addSlice(input, Dims3{ 0, 0, 0 }, Dims3{ inch, kInputH / 2, kInputW / 2 }, Dims3{ 1, 2, 2 });
    ISliceLayer* s2 = network->addSlice(input, Dims3{ 0, 1, 0 }, Dims3{ inch, kInputH / 2, kInputW / 2 }, Dims3{ 1, 2, 2 });
    ISliceLayer* s3 = network->addSlice(input, Dims3{ 0, 0, 1 }, Dims3{ inch, kInputH / 2, kInputW / 2 }, Dims3{ 1, 2, 2 });
    ISliceLayer* s4 = network->addSlice(input, Dims3{ 0, 1, 1 }, Dims3{ inch, kInputH / 2, kInputW / 2 }, Dims3{ 1, 2, 2 });
    ITensor* inputTensors[] = { s1->getOutput(0), s2->getOutput(0), s3->getOutput(0), s4->getOutput(0) };
    auto cat = network->addConcatenation(inputTensors, 4);
    return cat;
}

ILayer* DownC(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, const std::string& lname) {
    int c_ = int(c2 * 0.5);
    IElementWiseLayer* cv1 = convBnSilu(network, weightMap, input, c1, 1, 1, 0, lname + ".cv1");
    IElementWiseLayer* cv2 = convBnSilu(network, weightMap, *cv1->getOutput(0), c_, 3, 2, 1, lname + ".cv2");

    IPoolingLayer* m1 = network->addPoolingNd(input, PoolingType::kMAX, DimsHW{ 2, 2 });
    m1->setStrideNd(DimsHW{ 2, 2 });
    IElementWiseLayer* cv3 = convBnSilu(network, weightMap, *m1->getOutput(0), c_, 1, 1, 0, lname + ".cv3");

    ITensor* input_tensors[] = { cv2->getOutput(0),  cv3->getOutput(0) };
    IConcatenationLayer* concat = network->addConcatenation(input_tensors, 2);

    return concat;

}

IElementWiseLayer* SPPCSPC(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int c2, const std::string& lname) {
    int c_ = int(2 * c2 * 0.5);
    IElementWiseLayer* cv1 = convBnSilu(network, weightMap, input, c_, 1, 1, 0, lname + ".cv1");
    IElementWiseLayer* cv2 = convBnSilu(network, weightMap, input, c_, 1, 1, 0, lname + ".cv2");

    IElementWiseLayer* cv3 = convBnSilu(network, weightMap, *cv1->getOutput(0), c_, 3, 1, 1, lname + ".cv3");
    IElementWiseLayer* cv4 = convBnSilu(network, weightMap, *cv3->getOutput(0), c_, 1, 1, 0, lname + ".cv4");

    IPoolingLayer* m1 = network->addPoolingNd(*cv4->getOutput(0), PoolingType::kMAX, DimsHW{ 5, 5 });
    m1->setStrideNd(DimsHW{ 1, 1 });
    m1->setPaddingNd(DimsHW{ 2, 2 });
    IPoolingLayer* m2 = network->addPoolingNd(*cv4->getOutput(0), PoolingType::kMAX, DimsHW{ 9, 9 });
    m2->setStrideNd(DimsHW{ 1, 1 });
    m2->setPaddingNd(DimsHW{ 4, 4 });
    IPoolingLayer* m3 = network->addPoolingNd(*cv4->getOutput(0), PoolingType::kMAX, DimsHW{ 13, 13 });
    m3->setStrideNd(DimsHW{ 1, 1 });
    m3->setPaddingNd(DimsHW{ 6, 6 });

    ITensor* input_tensors[] = { cv4->getOutput(0), m1->getOutput(0), m2->getOutput(0), m3->getOutput(0) };
    IConcatenationLayer* concat = network->addConcatenation(input_tensors, 4);
    // 0U
    concat->setAxis(0);

    IElementWiseLayer* cv5 = convBnSilu(network, weightMap, *concat->getOutput(0), c_, 1, 1, 0, lname + ".cv5");
    IElementWiseLayer* cv6 = convBnSilu(network, weightMap, *cv5->getOutput(0), c_, 3, 1, 1, lname + ".cv6");

    ITensor* input_tensors2[] = { cv6->getOutput(0), cv2->getOutput(0) };
    IConcatenationLayer* concat1 = network->addConcatenation(input_tensors2, 2);
    // 0U
    concat1->setAxis(0);


    IElementWiseLayer* cv7 = convBnSilu(network, weightMap, *concat1->getOutput(0), c2, 1, 1, 0, lname + ".cv7");
    return cv7;
}

IElementWiseLayer* RepConv(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int c2, int k, int s, const std::string& lname) {
    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
    // 256 * 128 * 3 *3
    IConvolutionLayer* rbr_dense_conv = network->addConvolutionNd(input, c2, DimsHW{ k, k }, weightMap[lname + ".rbr_dense.0.weight"], emptywts);
    assert(rbr_dense_conv);
    rbr_dense_conv->setPaddingNd(DimsHW{ k / 2, k / 2 });
    rbr_dense_conv->setStrideNd(DimsHW{ s, s });
    rbr_dense_conv->setName((lname + ".rbr_dense.0").c_str());
    IScaleLayer* rbr_dense_bn = addBatchNorm2d(network, weightMap, *rbr_dense_conv->getOutput(0), lname + ".rbr_dense.1", 1e-3);

    IConvolutionLayer* rbr_1x1_conv = network->addConvolutionNd(input, c2, DimsHW{ 1, 1 }, weightMap[lname + ".rbr_1x1.0.weight"], emptywts);
    assert(rbr_1x1_conv);
    rbr_1x1_conv->setStrideNd(DimsHW{ s, s });
    rbr_1x1_conv->setName((lname + ".rbr_1x1.0").c_str());
    IScaleLayer* rbr_1x1_bn = addBatchNorm2d(network, weightMap, *rbr_1x1_conv->getOutput(0), lname + ".rbr_1x1.1", 1e-3);

    IElementWiseLayer* ew1 = network->addElementWise(*rbr_dense_bn->getOutput(0), *rbr_1x1_bn->getOutput(0), ElementWiseOperation::kSUM);
    assert(ew1);
    // silu
    IActivationLayer* sigmoid = network->addActivation(*ew1->getOutput(0), ActivationType::kSIGMOID);
    IElementWiseLayer* ew2 = network->addElementWise(*ew1->getOutput(0), *sigmoid->getOutput(0), ElementWiseOperation::kPROD);
    return ew2;
}

IActivationLayer* convBlockLeakRelu(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int p, std::string lname) {
    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };

    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ ksize, ksize }, weightMap[lname + ".conv.weight"], emptywts);
    assert(conv1);
    conv1->setName((lname + ".conv").c_str());
    conv1->setStrideNd(DimsHW{ s, s });
    conv1->setPaddingNd(DimsHW{ p, p });
    //conv1->setNbGroups(g);
    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-5);

    auto ew1 = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU);
    ew1->setAlpha(0.1);
    return ew1;
}

static std::vector<std::vector<float>> getAnchors(std::map<std::string, Weights>& weightMap, std::string lname) {
    std::vector<std::vector<float>> anchors;
    Weights wts = weightMap[lname + ".anchor_grid"];
    int anchor_len = kNumAnchor * 2;
    for (int i = 0; i < wts.count / anchor_len; i++) {
        auto *p = (const float*)wts.values + i * anchor_len;
        std::vector<float> anchor(p, p + anchor_len);
        anchors.push_back(anchor);
    }
    return anchors;
}

IPluginV2Layer* addYoLoLayer(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, std::string lname, std::vector<IConvolutionLayer*> dets) {
    auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
    auto anchors = getAnchors(weightMap, lname);

    PluginField plugin_fields[2];
    int netinfo[4] = {kNumClass, kInputW, kInputH, kMaxNumOutputBbox};
    plugin_fields[0].data = netinfo;
    plugin_fields[0].length = 4;
    plugin_fields[0].name = "netinfo";
    plugin_fields[0].type = PluginFieldType::kFLOAT32;
    int scale = 8;

    std::vector<YoloKernel> kernels;
    for (size_t i = 0; i < anchors.size(); i++) {
        YoloKernel kernel;
        kernel.width = kInputW / scale;
        kernel.height = kInputH / scale;
        memcpy(kernel.anchors, &anchors[i][0], anchors[i].size() * sizeof(float));
        kernels.push_back(kernel);
        scale *= 2;
    }
    plugin_fields[1].data = &kernels[0];
    plugin_fields[1].length = kernels.size();
    plugin_fields[1].name = "kernels";
    plugin_fields[1].type = PluginFieldType::kFLOAT32;
    PluginFieldCollection plugin_data;
    plugin_data.nbFields = 2;
    plugin_data.fields = plugin_fields;
    IPluginV2 *plugin_obj = creator->createPlugin("yololayer", &plugin_data);
    std::vector<ITensor*> input_tensors;
    for (auto det: dets) {
        input_tensors.push_back(det->getOutput(0));
    }
    auto yolo = network->addPluginV2(&input_tensors[0], input_tensors.size(), *plugin_obj);
    return yolo;
}


================================================
FILE: yolov7/src/calibrator.cpp
================================================
#include <iostream>
#include <iterator>
#include <fstream>
#include <opencv2/dnn/dnn.hpp>
#include "calibrator.h"
#include "cuda_utils.h"
#include "utils.h"

Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache)
    : batchsize_(batchsize)
    , input_w_(input_w)
    , input_h_(input_h)
    , img_idx_(0)
    , img_dir_(img_dir)
    , calib_table_name_(calib_table_name)
    , input_blob_name_(input_blob_name)
    , read_cache_(read_cache)
{
    input_count_ = 3 * input_w * input_h * batchsize;
    CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float)));
    read_files_in_dir(img_dir, img_files_);
}

Int8EntropyCalibrator2::~Int8EntropyCalibrator2()
{
    CUDA_CHECK(cudaFree(device_input_));
}

int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT
{
    return batchsize_;
}

bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT
{
    if (img_idx_ + batchsize_ > (int)img_files_.size()) {
        return false;
    }

    std::vector<cv::Mat> input_imgs_;
    for (int i = img_idx_; i < img_idx_ + batchsize_; i++) {
        std::cout << img_files_[i] << "  " << i << std::endl;
        cv::Mat temp = cv::imread(img_dir_ + img_files_[i]);
        if (temp.empty()){
            std::cerr << "Fatal error: image cannot open!" << std::endl;
            return false;
        }
        cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_);
        input_imgs_.push_back(pr_img);
    }
    img_idx_ += batchsize_;
    cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false);

    CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr<float>(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice));
    assert(!strcmp(names[0], input_blob_name_));
    bindings[0] = device_input_;
    return true;
}

const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT
{
    std::cout << "reading calib cache: " << calib_table_name_ << std::endl;
    calib_cache_.clear();
    std::ifstream input(calib_table_name_, std::ios::binary);
    input >> std::noskipws;
    if (read_cache_ && input.good())
    {
        std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(calib_cache_));
    }
    length = calib_cache_.size();
    return length ? calib_cache_.data() : nullptr;
}

void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT
{
    std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl;
    std::ofstream output(calib_table_name_, std::ios::binary);
    output.write(reinterpret_cast<const char*>(cache), length);
}


================================================
FILE: yolov7/src/model.cpp
================================================
#include "model.h"
#include "block.h"
// #include "yololayer.h"
#include "config.h"
#include "calibrator.h"
#include <iostream>
#include <cassert>

using namespace nvinfer1;

IHostMemory* build_engine_yolov7e6e(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, const std::string& wts_path) {
    std::map<std::string, Weights> weightMap = loadWeights(wts_path);

    INetworkDefinition* network = builder->createNetworkV2(0U);
    ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, kInputH, kInputW });
    assert(data);

    auto* conv0 = ReOrg(network, weightMap, *data, 3);


    IElementWiseLayer* conv1 = convBnSilu(network, weightMap, *conv0->getOutput(0), 80, 3, 1, 1, "model.1");
    auto conv2 = DownC(network, weightMap, *conv1->getOutput(0), 80, 160, "model.2");

    IElementWiseLayer* conv3 = convBnSilu(network, weightMap, *conv2->getOutput(0), 64, 1, 1, 0, "model.3");
    IElementWiseLayer* conv4 = convBnSilu(network, weightMap, *conv2->getOutput(0), 64, 1, 1, 0, "model.4");

    IElementWiseLayer* conv5 = convBnSilu(network, weightMap, *conv4->getOutput(0), 64, 3, 1, 1, "model.5");
    IElementWiseLayer* conv6 = convBnSilu(network, weightMap, *conv5->getOutput(0), 64, 3, 1, 1, "model.6");
    IElementWiseLayer* conv7 = convBnSilu(network, weightMap, *conv6->getOutput(0), 64, 3, 1, 1, "model.7");
    IElementWiseLayer* conv8 = convBnSilu(network, weightMap, *conv7->getOutput(0), 64, 3, 1, 1, "model.8");
    IElementWiseLayer* conv9 = convBnSilu(network, weightMap, *conv8->getOutput(0), 64, 3, 1, 1, "model.9");
    IElementWiseLayer* conv10 = convBnSilu(network, weightMap, *conv9->getOutput(0), 64, 3, 1, 1, "model.10");

    ITensor* input_tensor_11[] = { conv10->getOutput(0), conv8->getOutput(0),conv6->getOutput(0), conv4->getOutput(0),
        conv3->getOutput(0) };
    IConcatenationLayer* concat11 = network->addConcatenation(input_tensor_11, 5);

    IElementWiseLayer* conv12 = convBnSilu(network, weightMap, *concat11->getOutput(0), 160, 1, 1, 0, "model.12");


    IElementWiseLayer* conv13 = convBnSilu(network, weightMap, *conv2->getOutput(0), 64, 1, 1, 0, "model.13");
    IElementWiseLayer* conv14 = convBnSilu(network, weightMap, *conv2->getOutput(0), 64, 1, 1, 0, "model.14");

    IElementWiseLayer* conv15 = convBnSilu(network, weightMap, *conv14->getOutput(0), 64, 3, 1, 1, "model.15");
    IElementWiseLayer* conv16 = convBnSilu(network, weightMap, *conv15->getOutput(0), 64, 3, 1, 1, "model.16");
    IElementWiseLayer* conv17 = convBnSilu(network, weightMap, *conv16->getOutput(0), 64, 3, 1, 1, "model.17");
    IElementWiseLayer* conv18 = convBnSilu(network, weightMap, *conv17->getOutput(0), 64, 3, 1, 1, "model.18");
    IElementWiseLayer* conv19 = convBnSilu(network, weightMap, *conv18->getOutput(0), 64, 3, 1, 1, "model.19");
    IElementWiseLayer* conv20 = convBnSilu(network, weightMap, *conv19->getOutput(0), 64, 3, 1, 1, "model.20");
    ITensor* input_tensor_21[] = { conv20->getOutput(0), conv18->getOutput(0),conv16->getOutput(0), conv14->getOutput(0),
        conv13->getOutput(0) };
    IConcatenationLayer* concat21 = network->addConcatenation(input_tensor_21, 5);
    
    IElementWiseLayer* conv22 = convBnSilu(network, weightMap, *concat21->getOutput(0), 160, 1, 1, 0, "model.22");
    auto conv23 = network->addElementWise(*conv22->getOutput(0), *conv12->getOutput(0), ElementWiseOperation::kSUM);

    auto conv24 = DownC(network, weightMap, *conv23->getOutput(0), 160, 320, "model.24");
    IElementWiseLayer* conv25 = convBnSilu(network, weightMap, *conv24->getOutput(0), 128, 1, 1, 0, "model.25");
    IElementWiseLayer* conv26 = convBnSilu(network, weightMap, *conv24->getOutput(0), 128, 1, 1, 0, "model.26");

    IElementWiseLayer* conv27 = convBnSilu(network, weightMap, *conv26->getOutput(0), 128, 3, 1, 1, "model.27");
    IElementWiseLayer* conv28 = convBnSilu(network, weightMap, *conv27->getOutput(0), 128, 3, 1, 1, "model.28");
    IElementWiseLayer* conv29 = convBnSilu(network, weightMap, *conv28->getOutput(0), 128, 3, 1, 1, "model.29");
    IElementWiseLayer* conv30 = convBnSilu(network, weightMap, *conv29->getOutput(0), 128, 3, 1, 1, "model.30");
    IElementWiseLayer* conv31 = convBnSilu(network, weightMap, *conv30->getOutput(0), 128, 3, 1, 1, "model.31");
    IElementWiseLayer* conv32 = convBnSilu(network, weightMap, *conv31->getOutput(0), 128, 3, 1, 1, "model.32");

    ITensor* input_tensor_33[] = { conv32->getOutput(0), conv30->getOutput(0),conv28->getOutput(0), conv26->getOutput(0),
        conv25->getOutput(0)};
    IConcatenationLayer* concat33 = network->addConcatenation(input_tensor_33, 5);

    IElementWiseLayer* conv34 = convBnSilu(network, weightMap, *concat33->getOutput(0), 320, 1, 1, 0, "model.34");

    IElementWiseLayer* conv35 = convBnSilu(network, weightMap, *conv24->getOutput(0), 128, 1, 1, 0, "model.35");
    IElementWiseLayer* conv36 = convBnSilu(network, weightMap, *conv24->getOutput(0), 128, 1, 1, 0, "model.36");

    IElementWiseLayer* conv37 = convBnSilu(network, weightMap, *conv36->getOutput(0), 128, 3, 1, 1, "model.37");
    IElementWiseLayer* conv38 = convBnSilu(network, weightMap, *conv37->getOutput(0), 128, 3, 1, 1, "model.38");
    IElementWiseLayer* conv39 = convBnSilu(network, weightMap, *conv38->getOutput(0), 128, 3, 1, 1, "model.39");
    IElementWiseLayer* conv40 = convBnSilu(network, weightMap, *conv39->getOutput(0), 128, 3, 1, 1, "model.40");
    IElementWiseLayer* conv41 = convBnSilu(network, weightMap, *conv40->getOutput(0), 128, 3, 1, 1, "model.41");
    IElementWiseLayer* conv42 = convBnSilu(network, weightMap, *conv41->getOutput(0), 128, 3, 1, 1, "model.42");

    ITensor* input_tensor_43[] = { conv42->getOutput(0), conv40->getOutput(0),conv38->getOutput(0), conv36->getOutput(0),
        conv35->getOutput(0)};
    IConcatenationLayer* concat43 = network->addConcatenation(input_tensor_43, 5);
    IElementWiseLayer* conv44 = convBnSilu(network, weightMap, *concat43->getOutput(0), 320, 1, 1, 0, "model.44");

    auto conv45 = network->addElementWise(*conv44->getOutput(0), *conv34->getOutput(0), ElementWiseOperation::kSUM);

    auto conv46 = DownC(network, weightMap, *conv45->getOutput(0), 320, 640, "model.46");//=====


    IElementWiseLayer* conv47 = convBnSilu(network, weightMap, *conv46->getOutput(0), 256, 1, 1, 0, "model.47");
    IElementWiseLayer* conv48 = convBnSilu(network, weightMap, *conv46->getOutput(0), 256, 1, 1, 0, "model.48");

    IElementWiseLayer* conv49 = convBnSilu(network, weightMap, *conv48->getOutput(0), 256, 3, 1, 1, "model.49");
    IElementWiseLayer* conv50 = convBnSilu(network, weightMap, *conv49->getOutput(0), 256, 3, 1, 1, "model.50");
    IElementWiseLayer* conv51 = convBnSilu(network, weightMap, *conv50->getOutput(0), 256, 3, 1, 1, "model.51");
    IElementWiseLayer* conv52 = convBnSilu(network, weightMap, *conv51->getOutput(0), 256, 3, 1, 1, "model.52");
    IElementWiseLayer* conv53 = convBnSilu(network, weightMap, *conv52->getOutput(0), 256, 3, 1, 1, "model.53");
    IElementWiseLayer* conv54 = convBnSilu(network, weightMap, *conv53->getOutput(0), 256, 3, 1, 1, "model.54");
    
    ITensor* input_tensor_55[] = { conv54->getOutput(0), conv52->getOutput(0),conv50->getOutput(0), conv48->getOutput(0),
        conv47->getOutput(0) };
    IConcatenationLayer* concat55 = network->addConcatenation(input_tensor_55, 5);
    IElementWiseLayer* conv56 = convBnSilu(network, weightMap, *concat55->getOutput(0), 640, 1, 1, 0, "model.56");

    IElementWiseLayer* conv57 = convBnSilu(network, weightMap, *conv46->getOutput(0), 256, 1, 1, 0, "model.57");
    IElementWiseLayer* conv58 = convBnSilu(network, weightMap, *conv46->getOutput(0), 256, 1, 1, 0, "model.58");

    IElementWiseLayer* conv59 = convBnSilu(network, weightMap, *conv58->getOutput(0), 256, 3, 1, 1, "model.59");
    IElementWiseLayer* conv60 = convBnSilu(network, weightMap, *conv59->getOutput(0), 256, 3, 1, 1, "model.60");
    IElementWiseLayer* conv61 = convBnSilu(network, weightMap, *conv60->getOutput(0), 256, 3, 1, 1, "model.61");
    IElementWiseLayer* conv62 = convBnSilu(network, weightMap, *conv61->getOutput(0), 256, 3, 1, 1, "model.62");
    IElementWiseLayer* conv63 = convBnSilu(network, weightMap, *conv62->getOutput(0), 256, 3, 1, 1, "model.63");
    IElementWiseLayer* conv64 = convBnSilu(network, weightMap, *conv63->getOutput(0), 256, 3, 1, 1, "model.64");
    ITensor* input_tensor_65[] = { conv64->getOutput(0), conv62->getOutput(0),conv60->getOutput(0), conv58->getOutput(0),
        conv57->getOutput(0) };
    IConcatenationLayer* concat65 = network->addConcatenation(input_tensor_65, 5);
    IElementWiseLayer* conv66 = convBnSilu(network, weightMap, *concat65->getOutput(0), 640, 1, 1, 0, "model.66");
    auto conv67 = network->addElementWise(*conv66->getOutput(0), *conv56->getOutput(0), ElementWiseOperation::kSUM);

    auto conv68 = DownC(network, weightMap, *conv67->getOutput(0), 640, 960, "model.68");//=====

    IElementWiseLayer* conv69 = convBnSilu(network, weightMap, *conv68->getOutput(0), 384, 1, 1, 0, "model.69");
    IElementWiseLayer* conv70 = convBnSilu(network, weightMap, *conv68->getOutput(0), 384, 1, 1, 0, "model.70");

    IElementWiseLayer* conv71 = convBnSilu(network, weightMap, *conv70->getOutput(0), 384, 3, 1, 1, "model.71");
    IElementWiseLayer* conv72 = convBnSilu(network, weightMap, *conv71->getOutput(0), 384, 3, 1, 1, "model.72");
    IElementWiseLayer* conv73 = convBnSilu(network, weightMap, *conv72->getOutput(0), 384, 3, 1, 1, "model.73");
    IElementWiseLayer* conv74 = convBnSilu(network, weightMap, *conv73->getOutput(0), 384, 3, 1, 1, "model.74");
    IElementWiseLayer* conv75 = convBnSilu(network, weightMap, *conv74->getOutput(0), 384, 3, 1, 1, "model.75");
    IElementWiseLayer* conv76 = convBnSilu(network, weightMap, *conv75->getOutput(0), 384, 3, 1, 1, "model.76");
    ITensor* input_tensor_77[] = { conv76->getOutput(0), conv74->getOutput(0),conv72->getOutput(0), conv70->getOutput(0),
        conv69->getOutput(0) };
    IConcatenationLayer* concat77 = network->addConcatenation(input_tensor_77, 5);
    IElementWiseLayer* conv78 = convBnSilu(network, weightMap, *concat77->getOutput(0), 960, 1, 1, 0, "model.78");

    IElementWiseLayer* conv79 = convBnSilu(network, weightMap, *conv68->getOutput(0), 384, 1, 1, 0, "model.79");
    IElementWiseLayer* conv80 = convBnSilu(network, weightMap, *conv68->getOutput(0), 384, 1, 1, 0, "model.80");

    IElementWiseLayer* conv81 = convBnSilu(network, weightMap, *conv80->getOutput(0), 384, 3, 1, 1, "model.81");
    IElementWiseLayer* conv82 = convBnSilu(network, weightMap, *conv81->getOutput(0), 384, 3, 1, 1, "model.82");
    IElementWiseLayer* conv83 = convBnSilu(network, weightMap, *conv82->getOutput(0), 384, 3, 1, 1, "model.83");
    IElementWiseLayer* conv84 = convBnSilu(network, weightMap, *conv83->getOutput(0), 384, 3, 1, 1, "model.84");
    IElementWiseLayer* conv85 = convBnSilu(network, weightMap, *conv84->getOutput(0), 384, 3, 1, 1, "model.85");
    IElementWiseLayer* conv86 = convBnSilu(network, weightMap, *conv85->getOutput(0), 384, 3, 1, 1, "model.86");
    ITensor* input_tensor_87[] = { conv86->getOutput(0), conv84->getOutput(0),conv82->getOutput(0), conv80->getOutput(0),
        conv79->getOutput(0) };
    IConcatenationLayer* concat87 = network->addConcatenation(input_tensor_87, 5);
    IElementWiseLayer* conv88 = convBnSilu(network, weightMap, *concat87->getOutput(0), 960, 1, 1, 0, "model.88");
    auto conv89 = network->addElementWise(*conv88->getOutput(0), *conv78->getOutput(0), ElementWiseOperation::kSUM);


    auto conv90 = DownC(network, weightMap, *conv89->getOutput(0), 960, 1280, "model.90");

    IElementWiseLayer* conv91 = convBnSilu(network, weightMap, *conv90->getOutput(0), 512, 1, 1, 0, "model.91");
    IElementWiseLayer* conv92 = convBnSilu(network, weightMap, *conv90->getOutput(0), 512, 1, 1, 0, "model.92");

    IElementWiseLayer* conv93 = convBnSilu(network, weightMap, *conv92->getOutput(0), 512, 3, 1, 1, "model.93");
    IElementWiseLayer* conv94 = convBnSilu(network, weightMap, *conv93->getOutput(0), 512, 3, 1, 1, "model.94");
    IElementWiseLayer* conv95 = convBnSilu(network, weightMap, *conv94->getOutput(0), 512, 3, 1, 1, "model.95");
    IElementWiseLayer* conv96 = convBnSilu(network, weightMap, *conv95->getOutput(0), 512, 3, 1, 1, "model.96");
    IElementWiseLayer* conv97 = convBnSilu(network, weightMap, *conv96->getOutput(0), 512, 3, 1, 1, "model.97");
    IElementWiseLayer* conv98 = convBnSilu(network, weightMap, *conv97->getOutput(0), 512, 3, 1, 1, "model.98");
    ITensor* input_tensor_99[] = { conv98->getOutput(0), conv96->getOutput(0),conv94->getOutput(0), conv92->getOutput(0),
      conv91->getOutput(0) };
    IConcatenationLayer* concat99 = network->addConcatenation(input_tensor_99, 5);
    IElementWiseLayer* conv100 = convBnSilu(network, weightMap, *concat99->getOutput(0), 1280, 1, 1, 0, "model.100");
    
    IElementWiseLayer* conv101 = convBnSilu(network, weightMap, *conv90->getOutput(0), 512, 1, 1, 0, "model.101");
    IElementWiseLayer* conv102 = convBnSilu(network, weightMap, *conv90->getOutput(0), 512, 1, 1, 0, "model.102");

    IElementWiseLayer* conv103 = convBnSilu(network, weightMap, *conv102->getOutput(0), 512, 3, 1, 1, "model.103");
    IElementWiseLayer* conv104 = convBnSilu(network, weightMap, *conv103->getOutput(0), 512, 3, 1, 1, "model.104");
    IElementWiseLayer* conv105 = convBnSilu(network, weightMap, *conv104->getOutput(0), 512, 3, 1, 1, "model.105");
    IElementWiseLayer* conv106 = convBnSilu(network, weightMap, *conv105->getOutput(0), 512, 3, 1, 1, "model.106");
    IElementWiseLayer* conv107 = convBnSilu(network, weightMap, *conv106->getOutput(0), 512, 3, 1, 1, "model.107");
    IElementWiseLayer* conv108 = convBnSilu(network, weightMap, *conv107->getOutput(0), 512, 3, 1, 1, "model.108");
    ITensor* input_tensor_109[] = { conv108->getOutput(0), conv106->getOutput(0),conv104->getOutput(0), conv102->getOutput(0),
      conv101->getOutput(0) };
    IConcatenationLayer* concat109 = network->addConcatenation(input_tensor_109, 5);
    IElementWiseLayer* conv110 = convBnSilu(network, weightMap, *concat109->getOutput(0), 1280, 1, 1, 0, "model.110");
    auto conv111 = network->addElementWise(*conv110->getOutput(0), *conv100->getOutput(0), ElementWiseOperation::kSUM);
    //---------------------------yolov7e6e head---------------------------------
    auto conv112 = SPPCSPC(network, weightMap, *conv111->getOutput(0), 640, "model.112");
    IElementWiseLayer* conv113 = convBnSilu(network, weightMap, *conv112->getOutput(0), 480, 1, 1, 0, "model.113");


    float scale[] = { 1.0, 2.0, 2.0 };
    IResizeLayer* re114 = network->addResize(*conv113->getOutput(0));
    re114->setResizeMode(ResizeMode::kNEAREST);
    re114->setScales(scale, 3);

    IElementWiseLayer* conv115 = convBnSilu(network, weightMap, *conv89->getOutput(0), 480, 1, 1, 0, "model.115");
    ITensor* input_tensor_116[] = { conv115->getOutput(0), re114->getOutput(0) };
    IConcatenationLayer* concat116 = network->addConcatenation(input_tensor_116, 2);


    IElementWiseLayer* conv117 = convBnSilu(network, weightMap, *concat116->getOutput(0), 384, 1, 1, 0, "model.117");
    IElementWiseLayer* conv118 = convBnSilu(network, weightMap, *concat116->getOutput(0), 384, 1, 1, 0, "model.118");

    IElementWiseLayer* conv119 = convBnSilu(network, weightMap, *conv118->getOutput(0), 192, 3, 1, 1, "model.119");
    IElementWiseLayer* conv120 = convBnSilu(network, weightMap, *conv119->getOutput(0), 192, 3, 1, 1, "model.120");
    IElementWiseLayer* conv121 = convBnSilu(network, weightMap, *conv120->getOutput(0), 192, 3, 1, 1, "model.121");
    IElementWiseLayer* conv122 = convBnSilu(network, weightMap, *conv121->getOutput(0), 192, 3, 1, 1, "model.122");
    IElementWiseLayer* conv123 = convBnSilu(network, weightMap, *conv122->getOutput(0), 192, 3, 1, 1, "model.123");
    IElementWiseLayer* conv124 = convBnSilu(network, weightMap, *conv123->getOutput(0), 192, 3, 1, 1, "model.124");
    ITensor* input_tensor_125[] = { conv124->getOutput(0), conv123->getOutput(0),conv122->getOutput(0), conv121->getOutput(0),
        conv120->getOutput(0), conv119->getOutput(0), conv118->getOutput(0), conv117->getOutput(0) };
    IConcatenationLayer* concat125 = network->addConcatenation(input_tensor_125, 8);
    IElementWiseLayer* conv126 = convBnSilu(network, weightMap, *concat125->getOutput(0), 480, 1, 1, 0, "model.126");

    IElementWiseLayer* conv127 = convBnSilu(network, weightMap, *concat116->getOutput(0), 384, 1, 1, 0, "model.127");
    IElementWiseLayer* conv128 = convBnSilu(network, weightMap, *concat116->getOutput(0), 384, 1, 1, 0, "model.128");

    IElementWiseLayer* conv129 = convBnSilu(network, weightMap, *conv128->getOutput(0), 192, 3, 1, 1, "model.129");
    IElementWiseLayer* conv130 = convBnSilu(network, weightMap, *conv129->getOutput(0), 192, 3, 1, 1, "model.130");
    IElementWiseLayer* conv131 = convBnSilu(network, weightMap, *conv130->getOutput(0), 192, 3, 1, 1, "model.131");
    IElementWiseLayer* conv132 = convBnSilu(network, weightMap, *conv131->getOutput(0), 192, 3, 1, 1, "model.132");
    IElementWiseLayer* conv133 = convBnSilu(network, weightMap, *conv132->getOutput(0), 192, 3, 1, 1, "model.133");
    IElementWiseLayer* conv134 = convBnSilu(network, weightMap, *conv133->getOutput(0), 192, 3, 1, 1, "model.134");
    ITensor* input_tensor_135[] = { conv134->getOutput(0), conv133->getOutput(0),conv132->getOutput(0), conv131->getOutput(0),
        conv130->getOutput(0), conv129->getOutput(0), conv128->getOutput(0), conv127->getOutput(0) };
    IConcatenationLayer* concat135 = network->addConcatenation(input_tensor_135, 8);
    IElementWiseLayer* conv136 = convBnSilu(network, weightMap, *concat135->getOutput(0), 480, 1, 1, 0, "model.136");
    auto conv137 = network->addElementWise(*conv136->getOutput(0), *conv126->getOutput(0), ElementWiseOperation::kSUM);

    IElementWiseLayer* conv138 = convBnSilu(network, weightMap, *conv137->getOutput(0), 320, 1, 1, 0, "model.138");
    IResizeLayer* re139 = network->addResize(*conv138->getOutput(0));
    re139->setResizeMode(ResizeMode::kNEAREST);
    re139->setScales(scale, 3);
    IElementWiseLayer* conv140 = convBnSilu(network, weightMap, *conv67->getOutput(0), 320, 1, 1, 0, "model.140");
    ITensor* input_tensor_141[] = { conv140->getOutput(0), re139->getOutput(0) };
    IConcatenationLayer* concat141 = network->addConcatenation(input_tensor_141, 2);

    IElementWiseLayer* conv142 = convBnSilu(network, weightMap, *concat141->getOutput(0), 256, 1, 1, 0, "model.142");
    IElementWiseLayer* conv143 = convBnSilu(network, weightMap, *concat141->getOutput(0), 256, 1, 1, 0, "model.143");

    IElementWiseLayer* conv144 = convBnSilu(network, weightMap, *conv143->getOutput(0), 128, 3, 1, 1, "model.144");
    IElementWiseLayer* conv145 = convBnSilu(network, weightMap, *conv144->getOutput(0), 128, 3, 1, 1, "model.145");
    IElementWiseLayer* conv146 = convBnSilu(network, weightMap, *conv145->getOutput(0), 128, 3, 1, 1, "model.146");
    IElementWiseLayer* conv147 = convBnSilu(network, weightMap, *conv146->getOutput(0), 128, 3, 1, 1, "model.147");
    IElementWiseLayer* conv148 = convBnSilu(network, weightMap, *conv147->getOutput(0), 128, 3, 1, 1, "model.148");
    IElementWiseLayer* conv149 = convBnSilu(network, weightMap, *conv148->getOutput(0), 128, 3, 1, 1, "model.149");

    ITensor* input_tensor_150[] = { conv149->getOutput(0), conv148->getOutput(0),conv147->getOutput(0), conv146->getOutput(0),
        conv145->getOutput(0), conv144->getOutput(0), conv143->getOutput(0), conv142->getOutput(0) };
    IConcatenationLayer* concat150 = network->addConcatenation(input_tensor_150, 8);

    IElementWiseLayer* conv151 = convBnSilu(network, weightMap, *concat150->getOutput(0), 320, 1, 1, 0, "model.151");

    IElementWiseLayer* conv152 = convBnSilu(network, weightMap, *concat141->getOutput(0), 256, 1, 1, 0, "model.152");
    IElementWiseLayer* conv153 = convBnSilu(network, weightMap, *concat141->getOutput(0), 256, 1, 1, 0, "model.153");

    IElementWiseLayer* conv154 = convBnSilu(network, weightMap, *conv153->getOutput(0), 128, 3, 1, 1, "model.154");
    IElementWiseLayer* conv155 = convBnSilu(network, weightMap, *conv154->getOutput(0), 128, 3, 1, 1, "model.155");
    IElementWiseLayer* conv156 = convBnSilu(network, weightMap, *conv155->getOutput(0), 128, 3, 1, 1, "model.156");
    IElementWiseLayer* conv157 = convBnSilu(network, weightMap, *conv156->getOutput(0), 128, 3, 1, 1, "model.157");
    IElementWiseLayer* conv158 = convBnSilu(network, weightMap, *conv157->getOutput(0), 128, 3, 1, 1, "model.158");
    IElementWiseLayer* conv159 = convBnSilu(network, weightMap, *conv158->getOutput(0), 128, 3, 1, 1, "model.159");
    ITensor* input_tensor_160[] = { conv159->getOutput(0), conv158->getOutput(0),conv157->getOutput(0), conv156->getOutput(0),
        conv155->getOutput(0), conv154->getOutput(0), conv153->getOutput(0), conv152->getOutput(0) };
    IConcatenationLayer* concat160 = network->addConcatenation(input_tensor_160, 8);
    IElementWiseLayer* conv161 = convBnSilu(network, weightMap, *concat160->getOutput(0), 320, 1, 1, 0, "model.161");
    auto conv162 = network->addElementWise(*conv161->getOutput(0), *conv151->getOutput(0), ElementWiseOperation::kSUM);

    IElementWiseLayer* conv163 = convBnSilu(network, weightMap, *conv162->getOutput(0), 160, 1, 1, 0, "model.163");

    IResizeLayer* re164 = network->addResize(*conv163->getOutput(0));
    re164->setResizeMode(ResizeMode::kNEAREST);
    re164->setScales(scale, 3);

    IElementWiseLayer* conv165 = convBnSilu(network, weightMap, *conv45->getOutput(0), 160, 1, 1, 0, "model.165");
    ITensor* input_tensor_166[] = { conv165->getOutput(0), re164->getOutput(0) };
    IConcatenationLayer* concat166 = network->addConcatenation(input_tensor_166, 2);

    IElementWiseLayer* conv167 = convBnSilu(network, weightMap, *concat166->getOutput(0), 128, 1, 1, 0, "model.167");
    IElementWiseLayer* conv168 = convBnSilu(network, weightMap, *concat166->getOutput(0), 128, 1, 1, 0, "model.168");
    IElementWiseLayer* conv169 = convBnSilu(network, weightMap, *conv168->getOutput(0), 64, 3, 1, 1, "model.169");
    IElementWiseLayer* conv170 = convBnSilu(network, weightMap, *conv169->getOutput(0), 64, 3, 1, 1, "model.170");
    IElementWiseLayer* conv171 = convBnSilu(network, weightMap, *conv170->getOutput(0), 64, 3, 1, 1, "model.171");
    IElementWiseLayer* conv172 = convBnSilu(network, weightMap, *conv171->getOutput(0), 64, 3, 1, 1, "model.172");
    IElementWiseLayer* conv173 = convBnSilu(network, weightMap, *conv172->getOutput(0), 64, 3, 1, 1, "model.173");
    IElementWiseLayer* conv174 = convBnSilu(network, weightMap, *conv173->getOutput(0), 64, 3, 1, 1, "model.174");


    ITensor* input_tensor_175[] = { conv174->getOutput(0), conv173->getOutput(0),conv172->getOutput(0), conv171->getOutput(0),
       conv170->getOutput(0), conv169->getOutput(0), conv168->getOutput(0), conv167->getOutput(0) };
    IConcatenationLayer* concat175 = network->addConcatenation(input_tensor_175, 8); 
    IElementWiseLayer* conv176 = convBnSilu(network, weightMap, *concat175->getOutput(0), 160, 1, 1, 0, "model.176");
    IElementWiseLayer* conv177 = convBnSilu(network, weightMap, *concat166->getOutput(0), 128, 1, 1, 0, "model.177");
    IElementWiseLayer* conv178 = convBnSilu(network, weightMap, *concat166->getOutput(0), 128, 1, 1, 0, "model.178");
    
    IElementWiseLayer* conv179 = convBnSilu(network, weightMap, *conv178->getOutput(0), 64, 3, 1, 1, "model.179");
    IElementWiseLayer* conv180 = convBnSilu(network, weightMap, *conv179->getOutput(0), 64, 3, 1, 1, "model.180");
    IElementWiseLayer* conv181 = convBnSilu(network, weightMap, *conv180->getOutput(0), 64, 3, 1, 1, "model.181");
    IElementWiseLayer* conv182 = convBnSilu(network, weightMap, *conv181->getOutput(0), 64, 3, 1, 1, "model.182");
    IElementWiseLayer* conv183 = convBnSilu(network, weightMap, *conv182->getOutput(0), 64, 3, 1, 1, "model.183");
    IElementWiseLayer* conv184 = convBnSilu(network, weightMap, *conv183->getOutput(0), 64, 3, 1, 1, "model.184");
    ITensor* input_tensor_185[] = { conv184->getOutput(0), conv183->getOutput(0),conv182->getOutput(0), conv181->getOutput(0),
       conv180->getOutput(0), conv179->getOutput(0), conv178->getOutput(0), conv177->getOutput(0) };
    IConcatenationLayer* concat185 = network->addConcatenation(input_tensor_185, 8);
    IElementWiseLayer* conv186 = convBnSilu(network, weightMap, *concat185->getOutput(0), 160, 1, 1, 0, "model.186");
    auto conv187 = network->addElementWise(*conv186->getOutput(0), *conv176->getOutput(0), ElementWiseOperation::kSUM);

    auto conv188 = DownC(network, weightMap, *conv187->getOutput(0), 160, 320, "model.188");


    ITensor* input_tensor_189[] = { conv188->getOutput(0), conv162->getOutput(0) };
    IConcatenationLayer* concat189 = network->addConcatenation(input_tensor_189, 2);

    IElementWiseLayer* conv190 = convBnSilu(network, weightMap, *concat189->getOutput(0), 256, 1, 1, 0, "model.190");
    IElementWiseLayer* conv191 = convBnSilu(network, weightMap, *concat189->getOutput(0), 256, 1, 1, 0, "model.191");

    IElementWiseLayer* conv192 = convBnSilu(network, weightMap, *conv191->getOutput(0), 128, 3, 1, 1, "model.192");
    IElementWiseLayer* conv193 = convBnSilu(network, weightMap, *conv192->getOutput(0), 128, 3, 1, 1, "model.193");
    IElementWiseLayer* conv194 = convBnSilu(network, weightMap, *conv193->getOutput(0), 128, 3, 1, 1, "model.194");
    IElementWiseLayer* conv195 = convBnSilu(network, weightMap, *conv194->getOutput(0), 128, 3, 1, 1, "model.195");
    IElementWiseLayer* conv196 = convBnSilu(network, weightMap, *conv195->getOutput(0), 128, 3, 1, 1, "model.196");
    IElementWiseLayer* conv197 = convBnSilu(network, weightMap, *conv196->getOutput(0), 128, 3, 1, 1, "model.197");


    ITensor* input_tensor_198[] = { conv197->getOutput(0), conv196->getOutput(0),conv195->getOutput(0), conv194->getOutput(0),
       conv193->getOutput(0), conv192->getOutput(0), conv191->getOutput(0), conv190->getOutput(0) };
    IConcatenationLayer* concat198 = network->addConcatenation(input_tensor_198, 8);
    IElementWiseLayer* conv199 = convBnSilu(network, weightMap, *concat198->getOutput(0), 320, 1, 1, 0, "model.199");

    IElementWiseLayer* conv200 = convBnSilu(network, weightMap, *concat189->getOutput(0), 256, 1, 1, 0, "model.200");
    IElementWiseLayer* conv201 = convBnSilu(network, weightMap, *concat189->getOutput(0), 256, 1, 1, 0, "model.201");

    IElementWiseLayer* conv202 = convBnSilu(network, weightMap, *conv201->getOutput(0), 128, 3, 1, 1, "model.202");
    IElementWiseLayer* conv203 = convBnSilu(network, weightMap, *conv202->getOutput(0), 128, 3, 1, 1, "model.203");
    IElementWiseLayer* conv204 = convBnSilu(network, weightMap, *conv203->getOutput(0), 128, 3, 1, 1, "model.204");
    IElementWiseLayer* conv205 = convBnSilu(network, weightMap, *conv204->getOutput(0), 128, 3, 1, 1, "model.205");
    IElementWiseLayer* conv206 = convBnSilu(network, weightMap, *conv205->getOutput(0), 128, 3, 1, 1, "model.206");
    IElementWiseLayer* conv207 = convBnSilu(network, weightMap, *conv206->getOutput(0), 128, 3, 1, 1, "model.207");
    ITensor* input_tensor_208[] = { conv207->getOutput(0), conv206->getOutput(0),conv205->getOutput(0), conv204->getOutput(0),
      conv203->getOutput(0), conv202->getOutput(0), conv201->getOutput(0), conv200->getOutput(0) };
    IConcatenationLayer* concat208 = network->addConcatenation(input_tensor_208, 8);
    IElementWiseLayer* conv209 = convBnSilu(network, weightMap, *concat208->getOutput(0), 320, 1, 1, 0, "model.209");
    auto conv210 = network->addElementWise(*conv209->getOutput(0), *conv199->getOutput(0), ElementWiseOperation::kSUM);


    auto conv211 = DownC(network, weightMap, *conv210->getOutput(0), 320, 480, "model.211");
    ITensor* input_tensor_212[] = { conv211->getOutput(0), conv137->getOutput(0) };
    IConcatenationLayer* concat212 = network->addConcatenation(input_tensor_212, 2);

    IElementWiseLayer* conv213 = convBnSilu(network, weightMap, *concat212->getOutput(0), 384, 1, 1, 0, "model.213");
    IElementWiseLayer* conv214 = convBnSilu(network, weightMap, *concat212->getOutput(0), 384, 1, 1, 0, "model.214");

    IElementWiseLayer* conv215 = convBnSilu(network, weightMap, *conv214->getOutput(0), 192, 3, 1, 1, "model.215");
    IElementWiseLayer* conv216 = convBnSilu(network, weightMap, *conv215->getOutput(0), 192, 3, 1, 1, "model.216");
    IElementWiseLayer* conv217 = convBnSilu(network, weightMap, *conv216->getOutput(0), 192, 3, 1, 1, "model.217");
    IElementWiseLayer* conv218 = convBnSilu(network, weightMap, *conv217->getOutput(0), 192, 3, 1, 1, "model.218");
    IElementWiseLayer* conv219 = convBnSilu(network, weightMap, *conv218->getOutput(0), 192, 3, 1, 1, "model.219");
    IElementWiseLayer* conv220 = convBnSilu(network, weightMap, *conv219->getOutput(0), 192, 3, 1, 1, "model.220");

    ITensor* input_tensor_221[] = { conv220->getOutput(0), conv219->getOutput(0),conv218->getOutput(0), conv217->getOutput(0),
      conv216->getOutput(0), conv215->getOutput(0), conv214->getOutput(0), conv213->getOutput(0) };
    IConcatenationLayer* concat221 = network->addConcatenation(input_tensor_221, 8);
    IElementWiseLayer* conv222 = convBnSilu(network, weightMap, *concat221->getOutput(0), 480, 1, 1, 0, "model.222");

    IElementWiseLayer* conv223 = convBnSilu(network, weightMap, *concat212->getOutput(0), 384, 1, 1, 0, "model.223");
    IElementWiseLayer* conv224 = convBnSilu(network, weightMap, *concat212->getOutput(0), 384, 1, 1, 0, "model.224");

    IElementWiseLayer* conv225 = convBnSilu(network, weightMap, *conv224->getOutput(0), 192, 3, 1, 1, "model.225");
    IElementWiseLayer* conv226 = convBnSilu(network, weightMap, *conv225->getOutput(0), 192, 3, 1, 1, "model.226");
    IElementWiseLayer* conv227 = convBnSilu(network, weightMap, *conv226->getOutput(0), 192, 3, 1, 1, "model.227");
    IElementWiseLayer* conv228 = convBnSilu(network, weightMap, *conv227->getOutput(0), 192, 3, 1, 1, "model.228");
    IElementWiseLayer* conv229 = convBnSilu(network, weightMap, *conv228->getOutput(0), 192, 3, 1, 1, "model.229");
    IElementWiseLayer* conv230 = convBnSilu(network, weightMap, *conv229->getOutput(0), 192, 3, 1, 1, "model.230");
    ITensor* input_tensor_231[] = { conv230->getOutput(0), conv229->getOutput(0),conv228->getOutput(0), conv227->getOutput(0),
     conv226->getOutput(0), conv225->getOutput(0), conv224->getOutput(0), conv223->getOutput(0) };
    IConcatenationLayer* concat231 = network->addConcatenation(input_tensor_231, 8);
    IElementWiseLayer* conv232 = convBnSilu(network, weightMap, *concat231->getOutput(0), 480, 1, 1, 0, "model.232");

    auto conv233 = network->addElementWise(*conv232->getOutput(0), *conv222->getOutput(0), ElementWiseOperation::kSUM);


    auto conv234 = DownC(network, weightMap, *conv233->getOutput(0), 480, 640, "model.234");
    ITensor* input_tensor_235[] = { conv234->getOutput(0), conv112->getOutput(0) };
    IConcatenationLayer* concat235 = network->addConcatenation(input_tensor_235, 2);


    IElementWiseLayer* conv236 = convBnSilu(network, weightMap, *concat235->getOutput(0), 512, 1, 1, 0, "model.236");
    IElementWiseLayer* conv237 = convBnSilu(network, weightMap, *concat235->getOutput(0), 512, 1, 1, 0, "model.237");

    IElementWiseLayer* conv238 = convBnSilu(network, weightMap, *conv237->getOutput(0), 256, 3, 1, 1, "model.238");
    IElementWiseLayer* conv239 = convBnSilu(network, weightMap, *conv238->getOutput(0), 256, 3, 1, 1, "model.239");
    IElementWiseLayer* conv240 = convBnSilu(network, weightMap, *conv239->getOutput(0), 256, 3, 1, 1, "model.240");
    IElementWiseLayer* conv241 = convBnSilu(network, weightMap, *conv240->getOutput(0), 256, 3, 1, 1, "model.241");
    IElementWiseLayer* conv242 = convBnSilu(network, weightMap, *conv241->getOutput(0), 256, 3, 1, 1, "model.242");
    IElementWiseLayer* conv243 = convBnSilu(network, weightMap, *conv242->getOutput(0), 256, 3, 1, 1, "model.243");
  
    ITensor* input_tensor_244[] = { conv243->getOutput(0), conv242->getOutput(0),conv241->getOutput(0), conv240->getOutput(0),
     conv239->getOutput(0), conv238->getOutput(0), conv237->getOutput(0), conv236->getOutput(0) };
    IConcatenationLayer* concat244 = network->addConcatenation(input_tensor_244, 8);
    IElementWiseLayer* conv245 = convBnSilu(network, weightMap, *concat244->getOutput(0), 640, 1, 1, 0, "model.245");

    IElementWiseLayer* conv246 = convBnSilu(network, weightMap, *concat235->getOutput(0), 512, 1, 1, 0, "model.246");
    IElementWiseLayer* conv247 = convBnSilu(network, weightMap, *concat235->getOutput(0), 512, 1, 1, 0, "model.247");

    IElementWiseLayer* conv248 = convBnSilu(network, weightMap, *conv247->getOutput(0), 256, 3, 1, 1, "model.248");
    IElementWiseLayer* conv249 = convBnSilu(network, weightMap, *conv248->getOutput(0), 256, 3, 1, 1, "model.249");
    IElementWiseLayer* conv250 = convBnSilu(network, weightMap, *conv249->getOutput(0), 256, 3, 1, 1, "model.250");
    IElementWiseLayer* conv251 = convBnSilu(network, weightMap, *conv250->getOutput(0), 256, 3, 1, 1, "model.251");
    IElementWiseLayer* conv252 = convBnSilu(network, weightMap, *conv251->getOutput(0), 256, 3, 1, 1, "model.252");
    IElementWiseLayer* conv253 = convBnSilu(network, weightMap, *conv252->getOutput(0), 256, 3, 1, 1, "model.253");

    ITensor* input_tensor_254[] = { conv253->getOutput(0), conv252->getOutput(0),conv251->getOutput(0), conv250->getOutput(0),
    conv249->getOutput(0), conv248->getOutput(0), conv247->getOutput(0), conv246->getOutput(0) };
    IConcatenationLayer* concat254 = network->addConcatenation(input_tensor_254, 8);

    IElementWiseLayer* conv255= convBnSilu(network, weightMap, *concat254->getOutput(0), 640, 1, 1, 0, "model.255");
    auto conv256 = network->addElementWise(*conv255->getOutput(0), *conv245->getOutput(0), ElementWiseOperation::kSUM);

    IElementWiseLayer* conv257 = convBnSilu(network, weightMap, *conv187->getOutput(0), 320, 3, 1, 1, "model.257");
    IElementWiseLayer* conv258 = convBnSilu(network, weightMap, *conv210->getOutput(0), 640, 3, 1, 1, "model.258");
    IElementWiseLayer* conv259 = convBnSilu(network, weightMap, *conv233->getOutput(0), 960, 3, 1, 1, "model.259");
    IElementWiseLayer* conv260 = convBnSilu(network, weightMap, *conv256->getOutput(0), 1280, 3, 1, 1, "model.260");


    // out
    IConvolutionLayer* cv105_0 = network->addConvolutionNd(*conv257->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.261.m.0.weight"], weightMap["model.261.m.0.bias"]);
    assert(cv105_0);
    cv105_0->setName("cv105.0");
    IConvolutionLayer* cv105_1 = network->addConvolutionNd(*conv258->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.261.m.1.weight"], weightMap["model.261.m.1.bias"]);
    assert(cv105_1);
    cv105_1->setName("cv105.1");
    IConvolutionLayer* cv105_2 = network->addConvolutionNd(*conv259->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.261.m.2.weight"], weightMap["model.261.m.2.bias"]);
    assert(cv105_2);
    cv105_2->setName("cv105.2");
    IConvolutionLayer* cv105_3 = network->addConvolutionNd(*conv260->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.261.m.3.weight"], weightMap["model.261.m.3.bias"]);
    assert(cv105_3);
    cv105_3->setName("cv105.3");

    /*------------detect-----------*/
    auto yolo = addYoLoLayer(network, weightMap, "model.261", std::vector<IConvolutionLayer*>{cv105_0, cv105_1, cv105_2, cv105_3});
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));
    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#if defined(USE_FP16)
    config->setFlag(BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(BuilderFlag::kINT8);
    Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    return serialized_model;
}

IHostMemory* build_engine_yolov7d6(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, const std::string& wts_path) {
    std::map<std::string, Weights> weightMap = loadWeights(wts_path);

    INetworkDefinition* network = builder->createNetworkV2(0U);
    ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, kInputH, kInputW });
    assert(data);

    /*----------------------------------yolov7d6 backbone-----------------------------------------*/
    auto* conv0 = ReOrg(network, weightMap, *data, 3);


    IElementWiseLayer* conv1 = convBnSilu(network, weightMap, *conv0->getOutput(0), 96, 3, 1, 1, "model.1");
    auto conv2 = DownC(network, weightMap, *conv1->getOutput(0), 96, 192, "model.2");

    IElementWiseLayer* conv3 = convBnSilu(network, weightMap, *conv2->getOutput(0), 64, 1, 1, 0, "model.3");
    IElementWiseLayer* conv4 = convBnSilu(network, weightMap, *conv2->getOutput(0), 64, 1, 1, 0, "model.4");

    IElementWiseLayer* conv5 = convBnSilu(network, weightMap, *conv4->getOutput(0), 64, 3, 1, 1, "model.5");
    IElementWiseLayer* conv6 = convBnSilu(network, weightMap, *conv5->getOutput(0), 64, 3, 1, 1, "model.6");
    IElementWiseLayer* conv7 = convBnSilu(network, weightMap, *conv6->getOutput(0), 64, 3, 1, 1, "model.7");
    IElementWiseLayer* conv8 = convBnSilu(network, weightMap, *conv7->getOutput(0), 64, 3, 1, 1, "model.8");
    IElementWiseLayer* conv9 = convBnSilu(network, weightMap, *conv8->getOutput(0), 64, 3, 1, 1, "model.9");
    IElementWiseLayer* conv10 = convBnSilu(network, weightMap, *conv9->getOutput(0), 64, 3, 1, 1, "model.10");
    IElementWiseLayer* conv11 = convBnSilu(network, weightMap, *conv10->getOutput(0), 64, 3, 1, 1, "model.11");
    IElementWiseLayer* conv12 = convBnSilu(network, weightMap, *conv11->getOutput(0), 64, 3, 1, 1, "model.12");

    ITensor* input_tensor_13[] = { conv12->getOutput(0), conv10->getOutput(0),conv8->getOutput(0), conv6->getOutput(0),
        conv4->getOutput(0),conv3->getOutput(0) };
    IConcatenationLayer* concat13 = network->addConcatenation(input_tensor_13, 6);

    IElementWiseLayer* conv14 = convBnSilu(network, weightMap, *concat13->getOutput(0), 192, 1, 1, 0, "model.14");


    auto conv15 = DownC(network, weightMap, *conv14->getOutput(0), 192, 384, "model.15");
    IElementWiseLayer* conv16 = convBnSilu(network, weightMap, *conv15->getOutput(0), 128, 1, 1, 0, "model.16");
    IElementWiseLayer* conv17 = convBnSilu(network, weightMap, *conv15->getOutput(0), 128, 1, 1, 0, "model.17");

    IElementWiseLayer* conv18 = convBnSilu(network, weightMap, *conv17->getOutput(0), 128, 3, 1, 1, "model.18");
    IElementWiseLayer* conv19 = convBnSilu(network, weightMap, *conv18->getOutput(0), 128, 3, 1, 1, "model.19");
    IElementWiseLayer* conv20 = convBnSilu(network, weightMap, *conv19->getOutput(0), 128, 3, 1, 1, "model.20");
    IElementWiseLayer* conv21 = convBnSilu(network, weightMap, *conv20->getOutput(0), 128, 3, 1, 1, "model.21");
    IElementWiseLayer* conv22 = convBnSilu(network, weightMap, *conv21->getOutput(0), 128, 3, 1, 1, "model.22");
    IElementWiseLayer* conv23 = convBnSilu(network, weightMap, *conv22->getOutput(0), 128, 3, 1, 1, "model.23");
    IElementWiseLayer* conv24 = convBnSilu(network, weightMap, *conv23->getOutput(0), 128, 3, 1, 1, "model.24");
    IElementWiseLayer* conv25 = convBnSilu(network, weightMap, *conv24->getOutput(0), 128, 3, 1, 1, "model.25");
    ITensor* input_tensor_26[] = { conv25->getOutput(0), conv23->getOutput(0),conv21->getOutput(0), conv19->getOutput(0),
        conv17->getOutput(0),conv16->getOutput(0) };
    IConcatenationLayer* concat26 = network->addConcatenation(input_tensor_26, 6);

    IElementWiseLayer* conv27 = convBnSilu(network, weightMap, *concat26->getOutput(0), 384, 1, 1, 0, "model.27");


    auto conv28 = DownC(network, weightMap, *conv27->getOutput(0), 384, 768, "model.28");
    IElementWiseLayer* conv29 = convBnSilu(network, weightMap, *conv28->getOutput(0), 256, 1, 1, 0, "model.29");
    IElementWiseLayer* conv30 = convBnSilu(network, weightMap, *conv28->getOutput(0), 256, 1, 1, 0, "model.30");

    IElementWiseLayer* conv31 = convBnSilu(network, weightMap, *conv30->getOutput(0), 256, 3, 1, 1, "model.31");
    IElementWiseLayer* conv32 = convBnSilu(network, weightMap, *conv31->getOutput(0), 256, 3, 1, 1, "model.32");
    IElementWiseLayer* conv33 = convBnSilu(network, weightMap, *conv32->getOutput(0), 256, 3, 1, 1, "model.33");
    IElementWiseLayer* conv34 = convBnSilu(network, weightMap, *conv33->getOutput(0), 256, 3, 1, 1, "model.34");
    IElementWiseLayer* conv35 = convBnSilu(network, weightMap, *conv34->getOutput(0), 256, 3, 1, 1, "model.35");
    IElementWiseLayer* conv36 = convBnSilu(network, weightMap, *conv35->getOutput(0), 256, 3, 1, 1, "model.36");
    IElementWiseLayer* conv37 = convBnSilu(network, weightMap, *conv36->getOutput(0), 256, 3, 1, 1, "model.37");
    IElementWiseLayer* conv38 = convBnSilu(network, weightMap, *conv37->getOutput(0), 256, 3, 1, 1, "model.38");
    ITensor* input_tensor_39[] = { conv38->getOutput(0), conv36->getOutput(0),conv34->getOutput(0), conv32->getOutput(0),
        conv30->getOutput(0), conv29 ->getOutput(0)};
    IConcatenationLayer* concat39 = network->addConcatenation(input_tensor_39, 6);

    IElementWiseLayer* conv40 = convBnSilu(network, weightMap, *concat39->getOutput(0), 768, 1, 1, 0, "model.40");
    auto conv41 = DownC(network, weightMap, *conv40->getOutput(0), 768, 1152, "model.41");
    IElementWiseLayer* conv42 = convBnSilu(network, weightMap, *conv41->getOutput(0), 384, 1, 1, 0, "model.42");
    IElementWiseLayer* conv43 = convBnSilu(network, weightMap, *conv41->getOutput(0), 384, 1, 1, 0, "model.43");

    IElementWiseLayer* conv44 = convBnSilu(network, weightMap, *conv43->getOutput(0), 384, 3, 1, 1, "model.44");
    IElementWiseLayer* conv45 = convBnSilu(network, weightMap, *conv44->getOutput(0), 384, 3, 1, 1, "model.45");
    IElementWiseLayer* conv46 = convBnSilu(network, weightMap, *conv45->getOutput(0), 384, 3, 1, 1, "model.46");
    IElementWiseLayer* conv47 = convBnSilu(network, weightMap, *conv46->getOutput(0), 384, 3, 1, 1, "model.47");
    IElementWiseLayer* conv48 = convBnSilu(network, weightMap, *conv47->getOutput(0), 384, 3, 1, 1, "model.48");
    IElementWiseLayer* conv49 = convBnSilu(network, weightMap, *conv48->getOutput(0), 384, 3, 1, 1, "model.49");
    IElementWiseLayer* conv50 = convBnSilu(network, weightMap, *conv49->getOutput(0), 384, 3, 1, 1, "model.50");
    IElementWiseLayer* conv51 = convBnSilu(network, weightMap, *conv50->getOutput(0), 384, 3, 1, 1, "model.51");

    ITensor* input_tensor_52[] = { conv51->getOutput(0), conv49->getOutput(0),conv47->getOutput(0), conv45->getOutput(0),
        conv43->getOutput(0),conv42->getOutput(0) };
    IConcatenationLayer* concat52 = network->addConcatenation(input_tensor_52, 6);
    IElementWiseLayer* conv53 = convBnSilu(network, weightMap, *concat52->getOutput(0), 1152, 1, 1, 0, "model.53");

    auto conv54 = DownC(network, weightMap, *conv53->getOutput(0), 1152, 1536, "model.54");//=====
    IElementWiseLayer* conv55 = convBnSilu(network, weightMap, *conv54->getOutput(0), 512, 1, 1, 0, "model.55");
    IElementWiseLayer* conv56 = convBnSilu(network, weightMap, *conv54->getOutput(0), 512, 1, 1, 0, "model.56");

    IElementWiseLayer* conv57 = convBnSilu(network, weightMap, *conv56->getOutput(0), 512, 3, 1, 1, "model.57");
    IElementWiseLayer* conv58 = convBnSilu(network, weightMap, *conv57->getOutput(0), 512, 3, 1, 1, "model.58");
    IElementWiseLayer* conv59 = convBnSilu(network, weightMap, *conv58->getOutput(0), 512, 3, 1, 1, "model.59");
    IElementWiseLayer* conv60 = convBnSilu(network, weightMap, *conv59->getOutput(0), 512, 3, 1, 1, "model.60");
    IElementWiseLayer* conv61 = convBnSilu(network, weightMap, *conv60->getOutput(0), 512, 3, 1, 1, "model.61");
    IElementWiseLayer* conv62 = convBnSilu(network, weightMap, *conv61->getOutput(0), 512, 3, 1, 1, "model.62");
    IElementWiseLayer* conv63 = convBnSilu(network, weightMap, *conv62->getOutput(0), 512, 3, 1, 1, "model.63");
    IElementWiseLayer* conv64 = convBnSilu(network, weightMap, *conv63->getOutput(0), 512, 3, 1, 1, "model.64");
    ITensor* input_tensor_65[] = { conv64->getOutput(0), conv62->getOutput(0),conv60->getOutput(0), conv58->getOutput(0),
        conv56->getOutput(0),conv55->getOutput(0) };
    IConcatenationLayer* concat65 = network->addConcatenation(input_tensor_65, 6);
    IElementWiseLayer* conv66 = convBnSilu(network, weightMap, *concat65->getOutput(0), 1536, 1, 1, 0, "model.66");

    //------------------------yolov7e6 head-------------------------------
    auto conv67 = SPPCSPC(network, weightMap, *conv66->getOutput(0), 768, "model.67");
    IElementWiseLayer* conv68 = convBnSilu(network, weightMap, *conv67->getOutput(0), 576, 1, 1, 0, "model.68");


    float scale[] = { 1.0, 2.0, 2.0 };
    IResizeLayer* re69 = network->addResize(*conv68->getOutput(0));
    re69->setResizeMode(ResizeMode::kNEAREST);
    re69->setScales(scale, 3);

    IElementWiseLayer* conv70 = convBnSilu(network, weightMap, *conv53->getOutput(0), 576, 1, 1, 0, "model.70");
    ITensor* input_tensor_71[] = { conv70->getOutput(0), re69->getOutput(0) };
    IConcatenationLayer* concat71 = network->addConcatenation(input_tensor_71, 2);
    IElementWiseLayer* conv72 = convBnSilu(network, weightMap, *concat71->getOutput(0), 384, 1, 1, 0, "model.72");
    IElementWiseLayer* conv73 = convBnSilu(network, weightMap, *concat71->getOutput(0), 384, 1, 1, 0, "model.73");

    IElementWiseLayer* conv74 = convBnSilu(network, weightMap, *conv73->getOutput(0), 192, 3, 1, 1, "model.74");
    IElementWiseLayer* conv75 = convBnSilu(network, weightMap, *conv74->getOutput(0), 192, 3, 1, 1, "model.75");
    IElementWiseLayer* conv76 = convBnSilu(network, weightMap, *conv75->getOutput(0), 192, 3, 1, 1, "model.76");
    IElementWiseLayer* conv77 = convBnSilu(network, weightMap, *conv76->getOutput(0), 192, 3, 1, 1, "model.77");
    IElementWiseLayer* conv78 = convBnSilu(network, weightMap, *conv77->getOutput(0), 192, 3, 1, 1, "model.78");
    IElementWiseLayer* conv79 = convBnSilu(network, weightMap, *conv78->getOutput(0), 192, 3, 1, 1, "model.79");
    IElementWiseLayer* conv80 = convBnSilu(network, weightMap, *conv79->getOutput(0), 192, 3, 1, 1, "model.80");
    IElementWiseLayer* conv81 = convBnSilu(network, weightMap, *conv80->getOutput(0), 192, 3, 1, 1, "model.81");

    ITensor* input_tensor_82[] = { conv81->getOutput(0), conv80->getOutput(0),conv79->getOutput(0), conv78->getOutput(0),
        conv77->getOutput(0), conv76->getOutput(0), conv75->getOutput(0), conv74->getOutput(0), conv73->getOutput(0),
        conv72->getOutput(0) };
    IConcatenationLayer* concat82 = network->addConcatenation(input_tensor_82, 10);
    IElementWiseLayer* conv83 = convBnSilu(network, weightMap, *concat82->getOutput(0), 576, 1, 1, 0, "model.83");

    IElementWiseLayer* conv84 = convBnSilu(network, weightMap, *conv83->getOutput(0), 384, 1, 1, 0, "model.84");
    IResizeLayer* re85 = network->addResize(*conv84->getOutput(0));
    re85->setResizeMode(ResizeMode::kNEAREST);
    re85->setScales(scale, 3);
    IElementWiseLayer* conv86 = convBnSilu(network, weightMap, *conv40->getOutput(0), 384, 1, 1, 0, "model.86");
    ITensor* input_tensor_87[] = { conv86->getOutput(0), re85->getOutput(0) };
    IConcatenationLayer* concat87 = network->addConcatenation(input_tensor_87, 2);

    IElementWiseLayer* conv88 = convBnSilu(network, weightMap, *concat87->getOutput(0), 256, 1, 1, 0, "model.88");
    IElementWiseLayer* conv89 = convBnSilu(network, weightMap, *concat87->getOutput(0), 256, 1, 1, 0, "model.89");

    IElementWiseLayer* conv90 = convBnSilu(network, weightMap, *conv89->getOutput(0), 128, 3, 1, 1, "model.90");
    IElementWiseLayer* conv91 = convBnSilu(network, weightMap, *conv90->getOutput(0), 128, 3, 1, 1, "model.91");
    IElementWiseLayer* conv92 = convBnSilu(network, weightMap, *conv91->getOutput(0), 128, 3, 1, 1, "model.92");
    IElementWiseLayer* conv93 = convBnSilu(network, weightMap, *conv92->getOutput(0), 128, 3, 1, 1, "model.93");
    IElementWiseLayer* conv94 = convBnSilu(network, weightMap, *conv93->getOutput(0), 128, 3, 1, 1, "model.94");
    IElementWiseLayer* conv95 = convBnSilu(network, weightMap, *conv94->getOutput(0), 128, 3, 1, 1, "model.95");
    IElementWiseLayer* conv96 = convBnSilu(network, weightMap, *conv95->getOutput(0), 128, 3, 1, 1, "model.96");
    IElementWiseLayer* conv97 = convBnSilu(network, weightMap, *conv96->getOutput(0), 128, 3, 1, 1, "model.97");

    ITensor* input_tensor_98[] = { conv97->getOutput(0), conv96->getOutput(0),conv95->getOutput(0), conv94->getOutput(0),
        conv93->getOutput(0), conv92->getOutput(0), conv91->getOutput(0), conv90->getOutput(0),conv89->getOutput(0), 
        conv88->getOutput(0) };
    IConcatenationLayer* concat98 = network->addConcatenation(input_tensor_98, 10);

    IElementWiseLayer* conv99 = convBnSilu(network, weightMap, *concat98->getOutput(0), 384, 1, 1, 0, "model.99");

    IElementWiseLayer* conv100 = convBnSilu(network, weightMap, *conv99->getOutput(0), 192, 1, 1, 0, "model.100");
    IResizeLayer* re101 = network->addResize(*conv100->getOutput(0));
    re101->setResizeMode(ResizeMode::kNEAREST);
    re101->setScales(scale, 3);
    IElementWiseLayer* conv102 = convBnSilu(network, weightMap, *conv27->getOutput(0), 192, 1, 1, 0, "model.102");
    ITensor* input_tensor_103[] = { conv102->getOutput(0), re101->getOutput(0) };
    IConcatenationLayer* concat103 = network->addConcatenation(input_tensor_103, 2);

    IElementWiseLayer* conv104 = convBnSilu(network, weightMap, *concat103->getOutput(0), 128, 1, 1, 0, "model.104");
    IElementWiseLayer* conv105 = convBnSilu(network, weightMap, *concat103->getOutput(0), 128, 1, 1, 0, "model.105");
    IElementWiseLayer* conv106 = convBnSilu(network, weightMap, *conv105->getOutput(0), 64, 3, 1, 1, "model.106");
    IElementWiseLayer* conv107 = convBnSilu(network, weightMap, *conv106->getOutput(0), 64, 3, 1, 1, "model.107");
    IElementWiseLayer* conv108 = convBnSilu(network, weightMap, *conv107->getOutput(0), 64, 3, 1, 1, "model.108");
    IElementWiseLayer* conv109 = convBnSilu(network, weightMap, *conv108->getOutput(0), 64, 3, 1, 1, "model.109");
    IElementWiseLayer* conv110 = convBnSilu(network, weightMap, *conv109->getOutput(0), 64, 3, 1, 1, "model.110");
    IElementWiseLayer* conv111 = convBnSilu(network, weightMap, *conv110->getOutput(0), 64, 3, 1, 1, "model.111");
    IElementWiseLayer* conv112 = convBnSilu(network, weightMap, *conv111->getOutput(0), 64, 3, 1, 1, "model.112");
    IElementWiseLayer* conv113 = convBnSilu(network, weightMap, *conv112->getOutput(0), 64, 3, 1, 1, "model.113");

    ITensor* input_tensor_114[] = { conv113->getOutput(0), conv112->getOutput(0),conv111->getOutput(0), conv110->getOutput(0),
       conv109->getOutput(0), conv108->getOutput(0), conv107->getOutput(0), conv106->getOutput(0), conv105->getOutput(0),
        conv104->getOutput(0) };
    IConcatenationLayer* concat114 = network->addConcatenation(input_tensor_114, 10);

    IElementWiseLayer* conv115 = convBnSilu(network, weightMap, *concat114->getOutput(0), 192, 1, 1, 0, "model.115");

    auto conv116 = DownC(network, weightMap, *conv115->getOutput(0), 192, 384, "model.116");
    ITensor* input_tensor_117[] = { conv116->getOutput(0), conv99->getOutput(0) };
    IConcatenationLayer* concat117 = network->addConcatenation(input_tensor_117, 2);

    IElementWiseLayer* conv118 = convBnSilu(network, weightMap, *concat117->getOutput(0), 256, 1, 1, 0, "model.118");
    IElementWiseLayer* conv119 = convBnSilu(network, weightMap, *concat117->getOutput(0), 256, 1, 1, 0, "model.119");

    IElementWiseLayer* conv120 = convBnSilu(network, weightMap, *conv119->getOutput(0), 128, 3, 1, 1, "model.120");
    IElementWiseLayer* conv121 = convBnSilu(network, weightMap, *conv120->getOutput(0), 128, 3, 1, 1, "model.121");
    IElementWiseLayer* conv122 = convBnSilu(network, weightMap, *conv121->getOutput(0), 128, 3, 1, 1, "model.122");
    IElementWiseLayer* conv123 = convBnSilu(network, weightMap, *conv122->getOutput(0), 128, 3, 1, 1, "model.123");
    IElementWiseLayer* conv124 = convBnSilu(network, weightMap, *conv123->getOutput(0), 128, 3, 1, 1, "model.124");
    IElementWiseLayer* conv125 = convBnSilu(network, weightMap, *conv124->getOutput(0), 128, 3, 1, 1, "model.125");
    IElementWiseLayer* conv126 = convBnSilu(network, weightMap, *conv125->getOutput(0), 128, 3, 1, 1, "model.126");
    IElementWiseLayer* conv127 = convBnSilu(network, weightMap, *conv126->getOutput(0), 128, 3, 1, 1, "model.127");

    ITensor* input_tensor_128[] = { conv127->getOutput(0), conv126->getOutput(0),conv125->getOutput(0), conv124->getOutput(0),
       conv123->getOutput(0), conv122->getOutput(0), conv121->getOutput(0), conv120->getOutput(0), conv119->getOutput(0),
       conv118->getOutput(0) };
    IConcatenationLayer* concat128 = network->addConcatenation(input_tensor_128, 10);
    IElementWiseLayer* conv129 = convBnSilu(network, weightMap, *concat128->getOutput(0), 384, 1, 1, 0, "model.129");

    auto conv130 = DownC(network, weightMap, *conv129->getOutput(0), 384, 576, "model.130");
    ITensor* input_tensor_131[] = { conv130->getOutput(0), conv83->getOutput(0) };
    IConcatenationLayer* concat131 = network->addConcatenation(input_tensor_131, 2);

    IElementWiseLayer* conv132 = convBnSilu(network, weightMap, *concat131->getOutput(0), 384, 1, 1, 0, "model.132");
    IElementWiseLayer* conv133 = convBnSilu(network, weightMap, *concat131->getOutput(0), 384, 1, 1, 0, "model.133");

    IElementWiseLayer* conv134 = convBnSilu(network, weightMap, *conv133->getOutput(0), 192, 3, 1, 1, "model.134");
    IElementWiseLayer* conv135 = convBnSilu(network, weightMap, *conv134->getOutput(0), 192, 3, 1, 1, "model.135");
    IElementWiseLayer* conv136 = convBnSilu(network, weightMap, *conv135->getOutput(0), 192, 3, 1, 1, "model.136");
    IElementWiseLayer* conv137 = convBnSilu(network, weightMap, *conv136->getOutput(0), 192, 3, 1, 1, "model.137");
    IElementWiseLayer* conv138 = convBnSilu(network, weightMap, *conv137->getOutput(0), 192, 3, 1, 1, "model.138");
    IElementWiseLayer* conv139 = convBnSilu(network, weightMap, *conv138->getOutput(0), 192, 3, 1, 1, "model.139");
    IElementWiseLayer* conv140 = convBnSilu(network, weightMap, *conv139->getOutput(0), 192, 3, 1, 1, "model.140");
    IElementWiseLayer* conv141 = convBnSilu(network, weightMap, *conv140->getOutput(0), 192, 3, 1, 1, "model.141");
    ITensor* input_tensor_142[] = { conv141->getOutput(0), conv140->getOutput(0),conv139->getOutput(0), conv138->getOutput(0),
      conv137->getOutput(0), conv136->getOutput(0), conv135->getOutput(0), conv134->getOutput(0), conv133->getOutput(0), 
        conv132->getOutput(0) };
    IConcatenationLayer* concat142 = network->addConcatenation(input_tensor_142, 10);
    IElementWiseLayer* conv143 = convBnSilu(network, weightMap, *concat142->getOutput(0), 576, 1, 1, 0, "model.143");

    auto conv144 = DownC(network, weightMap, *conv143->getOutput(0), 576, 768, "model.144");
    ITensor* input_tensor_145[] = { conv144->getOutput(0), conv67->getOutput(0) };
    IConcatenationLayer* concat145 = network->addConcatenation(input_tensor_145, 2);

    IElementWiseLayer* conv146 = convBnSilu(network, weightMap, *concat145->getOutput(0), 512, 1, 1, 0, "model.146");
    IElementWiseLayer* conv147 = convBnSilu(network, weightMap, *concat145->getOutput(0), 512, 1, 1, 0, "model.147");

    IElementWiseLayer* conv148 = convBnSilu(network, weightMap, *conv147->getOutput(0), 256, 3, 1, 1, "model.148");
    IElementWiseLayer* conv149 = convBnSilu(network, weightMap, *conv148->getOutput(0), 256, 3, 1, 1, "model.149");
    IElementWiseLayer* conv150 = convBnSilu(network, weightMap, *conv149->getOutput(0), 256, 3, 1, 1, "model.150");
    IElementWiseLayer* conv151 = convBnSilu(network, weightMap, *conv150->getOutput(0), 256, 3, 1, 1, "model.151");
    IElementWiseLayer* conv152 = convBnSilu(network, weightMap, *conv151->getOutput(0), 256, 3, 1, 1, "model.152");
    IElementWiseLayer* conv153 = convBnSilu(network, weightMap, *conv152->getOutput(0), 256, 3, 1, 1, "model.153");
    IElementWiseLayer* conv154 = convBnSilu(network, weightMap, *conv153->getOutput(0), 256, 3, 1, 1, "model.154");
    IElementWiseLayer* conv155 = convBnSilu(network, weightMap, *conv154->getOutput(0), 256, 3, 1, 1, "model.155");
    ITensor* input_tensor_156[] = { conv155->getOutput(0), conv154->getOutput(0),conv153->getOutput(0), conv152->getOutput(0),
     conv151->getOutput(0), conv150->getOutput(0), conv149->getOutput(0), conv148->getOutput(0),conv147->getOutput(0),
        conv146->getOutput(0) };
    IConcatenationLayer* concat156 = network->addConcatenation(input_tensor_156, 10);
    IElementWiseLayer* conv157 = convBnSilu(network, weightMap, *concat156->getOutput(0), 768, 1, 1, 0, "model.157");

    IElementWiseLayer* conv158= convBnSilu(network, weightMap, *conv115->getOutput(0), 384, 3, 1, 1, "model.158");
    IElementWiseLayer* conv159 = convBnSilu(network, weightMap, *conv129->getOutput(0), 768, 3, 1, 1, "model.159");
    IElementWiseLayer* conv160 = convBnSilu(network, weightMap, *conv143->getOutput(0), 1152, 3, 1, 1, "model.160");
    IElementWiseLayer* conv161 = convBnSilu(network, weightMap, *conv157->getOutput(0), 1536, 3, 1, 1, "model.161");


    // out
    IConvolutionLayer* cv105_0 = network->addConvolutionNd(*conv158->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.162.m.0.weight"], weightMap["model.162.m.0.bias"]);
    assert(cv105_0);
    cv105_0->setName("cv105.0");
    IConvolutionLayer* cv105_1 = network->addConvolutionNd(*conv159->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.162.m.1.weight"], weightMap["model.162.m.1.bias"]);
    assert(cv105_1);
    cv105_1->setName("cv105.1");
    IConvolutionLayer* cv105_2 = network->addConvolutionNd(*conv160->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.162.m.2.weight"], weightMap["model.162.m.2.bias"]);
    assert(cv105_2);
    cv105_2->setName("cv105.2");
    IConvolutionLayer* cv105_3 = network->addConvolutionNd(*conv161->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.162.m.3.weight"], weightMap["model.162.m.3.bias"]);
    assert(cv105_3);
    cv105_3->setName("cv105.3");

    /*------------detect-----------*/
    auto yolo = addYoLoLayer(network, weightMap, "model.162", std::vector<IConvolutionLayer*>{cv105_0, cv105_1, cv105_2, cv105_3});
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));
    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#if defined(USE_FP16)
    config->setFlag(BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(BuilderFlag::kINT8);
    Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    return serialized_model;
}

IHostMemory* build_engine_yolov7e6(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, const std::string& wts_path) {
    std::map<std::string, Weights> weightMap = loadWeights(wts_path);

    INetworkDefinition* network = builder->createNetworkV2(0U);
    ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, kInputH, kInputW });
    assert(data);

    /*----------------------------------yolov7e6 backbone-----------------------------------------*/
    auto* conv0 = ReOrg(network, weightMap, *data, 3);


    IElementWiseLayer* conv1 = convBnSilu(network, weightMap, *conv0->getOutput(0), 80, 3, 1, 1, "model.1");
    auto conv2 = DownC(network, weightMap, *conv1->getOutput(0), 80, 160, "model.2");

    IElementWiseLayer* conv3 = convBnSilu(network, weightMap, *conv2->getOutput(0), 64, 1, 1, 0, "model.3");
    IElementWiseLayer* conv4 = convBnSilu(network, weightMap, *conv2->getOutput(0), 64, 1, 1, 0, "model.4");

    IElementWiseLayer* conv5 = convBnSilu(network, weightMap, *conv4->getOutput(0), 64, 3, 1, 1, "model.5");
    IElementWiseLayer* conv6 = convBnSilu(network, weightMap, *conv5->getOutput(0), 64, 3, 1, 1, "model.6");
    IElementWiseLayer* conv7 = convBnSilu(network, weightMap, *conv6->getOutput(0), 64, 3, 1, 1, "model.7");
    IElementWiseLayer* conv8 = convBnSilu(network, weightMap, *conv7->getOutput(0), 64, 3, 1, 1, "model.8");
    IElementWiseLayer* conv9 = convBnSilu(network, weightMap, *conv8->getOutput(0), 64, 3, 1, 1, "model.9");
    IElementWiseLayer* conv10 = convBnSilu(network, weightMap, *conv9->getOutput(0), 64, 3, 1, 1, "model.10");

    ITensor* input_tensor_11[] = { conv10->getOutput(0), conv8->getOutput(0),conv6->getOutput(0), conv4->getOutput(0),conv3->getOutput(0) };
    IConcatenationLayer* concat11 = network->addConcatenation(input_tensor_11, 5);

    IElementWiseLayer* conv12 = convBnSilu(network, weightMap, *concat11->getOutput(0), 160, 1, 1, 0, "model.12");


    auto conv13 = DownC(network, weightMap, *conv12->getOutput(0), 160, 320, "model.13");
    IElementWiseLayer* conv14 = convBnSilu(network, weightMap, *conv13->getOutput(0), 128, 1, 1, 0, "model.14");
    IElementWiseLayer* conv15 = convBnSilu(network, weightMap, *conv13->getOutput(0), 128, 1, 1, 0, "model.15");
    
    IElementWiseLayer* conv16 = convBnSilu(network, weightMap, *conv15->getOutput(0), 128, 3, 1, 1, "model.16");
    IElementWiseLayer* conv17 = convBnSilu(network, weightMap, *conv16->getOutput(0), 128, 3, 1, 1, "model.17");
    IElementWiseLayer* conv18 = convBnSilu(network, weightMap, *conv17->getOutput(0), 128, 3, 1, 1, "model.18");
    IElementWiseLayer* conv19 = convBnSilu(network, weightMap, *conv18->getOutput(0), 128, 3, 1, 1, "model.19");
    IElementWiseLayer* conv20 = convBnSilu(network, weightMap, *conv19->getOutput(0), 128, 3, 1, 1, "model.20");
    IElementWiseLayer* conv21 = convBnSilu(network, weightMap, *conv20->getOutput(0), 128, 3, 1, 1, "model.21");
    ITensor* input_tensor_22[] = { conv21->getOutput(0), conv19->getOutput(0),conv17->getOutput(0), conv15->getOutput(0),conv14->getOutput(0) };
    IConcatenationLayer* concat22 = network->addConcatenation(input_tensor_22, 5);

    IElementWiseLayer* conv23 = convBnSilu(network, weightMap, *concat22->getOutput(0), 320, 1, 1, 0, "model.23");


    auto conv24 = DownC(network, weightMap, *conv23->getOutput(0), 320, 640, "model.24");
    IElementWiseLayer* conv25 = convBnSilu(network, weightMap, *conv24->getOutput(0), 256, 1, 1, 0, "model.25");
    IElementWiseLayer* conv26 = convBnSilu(network, weightMap, *conv24->getOutput(0), 256, 1, 1, 0, "model.26");

    IElementWiseLayer* conv27 = convBnSilu(network, weightMap, *conv26->getOutput(0), 256, 3, 1, 1, "model.27");
    IElementWiseLayer* conv28 = convBnSilu(network, weightMap, *conv27->getOutput(0), 256, 3, 1, 1, "model.28");
    IElementWiseLayer* conv29 = convBnSilu(network, weightMap, *conv28->getOutput(0), 256, 3, 1, 1, "model.29");
    IElementWiseLayer* conv30 = convBnSilu(network, weightMap, *conv29->getOutput(0), 256, 3, 1, 1, "model.30");
    IElementWiseLayer* conv31 = convBnSilu(network, weightMap, *conv30->getOutput(0), 256, 3, 1, 1, "model.31");
    IElementWiseLayer* conv32 = convBnSilu(network, weightMap, *conv31->getOutput(0), 256, 3, 1, 1, "model.32");
    ITensor* input_tensor_33[] = { conv32->getOutput(0), conv30->getOutput(0),conv28->getOutput(0), conv26->getOutput(0),conv25->getOutput(0) };
    IConcatenationLayer* concat33 = network->addConcatenation(input_tensor_33, 5);

    IElementWiseLayer* conv34 = convBnSilu(network, weightMap, *concat33->getOutput(0), 640, 1, 1, 0, "model.34");
    auto conv35 = DownC(network, weightMap, *conv34->getOutput(0), 640, 960, "model.35");
    IElementWiseLayer* conv36 = convBnSilu(network, weightMap, *conv35->getOutput(0), 384, 1, 1, 0, "model.36");
    IElementWiseLayer* conv37 = convBnSilu(network, weightMap, *conv35->getOutput(0), 384, 1, 1, 0, "model.37");

    IElementWiseLayer* conv38 = convBnSilu(network, weightMap, *conv37->getOutput(0), 384, 3, 1, 1, "model.38");
    IElementWiseLayer* conv39 = convBnSilu(network, weightMap, *conv38->getOutput(0), 384, 3, 1, 1, "model.39");
    IElementWiseLayer* conv40 = convBnSilu(network, weightMap, *conv39->getOutput(0), 384, 3, 1, 1, "model.40");
    IElementWiseLayer* conv41 = convBnSilu(network, weightMap, *conv40->getOutput(0), 384, 3, 1, 1, "model.41");
    IElementWiseLayer* conv42 = convBnSilu(network, weightMap, *conv41->getOutput(0), 384, 3, 1, 1, "model.42");
    IElementWiseLayer* conv43 = convBnSilu(network, weightMap, *conv42->getOutput(0), 384, 3, 1, 1, "model.43");

    ITensor* input_tensor_44[] = { conv43->getOutput(0), conv41->getOutput(0),conv39->getOutput(0), conv37->getOutput(0),conv36->getOutput(0) };
    IConcatenationLayer* concat44 = network->addConcatenation(input_tensor_44, 5);
    IElementWiseLayer* conv45 = convBnSilu(network, weightMap, *concat44->getOutput(0), 960, 1, 1, 0, "model.45");

    auto conv46 = DownC(network, weightMap, *conv45->getOutput(0), 960, 1280, "model.46");
    IElementWiseLayer* conv47 = convBnSilu(network, weightMap, *conv46->getOutput(0), 512, 1, 1, 0, "model.47");
    IElementWiseLayer* conv48 = convBnSilu(network, weightMap, *conv46->getOutput(0), 512, 1, 1, 0, "model.48");

    IElementWiseLayer* conv49 = convBnSilu(network, weightMap, *conv48->getOutput(0), 512, 3, 1, 1, "model.49");
    IElementWiseLayer* conv50 = convBnSilu(network, weightMap, *conv49->getOutput(0), 512, 3, 1, 1, "model.50");
    IElementWiseLayer* conv51 = convBnSilu(network, weightMap, *conv50->getOutput(0), 512, 3, 1, 1, "model.51");
    IElementWiseLayer* conv52 = convBnSilu(network, weightMap, *conv51->getOutput(0), 512, 3, 1, 1, "model.52");
    IElementWiseLayer* conv53 = convBnSilu(network, weightMap, *conv52->getOutput(0), 512, 3, 1, 1, "model.53");
    IElementWiseLayer* conv54 = convBnSilu(network, weightMap, *conv53->getOutput(0), 512, 3, 1, 1, "model.54");
    ITensor* input_tensor_55[] = { conv54->getOutput(0), conv52->getOutput(0),conv50->getOutput(0), conv48->getOutput(0),conv47->getOutput(0) };
    IConcatenationLayer* concat55 = network->addConcatenation(input_tensor_55, 5);
    IElementWiseLayer* conv56 = convBnSilu(network, weightMap, *concat55->getOutput(0), 1280, 1, 1, 0, "model.56");

    //------------------------yolov7e6 head-------------------------------
    auto conv57 = SPPCSPC(network, weightMap, *conv56->getOutput(0), 640, "model.57");
    IElementWiseLayer* conv58 = convBnSilu(network, weightMap, *conv57->getOutput(0), 480, 1, 1, 0, "model.58");


    float scale[] = { 1.0, 2.0, 2.0 };
    IResizeLayer* re59 = network->addResize(*conv58->getOutput(0));
    re59->setResizeMode(ResizeMode::kNEAREST);
    re59->setScales(scale, 3);

    IElementWiseLayer* conv60 = convBnSilu(network, weightMap, *conv45->getOutput(0), 480, 1, 1, 0, "model.60");
    ITensor* input_tensor_61[] = { conv60->getOutput(0), re59->getOutput(0) };
    IConcatenationLayer* concat61 = network->addConcatenation(input_tensor_61, 2);
    IElementWiseLayer* conv62 = convBnSilu(network, weightMap, *concat61->getOutput(0), 384, 1, 1, 0, "model.62");
    IElementWiseLayer* conv63 = convBnSilu(network, weightMap, *concat61->getOutput(0), 384, 1, 1, 0, "model.63");

    IElementWiseLayer* conv64 = convBnSilu(network, weightMap, *conv63->getOutput(0), 192, 3, 1, 1, "model.64");
    IElementWiseLayer* conv65 = convBnSilu(network, weightMap, *conv64->getOutput(0), 192, 3, 1, 1, "model.65");
    IElementWiseLayer* conv66 = convBnSilu(network, weightMap, *conv65->getOutput(0), 192, 3, 1, 1, "model.66");
    IElementWiseLayer* conv67 = convBnSilu(network, weightMap, *conv66->getOutput(0), 192, 3, 1, 1, "model.67");
    IElementWiseLayer* conv68 = convBnSilu(network, weightMap, *conv67->getOutput(0), 192, 3, 1, 1, "model.68");
    IElementWiseLayer* conv69 = convBnSilu(network, weightMap, *conv68->getOutput(0), 192, 3, 1, 1, "model.69");

    ITensor* input_tensor_70[] = { conv69->getOutput(0), conv68->getOutput(0),conv67->getOutput(0), conv66->getOutput(0),
        conv65->getOutput(0), conv64->getOutput(0), conv63->getOutput(0), conv62->getOutput(0) };
    IConcatenationLayer* concat70 = network->addConcatenation(input_tensor_70, 8);
    IElementWiseLayer* conv71 = convBnSilu(network, weightMap, *concat70->getOutput(0), 480, 1, 1, 0, "model.71");

    IElementWiseLayer* conv72 = convBnSilu(network, weightMap, *conv71->getOutput(0), 320, 1, 1, 0, "model.72");
    IResizeLayer* re73 = network->addResize(*conv72->getOutput(0));
    re73->setResizeMode(ResizeMode::kNEAREST);
    re73->setScales(scale, 3);
    IElementWiseLayer* conv74 = convBnSilu(network, weightMap, *conv34->getOutput(0), 320, 1, 1, 0, "model.74");
    ITensor* input_tensor_75[] = { conv74->getOutput(0), re73->getOutput(0) };
    IConcatenationLayer* concat75 = network->addConcatenation(input_tensor_75, 2);

    IElementWiseLayer* conv76 = convBnSilu(network, weightMap, *concat75->getOutput(0), 256, 1, 1, 0, "model.76");
    IElementWiseLayer* conv77 = convBnSilu(network, weightMap, *concat75->getOutput(0), 256, 1, 1, 0, "model.77");

    IElementWiseLayer* conv78 = convBnSilu(network, weightMap, *conv77->getOutput(0), 128, 3, 1, 1, "model.78");
    IElementWiseLayer* conv79 = convBnSilu(network, weightMap, *conv78->getOutput(0), 128, 3, 1, 1, "model.79");
    IElementWiseLayer* conv80 = convBnSilu(network, weightMap, *conv79->getOutput(0), 128, 3, 1, 1, "model.80");
    IElementWiseLayer* conv81 = convBnSilu(network, weightMap, *conv80->getOutput(0), 128, 3, 1, 1, "model.81");
    IElementWiseLayer* conv82 = convBnSilu(network, weightMap, *conv81->getOutput(0), 128, 3, 1, 1, "model.82");
    IElementWiseLayer* conv83 = convBnSilu(network, weightMap, *conv82->getOutput(0), 128, 3, 1, 1, "model.83");

    ITensor* input_tensor_84[] = { conv83->getOutput(0), conv82->getOutput(0),conv81->getOutput(0), conv80->getOutput(0),
        conv79->getOutput(0), conv78->getOutput(0), conv77->getOutput(0), conv76->getOutput(0) };
    IConcatenationLayer* concat84 = network->addConcatenation(input_tensor_84, 8);

    IElementWiseLayer* conv85 = convBnSilu(network, weightMap, *concat84->getOutput(0), 320, 1, 1, 0, "model.85");

    IElementWiseLayer* conv86 = convBnSilu(network, weightMap, *conv85->getOutput(0), 160, 1, 1, 0, "model.86");
    IResizeLayer* re87 = network->addResize(*conv86->getOutput(0));
    re87->setResizeMode(ResizeMode::kNEAREST);
    re87->setScales(scale, 3);
    IElementWiseLayer* conv88 = convBnSilu(network, weightMap, *conv23->getOutput(0), 160, 1, 1, 0, "model.88");
    ITensor* input_tensor_89[] = { conv88->getOutput(0), re87->getOutput(0) };
    IConcatenationLayer* concat89 = network->addConcatenation(input_tensor_89, 2);

    IElementWiseLayer* conv90 = convBnSilu(network, weightMap, *concat89->getOutput(0), 128, 1, 1, 0, "model.90");
    IElementWiseLayer* conv91 = convBnSilu(network, weightMap, *concat89->getOutput(0), 128, 1, 1, 0, "model.91");
    IElementWiseLayer* conv92 = convBnSilu(network, weightMap, *conv91->getOutput(0), 64, 3, 1, 1, "model.92");
    IElementWiseLayer* conv93 = convBnSilu(network, weightMap, *conv92->getOutput(0), 64, 3, 1, 1, "model.93");
    IElementWiseLayer* conv94 = convBnSilu(network, weightMap, *conv93->getOutput(0), 64, 3, 1, 1, "model.94");
    IElementWiseLayer* conv95 = convBnSilu(network, weightMap, *conv94->getOutput(0), 64, 3, 1, 1, "model.95");
    IElementWiseLayer* conv96 = convBnSilu(network, weightMap, *conv95->getOutput(0), 64, 3, 1, 1, "model.96");
    IElementWiseLayer* conv97 = convBnSilu(network, weightMap, *conv96->getOutput(0), 64, 3, 1, 1, "model.97");

    ITensor* input_tensor_98[] = { conv97->getOutput(0), conv96->getOutput(0),conv95->getOutput(0), conv94->getOutput(0),
       conv93->getOutput(0), conv92->getOutput(0), conv91->getOutput(0), conv90->getOutput(0) };
    IConcatenationLayer* concat98 = network->addConcatenation(input_tensor_98, 8);

    IElementWiseLayer* conv99 = convBnSilu(network, weightMap, *concat98->getOutput(0), 160, 1, 1, 0, "model.99");

    auto conv100 = DownC(network, weightMap, *conv99->getOutput(0), 160, 320, "model.100");
    ITensor* input_tensor_101[] = { conv100->getOutput(0), conv85->getOutput(0) };
    IConcatenationLayer* concat101 = network->addConcatenation(input_tensor_101, 2);

    IElementWiseLayer* conv102 = convBnSilu(network, weightMap, *concat101->getOutput(0), 256, 1, 1, 0, "model.102");
    IElementWiseLayer* conv103 = convBnSilu(network, weightMap, *concat101->getOutput(0), 256, 1, 1, 0, "model.103");

    IElementWiseLayer* conv104 = convBnSilu(network, weightMap, *conv103->getOutput(0), 128, 3, 1, 1, "model.104");
    IElementWiseLayer* conv105 = convBnSilu(network, weightMap, *conv104->getOutput(0), 128, 3, 1, 1, "model.105");
    IElementWiseLayer* conv106 = convBnSilu(network, weightMap, *conv105->getOutput(0), 128, 3, 1, 1, "model.106");
    IElementWiseLayer* conv107 = convBnSilu(network, weightMap, *conv106->getOutput(0), 128, 3, 1, 1, "model.107");
    IElementWiseLayer* conv108 = convBnSilu(network, weightMap, *conv107->getOutput(0), 128, 3, 1, 1, "model.108");
    IElementWiseLayer* conv109 = convBnSilu(network, weightMap, *conv108->getOutput(0), 128, 3, 1, 1, "model.109");

    ITensor* input_tensor_110[] = { conv109->getOutput(0), conv108->getOutput(0),conv107->getOutput(0), conv106->getOutput(0),
       conv105->getOutput(0), conv104->getOutput(0), conv103->getOutput(0), conv102->getOutput(0) };
    IConcatenationLayer* concat110 = network->addConcatenation(input_tensor_110, 8);
    IElementWiseLayer* conv111 = convBnSilu(network, weightMap, *concat110->getOutput(0), 320, 1, 1, 0, "model.111");

    auto conv112 = DownC(network, weightMap, *conv111->getOutput(0), 320, 480, "model.112");
    ITensor* input_tensor_113[] = { conv112->getOutput(0), conv71->getOutput(0) };
    IConcatenationLayer* concat113 = network->addConcatenation(input_tensor_113, 2);

    IElementWiseLayer* conv114 = convBnSilu(network, weightMap, *concat113->getOutput(0), 384, 1, 1, 0, "model.114");
    IElementWiseLayer* conv115 = convBnSilu(network, weightMap, *concat113->getOutput(0), 384, 1, 1, 0, "model.115");

    IElementWiseLayer* conv116 = convBnSilu(network, weightMap, *conv115->getOutput(0), 192, 3, 1, 1, "model.116");
    IElementWiseLayer* conv117 = convBnSilu(network, weightMap, *conv116->getOutput(0), 192, 3, 1, 1, "model.117");
    IElementWiseLayer* conv118 = convBnSilu(network, weightMap, *conv117->getOutput(0), 192, 3, 1, 1, "model.118");
    IElementWiseLayer* conv119 = convBnSilu(network, weightMap, *conv118->getOutput(0), 192, 3, 1, 1, "model.119");
    IElementWiseLayer* conv120 = convBnSilu(network, weightMap, *conv119->getOutput(0), 192, 3, 1, 1, "model.120");
    IElementWiseLayer* conv121 = convBnSilu(network, weightMap, *conv120->getOutput(0), 192, 3, 1, 1, "model.121");
    ITensor* input_tensor_122[] = { conv121->getOutput(0), conv120->getOutput(0),conv119->getOutput(0), conv118->getOutput(0),
      conv117->getOutput(0), conv116->getOutput(0), conv115->getOutput(0), conv114->getOutput(0) };
    IConcatenationLayer* concat122 = network->addConcatenation(input_tensor_122, 8);
    IElementWiseLayer* conv123 = convBnSilu(network, weightMap, *concat122->getOutput(0), 480, 1, 1, 0, "model.123");

    auto conv124 = DownC(network, weightMap, *conv123->getOutput(0), 480, 640, "model.124");
    ITensor* input_tensor_125[] = { conv124->getOutput(0), conv57->getOutput(0) };
    IConcatenationLayer* concat125 = network->addConcatenation(input_tensor_125, 2);

    IElementWiseLayer* conv126 = convBnSilu(network, weightMap, *concat125->getOutput(0), 512, 1, 1, 0, "model.126");
    IElementWiseLayer* conv127 = convBnSilu(network, weightMap, *concat125->getOutput(0), 512, 1, 1, 0, "model.127");

    IElementWiseLayer* conv128 = convBnSilu(network, weightMap, *conv127->getOutput(0), 256, 3, 1, 1, "model.128");
    IElementWiseLayer* conv129 = convBnSilu(network, weightMap, *conv128->getOutput(0), 256, 3, 1, 1, "model.129");
    IElementWiseLayer* conv130 = convBnSilu(network, weightMap, *conv129->getOutput(0), 256, 3, 1, 1, "model.130");
    IElementWiseLayer* conv131 = convBnSilu(network, weightMap, *conv130->getOutput(0), 256, 3, 1, 1, "model.131");
    IElementWiseLayer* conv132 = convBnSilu(network, weightMap, *conv131->getOutput(0), 256, 3, 1, 1, "model.132");
    IElementWiseLayer* conv133 = convBnSilu(network, weightMap, *conv132->getOutput(0), 256, 3, 1, 1, "model.133");
    ITensor* input_tensor_134[] = { conv133->getOutput(0), conv132->getOutput(0),conv131->getOutput(0), conv130->getOutput(0),
     conv129->getOutput(0), conv128->getOutput(0), conv127->getOutput(0), conv126->getOutput(0) };
    IConcatenationLayer* concat134 = network->addConcatenation(input_tensor_134, 8);
    IElementWiseLayer* conv135 = convBnSilu(network, weightMap, *concat134->getOutput(0), 640, 1, 1, 0, "model.135");

    IElementWiseLayer* conv136 = convBnSilu(network, weightMap, *conv99->getOutput(0), 320, 3, 1, 1, "model.136");
    IElementWiseLayer* conv137 = convBnSilu(network, weightMap, *conv111->getOutput(0), 640, 3, 1, 1, "model.137");
    IElementWiseLayer* conv138 = convBnSilu(network, weightMap, *conv123->getOutput(0), 960, 3, 1, 1, "model.138");
    IElementWiseLayer* conv139 = convBnSilu(network, weightMap, *conv135->getOutput(0), 1280, 3, 1, 1, "model.139");


     // out
    IConvolutionLayer* cv105_0 = network->addConvolutionNd(*conv136->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.140.m.0.weight"], weightMap["model.140.m.0.bias"]);
    assert(cv105_0);
    cv105_0->setName("cv105.0");
    IConvolutionLayer* cv105_1 = network->addConvolutionNd(*conv137->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.140.m.1.weight"], weightMap["model.140.m.1.bias"]);
    assert(cv105_1);
    cv105_1->setName("cv105.1");
    IConvolutionLayer* cv105_2 = network->addConvolutionNd(*conv138->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.140.m.2.weight"], weightMap["model.140.m.2.bias"]);
    assert(cv105_2);
    cv105_2->setName("cv105.2");
    IConvolutionLayer* cv105_3 = network->addConvolutionNd(*conv139->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.140.m.3.weight"], weightMap["model.140.m.3.bias"]);
    assert(cv105_3);
    cv105_3->setName("cv105.3");

    /*------------detect-----------*/
    auto yolo = addYoLoLayer(network, weightMap, "model.140", std::vector<IConvolutionLayer*>{cv105_0, cv105_1, cv105_2, cv105_3});
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));
    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#if defined(USE_FP16)
    config->setFlag(BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(BuilderFlag::kINT8);
    Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    return serialized_model;
}

IHostMemory* build_engine_yolov7w6(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, const std::string& wts_path) {
    std::map<std::string, Weights> weightMap = loadWeights(wts_path);

    INetworkDefinition* network = builder->createNetworkV2(0U);
    ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, kInputH, kInputW });
    assert(data);

    /*----------------------------------yolov7w6 backbone-----------------------------------------*/
    auto* conv0 = ReOrg(network, weightMap, *data, 3);

    IElementWiseLayer* conv1 = convBnSilu(network, weightMap, *conv0->getOutput(0), 64, 3, 1, 1, "model.1");

    IElementWiseLayer* conv2 = convBnSilu(network, weightMap, *conv1->getOutput(0), 128, 3, 2, 1, "model.2");

    IElementWiseLayer* conv3 = convBnSilu(network, weightMap, *conv2->getOutput(0), 64, 1, 1, 0, "model.3");
    IElementWiseLayer* conv4 = convBnSilu(network, weightMap, *conv2->getOutput(0), 64, 1, 1, 0, "model.4");

    IElementWiseLayer* conv5 = convBnSilu(network, weightMap, *conv4->getOutput(0), 64, 3, 1, 1, "model.5");
    IElementWiseLayer* conv6 = convBnSilu(network, weightMap, *conv5->getOutput(0), 64, 3, 1, 1, "model.6");
    IElementWiseLayer* conv7 = convBnSilu(network, weightMap, *conv6->getOutput(0), 64, 3, 1, 1, "model.7");
    IElementWiseLayer* conv8 = convBnSilu(network, weightMap, *conv7->getOutput(0), 64, 3, 1, 1, "model.8");

    ITensor* input_tensor_9[] = { conv8->getOutput(0), conv6->getOutput(0), conv4->getOutput(0), conv3->getOutput(0) };
    IConcatenationLayer* concat9 = network->addConcatenation(input_tensor_9, 4);
    concat9->setAxis(0);
    IElementWiseLayer* conv10 = convBnSilu(network, weightMap, *concat9->getOutput(0), 128, 1, 1, 0, "model.10");

    IElementWiseLayer* conv11 = convBnSilu(network, weightMap, *conv10->getOutput(0), 256, 3, 2, 1, "model.11");

    IElementWiseLayer* conv12 = convBnSilu(network, weightMap, *conv11->getOutput(0), 128, 1, 1, 0, "model.12");
    IElementWiseLayer* conv13 = convBnSilu(network, weightMap, *conv11->getOutput(0), 128, 1, 1, 0, "model.13");
    IElementWiseLayer* conv14 = convBnSilu(network, weightMap, *conv13->getOutput(0), 128, 3, 1, 1, "model.14");
    IElementWiseLayer* conv15 = convBnSilu(network, weightMap, *conv14->getOutput(0), 128, 3, 1, 1, "model.15");
    IElementWiseLayer* conv16 = convBnSilu(network, weightMap, *conv15->getOutput(0), 128, 3, 1, 1, "model.16");
    IElementWiseLayer* conv17 = convBnSilu(network, weightMap, *conv16->getOutput(0), 128, 3, 1, 1, "model.17");
    ITensor* input_tensor_18[] = { conv17->getOutput(0), conv15->getOutput(0), conv13->getOutput(0), conv12->getOutput(0) };
    IConcatenationLayer* concat18 = network->addConcatenation(input_tensor_18, 4);
    concat18->setAxis(0);
    IElementWiseLayer* conv19 = convBnSilu(network, weightMap, *concat18->getOutput(0), 256, 1, 1, 0, "model.19");

    IElementWiseLayer* conv20 = convBnSilu(network, weightMap, *conv19->getOutput(0), 512, 3, 2, 1, "model.20");

    IElementWiseLayer* conv21 = convBnSilu(network, weightMap, *conv20->getOutput(0), 256, 1, 1, 0, "model.21");
    IElementWiseLayer* conv22 = convBnSilu(network, weightMap, *conv20->getOutput(0), 256, 1, 1, 0, "model.22");
    IElementWiseLayer* conv23 = convBnSilu(network, weightMap, *conv22->getOutput(0), 256, 3, 1, 1, "model.23");
    IElementWiseLayer* conv24 = convBnSilu(network, weightMap, *conv23->getOutput(0), 256, 3, 1, 1, "model.24");
    IElementWiseLayer* conv25 = convBnSilu(network, weightMap, *conv24->getOutput(0), 256, 3, 1, 1, "model.25");
    IElementWiseLayer* conv26 = convBnSilu(network, weightMap, *conv25->getOutput(0), 256, 3, 1, 1, "model.26");
    ITensor* input_tensor_27[] = { conv26->getOutput(0), conv24->getOutput(0), conv22->getOutput(0), conv21->getOutput(0) };
    IConcatenationLayer* concat27 = network->addConcatenation(input_tensor_27, 4);
    concat27->setAxis(0);

    IElementWiseLayer* conv28 = convBnSilu(network, weightMap, *concat27->getOutput(0), 512, 1, 1, 0, "model.28");

    IElementWiseLayer* conv29 = convBnSilu(network, weightMap, *conv28->getOutput(0), 768, 3, 2, 1, "model.29");

    IElementWiseLayer* conv30 = convBnSilu(network, weightMap, *conv29->getOutput(0), 384, 1, 1, 0, "model.30");
    IElementWiseLayer* conv31 = convBnSilu(network, weightMap, *conv29->getOutput(0), 384, 1, 1, 0, "model.31");
    IElementWiseLayer* conv32 = convBnSilu(network, weightMap, *conv31->getOutput(0), 384, 3, 1, 1, "model.32");
    IElementWiseLayer* conv33 = convBnSilu(network, weightMap, *conv32->getOutput(0), 384, 3, 1, 1, "model.33");
    IElementWiseLayer* conv34 = convBnSilu(network, weightMap, *conv33->getOutput(0), 384, 3, 1, 1, "model.34");
    IElementWiseLayer* conv35 = convBnSilu(network, weightMap, *conv34->getOutput(0), 384, 3, 1, 1, "model.35");
    ITensor* input_tensor_36[] = { conv35->getOutput(0), conv33->getOutput(0), conv31->getOutput(0), conv30->getOutput(0) };
    IConcatenationLayer* concat36 = network->addConcatenation(input_tensor_36, 4);
    IElementWiseLayer* conv37 = convBnSilu(network, weightMap, *concat36->getOutput(0), 768, 1, 1, 0, "model.37");

    IElementWiseLayer* conv38 = convBnSilu(network, weightMap, *conv37->getOutput(0), 1024, 3, 2, 1, "model.38");

    IElementWiseLayer* conv39 = convBnSilu(network, weightMap, *conv38->getOutput(0), 512, 1, 1, 0, "model.39");
    IElementWiseLayer* conv40 = convBnSilu(network, weightMap, *conv38->getOutput(0), 512, 1, 1, 0, "model.40");
    IElementWiseLayer* conv41 = convBnSilu(network, weightMap, *conv40->getOutput(0), 512, 3, 1, 1, "model.41");
    IElementWiseLayer* conv42 = convBnSilu(network, weightMap, *conv41->getOutput(0), 512, 3, 1, 1, "model.42");
    IElementWiseLayer* conv43 = convBnSilu(network, weightMap, *conv42->getOutput(0), 512, 3, 1, 1, "model.43");
    IElementWiseLayer* conv44 = convBnSilu(network, weightMap, *conv43->getOutput(0), 512, 3, 1, 1, "model.44");
    ITensor* input_tensor_45[] = { conv44->getOutput(0), conv42->getOutput(0), conv40->getOutput(0), conv39->getOutput(0) };
    IConcatenationLayer* concat45 = network->addConcatenation(input_tensor_45, 4);
    IElementWiseLayer* conv46 = convBnSilu(network, weightMap, *concat45->getOutput(0), 1024, 1, 1, 0, "model.46");

    auto conv47 = SPPCSPC(network, weightMap, *conv46->getOutput(0), 512, "model.47");
    IElementWiseLayer* conv48 = convBnSilu(network, weightMap, *conv47->getOutput(0), 384, 1, 1, 0, "model.48");

    float scale[] = { 1.0, 2.0, 2.0 };
    IResizeLayer* re49 = network->addResize(*conv48->getOutput(0));
    re49->setResizeMode(ResizeMode::kNEAREST);
    re49->setScales(scale, 3);

    IElementWiseLayer* conv50 = convBnSilu(network, weightMap, *conv37->getOutput(0), 384, 1, 1, 0, "model.50");
    ITensor* input_tensor_51[] = { conv50->getOutput(0), re49->getOutput(0) };
    IConcatenationLayer* concat51 = network->addConcatenation(input_tensor_51, 2);

    IElementWiseLayer* conv52 = convBnSilu(network, weightMap, *concat51->getOutput(0), 384, 1, 1, 0, "model.52");
    IElementWiseLayer* conv53 = convBnSilu(network, weightMap, *concat51->getOutput(0), 384, 1, 1, 0, "model.53");
    IElementWiseLayer* conv54 = convBnSilu(network, weightMap, *conv53->getOutput(0), 192, 3, 1, 1, "model.54");
    IElementWiseLayer* conv55 = convBnSilu(network, weightMap, *conv54->getOutput(0), 192, 3, 1, 1, "model.55");
    IElementWiseLayer* conv56 = convBnSilu(network, weightMap, *conv55->getOutput(0), 192, 3, 1, 1, "model.56");
    IElementWiseLayer* conv57 = convBnSilu(network, weightMap, *conv56->getOutput(0), 192, 3, 1, 1, "model.57");

    ITensor* input_tensor_58[] = { conv57->getOutput(0), conv56->getOutput(0), conv55->getOutput(0), conv54->getOutput(0), conv53->getOutput(0), conv52->getOutput(0) };
    IConcatenationLayer* concat58 = network->addConcatenation(input_tensor_58, 6);

    IElementWiseLayer* conv59 = convBnSilu(network, weightMap, *concat58->getOutput(0), 384, 1, 1, 0, "model.59");

    IElementWiseLayer* conv60 = convBnSilu(network, weightMap, *conv59->getOutput(0), 256, 1, 1, 0, "model.60");
    IResizeLayer* re61 = network->addResize(*conv60->getOutput(0));
    re61->setResizeMode(ResizeMode::kNEAREST);
    re61->setScales(scale, 3);
    IElementWiseLayer* conv62 = convBnSilu(network, weightMap, *conv28->getOutput(0), 256, 1, 1, 0, "model.62");
    ITensor* input_tensor_63[] = { conv62->getOutput(0), re61->getOutput(0) };
    IConcatenationLayer* concat63 = network->addConcatenation(input_tensor_63, 2);

    IElementWiseLayer* conv64 = convBnSilu(network, weightMap, *concat63->getOutput(0), 256, 1, 1, 0, "model.64");
    IElementWiseLayer* conv65 = convBnSilu(network, weightMap, *concat63->getOutput(0), 256, 1, 1, 0, "model.65");
    IElementWiseLayer* conv66 = convBnSilu(network, weightMap, *conv65->getOutput(0), 128, 3, 1, 1, "model.66");
    IElementWiseLayer* conv67 = convBnSilu(network, weightMap, *conv66->getOutput(0), 128, 3, 1, 1, "model.67");
    IElementWiseLayer* conv68 = convBnSilu(network, weightMap, *conv67->getOutput(0), 128, 3, 1, 1, "model.68");
    IElementWiseLayer* conv69 = convBnSilu(network, weightMap, *conv68->getOutput(0), 128, 3, 1, 1, "model.69");

    ITensor* input_tensor_70[] = { conv69->getOutput(0), conv68->getOutput(0), conv67->getOutput(0), conv66->getOutput(0), conv65->getOutput(0), conv64->getOutput(0) };
    IConcatenationLayer* concat70 = network->addConcatenation(input_tensor_70, 6);

    IElementWiseLayer* conv71 = convBnSilu(network, weightMap, *concat70->getOutput(0), 256, 1, 1, 0, "model.71");
    IElementWiseLayer* conv72 = convBnSilu(network, weightMap, *conv71->getOutput(0), 128, 1, 1, 0, "model.72");
    IResizeLayer* re73 = network->addResize(*conv72->getOutput(0));
    re73->setResizeMode(ResizeMode::kNEAREST);
    re73->setScales(scale, 3);

    IElementWiseLayer* conv74 = convBnSilu(network, weightMap, *conv19->getOutput(0), 128, 1, 1, 0, "model.74");
    ITensor* input_tensor_75[] = { conv74->getOutput(0), re73->getOutput(0) };
    IConcatenationLayer* concat75 = network->addConcatenation(input_tensor_75, 2);
    IElementWiseLayer* conv76 = convBnSilu(network, weightMap, *concat75->getOutput(0), 128, 1, 1, 0, "model.76");
    IElementWiseLayer* conv77 = convBnSilu(network, weightMap, *concat75->getOutput(0), 128, 1, 1, 0, "model.77");

    IElementWiseLayer* conv78 = convBnSilu(network, weightMap, *conv77->getOutput(0), 64, 3, 1, 1, "model.78");
    IElementWiseLayer* conv79 = convBnSilu(network, weightMap, *conv78->getOutput(0), 64, 3, 1, 1, "model.79");
    IElementWiseLayer* conv80 = convBnSilu(network, weightMap, *conv79->getOutput(0), 64, 3, 1, 1, "model.80");
    IElementWiseLayer* conv81 = convBnSilu(network, weightMap, *conv80->getOutput(0), 64, 3, 1, 1, "model.81");
    ITensor* input_tensor_82[] = { conv81->getOutput(0), conv80->getOutput(0), conv79->getOutput(0), conv78->getOutput(0), conv77->getOutput(0), conv76->getOutput(0) };
    IConcatenationLayer* concat82 = network->addConcatenation(input_tensor_82, 6);

    IElementWiseLayer* conv83 = convBnSilu(network, weightMap, *concat82->getOutput(0), 128, 1, 1, 0, "model.83");

    IElementWiseLayer* conv84 = convBnSilu(network, weightMap, *conv83->getOutput(0), 256, 3, 2, 1, "model.84");
    ITensor* input_tensor_85[] = { conv84->getOutput(0), conv71->getOutput(0) };
    IConcatenationLayer* concat85 = network->addConcatenation(input_tensor_85, 2);

    IElementWiseLayer* conv86 = convBnSilu(network, weightMap, *concat85->getOutput(0), 256, 1, 1, 0, "model.86");
    IElementWiseLayer* conv87 = convBnSilu(network, weightMap, *concat85->getOutput(0), 256, 1, 1, 0, "model.87");
    IElementWiseLayer* conv88 = convBnSilu(network, weightMap, *conv87->getOutput(0), 128, 3, 1, 1, "model.88");
    IElementWiseLayer* conv89 = convBnSilu(network, weightMap, *conv88->getOutput(0), 128, 3, 1, 1, "model.89");
    IElementWiseLayer* conv90 = convBnSilu(network, weightMap, *conv89->getOutput(0), 128, 3, 1, 1, "model.90");
    IElementWiseLayer* conv91 = convBnSilu(network, weightMap, *conv90->getOutput(0), 128, 3, 1, 1, "model.91");

    ITensor* input_tensor_92[] = { conv91->getOutput(0), conv90->getOutput(0), conv89->getOutput(0), conv88->getOutput(0), conv87->getOutput(0), conv86->getOutput(0) };
    IConcatenationLayer* concat92 = network->addConcatenation(input_tensor_92, 6);

    IElementWiseLayer* conv93 = convBnSilu(network, weightMap, *concat92->getOutput(0), 256, 1, 1, 0, "model.93");

    IElementWiseLayer* conv94 = convBnSilu(network, weightMap, *conv93->getOutput(0), 384, 3, 2, 1, "model.94");
    ITensor* input_tensor_95[] = { conv94->getOutput(0), conv59->getOutput(0) };
    IConcatenationLayer* concat95 = network->addConcatenation(input_tensor_95, 2);

    IElementWiseLayer* conv96 = convBnSilu(network, weightMap, *concat95->getOutput(0), 384, 1, 1, 0, "model.96");
    IElementWiseLayer* conv97 = convBnSilu(network, weightMap, *concat95->getOutput(0), 384, 1, 1, 0, "model.97");

    IElementWiseLayer* conv98 = convBnSilu(network, weightMap, *conv97->getOutput(0), 192, 3, 1, 1, "model.98");
    IElementWiseLayer* conv99 = convBnSilu(network, weightMap, *conv98->getOutput(0), 192, 3, 1, 1, "model.99");
    IElementWiseLayer* conv100 = convBnSilu(network, weightMap, *conv99->getOutput(0), 192, 3, 1, 1, "model.100");
    IElementWiseLayer* conv101 = convBnSilu(network, weightMap, *conv100->getOutput(0), 192, 3, 1, 1, "model.101");
    ITensor* input_tensor_102[] = { conv101->getOutput(0), conv100->getOutput(0), conv99->getOutput(0), conv98->getOutput(0), conv97->getOutput(0), conv96->getOutput(0) };
    IConcatenationLayer* concat102 = network->addConcatenation(input_tensor_102, 6);
    IElementWiseLayer* conv103 = convBnSilu(network, weightMap, *concat102->getOutput(0), 384, 1, 1, 0, "model.103");

    IElementWiseLayer* conv104 = convBnSilu(network, weightMap, *conv103->getOutput(0), 512, 3, 2, 1, "model.104");

    ITensor* input_tensor_105[] = { conv104->getOutput(0), conv47->getOutput(0) };
    IConcatenationLayer* concat105 = network->addConcatenation(input_tensor_105, 2);

    IElementWiseLayer* conv106 = convBnSilu(network, weightMap, *concat105->getOutput(0), 512, 1, 1, 0, "model.106");
    IElementWiseLayer* conv107 = convBnSilu(network, weightMap, *concat105->getOutput(0), 512, 1, 1, 0, "model.107");

    IElementWiseLayer* conv108 = convBnSilu(network, weightMap, *conv107->getOutput(0), 256, 3, 1, 1, "model.108");
    IElementWiseLayer* conv109 = convBnSilu(network, weightMap, *conv108->getOutput(0), 256, 3, 1, 1, "model.109");
    IElementWiseLayer* conv110 = convBnSilu(network, weightMap, *conv109->getOutput(0), 256, 3, 1, 1, "model.110");
    IElementWiseLayer* conv111 = convBnSilu(network, weightMap, *conv110->getOutput(0), 256, 3, 1, 1, "model.111");
    ITensor* input_tensor_112[] = { conv111->getOutput(0), conv110->getOutput(0), conv109->getOutput(0), conv108->getOutput(0), conv107->getOutput(0), conv106->getOutput(0) };
    IConcatenationLayer* concat112 = network->addConcatenation(input_tensor_112, 6);

    IElementWiseLayer* conv113 = convBnSilu(network, weightMap, *concat112->getOutput(0), 512, 1, 1, 0, "model.113");
    IElementWiseLayer* conv114 = convBnSilu(network, weightMap, *conv83->getOutput(0), 256, 3, 1, 1, "model.114");
    IElementWiseLayer* conv115 = convBnSilu(network, weightMap, *conv93->getOutput(0), 512, 3, 1, 1, "model.115");
    IElementWiseLayer* conv116 = convBnSilu(network, weightMap, *conv103->getOutput(0), 768, 3, 1, 1, "model.116");
    IElementWiseLayer* conv117 = convBnSilu(network, weightMap, *conv113->getOutput(0), 1024, 3, 1, 1, "model.117");

    // out
    IConvolutionLayer* cv105_0 = network->addConvolutionNd(*conv114->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.118.m.0.weight"], weightMap["model.118.m.0.bias"]);
    assert(cv105_0);
    cv105_0->setName("cv105.0");
    IConvolutionLayer* cv105_1 = network->addConvolutionNd(*conv115->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.118.m.1.weight"], weightMap["model.118.m.1.bias"]);
    assert(cv105_1);
    cv105_1->setName("cv105.1");
    IConvolutionLayer* cv105_2 = network->addConvolutionNd(*conv116->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.118.m.2.weight"], weightMap["model.118.m.2.bias"]);
    assert(cv105_2);
    cv105_2->setName("cv105.2");
    IConvolutionLayer* cv105_3 = network->addConvolutionNd(*conv117->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.118.m.3.weight"], weightMap["model.118.m.3.bias"]);
    assert(cv105_3);
    cv105_3->setName("cv105.3");

    /*------------detect-----------*/
    auto yolo = addYoLoLayer(network, weightMap, "model.118", std::vector<IConvolutionLayer*>{cv105_0, cv105_1, cv105_2, cv105_3});
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));
    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#if defined(USE_FP16)
    config->setFlag(BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(BuilderFlag::kINT8);
    Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    return serialized_model;
}

IHostMemory* build_engine_yolov7x(unsigned int maxBatchSize,IBuilder* builder, IBuilderConfig* config, DataType dt, const std::string& wts_path) {
    std::map<std::string, Weights> weightMap = loadWeights(wts_path);

    INetworkDefinition* network = builder->createNetworkV2(0U);
    ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, kInputH, kInputW });
    assert(data);

    /*----------------------------------yolov7x backbone-----------------------------------------*/
    IElementWiseLayer* conv0 = convBnSilu(network, weightMap, *data, 40, 3, 1, 1, "model.0");

    IElementWiseLayer* conv1 = convBnSilu(network, weightMap, *conv0->getOutput(0), 80, 3, 2, 1, "model.1");
    IElementWiseLayer* conv2 = convBnSilu(network, weightMap, *conv1->getOutput(0), 80, 3, 1, 1, "model.2");
    IElementWiseLayer* conv3 = convBnSilu(network, weightMap, *conv2->getOutput(0), 160, 3, 2, 1, "model.3");

    IElementWiseLayer* conv4 = convBnSilu(network, weightMap, *conv3->getOutput(0), 64, 1, 1, 0, "model.4");

    IElementWiseLayer* conv5 = convBnSilu(network, weightMap, *conv3->getOutput(0), 64, 1, 1, 0, "model.5");
    IElementWiseLayer* conv6 = convBnSilu(network, weightMap, *conv5->getOutput(0), 64, 3, 1, 1, "model.6");
    IElementWiseLayer* conv7 = convBnSilu(network, weightMap, *conv6->getOutput(0), 64, 3, 1, 1, "model.7");
    IElementWiseLayer* conv8 = convBnSilu(network, weightMap, *conv7->getOutput(0), 64, 3, 1, 1, "model.8");
    IElementWiseLayer* conv9 = convBnSilu(network, weightMap, *conv8->getOutput(0), 64, 3, 1, 1, "model.9");
    IElementWiseLayer* conv10 = convBnSilu(network, weightMap, *conv9->getOutput(0), 64, 3, 1, 1, "model.10");
    IElementWiseLayer* conv11 = convBnSilu(network, weightMap, *conv10->getOutput(0), 64, 3, 1, 1, "model.11");

    ITensor* input_tensor_12[] = { conv11->getOutput(0), conv9->getOutput(0), conv7->getOutput(0), conv5->getOutput(0), conv4->getOutput(0) };
    IConcatenationLayer* concat12 = network->addConcatenation(input_tensor_12, 5);
    //concat9->setAxis(0);
    IElementWiseLayer* conv13 = convBnSilu(network, weightMap, *concat12->getOutput(0), 320, 1, 1, 0, "model.13");

    IPoolingLayer* mp1 = network->addPoolingNd(*conv13->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 });
    mp1->setStrideNd(DimsHW{ 2, 2 });
    IElementWiseLayer* conv15 = convBnSilu(network, weightMap, *mp1->getOutput(0), 160, 1, 1, 0, "model.15");

    IElementWiseLayer* conv16 = convBnSilu(network, weightMap, *conv13->getOutput(0), 160, 1, 1, 0, "model.16");
    IElementWiseLayer* conv17 = convBnSilu(network, weightMap, *conv16->getOutput(0), 160, 3, 2, 1, "model.17");
    ITensor* input_tensor_18[] = { conv17->getOutput(0), conv15->getOutput(0) };
    IConcatenationLayer* concat18 = network->addConcatenation(input_tensor_18, 2);

    //IConcatenationLayer* mp1 = MPC3(network, weightMap, *conv13->getOutput(0), 160, "model.15", "model.16", "model.17");


    IElementWiseLayer* conv19 = convBnSilu(network, weightMap, *concat18->getOutput(0), 128, 1, 1, 0, "model.19");

    IElementWiseLayer* conv20 = convBnSilu(network, weightMap, *concat18->getOutput(0), 128, 1, 1, 0, "model.20");
    IElementWiseLayer* conv21 = convBnSilu(network, weightMap, *conv20->getOutput(0), 128, 3, 1, 1, "model.21");
    IElementWiseLayer* conv22 = convBnSilu(network, weightMap, *conv21->getOutput(0), 128, 3, 1, 1, "model.22");
    IElementWiseLayer* conv23 = convBnSilu(network, weightMap, *conv22->getOutput(0), 128, 3, 1, 1, "model.23");
    IElementWiseLayer* conv24 = convBnSilu(network, weightMap, *conv23->getOutput(0), 128, 3, 1, 1, "model.24");
    IElementWiseLayer* conv25 = convBnSilu(network, weightMap, *conv24->getOutput(0), 128, 3, 1, 1, "model.25");
    IElementWiseLayer* conv26 = convBnSilu(network, weightMap, *conv25->getOutput(0), 128, 3, 1, 1, "model.26");

    ITensor* input_tensor_27[] = { conv26->getOutput(0), conv24->getOutput(0), conv22->getOutput(0), conv20->getOutput(0),conv19->getOutput(0) };
    IConcatenationLayer* concat27 = network->addConcatenation(input_tensor_27, 5);
    //concat9->setAxis(0);
    IElementWiseLayer* conv28 = convBnSilu(network, weightMap, *concat27->getOutput(0), 640, 1, 1, 0, "model.28");


    IPoolingLayer* mp2 = network->addPoolingNd(*conv28->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 });
    mp2->setStrideNd(DimsHW{ 2, 2 });
    IElementWiseLayer* conv30 = convBnSilu(network, weightMap, *mp2->getOutput(0), 320, 1, 1, 0, "model.30");

    IElementWiseLayer* conv31 = convBnSilu(network, weightMap, *conv28->getOutput(0), 320, 1, 1, 0, "model.31");
    IElementWiseLayer* conv32 = convBnSilu(network, weightMap, *conv31->getOutput(0), 320, 3, 2, 1, "model.32");

    ITensor* input_tensor_33[] = { conv32->getOutput(0), conv30->getOutput(0) };
    IConcatenationLayer* concat33 = network->addConcatenation(input_tensor_33, 2);
    //IConcatenationLayer* mp2 = MPC3(network, weightMap, *conv28->getOutput(0), 320, "model.30", "model.31", "model.32");


    IElementWiseLayer* conv34 = convBnSilu(network, weightMap, *concat33->getOutput(0), 256, 1, 1, 0, "model.34");

    IElementWiseLayer* conv35 = convBnSilu(network, weightMap, *concat33->getOutput(0), 256, 1, 1, 0, "model.35");
    IElementWiseLayer* conv36 = convBnSilu(network, weightMap, *conv35->getOutput(0), 256, 3, 1, 1, "model.36");
    IElementWiseLayer* conv37 = convBnSilu(network, weightMap, *conv36->getOutput(0), 256, 3, 1, 1, "model.37");
    IElementWiseLayer* conv38 = convBnSilu(network, weightMap, *conv37->getOutput(0), 256, 3, 1, 1, "model.38");
    IElementWiseLayer* conv39 = convBnSilu(network, weightMap, *conv38->getOutput(0), 256, 3, 1, 1, "model.39");
    IElementWiseLayer* conv40 = convBnSilu(network, weightMap, *conv39->getOutput(0), 256, 3, 1, 1, "model.40");
    IElementWiseLayer* conv41 = convBnSilu(network, weightMap, *conv40->getOutput(0), 256, 3, 1, 1, "model.41");

    ITensor* input_tensor_42[] = { conv41->getOutput(0), conv39->getOutput(0), conv37->getOutput(0), conv35->getOutput(0),conv34->getOutput(0) };
    IConcatenationLayer* concat42 = network->addConcatenation(input_tensor_42, 5);
    //concat9->setAxis(0);
    IElementWiseLayer* conv43 = convBnSilu(network, weightMap, *concat42->getOutput(0), 1280, 1, 1, 0, "model.43");


    IPoolingLayer* mp3 = network->addPoolingNd(*conv43->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 });
    mp3->setStrideNd(DimsHW{ 2, 2 });
    IElementWiseLayer* conv45 = convBnSilu(network, weightMap, *mp3->getOutput(0), 640, 1, 1, 0, "model.45");

    IElementWiseLayer* conv46 = convBnSilu(network, weightMap, *conv43->getOutput(0), 640, 1, 1, 0, "model.46");
    IElementWiseLayer* conv47 = convBnSilu(network, weightMap, *conv46->getOutput(0), 640, 3, 2, 1, "model.47");
    ITensor* input_tensor_48[] = { conv47->getOutput(0), conv45->getOutput(0) };
    IConcatenationLayer* concat48 = network->addConcatenation(input_tensor_48, 2);

    //IConcatenationLayer* mp3 = MPC3(network, weightMap, *conv43->getOutput(0), 640, "model.45", "model.46", "model.47");


    IElementWiseLayer* conv49 = convBnSilu(network, weightMap, *concat48->getOutput(0), 256, 1, 1, 0, "model.49");

    IElementWiseLayer* conv50 = convBnSilu(network, weightMap, *concat48->getOutput(0), 256, 1, 1, 0, "model.50");
    IElementWiseLayer* conv51 = convBnSilu(network, weightMap, *conv50->getOutput(0), 256, 3, 1, 1, "model.51");
    IElementWiseLayer* conv52 = convBnSilu(network, weightMap, *conv51->getOutput(0), 256, 3, 1, 1, "model.52");
    IElementWiseLayer* conv53 = convBnSilu(network, weightMap, *conv52->getOutput(0), 256, 3, 1, 1, "model.53");
    IElementWiseLayer* conv54 = convBnSilu(network, weightMap, *conv53->getOutput(0), 256, 3, 1, 1, "model.54");
    IElementWiseLayer* conv55 = convBnSilu(network, weightMap, *conv54->getOutput(0), 256, 3, 1, 1, "model.55");
    IElementWiseLayer* conv56 = convBnSilu(network, weightMap, *conv55->getOutput(0), 256, 3, 1, 1, "model.56");

    ITensor* input_tensor_57[] = { conv56->getOutput(0), conv54->getOutput(0), conv52->getOutput(0), conv50->getOutput(0),conv49->getOutput(0) };
    IConcatenationLayer* concat57 = network->addConcatenation(input_tensor_57, 5);
    //concat9->setAxis(0);
    IElementWiseLayer* conv58 = convBnSilu(network, weightMap, *concat57->getOutput(0), 1280, 1, 1, 0, "model.58");


    //-----------------------yolov7 head---------------------------
    //-----SPPCSPC-----------
    IElementWiseLayer* conv59 = SPPCSPC(network, weightMap, *conv58->getOutput(0), 640, "model.59");

    IElementWiseLayer* conv60 = convBnSilu(network, weightMap, *conv59->getOutput(0), 320, 1, 1, 0, "model.60");


    float scale[] = { 1.0, 2.0, 2.0 };
    IResizeLayer* re61 = network->addResize(*conv60->getOutput(0));
    re61->setResizeMode(ResizeMode::kNEAREST);
    re61->setScales(scale, 3);

    IElementWiseLayer* conv62 = convBnSilu(network, weightMap, *conv43->getOutput(0), 320, 1, 1, 0, "model.62");


    ITensor* input_tensor_63[] = { conv62->getOutput(0), re61->getOutput(0) };
    IConcatenationLayer* concat63 = network->addConcatenation(input_tensor_63, 2);
    //concat63->setAxis(0);


    IElementWiseLayer* conv64 = convBnSilu(network, weightMap, *concat63->getOutput(0), 256, 1, 1, 0, "model.64");

    IElementWiseLayer* conv65 = convBnSilu(network, weightMap, *concat63->getOutput(0), 256, 1, 1, 0, "model.65");
    IElementWiseLayer* conv66 = convBnSilu(network, weightMap, *conv65->getOutput(0), 256, 3, 1, 1, "model.66");
    IElementWiseLayer* conv67 = convBnSilu(network, weightMap, *conv66->getOutput(0), 256, 3, 1, 1, "model.67");
    IElementWiseLayer* conv68 = convBnSilu(network, weightMap, *conv67->getOutput(0), 256, 3, 1, 1, "model.68");
    IElementWiseLayer* conv69 = convBnSilu(network, weightMap, *conv68->getOutput(0), 256, 3, 1, 1, "model.69");
    IElementWiseLayer* conv70 = convBnSilu(network, weightMap, *conv69->getOutput(0), 256, 3, 1, 1, "model.70");
    IElementWiseLayer* conv71 = convBnSilu(network, weightMap, *conv70->getOutput(0), 256, 3, 1, 1, "model.71");

    ITensor* input_tensor_72[] = { conv71->getOutput(0), conv69->getOutput(0), conv67->getOutput(0), conv65->getOutput(0),conv64->getOutput(0) };
    IConcatenationLayer* concat72 = network->addConcatenation(input_tensor_72, 5);
    //concat9->setAxis(0);
    IElementWiseLayer* conv73 = convBnSilu(network, weightMap, *concat72->getOutput(0), 320, 1, 1, 0, "model.73");

    IElementWiseLayer* conv74 = convBnSilu(network, weightMap, *conv73->getOutput(0), 160, 1, 1, 0, "model.74");

    IResizeLayer* re75 = network->addResize(*conv74->getOutput(0));
    re75->setResizeMode(ResizeMode::kNEAREST);
    re75->setScales(scale, 3);


    IElementWiseLayer* conv76 = convBnSilu(network, weightMap, *conv28->getOutput(0), 160, 1, 1, 0, "model.76");


    ITensor* input_tensor_77[] = { conv76->getOutput(0), re75->getOutput(0) };
    IConcatenationLayer* concat77 = network->addConcatenation(input_tensor_77, 2);


    IElementWiseLayer* conv78 = convBnSilu(network, weightMap, *concat77->getOutput(0), 128, 1, 1, 0, "model.78");

    IElementWiseLayer* conv79 = convBnSilu(network, weightMap, *concat77->getOutput(0), 128, 1, 1, 0, "model.79");
    IElementWiseLayer* conv80 = convBnSilu(network, weightMap, *conv79->getOutput(0), 128, 3, 1, 1, "model.80");
    IElementWiseLayer* conv81 = convBnSilu(network, weightMap, *conv80->getOutput(0), 128, 3, 1, 1, "model.81");
    IElementWiseLayer* conv82 = convBnSilu(network, weightMap, *conv81->getOutput(0), 128, 3, 1, 1, "model.82");
    IElementWiseLayer* conv83 = convBnSilu(network, weightMap, *conv82->getOutput(0), 128, 3, 1, 1, "model.83");
    IElementWiseLayer* conv84 = convBnSilu(network, weightMap, *conv83->getOutput(0), 128, 3, 1, 1, "model.84");
    IElementWiseLayer* conv85 = convBnSilu(network, weightMap, *conv84->getOutput(0), 128, 3, 1, 1, "model.85");


    ITensor* input_tensor_86[] = { conv85->getOutput(0), conv83->getOutput(0), conv81->getOutput(0), conv79->getOutput(0),conv78->getOutput(0) };
    IConcatenationLayer* concat86 = network->addConcatenation(input_tensor_86, 5);
    //concat9->setAxis(0);
    IElementWiseLayer* conv87 = convBnSilu(network, weightMap, *concat86->getOutput(0), 160, 1, 1, 0, "model.87");


    IPoolingLayer* mp88 = network->addPoolingNd(*conv87->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 });
    mp88->setStrideNd(DimsHW{ 2, 2 });
    IElementWiseLayer* conv89 = convBnSilu(network, weightMap, *mp88->getOutput(0), 160, 1, 1, 0, "model.89");

    IElementWiseLayer* conv90 = convBnSilu(network, weightMap, *conv87->getOutput(0), 160, 1, 1, 0, "model.90");
    IElementWiseLayer* conv91 = convBnSilu(network, weightMap, *conv90->getOutput(0), 160, 3, 2, 1, "model.91");


    ITensor* input_tensor_92[] = { conv91->getOutput(0), conv89->getOutput(0),conv73->getOutput(0) };
    IConcatenationLayer* concat92 = network->addConcatenation(input_tensor_92, 3);


    IElementWiseLayer* conv93 = convBnSilu(network, weightMap, *concat92->getOutput(0), 256, 1, 1, 0, "model.93");

    IElementWiseLayer* conv94 = convBnSilu(network, weightMap, *concat92->getOutput(0), 256, 1, 1, 0, "model.94");
    IElementWiseLayer* conv95 = convBnSilu(network, weightMap, *conv94->getOutput(0), 256, 3, 1, 1, "model.95");
    IElementWiseLayer* conv96 = convBnSilu(network, weightMap, *conv95->getOutput(0), 256, 3, 1, 1, "model.96");
    IElementWiseLayer* conv97 = convBnSilu(network, weightMap, *conv96->getOutput(0), 256, 3, 1, 1, "model.97");
    IElementWiseLayer* conv98 = convBnSilu(network, weightMap, *conv97->getOutput(0), 256, 3, 1, 1, "model.98");
    IElementWiseLayer* conv99 = convBnSilu(network, weightMap, *conv98->getOutput(0), 256, 3, 1, 1, "model.99");
    IElementWiseLayer* conv100 = convBnSilu(network, weightMap, *conv99->getOutput(0), 256, 3, 1, 1, "model.100");


    ITensor* input_tensor_101[] = { conv100->getOutput(0), conv98->getOutput(0), conv96->getOutput(0), conv94->getOutput(0),conv93->getOutput(0) };
    IConcatenationLayer* concat101 = network->addConcatenation(input_tensor_101, 5);
    //concat9->setAxis(0);
    IElementWiseLayer* conv102 = convBnSilu(network, weightMap, *concat101->getOutput(0), 320, 1, 1, 0, "model.102");

    IPoolingLayer* mp103 = network->addPoolingNd(*conv102->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 });
    mp103->setStrideNd(DimsHW{ 2, 2 });
    IElementWiseLayer* conv104 = convBnSilu(network, weightMap, *mp103->getOutput(0), 320, 1, 1, 0, "model.104");

    IElementWiseLayer* conv105 = convBnSilu(network, weightMap, *conv102->getOutput(0), 320, 1, 1, 0, "model.105");
    IElementWiseLayer* conv106 = convBnSilu(network, weightMap, *conv105->getOutput(0), 320, 3, 2, 1, "model.106");


    ITensor* input_tensor_107[] = { conv106->getOutput(0), conv104->getOutput(0),conv59->getOutput(0) };
    IConcatenationLayer* concat107 = network->addConcatenation(input_tensor_107, 3);


    IElementWiseLayer* conv108 = convBnSilu(network, weightMap, *concat107->getOutput(0), 512, 1, 1, 0, "model.108");

    IElementWiseLayer* conv109 = convBnSilu(network, weightMap, *concat107->getOutput(0), 512, 1, 1, 0, "model.109");
    IElementWiseLayer* conv110 = convBnSilu(network, weightMap, *conv109->getOutput(0), 512, 3, 1, 1, "model.110");
    IElementWiseLayer* conv111 = convBnSilu(network, weightMap, *conv110->getOutput(0), 512, 3, 1, 1, "model.111");
    IElementWiseLayer* conv112 = convBnSilu(network, weightMap, *conv111->getOutput(0), 512, 3, 1, 1, "model.112");
    IElementWiseLayer* conv113 = convBnSilu(network, weightMap, *conv112->getOutput(0), 512, 3, 1, 1, "model.113");
    IElementWiseLayer* conv114 = convBnSilu(network, weightMap, *conv113->getOutput(0), 512, 3, 1, 1, "model.114");
    IElementWiseLayer* conv115 = convBnSilu(network, weightMap, *conv114->getOutput(0), 512, 3, 1, 1, "model.115");

    ITensor* input_tensor_116[] = { conv115->getOutput(0), conv113->getOutput(0), conv111->getOutput(0), conv109->getOutput(0),conv108->getOutput(0) };
    IConcatenationLayer* concat116 = network->addConcatenation(input_tensor_116, 5);
    //concat9->setAxis(0);
    IElementWiseLayer* conv117 = convBnSilu(network, weightMap, *concat116->getOutput(0), 640, 1, 1, 0, "model.117");


    IElementWiseLayer* con_0 = convBnSilu(network, weightMap, *conv87->getOutput(0), 320, 3, 1, 1, "model.118");
    IElementWiseLayer* con_1 = convBnSilu(network, weightMap, *conv102->getOutput(0), 640, 3, 1, 1, "model.119");
    IElementWiseLayer* con_2 = convBnSilu(network, weightMap, *conv117->getOutput(0), 1280, 3, 1, 1, "model.120");


    /*----------------------------------yolov7 out-----------------------------------------*/
    IConvolutionLayer* det0 = network->addConvolutionNd(*con_0->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.121.m.0.weight"], weightMap["model.121.m.0.bias"]);
    assert(det0);
    det0->setName("det0");
    IConvolutionLayer* det1 = network->addConvolutionNd(*con_1->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.121.m.1.weight"], weightMap["model.121.m.1.bias"]);
    assert(det1);
    det1->setName("det1");
    IConvolutionLayer* det2 = network->addConvolutionNd(*con_2->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.121.m.2.weight"], weightMap["model.121.m.2.bias"]);
    assert(det2);
    det2->setName("det2");

    auto yolo = addYoLoLayer(network, weightMap, "model.121", std::vector<IConvolutionLayer*>{det0, det1, det2});
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));
#if defined(USE_FP16)
    config->setFlag(BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(BuilderFlag::kINT8);
    Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // Don't need the network any more
    delete network;

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}

IHostMemory* build_engine_yolov7(unsigned int maxBatchSize,IBuilder* builder, IBuilderConfig* config, DataType dt, const std::string& wts_path) {
    std::map<std::string, Weights> weightMap = loadWeights(wts_path);

    INetworkDefinition* network = builder->createNetworkV2(0U);
    ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, kInputH, kInputW });
    assert(data);
    /*----------------------------------yolov7 backbone-----------------------------------------*/
    IElementWiseLayer* conv0 = convBnSilu(network, weightMap, *data, 32, 3, 1, 1, "model.0");

    IElementWiseLayer* conv1 = convBnSilu(network, weightMap, *conv0->getOutput(0), 64, 3, 2, 1, "model.1");
    IElementWiseLayer* conv2 = convBnSilu(network, weightMap, *conv1->getOutput(0), 64, 3, 1, 1, "model.2");

    IElementWiseLayer* conv3 = convBnSilu(network, weightMap, *conv2->getOutput(0), 128, 3, 2, 1, "model.3");
    IElementWiseLayer* conv4 = convBnSilu(network, weightMap, *conv3->getOutput(0), 64, 1, 1, 0, "model.4");
    IElementWiseLayer* conv5 = convBnSilu(network, weightMap, *conv3->getOutput(0), 64, 1, 1, 0, "model.5");
    IElementWiseLayer* conv6 = convBnSilu(network, weightMap, *conv5->getOutput(0), 64, 3, 1, 1, "model.6");
    IElementWiseLayer* conv7 = convBnSilu(network, weightMap, *conv6->getOutput(0), 64, 3, 1, 1, "model.7");
    IElementWiseLayer* conv8 = convBnSilu(network, weightMap, *conv7->getOutput(0), 64, 3, 1, 1, "model.8");
    IElementWiseLayer* conv9 = convBnSilu(network, weightMap, *conv8->getOutput(0), 64, 3, 1, 1, "model.9");
    ITensor* input_tensor_10[] = { conv9->getOutput(0), conv7->getOutput(0), conv5->getOutput(0), conv4->getOutput(0) };
    IConcatenationLayer* concat10 = network->addConcatenation(input_tensor_10, 4);
    concat10->setAxis(0);
    IElementWiseLayer* conv11 = convBnSilu(network, weightMap, *concat10->getOutput(0), 256, 1, 1, 0, "model.11");

    IPoolingLayer* mp12 = network->addPoolingNd(*conv11->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 });
    mp12->setStrideNd(DimsHW{ 2, 2 });
    IElementWiseLayer* conv13 = convBnSilu(network, weightMap, *mp12->getOutput(0), 128, 1, 1, 0, "model.13");
    IElementWiseLayer* conv14 = convBnSilu(network, weightMap, *conv11->getOutput(0), 128, 1, 1, 0, "model.14");
    IElementWiseLayer* conv15 = convBnSilu(network, weightMap, *conv14->getOutput(0), 128, 3, 2, 1, "model.15");
    ITensor* input_tensor_16[] = { conv15->getOutput(0), conv13->getOutput(0) };
    IConcatenationLayer* concat16 = network->addConcatenation(input_tensor_16, 2);
    IElementWiseLayer* conv17 = convBnSilu(network, weightMap, *concat16->getOutput(0), 128, 1, 1, 0, "model.17");
    IElementWiseLayer* conv18 = convBnSilu(network, weightMap, *concat16->getOutput(0), 128, 1, 1, 0, "model.18");
    IElementWiseLayer* conv19 = convBnSilu(network, weightMap, *conv18->getOutput(0), 128, 3, 1, 1, "model.19");
    IElementWiseLayer* conv20 = convBnSilu(network, weightMap, *conv19->getOutput(0), 128, 3, 1, 1, "model.20");
    IElementWiseLayer* conv21 = convBnSilu(network, weightMap, *conv20->getOutput(0), 128, 3, 1, 1, "model.21");
    IElementWiseLayer* conv22 = convBnSilu(network, weightMap, *conv21->getOutput(0), 128, 3, 1, 1, "model.22");
    ITensor* input_tensor_23[] = { conv22->getOutput(0), conv20->getOutput(0), conv18->getOutput(0), conv17->getOutput(0) };
    IConcatenationLayer* concat23 = network->addConcatenation(input_tensor_23, 4);
    concat23->setAxis(0);
    IElementWiseLayer* conv24 = convBnSilu(network, weightMap, *concat23->getOutput(0), 512, 1, 1, 0, "model.24");

    IPoolingLayer* mp25 = network->addPoolingNd(*conv24->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 });
    mp25->setStrideNd(DimsHW{ 2, 2 });
    IElementWiseLayer* conv26 = convBnSilu(network, weightMap, *mp25->getOutput(0), 256, 1, 1, 0, "model.26");
    IElementWiseLayer* conv27 = convBnSilu(network, weightMap, *conv24->getOutput(0), 256, 1, 1, 0, "model.27");
    IElementWiseLayer* conv28 = convBnSilu(network, weightMap, *conv27->getOutput(0), 256, 3, 2, 1, "model.28");
    ITensor* input_tensor_29[] = { conv28->getOutput(0), conv26->getOutput(0) };
    IConcatenationLayer* concat29 = network->addConcatenation(input_tensor_29, 2);
    IElementWiseLayer* conv30 = convBnSilu(network, weightMap, *concat29->getOutput(0), 256, 1, 1, 0, "model.30");
    IElementWiseLayer* conv31 = convBnSilu(network, weightMap, *concat29->getOutput(0), 256, 1, 1, 0, "model.31");
    IElementWiseLayer* conv32 = convBnSilu(network, weightMap, *conv31->getOutput(0), 256, 3, 1, 1, "model.32");
    IElementWiseLayer* conv33 = convBnSilu(network, weightMap, *conv32->getOutput(0), 256, 3, 1, 1, "model.33");
    IElementWiseLayer* conv34 = convBnSilu(network, weightMap, *conv33->getOutput(0), 256, 3, 1, 1, "model.34");
    IElementWiseLayer* conv35 = convBnSilu(network, weightMap, *conv34->getOutput(0), 256, 3, 1, 1, "model.35");
    ITensor* input_tensor_36[] = { conv35->getOutput(0), conv33->getOutput(0), conv31->getOutput(0), conv30->getOutput(0) };
    IConcatenationLayer* concat36 = network->addConcatenation(input_tensor_36, 4);
    concat36->setAxis(0);
    IElementWiseLayer* conv37 = convBnSilu(network, weightMap, *concat36->getOutput(0), 1024, 1, 1, 0, "model.37");

    IPoolingLayer* mp38 = network->addPoolingNd(*conv37->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 });
    mp38->setStrideNd(DimsHW{ 2, 2 });
    IElementWiseLayer* conv39 = convBnSilu(network, weightMap, *mp38->getOutput(0), 512, 1, 1, 0, "model.39");
    IElementWiseLayer* conv40 = convBnSilu(network, weightMap, *conv37->getOutput(0), 512, 1, 1, 0, "model.40");
    IElementWiseLayer* conv41 = convBnSilu(network, weightMap, *conv40->getOutput(0), 512, 3, 2, 1, "model.41");
    ITensor* input_tensor_42[] = { conv41->getOutput(0), conv39->getOutput(0) };
    IConcatenationLayer* concat42 = network->addConcatenation(input_tensor_42, 2);
    concat42->setAxis(0);
    IElementWiseLayer* conv43 = convBnSilu(network, weightMap, *concat42->getOutput(0), 256, 1, 1, 0, "model.43");
    IElementWiseLayer* conv44 = convBnSilu(network, weightMap, *concat42->getOutput(0), 256, 1, 1, 0, "model.44");
    IElementWiseLayer* conv45 = convBnSilu(network, weightMap, *conv44->getOutput(0), 256, 3, 1, 1, "model.45");
    IElementWiseLayer* conv46 = convBnSilu(network, weightMap, *conv45->getOutput(0), 256, 3, 1, 1, "model.46");
    IElementWiseLayer* conv47 = convBnSilu(network, weightMap, *conv46->getOutput(0), 256, 3, 1, 1, "model.47");
    IElementWiseLayer* conv48 = convBnSilu(network, weightMap, *conv47->getOutput(0), 256, 3, 1, 1, "model.48");
    ITensor* input_tensor_49[] = { conv48->getOutput(0), conv46->getOutput(0), conv44->getOutput(0), conv43->getOutput(0) };
    IConcatenationLayer* concat49 = network->addConcatenation(input_tensor_49, 4);
    concat49->setAxis(0);
    IElementWiseLayer* conv50 = convBnSilu(network, weightMap, *concat49->getOutput(0), 1024, 1, 1, 0, "model.50");

    /*----------------------------------yolov7 head-----------------------------------------*/
    IElementWiseLayer* conv51 = SPPCSPC(network, weightMap, *conv50->getOutput(0), 512, "model.51");

    IElementWiseLayer* conv52 = convBnSilu(network, weightMap, *conv51->getOutput(0), 256, 1, 1, 0, "model.52");
    float scale[] = { 1.0, 2.0, 2.0 };
    IResizeLayer* re53 = network->addResize(*conv52->getOutput(0));
    re53->setResizeMode(ResizeMode::kNEAREST);
    re53->setScales(scale, 3);
    IElementWiseLayer* conv54 = convBnSilu(network, weightMap, *conv37->getOutput(0), 256, 1, 1, 0, "model.54");
    ITensor* input_tensor_55[] = { conv54->getOutput(0), re53->getOutput(0) };
    IConcatenationLayer* concat55 = network->addConcatenation(input_tensor_55, 2);
    concat55->setAxis(0);

    IElementWiseLayer* conv56 = convBnSilu(network, weightMap, *concat55->getOutput(0), 256, 1, 1, 0, "model.56");
    IElementWiseLayer* conv57 = convBnSilu(network, weightMap, *concat55->getOutput(0), 256, 1, 1, 0, "model.57");
    IElementWiseLayer* conv58 = convBnSilu(network, weightMap, *conv57->getOutput(0), 128, 3, 1, 1, "model.58");
    IElementWiseLayer* conv59 = convBnSilu(network, weightMap, *conv58->getOutput(0), 128, 3, 1, 1, "model.59");
    IElementWiseLayer* conv60 = convBnSilu(network, weightMap, *conv59->getOutput(0), 128, 3, 1, 1, "model.60");
    IElementWiseLayer* conv61 = convBnSilu(network, weightMap, *conv60->getOutput(0), 128, 3, 1, 1, "model.61");
    ITensor* input_tensor_62[] = { conv61->getOutput(0), conv60->getOutput(0), conv59->getOutput(0), conv58->getOutput(0), conv57->getOutput(0), conv56->getOutput(0) };
    IConcatenationLayer* concat62 = network->addConcatenation(input_tensor_62, 6);
    concat62->setAxis(0);
    IElementWiseLayer* conv63 = convBnSilu(network, weightMap, *concat62->getOutput(0), 256, 1, 1, 0, "model.63");

    IElementWiseLayer* conv64 = convBnSilu(network, weightMap, *conv63->getOutput(0), 128, 1, 1, 0, "model.64");
    IResizeLayer* re65 = network->addResize(*conv64->getOutput(0));
    re65->setResizeMode(ResizeMode::kNEAREST);
    re65->setScales(scale, 3);
    IElementWiseLayer* conv66 = convBnSilu(network, weightMap, *conv24->getOutput(0), 128, 1, 1, 0, "model.66");
    ITensor* input_tensor_67[] = { conv66->getOutput(0), re65->getOutput(0) };
    IConcatenationLayer* concat67 = network->addConcatenation(input_tensor_67, 2);
    concat67->setAxis(0);

    IElementWiseLayer* conv68 = convBnSilu(network, weightMap, *concat67->getOutput(0), 128, 1, 1, 0, "model.68");
    IElementWiseLayer* conv69 = convBnSilu(network, weightMap, *concat67->getOutput(0), 128, 1, 1, 0, "model.69");
    IElementWiseLayer* conv70 = convBnSilu(network, weightMap, *conv69->getOutput(0), 64, 3, 1, 1, "model.70");
    IElementWiseLayer* conv71 = convBnSilu(network, weightMap, *conv70->getOutput(0), 64, 3, 1, 1, "model.71");
    IElementWiseLayer* conv72 = convBnSilu(network, weightMap, *conv71->getOutput(0), 64, 3, 1, 1, "model.72");
    IElementWiseLayer* conv73 = convBnSilu(network, weightMap, *conv72->getOutput(0), 64, 3, 1, 1, "model.73");
    ITensor* input_tensor_74[] = { conv73->getOutput(0), conv72->getOutput(0), conv71->getOutput(0), conv70->getOutput(0), conv69->getOutput(0), conv68->getOutput(0) };
    IConcatenationLayer* concat74 = network->addConcatenation(input_tensor_74, 6);
    concat74->setAxis(0);
    IElementWiseLayer* conv75 = convBnSilu(network, weightMap, *concat74->getOutput(0), 128, 1, 1, 0, "model.75");

    IPoolingLayer* mp76 = network->addPoolingNd(*conv75->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 });
    mp76->setStrideNd(DimsHW{ 2, 2 });
    IElementWiseLayer* conv77 = convBnSilu(network, weightMap, *mp76->getOutput(0), 128, 1, 1, 0, "model.77");
    IElementWiseLayer* conv78 = convBnSilu(network, weightMap, *conv75->getOutput(0), 128, 1, 1, 0, "model.78");
    IElementWiseLayer* conv79 = convBnSilu(network, weightMap, *conv78->getOutput(0), 128, 3, 2, 1, "model.79");
    ITensor* input_tensor_80[] = { conv79->getOutput(0), conv77->getOutput(0), conv63->getOutput(0) };
    IConcatenationLayer* concat80 = network->addConcatenation(input_tensor_80, 3);
    concat80->setAxis(0);

    IElementWiseLayer* conv81 = convBnSilu(network, weightMap, *concat80->getOutput(0), 256, 1, 1, 0, "model.81");
    IElementWiseLayer* conv82 = convBnSilu(network, weightMap, *concat80->getOutput(0), 256, 1, 1, 0, "model.82");
    IElementWiseLayer* conv83 = convBnSilu(network, weightMap, *conv82->getOutput(0), 128, 3, 1, 1, "model.83");
    IElementWiseLayer* conv84 = convBnSilu(network, weightMap, *conv83->getOutput(0), 128, 3, 1, 1, "model.84");
    IElementWiseLayer* conv85 = convBnSilu(network, weightMap, *conv84->getOutput(0), 128, 3, 1, 1, "model.85");
    IElementWiseLayer* conv86 = convBnSilu(network, weightMap, *conv85->getOutput(0), 128, 3, 1, 1, "model.86");
    ITensor* input_tensor_87[] = { conv86->getOutput(0), conv85->getOutput(0), conv84->getOutput(0), conv83->getOutput(0), conv82->getOutput(0), conv81->getOutput(0) };
    IConcatenationLayer* concat87 = network->addConcatenation(input_tensor_87, 6);
    concat87->setAxis(0);
    IElementWiseLayer* conv88 = convBnSilu(network, weightMap, *concat87->getOutput(0), 256, 1, 1, 0, "model.88");

    IPoolingLayer* mp89 = network->addPoolingNd(*conv88->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 });
    mp89->setStrideNd(DimsHW{ 2, 2 });
    IElementWiseLayer* conv90 = convBnSilu(network, weightMap, *mp89->getOutput(0), 256, 1, 1, 0, "model.90");
    IElementWiseLayer* conv91 = convBnSilu(network, weightMap, *conv88->getOutput(0), 256, 1, 1, 0, "model.91");
    IElementWiseLayer* conv92 = convBnSilu(network, weightMap, *conv91->getOutput(0), 256, 3, 2, 1, "model.92");
    ITensor* input_tensor_93[] = { conv92->getOutput(0), conv90->getOutput(0), conv51->getOutput(0) };
    IConcatenationLayer* concat93 = network->addConcatenation(input_tensor_93, 3);
    concat93->setAxis(0);

    IElementWiseLayer* conv94 = convBnSilu(network, weightMap, *concat93->getOutput(0), 512, 1, 1, 0, "model.94");
    IElementWiseLayer* conv95 = convBnSilu(network, weightMap, *concat93->getOutput(0), 512, 1, 1, 0, "model.95");
    IElementWiseLayer* conv96 = convBnSilu(network, weightMap, *conv95->getOutput(0), 256, 3, 1, 1, "model.96");
    IElementWiseLayer* conv97 = convBnSilu(network, weightMap, *conv96->getOutput(0), 256, 3, 1, 1, "model.97");
    IElementWiseLayer* conv98 = convBnSilu(network, weightMap, *conv97->getOutput(0), 256, 3, 1, 1, "model.98");
    IElementWiseLayer* conv99 = convBnSilu(network, weightMap, *conv98->getOutput(0), 256, 3, 1, 1, "model.99");
    ITensor* input_tensor_100[] = { conv99->getOutput(0), conv98->getOutput(0), conv97->getOutput(0), conv96->getOutput(0), conv95->getOutput(0), conv94->getOutput(0) };
    IConcatenationLayer* concat100 = network->addConcatenation(input_tensor_100, 6);
    concat100->setAxis(0);
    IElementWiseLayer* conv101 = convBnSilu(network, weightMap, *concat100->getOutput(0), 512, 1, 1, 0, "model.101");

    IElementWiseLayer* conv102 = RepConv(network, weightMap, *conv75->getOutput(0), 256, 3, 1, "model.102");
    IElementWiseLayer* conv103 = RepConv(network, weightMap, *conv88->getOutput(0), 512, 3, 1, "model.103");
    IElementWiseLayer* conv104 = RepConv(network, weightMap, *conv101->getOutput(0), 1024, 3, 1, "model.104");

    /*----------------------------------yolov7 out-----------------------------------------*/
    IConvolutionLayer* cv105_0 = network->addConvolutionNd(*conv102->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.105.m.0.weight"], weightMap["model.105.m.0.bias"]);
    assert(cv105_0);
    cv105_0->setName("cv105.0");
    IConvolutionLayer* cv105_1 = network->addConvolutionNd(*conv103->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.105.m.1.weight"], weightMap["model.105.m.1.bias"]);
    assert(cv105_1);
    cv105_1->setName("cv105.1");
    IConvolutionLayer* cv105_2 = network->addConvolutionNd(*conv104->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.105.m.2.weight"], weightMap["model.105.m.2.bias"]);
    assert(cv105_2);
    cv105_2->setName("cv105.2");

    auto yolo = addYoLoLayer(network, weightMap, "model.105", std::vector<IConvolutionLayer*>{cv105_0, cv105_1, cv105_2});
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));
#if defined(USE_FP16)
    config->setFlag(BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(BuilderFlag::kINT8);
    Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // Don't need the network any more
    delete network;

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}

IHostMemory* build_engine_yolov7_tiny(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, std::string& wts_name) {
    INetworkDefinition* network = builder->createNetworkV2(0U);

    ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, kInputH, kInputW });
    assert(data);
    std::map<std::string, Weights> weightMap = loadWeights(wts_name);

    /* ------ yolov7-tiny backbone------ */
    // [32, 3, 2, None, 1, nn.LeakyReLU(0.1)]]---> outch、ksize、stride、padding、groups------
    auto conv0 = convBlockLeakRelu(network, weightMap, *data, 32, 3, 2, 1, "model.0");
    assert(conv0);

    // [-1, 1, Conv, [64, 3, 2, None, 1, nn.LeakyReLU(0.1)]],  # 1-P2/4
    auto conv1 = convBlockLeakRelu(network, weightMap, *conv0->getOutput(0), 64, 3, 2, 1, "model.1");
    assert(conv1);

    //  [-1, 1, Conv, [32, 1, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv2 = convBlockLeakRelu(network, weightMap, *conv1->getOutput(0), 32, 1, 1, 0, "model.2");
    assert(conv2);

    // [-2, 1, Conv, [32, 1, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv3 = convBlockLeakRelu(network, weightMap, *conv1->getOutput(0), 32, 1, 1, 0, "model.3");
    assert(conv3);

    // [-1, 1, Conv, [32, 3, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv4 = convBlockLeakRelu(network, weightMap, *conv3->getOutput(0), 32, 3, 1, 1, "model.4");
    assert(conv4);

    // [-1, 1, Conv, [32, 3, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv5 = convBlockLeakRelu(network, weightMap, *conv4->getOutput(0), 32, 3, 1, 1, "model.5");
    assert(conv5);

    ITensor* input_tensor_6[] = { conv5->getOutput(0), conv4->getOutput(0), conv3->getOutput(0), conv2->getOutput(0) };
    auto cat6 = network->addConcatenation(input_tensor_6, 4);
    //cat6->setAxis(0);

    // [-1, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]],  # 7
    auto conv7 = convBlockLeakRelu(network, weightMap, *cat6->getOutput(0), 64, 1, 1, 0, "model.7");
    assert(conv7);

    auto* pool8 = network->addPoolingNd(*conv7->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 });
    assert(pool8);
    pool8->setStrideNd(DimsHW{ 2, 2 });

    //[-1, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]] ,
    auto conv9 = convBlockLeakRelu(network, weightMap, *pool8->getOutput(0), 64, 1, 1, 0, "model.9");
    assert(conv9);

    // [-2, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv10 = convBlockLeakRelu(network, weightMap, *pool8->getOutput(0), 64, 1, 1, 0, "model.10");
    assert(conv10);
    //[-1, 1, Conv, [64, 3, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv11 = convBlockLeakRelu(network, weightMap, *conv10->getOutput(0), 64, 3, 1, 1, "model.11");
    assert(conv11);
    //[-1, 1, Conv, [64, 3, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv12 = convBlockLeakRelu(network, weightMap, *conv11->getOutput(0), 64, 3, 1, 1, "model.12");
    assert(conv12);

    ITensor* input_tensor_13[] = { conv12->getOutput(0), conv11->getOutput(0), conv10->getOutput(0), conv9->getOutput(0) };
    auto cat13 = network->addConcatenation(input_tensor_13, 4);
    //cat2->setAxis(0);
    
    // [-1, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]],  # 14
    auto conv14 = convBlockLeakRelu(network, weightMap, *cat13->getOutput(0), 128, 1, 1, 0, "model.14");
    assert(conv14);

    auto* pool15 = network->addPoolingNd(*conv14->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 });
    assert(pool15);
    pool15->setStrideNd(DimsHW{ 2, 2 });

    // [-1, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv16 = convBlockLeakRelu(network, weightMap, *pool15->getOutput(0), 128, 1, 1, 0, "model.16");
    assert(conv16);
    //[-2, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv17 = convBlockLeakRelu(network, weightMap, *pool15->getOutput(0), 128, 1, 1, 0, "model.17");
    assert(conv17);
    //[-1, 1, Conv, [128, 3, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv18 = convBlockLeakRelu(network, weightMap, *conv17->getOutput(0), 128, 3, 1, 1, "model.18");
    assert(conv18);
    // [-1, 1, Conv, [128, 3, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv19 = convBlockLeakRelu(network, weightMap, *conv18->getOutput(0), 128, 3, 1, 1, "model.19");
    assert(conv19);

    ITensor* input_tensor_20[] = { conv19->getOutput(0), conv18->getOutput(0), conv17->getOutput(0), conv16->getOutput(0) };
    auto cat20 = network->addConcatenation(input_tensor_20, 4);
    //cat20->setAxis(0);
    //[-1, 1, Conv, [256, 1, 1, None, 1, nn.LeakyReLU(0.1)]],  # 21
    auto conv21 = convBlockLeakRelu(network, weightMap, *cat20->getOutput(0), 256, 1, 1, 0, "model.21");
    assert(conv21);

    auto* pool22 = network->addPoolingNd(*conv21->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 });
    assert(pool22);
    pool22->setStrideNd(DimsHW{ 2, 2 });

    // [-1, 1, Conv, [256, 1, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv23 = convBlockLeakRelu(network, weightMap, *pool22->getOutput(0), 256, 1, 1, 0, "model.23");
    assert(conv23);

    // [-2, 1, Conv, [256, 1, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv24 = convBlockLeakRelu(network, weightMap, *pool22->getOutput(0), 256, 1, 1, 0, "model.24");
    assert(conv24);

    // [-1, 1, Conv, [256, 3, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv25 = convBlockLeakRelu(network, weightMap, *conv24->getOutput(0), 256, 3, 1, 1, "model.25");
    assert(conv25);

    // [-1, 1, Conv, [256, 3, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv26 = convBlockLeakRelu(network, weightMap, *conv25->getOutput(0), 256, 3, 1, 1, "model.26");
    assert(conv26);


    ITensor* input_tensor_27[] = { conv26->getOutput(0), conv25->getOutput(0), conv24->getOutput(0), conv23->getOutput(0) };
    auto cat27 = network->addConcatenation(input_tensor_27, 4);
    //cat27->setAxis(0);

    // [-1, 1, Conv, [512, 1, 1, None, 1, nn.LeakyReLU(0.1)]],  # 28
    auto conv28 = convBlockLeakRelu(network, weightMap, *cat27->getOutput(0), 512, 1, 1, 0, "model.28");
    assert(conv28);

    /*===============================yolov7-tiny head======================================*/

    // [-1, 1, Conv, [256, 1, 1, None, 1, nn.LeakyReLU(0.1)]]
    auto conv29 = convBlockLeakRelu(network, weightMap, *conv28->getOutput(0), 256, 1, 1, 0, "model.29");
    assert(conv29);

    // [-2, 1, Conv, [256, 1, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv30 = convBlockLeakRelu(network, weightMap, *conv28->getOutput(0), 256, 1, 1, 0, "model.30");
    assert(conv30);

    //[-1, 1, SP, [5]],
    auto* pool31 = network->addPoolingNd(*conv30->getOutput(0), PoolingType::kMAX, DimsHW{ 5, 5 });
    assert(pool31);
    pool31->setStrideNd(DimsHW{ 1, 1 });
    pool31->setPaddingNd(DimsHW{ 2, 2 });
    // [-2, 1, SP, [9]],
    auto* pool32 = network->addPoolingNd(*conv30->getOutput(0), PoolingType::kMAX, DimsHW{ 9, 9 });
    assert(pool32);
    pool32->setStrideNd(DimsHW{ 1, 1 });
    pool32->setPaddingNd(DimsHW{ 4, 4 });

    // [-3, 1, SP, [13]],
    auto* pool33 = network->addPoolingNd(*conv30->getOutput(0), PoolingType::kMAX, DimsHW{ 13, 13 });
    assert(pool33);
    pool33->setStrideNd(DimsHW{ 1, 1 });
    pool33->setPaddingNd(DimsHW{ 6, 6 });

    ITensor* input_tensor_34[] = { pool33->getOutput(0), pool32->getOutput(0), pool31->getOutput(0), conv30->getOutput(0) };
    auto cat34 = network->addConcatenation(input_tensor_34, 4);
    //cat34->setAxis(0);

    // [-1, 1, Conv, [256, 1, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv35 = convBlockLeakRelu(network, weightMap, *cat34->getOutput(0), 256, 1, 1, 0, "model.35");
    assert(conv35);

    ITensor* input_tensor_36[] = { conv35->getOutput(0), conv29->getOutput(0) };
    auto cat36 = network->addConcatenation(input_tensor_36, 2);
    //cat36->setAxis(0);

    // [-1, 1, Conv, [256, 1, 1, None, 1, nn.LeakyReLU(0.1)]],  # 37
    auto conv37 = convBlockLeakRelu(network, weightMap, *cat36->getOutput(0), 256, 1, 1, 0, "model.37");
    assert(conv37);

    // [-1, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv38 = convBlockLeakRelu(network, weightMap, *conv37->getOutput(0), 128, 1, 1, 0, "model.38");
    assert(conv38);

    float scale[] = { 1.0, 2.0, 2.0 };
    IResizeLayer* resize39 = network->addResize(*conv38->getOutput(0));
    resize39->setResizeMode(ResizeMode::kNEAREST);
    resize39->setScales(scale, 3);

    //    [21, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # route backbone P4 ---->conv16
    auto conv40 = convBlockLeakRelu(network, weightMap, *conv21->getOutput(0), 128, 1, 1, 0, "model.40");
    assert(conv40);

    ITensor* input_tensor_41[] = { conv40->getOutput(0), resize39->getOutput(0) };
    auto cat41 = network->addConcatenation(input_tensor_41, 2);
    //cat41->setAxis(0);

    //   [-1, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv42 = convBlockLeakRelu(network, weightMap, *cat41->getOutput(0), 64, 1, 1, 0, "model.42");
    assert(conv42);

    //[-2, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv43 = convBlockLeakRelu(network, weightMap, *cat41->getOutput(0), 64, 1, 1, 0, "model.43");
    assert(conv43);

    // [-1, 1, Conv, [64, 3, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv44 = convBlockLeakRelu(network, weightMap, *conv43->getOutput(0), 64, 3, 1, 1, "model.44");
    assert(conv44);

    // [-1, 1, Conv, [64, 3, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv45 = convBlockLeakRelu(network, weightMap, *conv44->getOutput(0), 64, 3, 1, 1, "model.45");
    assert(conv45);

    ITensor* input_tensor_46[] = { conv45->getOutput(0), conv44->getOutput(0), conv43->getOutput(0), conv42->getOutput(0) };
    auto cat46 = network->addConcatenation(input_tensor_46, 4);
    //cat46->setAxis(0);

    //  [-1, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]],  # 47
    auto conv47 = convBlockLeakRelu(network, weightMap, *cat46->getOutput(0), 128, 1, 1, 0, "model.47");
    assert(conv47);

    //    [-1, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv48 = convBlockLeakRelu(network, weightMap, *conv47->getOutput(0), 64, 1, 1, 0, "model.48");
    assert(conv48);

    IResizeLayer* resize49 = network->addResize(*conv48->getOutput(0));
    resize49->setResizeMode(ResizeMode::kNEAREST);
    resize49->setScales(scale, 3);

    // [14, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # route backbone P3 conv11
    auto conv50 = convBlockLeakRelu(network, weightMap, *conv14->getOutput(0), 64, 1, 1, 0, "model.50");
    assert(conv50);

    ITensor* input_tensor_51[] = { conv50->getOutput(0), resize49->getOutput(0) };
    IConcatenationLayer* cat51 = network->addConcatenation(input_tensor_51, 2);
    //cat51->setAxis(0);

    // [-1, 1, Conv, [32, 1, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv52 = convBlockLeakRelu(network, weightMap, *cat51->getOutput(0), 32, 1, 1, 0, "model.52");
    assert(conv52);
    // [-2, 1, Conv, [32, 1, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv53 = convBlockLeakRelu(network, weightMap, *cat51->getOutput(0), 32, 1, 1, 0, "model.53");
    assert(conv53);

    // [-1, 1, Conv, [32, 3, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv54 = convBlockLeakRelu(network, weightMap, *conv53->getOutput(0), 32, 3, 1, 1, "model.54");
    assert(conv54);
    // [-1, 1, Conv, [32, 3, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv55 = convBlockLeakRelu(network, weightMap, *conv54->getOutput(0), 32, 3, 1, 1, "model.55");
    assert(conv55);

    ITensor* input_tensor_56[] = { conv55->getOutput(0), conv54->getOutput(0), conv53->getOutput(0),conv52->getOutput(0) };
    IConcatenationLayer* cat56 = network->addConcatenation(input_tensor_56, 4);
    //cat56->setAxis(0);

    // [-1, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]],  # 57
    auto conv57 = convBlockLeakRelu(network, weightMap, *cat56->getOutput(0), 64, 1, 1, 0, "model.57");
    assert(conv57);

    // [-1, 1, Conv, [128, 3, 2, None, 1, nn.LeakyReLU(0.1)]],
    auto conv58 = convBlockLeakRelu(network, weightMap, *conv57->getOutput(0), 128, 3, 2, 1, "model.58");
    assert(conv58);

    // conv32 [[-1, 47], 1, Concat, [1]],
    ITensor* input_tensor_59[] = { conv58->getOutput(0), conv47->getOutput(0) };
    IConcatenationLayer* cat59 = network->addConcatenation(input_tensor_59, 2);
    //cat59->setAxis(0);

    // [-1, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv60 = convBlockLeakRelu(network, weightMap, *cat59->getOutput(0), 64, 1, 1, 0, "model.60");
    assert(conv60);
    // [-2, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv61 = convBlockLeakRelu(network, weightMap, *cat59->getOutput(0), 64, 1, 1, 0, "model.61");
    assert(conv61);

    // [-1, 1, Conv, [64, 3, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv62 = convBlockLeakRelu(network, weightMap, *conv61->getOutput(0), 64, 3, 1, 1, "model.62");
    assert(conv62);
    // [-1, 1, Conv, [64, 3, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv63 = convBlockLeakRelu(network, weightMap, *conv62->getOutput(0), 64, 3, 1, 1, "model.63");
    assert(conv63);

    ITensor* input_tensor_64[] = { conv63->getOutput(0), conv62->getOutput(0), conv61->getOutput(0), conv60->getOutput(0) };
    IConcatenationLayer* cat64 = network->addConcatenation(input_tensor_64, 4);
    //cat64->setAxis(0);

    // [-1, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]] , # 65
    auto conv65 = convBlockLeakRelu(network, weightMap, *cat64->getOutput(0), 128, 1, 1, 0, "model.65");
    assert(conv65);

    // [-1, 1, Conv, [256, 3, 2, None, 1, nn.LeakyReLU(0.1)]] ,
    auto conv66 = convBlockLeakRelu(network, weightMap, *conv65->getOutput(0), 256, 3, 2, 1, "model.66");
    assert(conv66);

    ITensor* input_tensor_67[] = { conv66->getOutput(0), conv37->getOutput(0) };
    IConcatenationLayer* cat67 = network->addConcatenation(input_tensor_67, 2);
    //cat67->setAxis(0);

    // [-1, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv68 = convBlockLeakRelu(network, weightMap, *cat67->getOutput(0), 128, 1, 1, 0, "model.68");
    assert(conv68);
    // [-2, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv69 = convBlockLeakRelu(network, weightMap, *cat67->getOutput(0), 128, 1, 1, 0, "model.69");
    assert(conv69);

    // [-1, 1, Conv, [128, 3, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv70 = convBlockLeakRelu(network, weightMap, *conv69->getOutput(0), 128, 3, 1, 1, "model.70");
    assert(conv70);

    // [-1, 1, Conv, [128, 3, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv71 = convBlockLeakRelu(network, weightMap, *conv70->getOutput(0), 128, 3, 1, 1, "model.71");
    assert(conv71);

    ITensor* input_tensor_72[] = { conv71->getOutput(0), conv70->getOutput(0), conv69->getOutput(0), conv68->getOutput(0) };
    IConcatenationLayer* cat72 = network->addConcatenation(input_tensor_72, 4);
    //cat72->setAxis(0);

    // [-1, 1, Conv, [256, 1, 1, None, 1, nn.LeakyReLU(0.1)]],  # 73
    auto conv73 = convBlockLeakRelu(network, weightMap, *cat72->getOutput(0), 256, 1, 1, 0, "model.73");
    assert(conv73);


    // [57, 1, Conv, [128, 3, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv74 = convBlockLeakRelu(network, weightMap, *conv57->getOutput(0), 128, 3, 1, 1, "model.74");
    assert(conv74);
    // [65, 1, Conv, [256, 3, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv75 = convBlockLeakRelu(network, weightMap, *conv65->getOutput(0), 256, 3, 1, 1, "model.75");
    assert(conv75);
    // [73, 1, Conv, [512, 3, 1, None, 1, nn.LeakyReLU(0.1)]],
    auto conv76 = convBlockLeakRelu(network, weightMap, *conv73->getOutput(0), 512, 3, 1, 1, "model.76");
    assert(conv76);

    /* ------ detect ------ */
    IConvolutionLayer* det0 = network->addConvolutionNd(*conv74->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.77.m.0.weight"], weightMap["model.77.m.0.bias"]);
   
    IConvolutionLayer* det1 = network->addConvolutionNd(*conv75->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.77.m.1.weight"], weightMap["model.77.m.1.bias"]);

    IConvolutionLayer* det2 = network->addConvolutionNd(*conv76->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.77.m.2.weight"], weightMap["model.77.m.2.bias"]);

    auto yolo = addYoLoLayer(network, weightMap, "model.77", std::vector<IConvolutionLayer*>{det0, det1, det2});
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));
    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#if defined(USE_FP16)
    config->setFlag(BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(BuilderFlag::kINT8);
    Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // Don't need the network any more
    delete network;

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}


================================================
FILE: yolov7/src/postprocess.cpp
================================================
#include "postprocess.h"

cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
  float l, r, t, b;
  float r_w = kInputW / (img.cols * 1.0);
  float r_h = kInputH / (img.rows * 1.0);
  if (r_h > r_w) {
    l = bbox[0] - bbox[2] / 2.f;
    r = bbox[0] + bbox[2] / 2.f;
    t = bbox[1] - bbox[3] / 2.f - (kInputH - r_w * img.rows) / 2;
    b = bbox[1] + bbox[3] / 2.f - (kInputH - r_w * img.rows) / 2;
    l = l / r_w;
    r = r / r_w;
    t = t / r_w;
    b = b / r_w;
  } else {
    l = bbox[0] - bbox[2] / 2.f - (kInputW - r_h * img.cols) / 2;
    r = bbox[0] + bbox[2] / 2.f - (kInputW - r_h * img.cols) / 2;
    t = bbox[1] - bbox[3] / 2.f;
    b = bbox[1] + bbox[3] / 2.f;
    l = l / r_h;
    r = r / r_h;
    t = t / r_h;
    b = b / r_h;
  }
  return cv::Rect(round(l), round(t), round(r - l), round(b - t));
}

static float iou(float lbox[4], float rbox[4]) {
  float interBox[] = {
    (std::max)(lbox[0] - lbox[2] / 2.f , rbox[0] - rbox[2] / 2.f), //left
    (std::min)(lbox[0] + lbox[2] / 2.f , rbox[0] + rbox[2] / 2.f), //right
    (std::max)(lbox[1] - lbox[3] / 2.f , rbox[1] - rbox[3] / 2.f), //top
    (std::min)(lbox[1] + lbox[3] / 2.f , rbox[1] + rbox[3] / 2.f), //bottom
  };

  if (interBox[2] > interBox[3] || interBox[0] > interBox[1])
    return 0.0f;

  float interBoxS = (interBox[1] - interBox[0])*(interBox[3] - interBox[2]);
  return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS);
}

static bool cmp(const Detection& a, const Detection& b) {
  return a.conf > b.conf;
}

void nms(std::vector<Detection>& res, float *output, float conf_thresh, float nms_thresh) {
  int det_size = sizeof(Detection) / sizeof(float);
  std::map<float, std::vector<Detection>> m;
  for (int i = 0; i < output[0] && i < kMaxNumOutputBbox; i++) {
    if (output[1 + det_size * i + 4] <= conf_thresh) continue;
    Detection det;
    memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
    if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Detection>());
    m[det.class_id].push_back(det);
  }
  for (auto it = m.begin(); it != m.end(); it++) {
    auto& dets = it->second;
    std::sort(dets.begin(), dets.end(), cmp);
    for (size_t m = 0; m < dets.size(); ++m) {
      auto& item = dets[m];
      res.push_back(item);
      for (size_t n = m + 1; n < dets.size(); ++n) {
        if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
          dets.erase(dets.begin() + n);
          --n;
        }
      }
    }
  }
}

void batch_nms(std::vector<std::vector<Detection>>& res_batch, float *output, int batch_size, int output_size, float conf_thresh, float nms_thresh) {
  res_batch.resize(batch_size);
  for (int i = 0; i < batch_size; i++) {
    nms(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh);
  }
}

void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
  for (size_t i = 0; i < img_batch.size(); i++) {
    auto& res = res_batch[i];
    cv::Mat img = img_batch[i];
    for (size_t j = 0; j < res.size(); j++) {
      cv::Rect r = get_rect(img, res[j].bbox);
      cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
      cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
    }
  }
}


================================================
FILE: yolov7/src/preprocess.cu
================================================
#include "preprocess.h"
#include "cuda_utils.h"

static uint8_t* img_buffer_host = nullptr;
static uint8_t* img_buffer_device = nullptr;

struct AffineMatrix{
  float value[6];
};

__global__ void warpaffine_kernel(
    uint8_t* src, int src_line_size, int src_width,
    int src_height, float* dst, int dst_width,
    int dst_height, uint8_t const_value_st,
    AffineMatrix d2s, int edge) {
  int position = blockDim.x * blockIdx.x + threadIdx.x;
  if (position >= edge) return;

  float m_x1 = d2s.value[0];
  float m_y1 = d2s.value[1];
  float m_z1 = d2s.value[2];
  float m_x2 = d2s.value[3];
  float m_y2 = d2s.value[4];
  float m_z2 = d2s.value[5];

  int dx = position % dst_width;
  int dy = position / dst_width;
  float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
  float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
  float c0, c1, c2;

  if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
    // out of range
    c0 = const_value_st;
    c1 = const_value_st;
    c2 = const_value_st;
  } else {
    int y_low = floorf(src_y);
    int x_low = floorf(src_x);
    int y_high = y_low + 1;
    int x_high = x_low + 1;

    uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
    float ly = src_y - y_low;
    float lx = src_x - x_low;
    float hy = 1 - ly;
    float hx = 1 - lx;
    float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
    uint8_t* v1 = const_value;
    uint8_t* v2 = const_value;
    uint8_t* v3 = const_value;
    uint8_t* v4 = const_value;

    if (y_low >= 0) {
      if (x_low >= 0)
        v1 = src + y_low * src_line_size + x_low * 3;

      if (x_high < src_width)
        v2 = src + y_low * src_line_size + x_high * 3;
    }

    if (y_high < src_height) {
      if (x_low >= 0)
        v3 = src + y_high * src_line_size + x_low * 3;

      if (x_high < src_width)
        v4 = src + y_high * src_line_size + x_high * 3;
    }

    c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
    c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
    c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
  }

  // bgr to rgb 
  float t = c2;
  c2 = c0;
  c0 = t;

  // normalization
  c0 = c0 / 255.0f;
  c1 = c1 / 255.0f;
  c2 = c2 / 255.0f;

  // rgbrgbrgb to rrrgggbbb
  int area = dst_width * dst_height;
  float* pdst_c0 = dst + dy * dst_width + dx;
  float* pdst_c1 = pdst_c0 + area;
  float* pdst_c2 = pdst_c1 + area;
  *pdst_c0 = c0;
  *pdst_c1 = c1;
  *pdst_c2 = c2;
}

void cuda_preprocess(
    uint8_t* src, int src_width, int src_height,
    float* dst, int dst_width, int dst_height,
    cudaStream_t stream) {
  int img_size = src_width * src_height * 3;
  // copy data to pinned memory
  memcpy(img_buffer_host, src, img_size);
  // copy data to device memory
  CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream));

  AffineMatrix s2d, d2s;
  float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width);

  s2d.value[0] = scale;
  s2d.value[1] = 0;
  s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5;
  s2d.value[3] = 0;
  s2d.value[4] = scale;
  s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5;
  cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
  cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
  cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);

  memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));

  int jobs = dst_height * dst_width;
  int threads = 256;
  int blocks = ceil(jobs / (float)threads);
  warpaffine_kernel<<<blocks, threads, 0, stream>>>(
      img_buffer_device, src_width * 3, src_width,
      src_height, dst, dst_width,
      dst_height, 128, d2s, jobs);
}


void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch,
                           float* dst, int dst_width, int dst_height,
                           cudaStream_t stream) {
  int dst_size = dst_width * dst_height * 3;
  for (size_t i = 0; i < img_batch.size(); i++) {
    cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width, dst_height, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
  }
}

void cuda_preprocess_init(int max_image_size) {
  // prepare input data in pinned memory
  CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3));
  // prepare input data in device memory
  CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3));
}

void cuda_preprocess_destroy() {
  CUDA_CHECK(cudaFree(img_buffer_device));
  CUDA_CHECK(cudaFreeHost(img_buffer_host));
}


================================================
FILE: yolov7/yolov7_trt.py
================================================
"""
An example that uses TensorRT's Python api to make inferences.
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt

CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4


def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret

def plot_one_box(x, img, color=None, label=None, line_thickness=None):
    """
    description: Plots one bounding box on image img,
                 this function comes from YoLov7 project.
    param: 
        x:      a box likes [x1,y1,x2,y2]
        img:    a opencv image object
        color:  color to draw rectangle, such as (0,255,0)
        label:  str
        line_thickness: int
    return:
        no return

    """
    tl = (
        line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
    )  # line/font thickness
    color = color or [random.randint(0, 255) for _ in range(3)]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
    if label:
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(
            img,
            label,
            (c1[0], c1[1] - 2),
            0,
            tl / 3,
            [225, 255, 255],
            thickness=tf,
            lineType=cv2.LINE_AA,
        )


class YoLov7TRT(object):
    """
    description: A YOLOv7 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.batch_size = engine.max_batch_size

    def infer(self, raw_image_generator):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        engine = self.engine
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_origin_h = []
        batch_origin_w = []
        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
            batch_image_raw.append(image_raw)
            batch_origin_h.append(origin_h)
            batch_origin_w.append(origin_w)
            np.copyto(batch_input_image[i], input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        # Do postprocess
        for i in range(self.batch_size):
            result_boxes, result_scores, result_classid = self.post_process(
                output[i * 6001: (i + 1) * 6001], batch_origin_h[i], batch_origin_w[i]
            )
            # Draw rectangles and labels on the original image
            for j in range(len(result_boxes)):
                box = result_boxes[j]
                plot_one_box(
                    box,
                    batch_image_raw[i],
                    label="{}:{:.2f}".format(
                        categories[int(result_classid[j])], result_scores[j]
                    ),
                )
        return batch_image_raw, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        
    def get_raw_image(self, image_path_batch):
        """
        description: Read an image from image path
        """
        for img_path in image_path_batch:
            yield cv2.imread(img_path)
        
    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_image(self, raw_bgr_image):
        """
        description: Convert BGR image to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = self.input_w / w
        r_h = self.input_h / h
        if r_h > r_w:
            tw = self.input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.input_h - th) / 2)
            ty2 = self.input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = self.input_h
            tx1 = int((self.input_w - tw) / 2)
            tx2 = self.input_w - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy(self, origin_h, origin_w, x):
        """
        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
        param:
            origin_h:   height of original image
            origin_w:   width of original image
            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
        return:
            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
        """
        y = np.zeros_like(x)
        r_w = self.input_w / origin_w
        r_h = self.input_h / origin_h
        if r_h > r_w:
            y[:, 0] = x[:, 0] - x[:, 2] / 2
            y[:, 2] = x[:, 0] + x[:, 2] / 2
            y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
            y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
            y /= r_w
        else:
            y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
            y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
            y[:, 1] = x[:, 1] - x[:, 3] / 2
            y[:, 3] = x[:, 1] + x[:, 3] / 2
            y /= r_h

        return y

    def post_process(self, output, origin_h, origin_w):
        """
        description: postprocess the prediction
        param:
            output:     A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] 
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
            result_scores: finally scores, a numpy, each element is the score correspoing to box
            result_classid: finally classid, a numpy, each element is the classid correspoing to box
        """
        # Get the num of boxes detected
        num = int(output[0])
        # Reshape to a two dimentional ndarray
        pred = np.reshape(output[1:], (-1, 6))[:num, :]
        # Do nms
        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
        result_scores = boxes[:, 4] if len(boxes) else np.array([])
        result_classid = boxes[:, 5] if len(boxes) else np.array([])
        return result_boxes, result_scores, result_classid

    def bbox_iou(self, box1, box2, x1y1x2y2=True):
        """
        description: compute the IoU of two bounding boxes
        param:
            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))            
            x1y1x2y2: select the coordinate format
        return:
            iou: computed iou
        """
        if not x1y1x2y2:
            # Transform from center and width to exact coordinates
            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
        else:
            # Get the coordinates of bounding boxes
            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

        # Get the coordinates of the intersection rectangle
        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
        # Intersection area
        inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \
                     np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)
        # Union Area
        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)

        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

        return iou

    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
        """
        description: Removes detections with lower object confidence score than 'conf_thres' and performs
        Non-Maximum Suppression to further filter detections.
        param:
            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
            origin_h: original image height
            origin_w: original image width
            conf_thres: a confidence threshold to filter detections
            nms_thres: a iou threshold to filter detections
        return:
            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
        """
        # Get the boxes that score > CONF_THRESH
        boxes = prediction[prediction[:, 4] >= conf_thres]
        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
        boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
        # clip the coordinates
        boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w -1)
        boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w -1)
        boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h -1)
        boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h -1)
        # Object confidence
        confs = boxes[:, 4]
        # Sort by the confs
        boxes = boxes[np.argsort(-confs)]
        # Perform non-maximum suppression
        keep_boxes = []
        while boxes.shape[0]:
            large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
            label_match = boxes[0, -1] == boxes[:, -1]
            # Indices of boxes with lower confidence scores, large IOUs and matching labels
            invalid = large_overlap & label_match
            keep_boxes += [boxes[0]]
            boxes = boxes[~invalid]
        boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
        return boxes


class inferThread(threading.Thread):
    def __init__(self, yolov7_wrapper, image_path_batch):
        threading.Thread.__init__(self)
        self.yolov7_wrapper = yolov7_wrapper
        self.image_path_batch = image_path_batch

    def run(self):
        batch_image_raw, use_time = self.yolov7_wrapper.infer(self.yolov7_wrapper.get_raw_image(self.image_path_batch))
        for i, img_path in enumerate(self.image_path_batch):
            parent, filename = os.path.split(img_path)
            save_name = os.path.join('output', filename)
            # Save image
            cv2.imwrite(save_name, batch_image_raw[i])
        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))


class warmUpThread(threading.Thread):
    def __init__(self, yolov7_wrapper):
        threading.Thread.__init__(self)
        self.yolov7_wrapper = yolov7_wrapper

    def run(self):
        batch_image_raw, use_time = self.yolov7_wrapper.infer(self.yolov7_wrapper.get_raw_image_zeros())
        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))


if __name__ == "__main__":
    # load custom plugin and engine
    PLUGIN_LIBRARY = "build/libmyplugins.so"
    engine_file_path = "yolov7.engine"

    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]
    if len(sys.argv) > 2:
        PLUGIN_LIBRARY = sys.argv[2]

    ctypes.CDLL(PLUGIN_LIBRARY)

    # load coco labels

    categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
            "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
            "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
            "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
            "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
            "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
            "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
            "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
            "hair drier", "toothbrush"]

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')
    # a YoLov7TRT instance
    yolov7_wrapper = YoLov7TRT(engine_file_path)
    try:
        print('batch size is', yolov7_wrapper.batch_size)
        
        image_dir = "samples/"
        image_path_batches = get_img_path_batches(yolov7_wrapper.batch_size, image_dir)

        for i in range(10):
            # create a new thread to do warm_up
            thread1 = warmUpThread(yolov7_wrapper)
            thread1.start()
            thread1.join()
        for batch in image_path_batches:
            # create a new thread to do inference
            thread1 = inferThread(yolov7_wrapper, batch)
            thread1.start()
            thread1.join()
    finally:
        # destroy the instance
        yolov7_wrapper.destroy()


================================================
FILE: yolov8/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.10)

project(yolov8)

add_definitions(-std=c++11)
add_definitions(-DAPI_EXPORTS)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
enable_language(CUDA)

include_directories(${PROJECT_SOURCE_DIR}/include)
include_directories(${PROJECT_SOURCE_DIR}/plugin)

# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
  message("embed_platform on")
  include_directories(/usr/local/cuda/targets/aarch64-linux/include)
  link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
else()
  message("embed_platform off")
  # cuda
  include_directories(/usr/local/cuda/include)
  link_directories(/usr/local/cuda/lib64)

  # tensorrt
  include_directories(/home/lindsay/TensorRT-8.6.1.6/include)
  link_directories(/home/lindsay/TensorRT-8.6.1.6/lib)
  #  include_directories(/home/lindsay/TensorRT-7.2.3.4/include)
  #  link_directories(/home/lindsay/TensorRT-7.2.3.4/lib)


endif()

add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu)
target_link_libraries(myplugins nvinfer cudart)

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})


file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu)
add_executable(yolov8_det ${PROJECT_SOURCE_DIR}/yolov8_det.cpp ${SRCS})

target_link_libraries(yolov8_det nvinfer)
target_link_libraries(yolov8_det cudart)
target_link_libraries(yolov8_det myplugins)
target_link_libraries(yolov8_det ${OpenCV_LIBS})

add_executable(yolov8_seg ${PROJECT_SOURCE_DIR}/yolov8_seg.cpp ${SRCS})
target_link_libraries(yolov8_seg nvinfer cudart myplugins ${OpenCV_LIBS})


add_executable(yolov8_pose ${PROJECT_SOURCE_DIR}/yolov8_pose.cpp ${SRCS})
target_link_libraries(yolov8_pose nvinfer cudart myplugins ${OpenCV_LIBS})

add_executable(yolov8_cls ${PROJECT_SOURCE_DIR}/yolov8_cls.cpp ${SRCS})
target_link_libraries(yolov8_cls nvinfer cudart myplugins ${OpenCV_LIBS})

add_executable(yolov8_5u_det ${PROJECT_SOURCE_DIR}/yolov8_5u_det.cpp ${SRCS})
target_link_libraries(yolov8_5u_det nvinfer cudart myplugins ${OpenCV_LIBS})

add_executable(yolov8_obb ${PROJECT_SOURCE_DIR}/yolov8_obb.cpp ${SRCS})
target_link_libraries(yolov8_obb nvinfer cudart myplugins ${OpenCV_LIBS})


================================================
FILE: yolov8/README.md
================================================
# YOLOv8

The Pytorch implementation is [ultralytics/yolov8](https://github.com/ultralytics/ultralytics/tree/main/ultralytics).

The tensorrt code is derived from [xiaocao-tian/yolov8_tensorrt](https://github.com/xiaocao-tian/yolov8_tensorrt)

## Contributors

<a href="https://github.com/xiaocao-tian"><img src="https://avatars.githubusercontent.com/u/65889782?v=4?s=48" width="40px;" alt=""/></a>
<a href="https://github.com/lindsayshuo"><img src="https://avatars.githubusercontent.com/u/45239466?v=4?s=48" width="40px;" alt=""/></a>
<a href="https://github.com/xinsuinizhuan"><img src="https://avatars.githubusercontent.com/u/40679769?v=4?s=48" width="40px;" alt=""/></a>
<a href="https://github.com/Rex-LK"><img src="https://avatars.githubusercontent.com/u/74702576?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/emptysoal"><img src="https://avatars.githubusercontent.com/u/57931586?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/ChangjunDAI"><img src="https://avatars.githubusercontent.com/u/65420228?s=48&v=4" width="40px;" alt=""/></a>

## Requirements

- TensorRT 8.0+
- OpenCV 3.4.0+
- ultralytics<=8.2.103

## Different versions of yolov8

Currently, we support yolov8

- For yolov8 , download .pt from [https://github.com/ultralytics/assets/releases](https://github.com/ultralytics/assets/releases), then follow how-to-run in current page.

## Config

- Choose the model n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6 from command line arguments.
- Check more configs in [include/config.h](./include/config.h)

## How to Run, yolov8n as example

1. generate .wts from pytorch with .pt, or download .wts from model zoo

```
// download https://github.com/ultralytics/assets/releases/yolov8n.pt
// download https://github.com/lindsayshuo/yolov8-p2/releases/download/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.pt (only for 10 cls p2 model)
cp {tensorrtx}/yolov8/gen_wts.py {ultralytics}/ultralytics
cd {ultralytics}/ultralytics
python gen_wts.py -w yolov8n.pt -o yolov8n.wts -t detect
// a file 'yolov8n.wts' will be generated.


// For p2 model
// download https://github.com/lindsayshuo/yolov8_p2_tensorrtx/releases/download/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.pt (only for 10 cls p2 model)
cd {ultralytics}/ultralytics
python gen_wts.py -w VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.pt -o VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts -t detect (only for  10 cls p2 model)
// a file 'VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts' will be generated.

// For yolov8_5u_det model
// download https://github.com/ultralytics/assets/releases/yolov5nu.pt
cd {ultralytics}/ultralytics
python gen_wts.py -w yolov5nu.pt -o yolov5nu.wts -t detect
// a file 'yolov5nu.wts' will be generated.

```

2. build tensorrtx/yolov8 and run

### Detection
```
cd {tensorrtx}/yolov8/
mkdir build
cd build
cp {ultralytics}/ultralytics/yolov8.wts {tensorrtx}/yolov8/build
cmake ..
make
sudo ./yolov8_det -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6]  // serialize model to plan file
sudo ./yolov8_det -d [.engine] [image folder]  [c/g] // deserialize and run inference, the images in [image folder] will be processed.

// For example yolov8n
sudo ./yolov8_det -s yolov8n.wts yolov8.engine n
sudo ./yolov8_det -d yolov8n.engine ../images c //cpu postprocess
sudo ./yolov8_det -d yolov8n.engine ../images g //gpu postprocess


// For p2 model:
// change the  "const static int kNumClass" in config.h to 10;
sudo ./yolov8_det -s VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine x2
wget https://github.com/lindsayshuo/yolov8-p2/releases/download/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last/0000008_01999_d_0000040.jpg
cp -r 0000008_01999_d_0000040.jpg ../images
sudo ./yolov8_det -d VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine ../images c //cpu postprocess
sudo ./yolov8_det -d VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine ../images g //gpu postprocess

// For yolov8_5u_det(YOLOv5u with the anchor-free, objectness-free split head structure based on YOLOv8 features) model:
sudo ./yolov8_5u_det -s [.wts] [.engine] [n/s/m/l/x//n6/s6/m6/l6/x6]
sudo ./yolov8_5u_det -d yolov5xu.engine ../images c //cpu postprocess
sudo ./yolov8_5u_det -d yolov5xu.engine ../images g //gpu postprocess
```

### Instance Segmentation
```
# Build and serialize TensorRT engine
./yolov8_seg -s yolov8s-seg.wts yolov8s-seg.engine s

# Download the labels file
wget -O coco.txt https://raw.githubusercontent.com/amikelive/coco-labels/master/coco-labels-2014_2017.txt

# Run inference with labels file
./yolov8_seg -d yolov8s-seg.engine ../images c coco.txt
```

### Classification
```
cd {tensorrtx}/yolov8/
// Download inference images
wget  https://github.com/lindsayshuo/infer_pic/releases/download/pics/1709970363.6990473rescls.jpg
mkdir samples
cp -r  1709970363.6990473rescls.jpg samples
// Download ImageNet labels
wget https://github.com/joannzhang00/ImageNet-dataset-classes-labels/blob/main/imagenet_classes.txt

// update kClsNumClass in config.h if your model is trained on custom dataset
mkdir build
cd build
cp {ultralytics}/ultralytics/yolov8n-cls.wts {tensorrtx}/yolov8/build
cmake ..
make
sudo ./yolov8_cls -s [.wts] [.engine] [n/s/m/l/x]  // serialize model to plan file
sudo ./yolov8_cls -d [.engine] [image folder]  // deserialize and run inference, the images in [image folder] will be processed.

// For example yolov8n
sudo ./yolov8_cls -s yolov8n-cls.wts yolov8-cls.engine n
sudo ./yolov8_cls -d yolov8n-cls.engine ../samples
```


### Pose Estimation
```
cd {tensorrtx}/yolov8/
// update "kPoseNumClass = 1" in config.h
mkdir build
cd build
cp {ultralytics}/ultralytics/yolov8-pose.wts {tensorrtx}/yolov8/build
cmake ..
make
sudo ./yolov8_pose -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6]  // serialize model to plan file
sudo ./yolov8_pose -d [.engine] [image folder]  [c/g] // deserialize and run inference, the images in [image folder] will be processed.

// For example yolov8-pose
sudo ./yolov8_pose -s yolov8n-pose.wts yolov8n-pose.engine n
sudo ./yolov8_pose -d yolov8n-pose.engine ../images c //cpu postprocess
sudo ./yolov8_pose -d yolov8n-pose.engine ../images g //gpu postprocess
```


### Oriented Bounding Boxes (OBB) Estimation
```
cd {tensorrtx}/yolov8/
// update "kObbNumClass = 15" "kInputH = 1024" "kInputW = 1024" in config.h
wget https://github.com/lindsayshuo/infer_pic/releases/download/pics/obb.png
mkdir images
mv obb.png ./images
mkdir build
cd build
cp {ultralytics}/ultralytics/yolov8-obb.wts {tensorrtx}/yolov8/build
cmake ..
make
sudo ./yolov8_obb -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6]  // serialize model to plan file
sudo ./yolov8_obb -d [.engine] [image folder]  [c/g] // deserialize and run inference, the images in [image folder] will be processed.

// For example yolov8-obb
sudo ./yolov8_obb -s yolov8n-obb.wts yolov8n-obb.engine n
sudo ./yolov8_obb -d yolov8n-obb.engine ../images c //cpu postprocess
sudo ./yolov8_obb -d yolov8n-obb.engine ../images g //gpu postprocess
```


4. optional, load and run the tensorrt model in python

```
// install python-tensorrt, pycuda, etc.
// ensure the yolov8n.engine and libmyplugins.so have been built
python yolov8_det_trt.py  # Detection
python yolov8_seg_trt.py  # Segmentation
python yolov8_cls_trt.py  # Classification
python yolov8_pose_trt.py  # Pose Estimation
python yolov8_5u_det_trt.py  # yolov8_5u_det(YOLOv5u with the anchor-free, objectness-free split head structure based on YOLOv8 features) model
python yolov8_obb_trt.py  # Oriented Bounding Boxes (OBB) Estimation
```

# INT8 Quantization

1. Prepare calibration images, you can randomly select 1000s images from your train set. For coco, you can also download my calibration images `coco_calib` from [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing) or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh

2. unzip it in yolov8/build

3. set the macro `USE_INT8` in config.h, change `kInputQuantizationFolder` into your image folder path and make

4. serialize the model and test

<p align="center">
<img src="https://user-images.githubusercontent.com/15235574/78247927-4d9fac00-751e-11ea-8b1b-704a0aeb3fcf.jpg" height="360px;">
</p>

## More Information

See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)


================================================
FILE: yolov8/gen_wts.py
================================================
import sys  # noqa: F401
import argparse
import os
import struct
import torch


def parse_args():
    parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
    parser.add_argument('-w', '--weights', required=True,
                        help='Input weights (.pt) file path (required)')
    parser.add_argument(
        '-o', '--output', help='Output (.wts) file path (optional)')
    parser.add_argument(
        '-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg', 'pose', 'obb'],
        help='determines the model is detection/classification')
    args = parser.parse_args()
    if not os.path.isfile(args.weights):
        raise SystemExit('Invalid input file')
    if not args.output:
        args.output = os.path.splitext(args.weights)[0] + '.wts'
    elif os.path.isdir(args.output):
        args.output = os.path.join(
            args.output,
            os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
    return args.weights, args.output, args.type


pt_file, wts_file, m_type = parse_args()

print(f'Generating .wts for {m_type} model')

# Load model
print(f'Loading {pt_file}')

# Initialize
device = 'cpu'

# Load model
model = torch.load(pt_file, map_location=device, weights_only=False)  # Load FP32 weights
model = model['ema' if model.get('ema') else 'model'].float()

if m_type in ['detect', 'seg', 'pose', 'obb']:
    anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None]

    delattr(model.model[-1], 'anchors')

model.to(device).eval()

with open(wts_file, 'w') as f:
    f.write('{}\n'.format(len(model.state_dict().keys())))
    for k, v in model.state_dict().items():
        vr = v.reshape(-1).cpu().numpy()
        f.write('{} {} '.format(k, len(vr)))
        for vv in vr:
            f.write(' ')
            f.write(struct.pack('>f', float(vv)).hex())
        f.write('\n')


================================================
FILE: yolov8/include/block.h
================================================
#pragma once
#include <map>
#include <string>
#include <vector>
#include "NvInfer.h"

int calculateP(int ksize);

std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file);

nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network,
                                        std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                        int ch, int k, int s, int p, std::string lname);

nvinfer1::IElementWiseLayer* C2F(nvinfer1::INetworkDefinition* network,
                                 std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
                                 int c2, int n, bool shortcut, float e, std::string lname);

nvinfer1::IElementWiseLayer* C2(nvinfer1::INetworkDefinition* network,
                                std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c1,
                                int c2, int n, bool shortcut, float e, std::string lname);

nvinfer1::IElementWiseLayer* C3(nvinfer1::INetworkDefinition* network,
                                std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
                                int c2, int n, bool shortcut, float e, std::string lname);

nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network,
                                  std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
                                  int c2, int k, std::string lname);

nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                             nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname);

nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
                                       std::vector<nvinfer1::IConcatenationLayer*> dets, const int* px_arry,
                                       int px_arry_num, int num_class, bool is_segmentation, bool is_pose, bool is_obb);


================================================
FILE: yolov8/include/calibrator.h
================================================
#ifndef ENTROPY_CALIBRATOR_H
#define ENTROPY_CALIBRATOR_H

#include <NvInfer.h>
#include <string>
#include <vector>
#include "macros.h"

//! \class Int8EntropyCalibrator2
//!
//! \brief Implements Entropy calibrator 2.
//!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
//!
class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
   public:
    Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name,
                           const char* input_blob_name, bool read_cache = true);
    virtual ~Int8EntropyCalibrator2();
    int getBatchSize() const TRT_NOEXCEPT override;
    bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
    const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
    void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;

   private:
    int batchsize_;
    int input_w_;
    int input_h_;
    int img_idx_;
    std::string img_dir_;
    std::vector<std::string> img_files_;
    size_t input_count_;
    std::string calib_table_name_;
    const char* input_blob_name_;
    bool read_cache_;
    void* device_input_;
    std::vector<char> calib_cache_;
};

#endif  // ENTROPY_CALIBRATOR_H


================================================
FILE: yolov8/include/config.h
================================================
#define USE_FP16
//#define USE_FP32
//#define USE_INT8

const static char* kInputTensorName = "images";
const static char* kOutputTensorName = "output";
const static int kNumClass = 80;
const static int kBatchSize = 1;
const static int kGpuId = 0;
const static int kInputH = 640;
const static int kInputW = 640;
const static float kNmsThresh = 0.45f;
const static float kConfThresh = 0.5f;
const static float kConfThreshKeypoints = 0.5f;  // keypoints confidence
const static int kMaxInputImageSize = 3000 * 3000;
const static int kMaxNumOutputBbox = 1000;
//Quantization input image folder path
const static char* kInputQuantizationFolder = "./coco_calib";

// Classfication model's number of classes
constexpr static int kClsNumClass = 1000;
// Classfication model's input shape
constexpr static int kClsInputH = 224;
constexpr static int kClsInputW = 224;

// pose model's number of classes
constexpr static int kPoseNumClass = 1;
const static int kNumberOfPoints = 17;  // number of keypoints total

// obb model's number of classes
constexpr static int kObbNumClass = 15;


================================================
FILE: yolov8/include/cuda_utils.h
================================================
#ifndef TRTX_CUDA_UTILS_H_
#define TRTX_CUDA_UTILS_H_

#include <cuda_runtime_api.h>

#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)                                                                    \
    {                                                                                          \
        cudaError_t error_code = callstr;                                                      \
        if (error_code != cudaSuccess) {                                                       \
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
            assert(0);                                                                         \
        }                                                                                      \
    }
#endif  // CUDA_CHECK

#endif  // TRTX_CUDA_UTILS_H_


================================================
FILE: yolov8/include/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "NvInferRuntimeCommon.h"
#include "macros.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf {
   public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {}

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {}

    ~LogStreamConsumerBuffer() {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr()) {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync() {
        putOutput();
        return 0;
    }

    void putOutput() {
        if (mShouldLog) {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }

   private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase {
   public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog) {}

   protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
   public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(severity <= reportableSeverity),
          mSeverity(severity) {}

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog),
          std::ostream(&mBuffer)  // links the stream buffer with the stream
          ,
          mShouldLog(other.mShouldLog),
          mSeverity(other.mSeverity) {}

    void setReportableSeverity(Severity reportableSeverity) {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

   private:
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger {
   public:
    Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {}

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult {
        kRUNNING,  //!< The test is running
        kPASSED,   //!< The test passed
        kFAILED,   //!< The test failed
        kWAIVED    //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger() { return *this; }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity) { mReportableSeverity = severity; }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom {
       public:
        TestAtom(TestAtom&&) = default;

       private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started), mName(name), mCmdline(cmdline) {}

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline) {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom) {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result) {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom) {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass) {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const { return mReportableSeverity; }

   private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity) {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                return "[F] ";
            case Severity::kERROR:
                return "[E] ";
            case Severity::kWARNING:
                return "[W] ";
            case Severity::kINFO:
                return "[I] ";
            case Severity::kVERBOSE:
                return "[V] ";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result) {
        switch (result) {
            case TestResult::kRUNNING:
                return "RUNNING";
            case TestResult::kPASSED:
                return "PASSED";
            case TestResult::kFAILED:
                return "FAILED";
            case TestResult::kWAIVED:
                return "WAIVED";
            default:
                assert(0);
                return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity) {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result) {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv) {
        std::stringstream ss;
        for (int i = 0; i < argc; i++) {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace {

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger) {
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

}  // anonymous namespace

#endif  // TENSORRT_LOGGING_H


================================================
FILE: yolov8/include/macros.h
================================================
#ifndef __MACROS_H
#define __MACROS_H

#include "NvInfer.h"

#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else

#if defined(_MSC_VER)
#define API __declspec(dllimport)
#else
#define API
#endif
#endif  // API_EXPORTS

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif

#endif  // __MACROS_H


================================================
FILE: yolov8/include/model.h
================================================
#pragma once
#include <assert.h>
#include <string>
#include "NvInfer.h"

nvinfer1::IHostMemory* buildEngineYolov8Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                            nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                            int& max_channels);

nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                              nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                              int& max_channels);

nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                              nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                              int& max_channels);

nvinfer1::IHostMemory* buildEngineYolov8Cls(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                            nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw);

nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                            nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                            int& max_channels);

nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                             nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                             int& max_channels);

nvinfer1::IHostMemory* buildEngineYolov8PoseP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                               nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                               int& max_channels);

nvinfer1::IHostMemory* buildEngineYolov8_5uDet(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                               nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                               int& max_channels);

nvinfer1::IHostMemory* buildEngineYolov8_5uDetP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                                 nvinfer1::DataType dt, const std::string& wts_path, float& gd,
                                                 float& gw, int& max_channels);

nvinfer1::IHostMemory* buildEngineYolov8Obb(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                            nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                            int& max_channels);


================================================
FILE: yolov8/include/postprocess.h
================================================
#pragma once

#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "types.h"

// Preprocessing functions
cv::Rect get_rect(cv::Mat& img, float bbox[4]);

// Processing functions
void batch_process(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
                   int bbox_element, const std::vector<cv::Mat>& img_batch);
void batch_process_obb(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
                       int bbox_element, const std::vector<cv::Mat>& img_batch);
void process_decode_ptr_host(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img,
                             int count);
void process_decode_ptr_host_obb(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element,
                                 cv::Mat& img, int count);

// NMS functions
void nms(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh = 0.5);
void batch_nms(std::vector<std::vector<Detection>>& batch_res, float* output, int batch_size, int output_size,
               float conf_thresh, float nms_thresh = 0.5);
void nms_obb(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh = 0.5);
void batch_nms_obb(std::vector<std::vector<Detection>>& batch_res, float* output, int batch_size, int output_size,
                   float conf_thresh, float nms_thresh = 0.5);

// CUDA-related functions
void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
                 cudaStream_t stream);
void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);
void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
                     cudaStream_t stream);
void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);

// Drawing functions
void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
void draw_bbox_obb(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
void draw_bbox_keypoints_line(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks,
                    std::unordered_map<int, std::string>& labels_map);


================================================
FILE: yolov8/include/preprocess.h
================================================
#pragma once

#include <map>
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "types.h"

void cuda_preprocess_init(int max_image_size);

void cuda_preprocess_destroy();

void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height,
                     cudaStream_t stream);

void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch, float* dst, int dst_width, int dst_height,
                           cudaStream_t stream);


================================================
FILE: yolov8/include/types.h
================================================
#pragma once
#include "config.h"

struct alignas(float) Detection {
    //center_x center_y w h
    float bbox[4];
    float conf;  // bbox_conf * cls_conf
    float class_id;
    float mask[32];
    float keypoints[kNumberOfPoints * 3];  // keypoints array with dynamic size based on kNumberOfPoints
    float angle;                           // obb angle
};

struct AffineMatrix {
    float value[6];
};

const int bbox_element =
        sizeof(AffineMatrix) / sizeof(float) + 1;  // left, top, right, bottom, confidence, class, keepflag


================================================
FILE: yolov8/include/utils.h
================================================
#pragma once
#include <dirent.h>
#include <fstream>
#include <opencv2/opencv.hpp>

static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
    int w, h, x, y;
    float r_w = input_w / (img.cols * 1.0);
    float r_h = input_h / (img.rows * 1.0);
    if (r_h > r_w) {
        w = input_w;
        h = r_w * img.rows;
        x = 0;
        y = (input_h - h) / 2;
    } else {
        w = r_h * img.cols;
        h = input_h;
        x = (input_w - w) / 2;
        y = 0;
    }
    cv::Mat re(h, w, CV_8UC3);
    cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
    cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
    re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
    return out;
}

static inline int read_files_in_dir(const char* p_dir_name, std::vector<std::string>& file_names) {
    DIR* p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}

// Function to trim leading and trailing whitespace from a string
static inline std::string trim_leading_whitespace(const std::string& str) {
    size_t first = str.find_first_not_of(' ');
    if (std::string::npos == first) {
        return str;
    }
    size_t last = str.find_last_not_of(' ');
    return str.substr(first, (last - first + 1));
}

// Src: https://stackoverflow.com/questions/16605967
static inline std::string to_string_with_precision(const float a_value, const int n = 2) {
    std::ostringstream out;
    out.precision(n);
    out << std::fixed << a_value;
    return out.str();
}

static inline int read_labels(const std::string labels_filename, std::unordered_map<int, std::string>& labels_map) {
    std::ifstream file(labels_filename);
    // Read each line of the file
    std::string line;
    int index = 0;
    while (std::getline(file, line)) {
        // Strip the line of any leading or trailing whitespace
        line = trim_leading_whitespace(line);

        // Add the stripped line to the labels_map, using the loop index as the key
        labels_map[index] = line;
        index++;
    }
    // Close the file
    file.close();

    return 0;
}


================================================
FILE: yolov8/plugin/yololayer.cu
================================================
#include <assert.h>
#include <math.h>
#include <iostream>
#include <vector>
#include "cuda_utils.h"
#include "types.h"
#include "yololayer.h"

namespace Tn {
template <typename T>
void write(char*& buffer, const T& val) {
    *reinterpret_cast<T*>(buffer) = val;
    buffer += sizeof(T);
}

template <typename T>
void read(const char*& buffer, T& val) {
    val = *reinterpret_cast<const T*>(buffer);
    buffer += sizeof(T);
}
}  // namespace Tn

__device__ float sigmoid(float x) {
    return 1.0f / (1.0f + exp(-x));
}

namespace nvinfer1 {
YoloLayerPlugin::YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth,
                                 int netHeight, int maxOut, bool is_segmentation, bool is_pose, bool is_obb,
                                 const int* strides, int stridesLength) {

    mClassCount = classCount;
    mNumberofpoints = numberofpoints;
    mConfthreshkeypoints = confthreshkeypoints;
    mYoloV8NetWidth = netWidth;
    mYoloV8netHeight = netHeight;
    mMaxOutObject = maxOut;
    mStridesLength = stridesLength;
    mStrides = new int[stridesLength];
    memcpy(mStrides, strides, stridesLength * sizeof(int));
    is_segmentation_ = is_segmentation;
    is_pose_ = is_pose;
    is_obb_ = is_obb;
}

YoloLayerPlugin::~YoloLayerPlugin() {
    if (mStrides != nullptr) {
        delete[] mStrides;
        mStrides = nullptr;
    }
}

YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) {
    using namespace Tn;
    const char *d = reinterpret_cast<const char*>(data), *a = d;
    read(d, mClassCount);
    read(d, mNumberofpoints);
    read(d, mConfthreshkeypoints);
    read(d, mThreadCount);
    read(d, mYoloV8NetWidth);
    read(d, mYoloV8netHeight);
    read(d, mMaxOutObject);
    read(d, mStridesLength);
    mStrides = new int[mStridesLength];
    for (int i = 0; i < mStridesLength; ++i) {
        read(d, mStrides[i]);
    }
    read(d, is_segmentation_);
    read(d, is_pose_);
    read(d, is_obb_);

    assert(d == a + length);
}

void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT {

    using namespace Tn;
    char *d = static_cast<char*>(buffer), *a = d;
    write(d, mClassCount);
    write(d, mNumberofpoints);
    write(d, mConfthreshkeypoints);
    write(d, mThreadCount);
    write(d, mYoloV8NetWidth);
    write(d, mYoloV8netHeight);
    write(d, mMaxOutObject);
    write(d, mStridesLength);
    for (int i = 0; i < mStridesLength; ++i) {
        write(d, mStrides[i]);
    }
    write(d, is_segmentation_);
    write(d, is_pose_);
    write(d, is_obb_);

    assert(d == a + getSerializationSize());
}

size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT {
    return sizeof(mClassCount) + sizeof(mNumberofpoints) + sizeof(mConfthreshkeypoints) + sizeof(mThreadCount) +
           sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + sizeof(mMaxOutObject) + sizeof(mStridesLength) +
           sizeof(int) * mStridesLength + sizeof(is_segmentation_) + sizeof(is_pose_) + sizeof(is_obb_);
}

int YoloLayerPlugin::initialize() TRT_NOEXCEPT {
    return 0;
}

nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs,
                                                    int nbInputDims) TRT_NOEXCEPT {
    int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float);
    return nvinfer1::Dims3(total_size + 1, 1, 1);
}

void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT {
    mPluginNamespace = pluginNamespace;
}

const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT {
    return mPluginNamespace;
}

nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
                                                      int nbInputs) const TRT_NOEXCEPT {
    return nvinfer1::DataType::kFLOAT;
}

bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
                                                   int nbInputs) const TRT_NOEXCEPT {

    return false;
}

bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT {

    return false;
}

void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput,
                                      nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT{};

void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
                                      IGpuAllocator* gpuAllocator) TRT_NOEXCEPT{};

void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {}

const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT {

    return "YoloLayer_TRT";
}

const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT {
    return "1";
}

void YoloLayerPlugin::destroy() TRT_NOEXCEPT {
    delete this;
}

nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT {

    YoloLayerPlugin* p =
            new YoloLayerPlugin(mClassCount, mNumberofpoints, mConfthreshkeypoints, mYoloV8NetWidth, mYoloV8netHeight,
                                mMaxOutObject, is_segmentation_, is_pose_, is_obb_, mStrides, mStridesLength);
    p->setPluginNamespace(mPluginNamespace);
    return p;
}

int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs,
                             void* workspace, cudaStream_t stream) TRT_NOEXCEPT {

    forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV8netHeight, mYoloV8NetWidth, batchSize);
    return 0;
}

__device__ float Logist(float data) {
    return 1.0f / (1.0f + expf(-data));
};

__global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h,
                             int grid_w, const int stride, int classes, int nk, float confkeypoints, int outputElem,
                             bool is_segmentation, bool is_pose, bool is_obb) {
    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    if (idx >= numElements)
        return;

    const int N_kpts = nk;
    int total_grid = grid_h * grid_w;
    int info_len = 4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0) + (is_obb ? 1 : 0);
    int batchIdx = idx / total_grid;
    int elemIdx = idx % total_grid;
    const float* curInput = input + batchIdx * total_grid * info_len;
    int outputIdx = batchIdx * outputElem;

    int class_id = 0;
    float max_cls_prob = 0.0;
    for (int i = 4; i < 4 + classes; i++) {
        float p = Logist(curInput[elemIdx + i * total_grid]);
        if (p > max_cls_prob) {
            max_cls_prob = p;
            class_id = i - 4;
        }
    }

    if (max_cls_prob < 0.1)
        return;

    int count = (int)atomicAdd(output + outputIdx, 1);
    if (count >= maxoutobject)
        return;
    char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection);
    Detection* det = (Detection*)(data);

    int row = elemIdx / grid_w;
    int col = elemIdx % grid_w;

    det->conf = max_cls_prob;
    det->class_id = class_id;
    det->bbox[0] = (col + 0.5f - curInput[elemIdx + 0 * total_grid]) * stride;
    det->bbox[1] = (row + 0.5f - curInput[elemIdx + 1 * total_grid]) * stride;
    det->bbox[2] = (col + 0.5f + curInput[elemIdx + 2 * total_grid]) * stride;
    det->bbox[3] = (row + 0.5f + curInput[elemIdx + 3 * total_grid]) * stride;

    if (is_segmentation) {
        for (int k = 0; k < 32; ++k) {
            det->mask[k] =
                    curInput[elemIdx + (4 + classes + (is_pose ? N_kpts * 3 : 0) + (is_obb ? 1 : 0) + k) * total_grid];
        }
    }

    if (is_pose) {
        for (int kpt = 0; kpt < N_kpts; kpt++) {
            int kpt_x_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3) * total_grid;
            int kpt_y_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3 + 1) * total_grid;
            int kpt_conf_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3 + 2) * total_grid;

            float kpt_confidence = sigmoid(curInput[elemIdx + kpt_conf_idx]);

            float kpt_x = (curInput[elemIdx + kpt_x_idx] * 2.0 + col) * stride;
            float kpt_y = (curInput[elemIdx + kpt_y_idx] * 2.0 + row) * stride;

            bool is_within_bbox =
                    kpt_x >= det->bbox[0] && kpt_x <= det->bbox[2] && kpt_y >= det->bbox[1] && kpt_y <= det->bbox[3];

            if (kpt_confidence < confkeypoints || !is_within_bbox) {
                det->keypoints[kpt * 3] = -1;
                det->keypoints[kpt * 3 + 1] = -1;
                det->keypoints[kpt * 3 + 2] = -1;
            } else {
                det->keypoints[kpt * 3] = kpt_x;
                det->keypoints[kpt * 3 + 1] = kpt_y;
                det->keypoints[kpt * 3 + 2] = kpt_confidence;
            }
        }
    }

    if (is_obb) {
        double pi = M_PI;
        auto angle_inx = curInput[elemIdx + (4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0) +
                                             0) * total_grid];
        auto angle = (sigmoid(angle_inx) - 0.25f) * pi;

        auto cos1 = cos(angle);
        auto sin1 = sin(angle);
        auto xf = (curInput[elemIdx + 2 * total_grid] - curInput[elemIdx + 0 * total_grid]) / 2;
        auto yf = (curInput[elemIdx + 3 * total_grid] - curInput[elemIdx + 1 * total_grid]) / 2;

        auto x = xf * cos1 - yf * sin1;
        auto y = xf * sin1 + yf * cos1;

        float cx = (col + 0.5f + x) * stride;
        float cy = (row + 0.5f + y) * stride;

        float w1 = (curInput[elemIdx + 0 * total_grid] + curInput[elemIdx + 2 * total_grid]) * stride;
        float h1 = (curInput[elemIdx + 1 * total_grid] + curInput[elemIdx + 3 * total_grid]) * stride;
        det->bbox[0] = cx;
        det->bbox[1] = cy;
        det->bbox[2] = w1;
        det->bbox[3] = h1;
        det->angle = angle;
    }
}

void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,
                                 int mYoloV8NetWidth, int batchSize) {

    int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
    cudaMemsetAsync(output, 0, sizeof(float), stream);
    for (int idx = 0; idx < batchSize; ++idx) {
        CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream));
    }
    int numElem = 0;
    int maxGrids = mStridesLength;
    int flatGridsLen = 2 * maxGrids;
    int* flatGrids = new int[flatGridsLen];

    for (int i = 0; i < maxGrids; ++i) {
        flatGrids[2 * i] = mYoloV8netHeight / mStrides[i];
        flatGrids[2 * i + 1] = mYoloV8NetWidth / mStrides[i];
    }

    for (unsigned int i = 0; i < maxGrids; i++) {
        // Access the elements of the original 2D array from the flattened 1D array
        int grid_h = flatGrids[2 * i];      // Corresponds to the access of grids[i][0]
        int grid_w = flatGrids[2 * i + 1];  // Corresponds to the access of grids[i][1]
        int stride = mStrides[i];
        numElem = grid_h * grid_w * batchSize;  // Calculate the total number of elements
        if (numElem < mThreadCount)             // Adjust the thread count if needed
            mThreadCount = numElem;

        // The CUDA kernel call remains unchanged
        CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>(
                inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, mNumberofpoints,
                mConfthreshkeypoints, outputElem, is_segmentation_, is_pose_, is_obb_);
    }

    delete[] flatGrids;
}

PluginFieldCollection YoloPluginCreator::mFC{};
std::vector<PluginField> YoloPluginCreator::mPluginAttributes;

YoloPluginCreator::YoloPluginCreator() {
    mPluginAttributes.clear();
    mFC.nbFields = mPluginAttributes.size();
    mFC.fields = mPluginAttributes.data();
}

const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT {
    return "YoloLayer_TRT";
}

const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT {
    return "1";
}

const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT {
    return &mFC;
}

IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT {
    assert(fc->nbFields == 1);
    assert(strcmp(fc->fields[0].name, "combinedInfo") == 0);
    const int* combinedInfo = static_cast<const int*>(fc->fields[0].data);
    int netinfo_count = 9;
    int class_count = combinedInfo[0];
    int numberofpoints = combinedInfo[1];
    float confthreshkeypoints = combinedInfo[2];
    int input_w = combinedInfo[3];
    int input_h = combinedInfo[4];
    int max_output_object_count = combinedInfo[5];
    bool is_segmentation = combinedInfo[6];
    bool is_pose = combinedInfo[7];
    bool is_obb = combinedInfo[8];
    const int* px_arry = combinedInfo + netinfo_count;
    int px_arry_length = fc->fields[0].length - netinfo_count;
    YoloLayerPlugin* obj =
            new YoloLayerPlugin(class_count, numberofpoints, confthreshkeypoints, input_w, input_h,
                                max_output_object_count, is_segmentation, is_pose, is_obb, px_arry, px_arry_length);
    obj->setPluginNamespace(mNamespace.c_str());
    return obj;
}

IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData,
                                                     size_t serialLength) TRT_NOEXCEPT {
    // This object will be deleted when the network is destroyed, which will
    // call YoloLayerPlugin::destroy()
    YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
    obj->setPluginNamespace(mNamespace.c_str());
    return obj;
}

}  // namespace nvinfer1


================================================
FILE: yolov8/plugin/yololayer.h
================================================
#pragma once
#include <string>
#include <vector>
#include "NvInfer.h"
#include "macros.h"
namespace nvinfer1 {
class API YoloLayerPlugin : public IPluginV2IOExt {
   public:
    YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth, int netHeight,
                    int maxOut, bool is_segmentation, bool is_pose, bool is_obb, const int* strides, int stridesLength);

    YoloLayerPlugin(const void* data, size_t length);
    ~YoloLayerPlugin();

    int getNbOutputs() const TRT_NOEXCEPT override { return 1; }

    nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;

    int initialize() TRT_NOEXCEPT override;

    virtual void terminate() TRT_NOEXCEPT override {}

    virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; }

    virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace,
                        cudaStream_t stream) TRT_NOEXCEPT override;

    virtual size_t getSerializationSize() const TRT_NOEXCEPT override;

    virtual void serialize(void* buffer) const TRT_NOEXCEPT override;

    bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs,
                                   int nbOutputs) const TRT_NOEXCEPT override {
        return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
    }

    const char* getPluginType() const TRT_NOEXCEPT override;

    const char* getPluginVersion() const TRT_NOEXCEPT override;

    void destroy() TRT_NOEXCEPT override;

    IPluginV2IOExt* clone() const TRT_NOEXCEPT override;

    void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;

    const char* getPluginNamespace() const TRT_NOEXCEPT override;

    nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
                                         int32_t nbInputs) const TRT_NOEXCEPT;

    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
                                      int nbInputs) const TRT_NOEXCEPT override;

    bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;

    void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
                         IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;

    void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out,
                         int32_t nbOutput) TRT_NOEXCEPT override;

    void detachFromContext() TRT_NOEXCEPT override;

   private:
    void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,
                    int mYoloV8NetWidth, int batchSize);
    int mThreadCount = 256;
    const char* mPluginNamespace;
    int mClassCount;
    int mNumberofpoints;
    float mConfthreshkeypoints;
    int mYoloV8NetWidth;
    int mYoloV8netHeight;
    int mMaxOutObject;
    bool is_segmentation_;
    bool is_pose_;
    bool is_obb_;
    int* mStrides;
    int mStridesLength;
};

class API YoloPluginCreator : public IPluginCreator {
   public:
    YoloPluginCreator();
    ~YoloPluginCreator() override = default;

    const char* getPluginName() const TRT_NOEXCEPT override;

    const char* getPluginVersion() const TRT_NOEXCEPT override;

    const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;

    nvinfer1::IPluginV2IOExt* createPlugin(const char* name,
                                           const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;

    nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData,
                                                size_t serialLength) TRT_NOEXCEPT override;

    void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; }

    const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); }

   private:
    std::string mNamespace;
    static PluginFieldCollection mFC;
    static std::vector<PluginField> mPluginAttributes;
};
REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
}  // namespace nvinfer1


================================================
FILE: yolov8/src/block.cpp
================================================
#include "block.h"
#include <assert.h>
#include <math.h>
#include <fstream>
#include <iostream>
#include "config.h"
#include "yololayer.h"

int calculateP(int ksize) {
    return ksize / 3;
}

std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, nvinfer1::Weights> WeightMap;

    std::ifstream input(file);
    assert(input.is_open() &&
           "Unable to load weight file. please check if the "
           ".wts file path is right!!!!!!");

    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--) {
        nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        std::string name;
        input >> name >> std::dec >> size;
        wt.type = nvinfer1::DataType::kFLOAT;

        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; x++) {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        wt.count = size;
        WeightMap[name] = wt;
    }
    return WeightMap;
}

static nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network,
                                             std::map<std::string, nvinfer1::Weights> weightMap,
                                             nvinfer1::ITensor& input, std::string lname, float eps) {
    float* gamma = (float*)weightMap[lname + ".weight"].values;
    float* beta = (float*)weightMap[lname + ".bias"].values;
    float* mean = (float*)weightMap[lname + ".running_mean"].values;
    float* var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;

    float* scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, scval, len};

    float* shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, shval, len};

    float* pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, pval, len};
    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    nvinfer1::IScaleLayer* output = network->addScale(input, nvinfer1::ScaleMode::kCHANNEL, shift, scale, power);
    assert(output);
    return output;
}

nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network,
                                        std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
                                        int ch, int k, int s, int p, std::string lname) {
    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv =
            network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty);
    assert(conv);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    conv->setPaddingNd(nvinfer1::DimsHW{p, p});

    nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);

    nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
    nvinfer1::IElementWiseLayer* ew =
            network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
    assert(ew);
    return ew;
}

nvinfer1::ILayer* bottleneck(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                             nvinfer1::ITensor& input, int c1, int c2, bool shortcut, float e, std::string lname) {
    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c2, 3, 1, 1, lname + ".cv1");
    nvinfer1::IElementWiseLayer* conv2 =
            convBnSiLU(network, weightMap, *conv1->getOutput(0), c2, 3, 1, 1, lname + ".cv2");

    if (shortcut && c1 == c2) {
        nvinfer1::IElementWiseLayer* ew =
                network->addElementWise(input, *conv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
        return ew;
    }
    return conv2;
}

static nvinfer1::ILayer* bottleneck_c3(nvinfer1::INetworkDefinition* network,
                                       std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input,
                                       int c1, int c2, bool shortcut, float e, std::string lname) {
    nvinfer1::IElementWiseLayer* cv1 =
            convBnSiLU(network, weightMap, input, (int)((float)c2 * e), 1, 1, calculateP(1), lname + ".cv1");
    nvinfer1::IElementWiseLayer* cv2 =
            convBnSiLU(network, weightMap, *cv1->getOutput(0), c2, 3, 1, calculateP(3), lname + ".cv2");
    if (shortcut && c1 == c2) {
        auto ew = network->addElementWise(input, *cv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
        return ew;
    }
    return cv2;
}

nvinfer1::IElementWiseLayer* C2F(nvinfer1::INetworkDefinition* network,
                                 std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
                                 int c2, int n, bool shortcut, float e, std::string lname) {
    int c_ = (float)c2 * e;

    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, 2 * c_, 1, 1, 0, lname + ".cv1");
    nvinfer1::Dims d = conv1->getOutput(0)->getDimensions();

    nvinfer1::ISliceLayer* split1 =
            network->addSlice(*conv1->getOutput(0), nvinfer1::Dims3{0, 0, 0},
                              nvinfer1::Dims3{d.d[0] / 2, d.d[1], d.d[2]}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split2 =
            network->addSlice(*conv1->getOutput(0), nvinfer1::Dims3{d.d[0] / 2, 0, 0},
                              nvinfer1::Dims3{d.d[0] / 2, d.d[1], d.d[2]}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ITensor* inputTensor0[] = {split1->getOutput(0), split2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor0, 2);
    nvinfer1::ITensor* y1 = split2->getOutput(0);
    for (int i = 0; i < n; i++) {
        auto* b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, 1.0, lname + ".m." + std::to_string(i));
        y1 = b->getOutput(0);

        nvinfer1::ITensor* inputTensors[] = {cat->getOutput(0), b->getOutput(0)};
        cat = network->addConcatenation(inputTensors, 2);
    }

    nvinfer1::IElementWiseLayer* conv2 =
            convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv2");

    return conv2;
}

nvinfer1::IElementWiseLayer* C2(nvinfer1::INetworkDefinition* network,
                                std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c1,
                                int c2, int n, bool shortcut, float e, std::string lname) {
    assert(network != nullptr);
    int hidden_channels = static_cast<int>(c2 * e);

    // cv1 branch
    nvinfer1::IElementWiseLayer* conv1 =
            convBnSiLU(network, weightMap, input, 2 * hidden_channels, 1, 1, 0, lname + ".cv1");
    nvinfer1::ITensor* cv1_out = conv1->getOutput(0);

    // Split the output of cv1 into two tensors
    nvinfer1::Dims dims = cv1_out->getDimensions();
    nvinfer1::ISliceLayer* split1 =
            network->addSlice(*cv1_out, nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{dims.d[0] / 2, dims.d[1], dims.d[2]},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split2 =
            network->addSlice(*cv1_out, nvinfer1::Dims3{dims.d[0] / 2, 0, 0},
                              nvinfer1::Dims3{dims.d[0] / 2, dims.d[1], dims.d[2]}, nvinfer1::Dims3{1, 1, 1});

    // Create y1 bottleneck sequence
    nvinfer1::ITensor* y1 = split1->getOutput(0);
    for (int i = 0; i < n; ++i) {
        auto* bottleneck_layer = bottleneck(network, weightMap, *y1, hidden_channels, hidden_channels, shortcut, 1.0,
                                            lname + ".m." + std::to_string(i));
        y1 = bottleneck_layer->getOutput(0);  // update 'y1' to be the output of the current bottleneck
    }

    // Concatenate y1 with the second split of cv1
    nvinfer1::ITensor* concatInputs[2] = {y1, split2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(concatInputs, 2);

    // cv2 to produce the final output
    nvinfer1::IElementWiseLayer* conv2 =
            convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv2");

    return conv2;
}

nvinfer1::IElementWiseLayer* C3(nvinfer1::INetworkDefinition* network,
                                std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
                                int c2, int n, bool shortcut, float e, std::string lname) {
    int c_ = (float)c2 * e;
    nvinfer1::IElementWiseLayer* cv1 = convBnSiLU(network, weightMap, input, c_, 1, 1, calculateP(1), lname + ".cv1");
    nvinfer1::IElementWiseLayer* cv2 = convBnSiLU(network, weightMap, input, c_, 1, 1, calculateP(1), lname + ".cv2");
    nvinfer1::ITensor* y1 = cv1->getOutput(0);
    for (int i = 0; i < n; i++) {
        auto b = bottleneck_c3(network, weightMap, *y1, c_, c_, shortcut, 1.0, lname + ".m." + std::to_string(i));
        y1 = b->getOutput(0);
    }
    nvinfer1::ITensor* inputTensors[] = {y1, cv2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 2);
    nvinfer1::IElementWiseLayer* conv3 =
            convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, calculateP(1), lname + ".cv3");
    return conv3;
}

nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network,
                                  std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
                                  int c2, int k, std::string lname) {
    int c_ = c1 / 2;
    nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c_, 1, 1, 0, lname + ".cv1");
    nvinfer1::IPoolingLayer* pool1 =
            network->addPoolingNd(*conv1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
    pool1->setStrideNd(nvinfer1::DimsHW{1, 1});
    pool1->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
    nvinfer1::IPoolingLayer* pool2 =
            network->addPoolingNd(*pool1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
    pool2->setStrideNd(nvinfer1::DimsHW{1, 1});
    pool2->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
    nvinfer1::IPoolingLayer* pool3 =
            network->addPoolingNd(*pool2->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
    pool3->setStrideNd(nvinfer1::DimsHW{1, 1});
    pool3->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
    nvinfer1::ITensor* inputTensors[] = {conv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0),
                                         pool3->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 4);
    nvinfer1::IElementWiseLayer* conv2 =
            convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv2");
    return conv2;
}

nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                             nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname) {

    nvinfer1::IShuffleLayer* shuffle1 = network->addShuffle(input);
    shuffle1->setReshapeDimensions(nvinfer1::Dims3{4, 16, grid});
    shuffle1->setSecondTranspose(nvinfer1::Permutation{1, 0, 2});
    nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*shuffle1->getOutput(0));

    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv =
            network->addConvolutionNd(*softmax->getOutput(0), 1, nvinfer1::DimsHW{1, 1}, weightMap[lname], bias_empty);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    conv->setPaddingNd(nvinfer1::DimsHW{p, p});

    nvinfer1::IShuffleLayer* shuffle2 = network->addShuffle(*conv->getOutput(0));
    shuffle2->setReshapeDimensions(nvinfer1::Dims2{4, grid});

    return shuffle2;
}

nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
                                       std::vector<nvinfer1::IConcatenationLayer*> dets, const int* px_arry,
                                       int px_arry_num, int num_class, bool is_segmentation, bool is_pose,
                                       bool is_obb) {
    auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
    const int netinfo_count = 9;  // Assuming the first 5 elements are for netinfo as per existing code.
    const int total_count = netinfo_count + px_arry_num;  // Total number of elements for netinfo and px_arry combined.

    std::vector<int> combinedInfo(total_count);
    // Fill in the first 5 elements as per existing netinfo.
    combinedInfo[0] = num_class;
    combinedInfo[1] = kNumberOfPoints;
    combinedInfo[2] = kConfThreshKeypoints;
    combinedInfo[3] = kInputW;
    combinedInfo[4] = kInputH;
    combinedInfo[5] = kMaxNumOutputBbox;
    combinedInfo[6] = is_segmentation;
    combinedInfo[7] = is_pose;
    combinedInfo[8] = is_obb;

    // Copy the contents of px_arry into the combinedInfo vector after the initial
    // 5 elements.
    std::copy(px_arry, px_arry + px_arry_num, combinedInfo.begin() + netinfo_count);

    // Now let's create the PluginField object to hold this combined information.
    nvinfer1::PluginField pluginField;
    pluginField.name = "combinedInfo";  // This can be any name that the plugin will recognize
    pluginField.data = combinedInfo.data();
    pluginField.type = nvinfer1::PluginFieldType::kINT32;
    pluginField.length = combinedInfo.size();

    // Create the PluginFieldCollection to hold the PluginField object.
    nvinfer1::PluginFieldCollection pluginFieldCollection;
    pluginFieldCollection.nbFields = 1;  // We have just one field, but it's a combined array
    pluginFieldCollection.fields = &pluginField;

    // Create the plugin object using the PluginFieldCollection.
    nvinfer1::IPluginV2* pluginObject = creator->createPlugin("yololayer", &pluginFieldCollection);

    // We assume that the plugin is to be added onto the network.
    // Prepare input tensors for the YOLO Layer.
    std::vector<nvinfer1::ITensor*> inputTensors;
    for (auto det : dets) {
        inputTensors.push_back(det->getOutput(0));  // Assuming each IConcatenationLayer has one output tensor.
    }

    // Add the plugin to the network using the prepared input tensors.
    nvinfer1::IPluginV2Layer* yoloLayer = network->addPluginV2(inputTensors.data(), inputTensors.size(), *pluginObject);

    return yoloLayer;  // Return the added YOLO layer.
}


================================================
FILE: yolov8/src/calibrator.cpp
================================================
#include "calibrator.h"
#include <fstream>
#include <iostream>
#include <iterator>
#include <opencv2/dnn/dnn.hpp>
#include "cuda_utils.h"
#include "utils.h"

Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir,
                                               const char* calib_table_name, const char* input_blob_name,
                                               bool read_cache)
    : batchsize_(batchsize),
      input_w_(input_w),
      input_h_(input_h),
      img_idx_(0),
      img_dir_(img_dir),
      calib_table_name_(calib_table_name),
      input_blob_name_(input_blob_name),
      read_cache_(read_cache) {
    input_count_ = 3 * input_w * input_h * batchsize;
    CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float)));
    read_files_in_dir(img_dir, img_files_);
}

Int8EntropyCalibrator2::~Int8EntropyCalibrator2() {
    CUDA_CHECK(cudaFree(device_input_));
}

int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT {
    return batchsize_;
}

bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT {
    if (img_idx_ + batchsize_ > (int)img_files_.size()) {
        return false;
    }

    std::vector<cv::Mat> input_imgs_;
    for (int i = img_idx_; i < img_idx_ + batchsize_; i++) {
        std::cout << img_files_[i] << "  " << i << std::endl;
        cv::Mat temp = cv::imread(img_dir_ + img_files_[i]);
        if (temp.empty()) {
            std::cerr << "Fatal error: image cannot open!" << std::endl;
            return false;
        }
        cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_);
        input_imgs_.push_back(pr_img);
    }
    img_idx_ += batchsize_;
    cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0),
                                           true, false);
    CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr<float>(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice));
    assert(!strcmp(names[0], input_blob_name_));
    bindings[0] = device_input_;
    return true;
}

const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT {
    std::cout << "reading calib cache: " << calib_table_name_ << std::endl;
    calib_cache_.clear();
    std::ifstream input(calib_table_name_, std::ios::binary);
    input >> std::noskipws;
    if (read_cache_ && input.good()) {
        std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(calib_cache_));
    }
    length = calib_cache_.size();
    return length ? calib_cache_.data() : nullptr;
}

void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT {
    std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl;
    std::ofstream output(calib_table_name_, std::ios::binary);
    output.write(reinterpret_cast<const char*>(cache), length);
}


================================================
FILE: yolov8/src/model.cpp
================================================
#include <math.h>
#include <iostream>

#include "block.h"
#include "calibrator.h"
#include "config.h"
#include "model.h"

static int get_width_5u(int x, float gw, int divisor = 8) {
    return int(ceil((x * gw) / divisor)) * divisor;
}

static int get_width(int x, float gw, int max_channels, int divisor = 8) {
    auto channel = int(ceil((x * gw) / divisor)) * divisor;
    return channel >= max_channels ? max_channels : channel;
}

static int get_depth(int x, float gd) {
    if (x == 1)
        return 1;
    int r = round(x * gd);
    if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0)
        --r;
    return std::max<int>(r, 1);
}

void calculateStrides(nvinfer1::IElementWiseLayer* conv_layers[], int size, int reference_size, int strides[]) {
    for (int i = 0; i < size; ++i) {
        nvinfer1::ILayer* layer = conv_layers[i];
        nvinfer1::Dims dims = layer->getOutput(0)->getDimensions();
        int feature_map_size = dims.d[1];
        strides[i] = reference_size / feature_map_size;
    }
}

static nvinfer1::IElementWiseLayer* Proto(nvinfer1::INetworkDefinition* network,
                                          std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input,
                                          std::string lname, float gw, int max_channels) {
    int mid_channel = get_width(256, gw, max_channels);
    auto cv1 = convBnSiLU(network, weightMap, input, mid_channel, 3, 1, 1, "model.22.proto.cv1");
    float* convTranpsose_bais = (float*)weightMap["model.22.proto.upsample.bias"].values;
    int convTranpsose_bais_len = weightMap["model.22.proto.upsample.bias"].count;
    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, convTranpsose_bais, convTranpsose_bais_len};
    auto convTranpsose = network->addDeconvolutionNd(*cv1->getOutput(0), mid_channel, nvinfer1::DimsHW{2, 2},
                                                     weightMap["model.22.proto.upsample.weight"], bias);
    assert(convTranpsose);
    convTranpsose->setStrideNd(nvinfer1::DimsHW{2, 2});
    auto cv2 = convBnSiLU(network, weightMap, *convTranpsose->getOutput(0), mid_channel, 3, 1, 1, "model.22.proto.cv2");
    auto cv3 = convBnSiLU(network, weightMap, *cv2->getOutput(0), 32, 1, 1, 0, "model.22.proto.cv3");
    assert(cv3);
    return cv3;
}

static nvinfer1::IShuffleLayer* cv4_conv_combined(nvinfer1::INetworkDefinition* network,
                                                  std::map<std::string, nvinfer1::Weights>& weightMap,
                                                  nvinfer1::ITensor& input, std::string lname, int grid_shape, float gw,
                                                  std::string algo_type) {
    int mid_channle = 0;
    int output_channel = 0;

    if (algo_type == "seg") {
        if (gw == 0.25 || gw == 0.5) {
            mid_channle = 32;
        } else if (gw == 0.75) {
            mid_channle = 48;
        } else if (gw == 1.00) {
            mid_channle = 64;
        } else if (gw == 1.25) {
            mid_channle = 80;
        }

        output_channel = 32;

    } else if (algo_type == "pose") {
        std::string bn_weight_key = lname + ".0.bn.weight";
        mid_channle = weightMap[bn_weight_key].count;
        output_channel = kNumberOfPoints * 3;
    } else if (algo_type == "obb") {
        std::string bn_weight_key = lname + ".0.bn.weight";
        mid_channle = weightMap[bn_weight_key].count;
        output_channel = 1;
    }

    auto cv0 = convBnSiLU(network, weightMap, input, mid_channle, 3, 1, 1, lname + ".0");
    auto cv1 = convBnSiLU(network, weightMap, *cv0->getOutput(0), mid_channle, 3, 1, 1, lname + ".1");
    float* cv2_bais_value = (float*)weightMap[lname + ".2" + ".bias"].values;
    int cv2_bais_len = weightMap[lname + ".2" + ".bias"].count;
    nvinfer1::Weights cv2_bais{nvinfer1::DataType::kFLOAT, cv2_bais_value, cv2_bais_len};
    auto cv2 = network->addConvolutionNd(*cv1->getOutput(0), output_channel, nvinfer1::DimsHW{1, 1},
                                         weightMap[lname + ".2" + ".weight"], cv2_bais);
    cv2->setStrideNd(nvinfer1::DimsHW{1, 1});
    nvinfer1::IShuffleLayer* cv2_shuffle = network->addShuffle(*cv2->getOutput(0));
    cv2_shuffle->setReshapeDimensions(nvinfer1::Dims2{output_channel, grid_shape});

    return cv2_shuffle;
}

nvinfer1::IHostMemory* buildEngineYolov8Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                            nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                            int& max_channels) {
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);

    /*******************************************************************************************************
  ******************************************  YOLOV8 INPUT
  ***********************************************
  *******************************************************************************************************/
    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW});
    assert(data);

    /*******************************************************************************************************
  *****************************************  YOLOV8 BACKBONE
  *********************************************
  *******************************************************************************************************/
    nvinfer1::IElementWiseLayer* conv0 =
            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
    nvinfer1::IElementWiseLayer* conv1 =
            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1");
    // 11233
    nvinfer1::IElementWiseLayer* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
                                             get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2");
    nvinfer1::IElementWiseLayer* conv3 =
            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3");
    // 22466
    nvinfer1::IElementWiseLayer* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
                                             get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4");
    nvinfer1::IElementWiseLayer* conv5 =
            convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5");
    // 22466
    nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                                             get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");
    nvinfer1::IElementWiseLayer* conv7 =
            convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7");
    // 11233
    nvinfer1::IElementWiseLayer* conv8 =
            C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
                get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8");
    nvinfer1::IElementWiseLayer* conv9 =
            SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), 5, "model.9");
    /*******************************************************************************************************
  *********************************************  YOLOV8 HEAD
  *********************************************
  *******************************************************************************************************/
    float scale[] = {1.0, 2.0, 2.0};
    nvinfer1::IResizeLayer* upsample10 = network->addResize(*conv9->getOutput(0));
    assert(upsample10);
    upsample10->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample10->setScales(scale, 3);

    nvinfer1::ITensor* inputTensor11[] = {upsample10->getOutput(0), conv6->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat11 = network->addConcatenation(inputTensor11, 2);

    nvinfer1::IElementWiseLayer* conv12 =
            C2F(network, weightMap, *cat11->getOutput(0), get_width(512, gw, max_channels),
                get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12");

    nvinfer1::IResizeLayer* upsample13 = network->addResize(*conv12->getOutput(0));
    assert(upsample13);
    upsample13->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample13->setScales(scale, 3);

    nvinfer1::ITensor* inputTensor14[] = {upsample13->getOutput(0), conv4->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat14 = network->addConcatenation(inputTensor14, 2);

    nvinfer1::IElementWiseLayer* conv15 =
            C2F(network, weightMap, *cat14->getOutput(0), get_width(256, gw, max_channels),
                get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.15");
    nvinfer1::IElementWiseLayer* conv16 = convBnSiLU(network, weightMap, *conv15->getOutput(0),
                                                     get_width(256, gw, max_channels), 3, 2, 1, "model.16");
    nvinfer1::ITensor* inputTensor17[] = {conv16->getOutput(0), conv12->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat17 = network->addConcatenation(inputTensor17, 2);
    nvinfer1::IElementWiseLayer* conv18 =
            C2F(network, weightMap, *cat17->getOutput(0), get_width(512, gw, max_channels),
                get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.18");
    nvinfer1::IElementWiseLayer* conv19 = convBnSiLU(network, weightMap, *conv18->getOutput(0),
                                                     get_width(512, gw, max_channels), 3, 2, 1, "model.19");
    nvinfer1::ITensor* inputTensor20[] = {conv19->getOutput(0), conv9->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat20 = network->addConcatenation(inputTensor20, 2);
    nvinfer1::IElementWiseLayer* conv21 =
            C2F(network, weightMap, *cat20->getOutput(0), get_width(1024, gw, max_channels),
                get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21");

    /*******************************************************************************************************
  *********************************************  YOLOV8 OUTPUT
  *******************************************
  *******************************************************************************************************/
    int base_in_channel = (gw == 1.25) ? 80 : 64;
    int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(256, gw, max_channels);

    // output0
    nvinfer1::IElementWiseLayer* conv22_cv2_0_0 =
            convBnSiLU(network, weightMap, *conv15->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.0");
    nvinfer1::IElementWiseLayer* conv22_cv2_0_1 =
            convBnSiLU(network, weightMap, *conv22_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.1");
    nvinfer1::IConvolutionLayer* conv22_cv2_0_2 =
            network->addConvolutionNd(*conv22_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.22.cv2.0.2.weight"], weightMap["model.22.cv2.0.2.bias"]);
    conv22_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv22_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::IElementWiseLayer* conv22_cv3_0_0 =
            convBnSiLU(network, weightMap, *conv15->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.0.0");
    nvinfer1::IElementWiseLayer* conv22_cv3_0_1 = convBnSiLU(network, weightMap, *conv22_cv3_0_0->getOutput(0),
                                                             base_out_channel, 3, 1, 1, "model.22.cv3.0.1");
    nvinfer1::IConvolutionLayer* conv22_cv3_0_2 =
            network->addConvolutionNd(*conv22_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.22.cv3.0.2.weight"], weightMap["model.22.cv3.0.2.bias"]);
    conv22_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1});
    conv22_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor22_0[] = {conv22_cv2_0_2->getOutput(0), conv22_cv3_0_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_0 = network->addConcatenation(inputTensor22_0, 2);

    // output1
    nvinfer1::IElementWiseLayer* conv22_cv2_1_0 =
            convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.0");
    nvinfer1::IElementWiseLayer* conv22_cv2_1_1 =
            convBnSiLU(network, weightMap, *conv22_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.1");
    nvinfer1::IConvolutionLayer* conv22_cv2_1_2 =
            network->addConvolutionNd(*conv22_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.22.cv2.1.2.weight"], weightMap["model.22.cv2.1.2.bias"]);
    conv22_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv22_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::IElementWiseLayer* conv22_cv3_1_0 =
            convBnSiLU(network, weightMap, *conv18->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.1.0");
    nvinfer1::IElementWiseLayer* conv22_cv3_1_1 = convBnSiLU(network, weightMap, *conv22_cv3_1_0->getOutput(0),
                                                             base_out_channel, 3, 1, 1, "model.22.cv3.1.1");
    nvinfer1::IConvolutionLayer* conv22_cv3_1_2 =
            network->addConvolutionNd(*conv22_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.22.cv3.1.2.weight"], weightMap["model.22.cv3.1.2.bias"]);
    conv22_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv22_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor22_1[] = {conv22_cv2_1_2->getOutput(0), conv22_cv3_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_1 = network->addConcatenation(inputTensor22_1, 2);

    // output2
    nvinfer1::IElementWiseLayer* conv22_cv2_2_0 =
            convBnSiLU(network, weightMap, *conv21->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.0");
    nvinfer1::IElementWiseLayer* conv22_cv2_2_1 =
            convBnSiLU(network, weightMap, *conv22_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.1");
    nvinfer1::IConvolutionLayer* conv22_cv2_2_2 =
            network->addConvolution(*conv22_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                    weightMap["model.22.cv2.2.2.weight"], weightMap["model.22.cv2.2.2.bias"]);
    nvinfer1::IElementWiseLayer* conv22_cv3_2_0 =
            convBnSiLU(network, weightMap, *conv21->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.2.0");
    nvinfer1::IElementWiseLayer* conv22_cv3_2_1 = convBnSiLU(network, weightMap, *conv22_cv3_2_0->getOutput(0),
                                                             base_out_channel, 3, 1, 1, "model.22.cv3.2.1");
    nvinfer1::IConvolutionLayer* conv22_cv3_2_2 =
            network->addConvolution(*conv22_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                    weightMap["model.22.cv3.2.2.weight"], weightMap["model.22.cv3.2.2.bias"]);
    nvinfer1::ITensor* inputTensor22_2[] = {conv22_cv2_2_2->getOutput(0), conv22_cv3_2_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_2 = network->addConcatenation(inputTensor22_2, 2);

    /*******************************************************************************************************
  *********************************************  YOLOV8 DETECT
  *******************************************
  *******************************************************************************************************/

    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7};
    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
    int stridesLength = sizeof(strides) / sizeof(int);

    nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0));
    shuffle22_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
    nvinfer1::ISliceLayer* split22_0_0 = network->addSlice(
            *shuffle22_0->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split22_0_1 = network->addSlice(
            *shuffle22_0->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl22_0 =
            DFL(network, weightMap, *split22_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
                1, 0, "model.22.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor22_dfl_0[] = {dfl22_0->getOutput(0), split22_0_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 2);

    nvinfer1::IShuffleLayer* shuffle22_1 = network->addShuffle(*cat22_1->getOutput(0));
    shuffle22_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
    nvinfer1::ISliceLayer* split22_1_0 = network->addSlice(
            *shuffle22_1->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split22_1_1 = network->addSlice(
            *shuffle22_1->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl22_1 =
            DFL(network, weightMap, *split22_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
                1, 0, "model.22.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor22_dfl_1[] = {dfl22_1->getOutput(0), split22_1_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 2);

    nvinfer1::IShuffleLayer* shuffle22_2 = network->addShuffle(*cat22_2->getOutput(0));
    shuffle22_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
    nvinfer1::ISliceLayer* split22_2_0 = network->addSlice(
            *shuffle22_2->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split22_2_1 = network->addSlice(
            *shuffle22_2->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl22_2 =
            DFL(network, weightMap, *split22_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
                1, 0, "model.22.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor22_dfl_2[] = {dfl22_2->getOutput(0), split22_2_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 2);

    nvinfer1::IPluginV2Layer* yolo =
            addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer*>{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2},
                         strides, stridesLength, kNumClass, false, false, false);

    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    builder->setMaxBatchSize(kBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table",
                                                  kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}

nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                              nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                              int& max_channels) {
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
    /*******************************************************************************************************
  ******************************************  YOLOV8 INPUT
  ***********************************************
  *******************************************************************************************************/
    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW});
    assert(data);
    /*******************************************************************************************************
  *****************************************  YOLOV8 BACKBONE
  *********************************************
  *******************************************************************************************************/
    nvinfer1::IElementWiseLayer* conv0 =
            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
    nvinfer1::IElementWiseLayer* conv1 =
            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1");
    // 11233
    nvinfer1::IElementWiseLayer* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
                                             get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2");
    nvinfer1::IElementWiseLayer* conv3 =
            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3");
    // 22466
    nvinfer1::IElementWiseLayer* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
                                             get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4");
    nvinfer1::IElementWiseLayer* conv5 =
            convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5");
    // 22466
    nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                                             get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");

    nvinfer1::IElementWiseLayer* conv7 =
            convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(768, gw, max_channels), 3, 2, 1, "model.7");
    nvinfer1::IElementWiseLayer* conv8 = C2F(network, weightMap, *conv7->getOutput(0), get_width(768, gw, max_channels),
                                             get_width(768, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8");

    nvinfer1::IElementWiseLayer* conv9 =
            convBnSiLU(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.9");
    nvinfer1::IElementWiseLayer* conv10 =
            C2F(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels),
                get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.10");

    nvinfer1::IElementWiseLayer* conv11 =
            SPPF(network, weightMap, *conv10->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), 5, "model.11");

    /*******************************************************************************************************
  *********************************************  YOLOV8 HEAD
  *********************************************
  *******************************************************************************************************/
    // Head
    float scale[] = {1.0, 2.0, 2.0};  // scale used for upsampling

    // P5
    nvinfer1::IResizeLayer* upsample12 = network->addResize(*conv11->getOutput(0));
    upsample12->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample12->setScales(scale, 3);
    nvinfer1::ITensor* concat13_inputs[] = {upsample12->getOutput(0), conv8->getOutput(0)};
    nvinfer1::IConcatenationLayer* concat13 = network->addConcatenation(concat13_inputs, 2);
    nvinfer1::IElementWiseLayer* conv14 =
            C2(network, weightMap, *concat13->getOutput(0), get_width(768, gw, max_channels),
               get_width(768, gw, max_channels), get_depth(3, gd), false, 0.5, "model.14");

    // P4
    nvinfer1::IResizeLayer* upsample15 = network->addResize(*conv14->getOutput(0));
    upsample15->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample15->setScales(scale, 3);
    nvinfer1::ITensor* concat16_inputs[] = {upsample15->getOutput(0), conv6->getOutput(0)};
    nvinfer1::IConcatenationLayer* concat16 = network->addConcatenation(concat16_inputs, 2);
    nvinfer1::IElementWiseLayer* conv17 =
            C2(network, weightMap, *concat16->getOutput(0), get_width(512, gw, max_channels),
               get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.17");

    // P3
    nvinfer1::IResizeLayer* upsample18 = network->addResize(*conv17->getOutput(0));
    upsample18->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample18->setScales(scale, 3);
    nvinfer1::ITensor* concat19_inputs[] = {upsample18->getOutput(0), conv4->getOutput(0)};
    nvinfer1::IConcatenationLayer* concat19 = network->addConcatenation(concat19_inputs, 2);
    nvinfer1::IElementWiseLayer* conv20 =
            C2(network, weightMap, *concat19->getOutput(0), get_width(256, gw, max_channels),
               get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.20");

    // Additional layers for P4, P5, P6
    // P4/16-medium
    nvinfer1::IElementWiseLayer* conv21 = convBnSiLU(network, weightMap, *conv20->getOutput(0),
                                                     get_width(256, gw, max_channels), 3, 2, 1, "model.21");
    nvinfer1::ITensor* concat22_inputs[] = {conv21->getOutput(0), conv17->getOutput(0)};
    nvinfer1::IConcatenationLayer* concat22 = network->addConcatenation(concat22_inputs, 2);
    nvinfer1::IElementWiseLayer* conv23 =
            C2(network, weightMap, *concat22->getOutput(0), get_width(512, gw, max_channels),
               get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.23");

    // P5/32-large
    nvinfer1::IElementWiseLayer* conv24 = convBnSiLU(network, weightMap, *conv23->getOutput(0),
                                                     get_width(512, gw, max_channels), 3, 2, 1, "model.24");
    nvinfer1::ITensor* concat25_inputs[] = {conv24->getOutput(0), conv14->getOutput(0)};
    nvinfer1::IConcatenationLayer* concat25 = network->addConcatenation(concat25_inputs, 2);
    nvinfer1::IElementWiseLayer* conv26 =
            C2(network, weightMap, *concat25->getOutput(0), get_width(768, gw, max_channels),
               get_width(768, gw, max_channels), get_depth(3, gd), false, 0.5, "model.26");

    // P6/64-xlarge
    nvinfer1::IElementWiseLayer* conv27 = convBnSiLU(network, weightMap, *conv26->getOutput(0),
                                                     get_width(768, gw, max_channels), 3, 2, 1, "model.27");
    nvinfer1::ITensor* concat28_inputs[] = {conv27->getOutput(0), conv11->getOutput(0)};
    nvinfer1::IConcatenationLayer* concat28 = network->addConcatenation(concat28_inputs, 2);
    nvinfer1::IElementWiseLayer* conv29 =
            C2(network, weightMap, *concat28->getOutput(0), get_width(1024, gw, max_channels),
               get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.29");

    /*******************************************************************************************************
  *********************************************  YOLOV8 OUTPUT
  *******************************************
  *******************************************************************************************************/
    int base_in_channel = (gw == 1.25) ? 80 : 64;
    int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(256, gw, max_channels);

    // output0
    nvinfer1::IElementWiseLayer* conv30_cv2_0_0 =
            convBnSiLU(network, weightMap, *conv20->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.0.0");
    nvinfer1::IElementWiseLayer* conv30_cv2_0_1 =
            convBnSiLU(network, weightMap, *conv30_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.0.1");
    nvinfer1::IConvolutionLayer* conv30_cv2_0_2 =
            network->addConvolutionNd(*conv30_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.30.cv2.0.2.weight"], weightMap["model.30.cv2.0.2.bias"]);
    conv30_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});

    conv30_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});

    nvinfer1::IElementWiseLayer* conv30_cv3_0_0 =
            convBnSiLU(network, weightMap, *conv20->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.0.0");

    nvinfer1::IElementWiseLayer* conv30_cv3_0_1 = convBnSiLU(network, weightMap, *conv30_cv3_0_0->getOutput(0),
                                                             base_out_channel, 3, 1, 1, "model.30.cv3.0.1");
    nvinfer1::IConvolutionLayer* conv30_cv3_0_2 =
            network->addConvolutionNd(*conv30_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.30.cv3.0.2.weight"], weightMap["model.30.cv3.0.2.bias"]);
    conv30_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1});
    conv30_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor30_0[] = {conv30_cv2_0_2->getOutput(0), conv30_cv3_0_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat30_0 = network->addConcatenation(inputTensor30_0, 2);

    // output1
    nvinfer1::IElementWiseLayer* conv30_cv2_1_0 =
            convBnSiLU(network, weightMap, *conv23->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.1.0");
    nvinfer1::IElementWiseLayer* conv30_cv2_1_1 =
            convBnSiLU(network, weightMap, *conv30_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.1.1");
    nvinfer1::IConvolutionLayer* conv30_cv2_1_2 =
            network->addConvolutionNd(*conv30_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.30.cv2.1.2.weight"], weightMap["model.30.cv2.1.2.bias"]);
    conv30_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv30_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::IElementWiseLayer* conv30_cv3_1_0 =
            convBnSiLU(network, weightMap, *conv23->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.1.0");
    nvinfer1::IElementWiseLayer* conv30_cv3_1_1 = convBnSiLU(network, weightMap, *conv30_cv3_1_0->getOutput(0),
                                                             base_out_channel, 3, 1, 1, "model.30.cv3.1.1");
    nvinfer1::IConvolutionLayer* conv30_cv3_1_2 =
            network->addConvolutionNd(*conv30_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.30.cv3.1.2.weight"], weightMap["model.30.cv3.1.2.bias"]);
    conv30_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv30_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor30_1[] = {conv30_cv2_1_2->getOutput(0), conv30_cv3_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat30_1 = network->addConcatenation(inputTensor30_1, 2);

    // output2
    nvinfer1::IElementWiseLayer* conv30_cv2_2_0 =
            convBnSiLU(network, weightMap, *conv26->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.2.0");
    nvinfer1::IElementWiseLayer* conv30_cv2_2_1 =
            convBnSiLU(network, weightMap, *conv30_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.2.1");
    nvinfer1::IConvolutionLayer* conv30_cv2_2_2 =
            network->addConvolution(*conv30_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                    weightMap["model.30.cv2.2.2.weight"], weightMap["model.30.cv2.2.2.bias"]);
    conv30_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv30_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::IElementWiseLayer* conv30_cv3_2_0 =
            convBnSiLU(network, weightMap, *conv26->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.2.0");
    nvinfer1::IElementWiseLayer* conv30_cv3_2_1 = convBnSiLU(network, weightMap, *conv30_cv3_2_0->getOutput(0),
                                                             base_out_channel, 3, 1, 1, "model.30.cv3.2.1");
    nvinfer1::IConvolutionLayer* conv30_cv3_2_2 =
            network->addConvolution(*conv30_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                    weightMap["model.30.cv3.2.2.weight"], weightMap["model.30.cv3.2.2.bias"]);
    conv30_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv30_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor30_2[] = {conv30_cv2_2_2->getOutput(0), conv30_cv3_2_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat30_2 = network->addConcatenation(inputTensor30_2, 2);

    // output3
    nvinfer1::IElementWiseLayer* conv30_cv2_3_0 =
            convBnSiLU(network, weightMap, *conv29->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.3.0");
    nvinfer1::IElementWiseLayer* conv30_cv2_3_1 =
            convBnSiLU(network, weightMap, *conv30_cv2_3_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.3.1");
    nvinfer1::IConvolutionLayer* conv30_cv2_3_2 =
            network->addConvolution(*conv30_cv2_3_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                    weightMap["model.30.cv2.3.2.weight"], weightMap["model.30.cv2.3.2.bias"]);
    conv30_cv2_3_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv30_cv2_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::IElementWiseLayer* conv30_cv3_3_0 =
            convBnSiLU(network, weightMap, *conv29->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.3.0");
    nvinfer1::IElementWiseLayer* conv30_cv3_3_1 = convBnSiLU(network, weightMap, *conv30_cv3_3_0->getOutput(0),
                                                             base_out_channel, 3, 1, 1, "model.30.cv3.3.1");
    nvinfer1::IConvolutionLayer* conv30_cv3_3_2 =
            network->addConvolution(*conv30_cv3_3_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                    weightMap["model.30.cv3.3.2.weight"], weightMap["model.30.cv3.3.2.bias"]);
    conv30_cv3_3_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv30_cv3_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor30_3[] = {conv30_cv2_3_2->getOutput(0), conv30_cv3_3_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat30_3 = network->addConcatenation(inputTensor30_3, 2);

    /*******************************************************************************************************
  *********************************************  YOLOV8 DETECT
  *******************************************
  *******************************************************************************************************/
    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7, conv9};
    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
    int stridesLength = sizeof(strides) / sizeof(int);

    // P3 processing steps (remains unchanged)
    nvinfer1::IShuffleLayer* shuffle30_0 =
            network->addShuffle(*cat30_0->getOutput(0));  // Reusing the previous cat30_0 as P3 concatenation layer
    shuffle30_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
    nvinfer1::ISliceLayer* split30_0_0 = network->addSlice(
            *shuffle30_0->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split30_0_1 = network->addSlice(
            *shuffle30_0->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl30_0 =
            DFL(network, weightMap, *split30_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
                1, 0, "model.30.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor30_dfl_0[] = {dfl30_0->getOutput(0), split30_0_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat30_dfl_0 = network->addConcatenation(inputTensor30_dfl_0, 2);

    // P4 processing steps (remains unchanged)
    nvinfer1::IShuffleLayer* shuffle30_1 =
            network->addShuffle(*cat30_1->getOutput(0));  // Reusing the previous cat30_1 as P4 concatenation layer
    shuffle30_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
    nvinfer1::ISliceLayer* split30_1_0 = network->addSlice(
            *shuffle30_1->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split30_1_1 = network->addSlice(
            *shuffle30_1->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl30_1 =
            DFL(network, weightMap, *split30_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
                1, 0, "model.30.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor30_dfl_1[] = {dfl30_1->getOutput(0), split30_1_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat30_dfl_1 = network->addConcatenation(inputTensor30_dfl_1, 2);

    // P5 processing steps (remains unchanged)
    nvinfer1::IShuffleLayer* shuffle30_2 =
            network->addShuffle(*cat30_2->getOutput(0));  // Reusing the previous cat30_2 as P5 concatenation layer
    shuffle30_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
    nvinfer1::ISliceLayer* split30_2_0 = network->addSlice(
            *shuffle30_2->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split30_2_1 = network->addSlice(
            *shuffle30_2->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl30_2 =
            DFL(network, weightMap, *split30_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
                1, 0, "model.30.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor30_dfl_2[] = {dfl30_2->getOutput(0), split30_2_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat30_dfl_2 = network->addConcatenation(inputTensor30_dfl_2, 2);

    // P6 processing steps
    nvinfer1::IShuffleLayer* shuffle30_3 = network->addShuffle(*cat30_3->getOutput(0));
    shuffle30_3->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])});
    nvinfer1::ISliceLayer* split30_3_0 = network->addSlice(
            *shuffle30_3->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split30_3_1 = network->addSlice(
            *shuffle30_3->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl30_3 =
            DFL(network, weightMap, *split30_3_0->getOutput(0), 4, (kInputH / strides[3]) * (kInputW / strides[3]), 1,
                1, 0, "model.30.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor30_dfl_3[] = {dfl30_3->getOutput(0), split30_3_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat30_dfl_3 = network->addConcatenation(inputTensor30_dfl_3, 2);

    nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(
            network, std::vector<nvinfer1::IConcatenationLayer*>{cat30_dfl_0, cat30_dfl_1, cat30_dfl_2, cat30_dfl_3},
            strides, stridesLength, kNumClass, false, false, false);
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    builder->setMaxBatchSize(kBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table",
                                                  kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}

nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                              nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                              int& max_channels) {
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);

    /*******************************************************************************************************
  ******************************************  YOLOV8 INPUT
  ***********************************************
  *******************************************************************************************************/
    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW});
    assert(data);

    /*******************************************************************************************************
  *****************************************  YOLOV8 BACKBONE
  *********************************************
  *******************************************************************************************************/
    nvinfer1::IElementWiseLayer* conv0 =
            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
    nvinfer1::IElementWiseLayer* conv1 =
            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1");
    // 11233
    nvinfer1::IElementWiseLayer* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
                                             get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2");
    nvinfer1::IElementWiseLayer* conv3 =
            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3");
    // 22466
    nvinfer1::IElementWiseLayer* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
                                             get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4");
    nvinfer1::IElementWiseLayer* conv5 =
            convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5");
    // 22466
    nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                                             get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");
    nvinfer1::IElementWiseLayer* conv7 =
            convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7");
    // 11233
    nvinfer1::IElementWiseLayer* conv8 =
            C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
                get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8");
    nvinfer1::IElementWiseLayer* conv9 =
            SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), 5, "model.9");

    /*******************************************************************************************************
  *********************************************  YOLOV8 HEAD
  *********************************************
  *******************************************************************************************************/
    // Head
    float scale[] = {1.0, 2.0, 2.0};  // scale used for upsampling

    // P4
    nvinfer1::IResizeLayer* upsample10 =
            network->addResize(*conv9->getOutput(0));  // Assuming conv9 is the last layer of the backbone
                                                       // as per P5 in your first section.
    upsample10->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample10->setScales(scale, 3);
    nvinfer1::ITensor* concat11_inputs[] = {upsample10->getOutput(0),
                                            conv6->getOutput(0)};  // Assuming conv6 corresponds to "backbone P4" as
                                                                   // per your pseudocode
    nvinfer1::IConcatenationLayer* concat11 = network->addConcatenation(concat11_inputs, 2);
    nvinfer1::IElementWiseLayer* conv12 =
            C2F(network, weightMap, *concat11->getOutput(0), get_width(512, gw, max_channels),
                get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12");

    // P3
    nvinfer1::IResizeLayer* upsample13 = network->addResize(*conv12->getOutput(0));
    upsample13->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample13->setScales(scale, 3);
    nvinfer1::ITensor* concat14_inputs[] = {upsample13->getOutput(0),
                                            conv4->getOutput(0)};  // Assuming conv4 corresponds to "backbone P3"
    nvinfer1::IConcatenationLayer* concat14 = network->addConcatenation(concat14_inputs, 2);
    nvinfer1::IElementWiseLayer* conv15 =
            C2F(network, weightMap, *concat14->getOutput(0), get_width(256, gw, max_channels),
                get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.15");

    // P2
    nvinfer1::IResizeLayer* upsample16 = network->addResize(*conv15->getOutput(0));
    upsample16->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample16->setScales(scale, 3);
    nvinfer1::ITensor* concat17_inputs[] = {upsample16->getOutput(0),
                                            conv2->getOutput(0)};  // Assuming conv2 corresponds to "backbone P2"
    nvinfer1::IConcatenationLayer* concat17 = network->addConcatenation(concat17_inputs, 2);
    nvinfer1::IElementWiseLayer* conv18 =
            C2F(network, weightMap, *concat17->getOutput(0), get_width(128, gw, max_channels),
                get_width(128, gw, max_channels), get_depth(3, gd), false, 0.5, "model.18");

    // Additional layers for P3, P4, P5
    // Downsample and concatenate for P3
    nvinfer1::IElementWiseLayer* conv19 = convBnSiLU(network, weightMap, *conv18->getOutput(0),
                                                     get_width(128, gw, max_channels), 3, 2, 1, "model.19");
    nvinfer1::ITensor* concat20_inputs[] = {
            conv19->getOutput(0), conv15->getOutput(0)};  // concatenate with higher-resolution feature map from P3
    nvinfer1::IConcatenationLayer* concat20 = network->addConcatenation(concat20_inputs, 2);
    nvinfer1::IElementWiseLayer* conv21 =
            C2F(network, weightMap, *concat20->getOutput(0), get_width(256, gw, max_channels),
                get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21");

    // Downsample and concatenate for P4
    nvinfer1::IElementWiseLayer* conv22 = convBnSiLU(network, weightMap, *conv21->getOutput(0),
                                                     get_width(256, gw, max_channels), 3, 2, 1, "model.22");
    nvinfer1::ITensor* concat23_inputs[] = {
            conv22->getOutput(0), conv12->getOutput(0)};  // concatenate with higher-resolution feature map from P4
    nvinfer1::IConcatenationLayer* concat23 = network->addConcatenation(concat23_inputs, 2);
    nvinfer1::IElementWiseLayer* conv24 =
            C2F(network, weightMap, *concat23->getOutput(0), get_width(512, gw, max_channels),
                get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.24");

    // Downsample and concatenate for P5
    nvinfer1::IElementWiseLayer* conv25 = convBnSiLU(network, weightMap, *conv24->getOutput(0),
                                                     get_width(512, gw, max_channels), 3, 2, 1, "model.25");
    nvinfer1::ITensor* concat26_inputs[] = {
            conv25->getOutput(0), conv9->getOutput(0)};  // concatenate with higher-resolution feature map from P5
    nvinfer1::IConcatenationLayer* concat26 = network->addConcatenation(concat26_inputs, 2);
    nvinfer1::IElementWiseLayer* conv27 =
            C2F(network, weightMap, *concat26->getOutput(0), get_width(1024, gw, max_channels),
                get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.27");

    /*******************************************************************************************************
  *********************************************  YOLOV8 OUTPUT
  *******************************************
  *******************************************************************************************************/
    int base_in_channel = 64;
    int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(128, gw, max_channels);

    // output0
    nvinfer1::IElementWiseLayer* conv28_cv2_0_0 =
            convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.0.0");
    nvinfer1::IElementWiseLayer* conv28_cv2_0_1 =
            convBnSiLU(network, weightMap, *conv28_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.0.1");
    nvinfer1::IConvolutionLayer* conv28_cv2_0_2 =
            network->addConvolutionNd(*conv28_cv2_0_1->getOutput(0), base_in_channel, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.28.cv2.0.2.weight"], weightMap["model.28.cv2.0.2.bias"]);
    conv28_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv28_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::IElementWiseLayer* conv28_cv3_0_0 =
            convBnSiLU(network, weightMap, *conv18->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.0.0");
    nvinfer1::IElementWiseLayer* conv28_cv3_0_1 = convBnSiLU(network, weightMap, *conv28_cv3_0_0->getOutput(0),
                                                             base_out_channel, 3, 1, 1, "model.28.cv3.0.1");
    nvinfer1::IConvolutionLayer* conv28_cv3_0_2 =
            network->addConvolutionNd(*conv28_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.28.cv3.0.2.weight"], weightMap["model.28.cv3.0.2.bias"]);
    conv28_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1});
    conv28_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor28_0[] = {conv28_cv2_0_2->getOutput(0), conv28_cv3_0_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat28_0 = network->addConcatenation(inputTensor28_0, 2);

    // output1
    nvinfer1::IElementWiseLayer* conv28_cv2_1_0 =
            convBnSiLU(network, weightMap, *conv21->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.1.0");
    nvinfer1::IElementWiseLayer* conv28_cv2_1_1 =
            convBnSiLU(network, weightMap, *conv28_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.1.1");
    nvinfer1::IConvolutionLayer* conv28_cv2_1_2 =
            network->addConvolutionNd(*conv28_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.28.cv2.1.2.weight"], weightMap["model.28.cv2.1.2.bias"]);
    conv28_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv28_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::IElementWiseLayer* conv28_cv3_1_0 =
            convBnSiLU(network, weightMap, *conv21->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.1.0");
    nvinfer1::IElementWiseLayer* conv28_cv3_1_1 = convBnSiLU(network, weightMap, *conv28_cv3_1_0->getOutput(0),
                                                             base_out_channel, 3, 1, 1, "model.28.cv3.1.1");
    nvinfer1::IConvolutionLayer* conv28_cv3_1_2 =
            network->addConvolutionNd(*conv28_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.28.cv3.1.2.weight"], weightMap["model.28.cv3.1.2.bias"]);
    conv28_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv28_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor28_1[] = {conv28_cv2_1_2->getOutput(0), conv28_cv3_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat28_1 = network->addConcatenation(inputTensor28_1, 2);

    // output2
    nvinfer1::IElementWiseLayer* conv28_cv2_2_0 =
            convBnSiLU(network, weightMap, *conv24->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.2.0");
    nvinfer1::IElementWiseLayer* conv28_cv2_2_1 =
            convBnSiLU(network, weightMap, *conv28_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.2.1");
    nvinfer1::IConvolutionLayer* conv28_cv2_2_2 =
            network->addConvolution(*conv28_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                    weightMap["model.28.cv2.2.2.weight"], weightMap["model.28.cv2.2.2.bias"]);
    conv28_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv28_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::IElementWiseLayer* conv28_cv3_2_0 =
            convBnSiLU(network, weightMap, *conv24->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.2.0");
    nvinfer1::IElementWiseLayer* conv28_cv3_2_1 = convBnSiLU(network, weightMap, *conv28_cv3_2_0->getOutput(0),
                                                             base_out_channel, 3, 1, 1, "model.28.cv3.2.1");
    nvinfer1::IConvolutionLayer* conv28_cv3_2_2 =
            network->addConvolution(*conv28_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                    weightMap["model.28.cv3.2.2.weight"], weightMap["model.28.cv3.2.2.bias"]);
    conv28_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv28_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor28_2[] = {conv28_cv2_2_2->getOutput(0), conv28_cv3_2_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat28_2 = network->addConcatenation(inputTensor28_2, 2);

    // output3
    nvinfer1::IElementWiseLayer* conv28_cv2_3_0 =
            convBnSiLU(network, weightMap, *conv27->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.3.0");
    nvinfer1::IElementWiseLayer* conv28_cv2_3_1 =
            convBnSiLU(network, weightMap, *conv28_cv2_3_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.3.1");
    nvinfer1::IConvolutionLayer* conv28_cv2_3_2 =
            network->addConvolution(*conv28_cv2_3_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                    weightMap["model.28.cv2.3.2.weight"], weightMap["model.28.cv2.3.2.bias"]);
    conv28_cv2_3_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv28_cv2_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::IElementWiseLayer* conv28_cv3_3_0 =
            convBnSiLU(network, weightMap, *conv27->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.3.0");
    nvinfer1::IElementWiseLayer* conv28_cv3_3_1 = convBnSiLU(network, weightMap, *conv28_cv3_3_0->getOutput(0),
                                                             base_out_channel, 3, 1, 1, "model.28.cv3.3.1");
    nvinfer1::IConvolutionLayer* conv28_cv3_3_2 =
            network->addConvolution(*conv28_cv3_3_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                    weightMap["model.28.cv3.3.2.weight"], weightMap["model.28.cv3.3.2.bias"]);
    conv28_cv3_3_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv28_cv3_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor28_3[] = {conv28_cv2_3_2->getOutput(0), conv28_cv3_3_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat28_3 = network->addConcatenation(inputTensor28_3, 2);

    /*******************************************************************************************************
  *********************************************  YOLOV8 DETECT
  *******************************************
  *******************************************************************************************************/

    nvinfer1::IElementWiseLayer* conv_layers[] = {conv1, conv3, conv5, conv7};
    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
    int stridesLength = sizeof(strides) / sizeof(int);

    // P2 processing steps (remains unchanged)
    nvinfer1::IShuffleLayer* shuffle28_0 = network->addShuffle(*cat28_0->getOutput(0));
    shuffle28_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
    nvinfer1::ISliceLayer* split28_0_0 = network->addSlice(
            *shuffle28_0->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split28_0_1 = network->addSlice(
            *shuffle28_0->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl28_0 =
            DFL(network, weightMap, *split28_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
                1, 0, "model.28.dfl.conv.weight");

    nvinfer1::ITensor* inputTensor28_dfl_0[] = {dfl28_0->getOutput(0), split28_0_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat28_dfl_0 = network->addConcatenation(inputTensor28_dfl_0, 2);

    // P3 processing steps (remains unchanged)
    nvinfer1::IShuffleLayer* shuffle28_1 = network->addShuffle(*cat28_1->getOutput(0));
    shuffle28_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
    nvinfer1::ISliceLayer* split28_1_0 = network->addSlice(
            *shuffle28_1->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split28_1_1 = network->addSlice(
            *shuffle28_1->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl28_1 =
            DFL(network, weightMap, *split28_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
                1, 0, "model.28.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor28_dfl_1[] = {dfl28_1->getOutput(0), split28_1_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat28_dfl_1 = network->addConcatenation(inputTensor28_dfl_1, 2);

    // P4 processing steps (remains unchanged)
    nvinfer1::IShuffleLayer* shuffle28_2 = network->addShuffle(*cat28_2->getOutput(0));
    shuffle28_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
    nvinfer1::ISliceLayer* split28_2_0 = network->addSlice(
            *shuffle28_2->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split28_2_1 = network->addSlice(
            *shuffle28_2->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl28_2 =
            DFL(network, weightMap, *split28_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
                1, 0, "model.28.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor28_dfl_2[] = {dfl28_2->getOutput(0), split28_2_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat28_dfl_2 = network->addConcatenation(inputTensor28_dfl_2, 2);

    // P5 processing steps
    nvinfer1::IShuffleLayer* shuffle28_3 = network->addShuffle(*cat28_3->getOutput(0));
    shuffle28_3->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])});
    nvinfer1::ISliceLayer* split28_3_0 = network->addSlice(
            *shuffle28_3->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split28_3_1 = network->addSlice(
            *shuffle28_3->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl28_3 =
            DFL(network, weightMap, *split28_3_0->getOutput(0), 4, (kInputH / strides[3]) * (kInputW / strides[3]), 1,
                1, 0, "model.28.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor28_dfl_3[] = {dfl28_3->getOutput(0), split28_3_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat28_dfl_3 = network->addConcatenation(inputTensor28_dfl_3, 2);

    nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(
            network, std::vector<nvinfer1::IConcatenationLayer*>{cat28_dfl_0, cat28_dfl_1, cat28_dfl_2, cat28_dfl_3},
            strides, stridesLength, kNumClass, false, false, false);
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    builder->setMaxBatchSize(kBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table",
                                                  kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}

nvinfer1::IHostMemory* buildEngineYolov8Cls(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                            nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw) {
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
    int max_channels = 1280;
    // ****************************************** YOLOV8 INPUT
    // **********************************************
    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kClsInputH, kClsInputW});
    assert(data);

    // ***************************************** YOLOV8 BACKBONE
    // ********************************************
    nvinfer1::IElementWiseLayer* conv0 =
            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
    nvinfer1::IElementWiseLayer* conv1 =
            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1");
    // C2 Block (11233)
    nvinfer1::IElementWiseLayer* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
                                             get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2");
    nvinfer1::IElementWiseLayer* conv3 =
            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3");
    // C2 Block Sequence (22466)
    nvinfer1::IElementWiseLayer* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
                                             get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4");
    nvinfer1::IElementWiseLayer* conv5 =
            convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5");
    // C2 Block Sequence (22466)
    nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                                             get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");
    nvinfer1::IElementWiseLayer* conv7 =
            convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7");
    // C2 Block (11233)
    nvinfer1::IElementWiseLayer* conv8 =
            C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
                get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8");

    // ********************************************* YOLOV8 HEAD
    // *********************************************

    auto conv_class = convBnSiLU(network, weightMap, *conv8->getOutput(0), 1280, 1, 1, 1, "model.9.conv");
    // Adjusted code
    nvinfer1::Dims dims = conv_class->getOutput(0)->getDimensions();  // Obtain the dimensions of the
                                                                      // output of conv_class
    assert(dims.nbDims == 3);  // Make sure there are exactly 3 dimensions (channels, height, width)

    nvinfer1::IPoolingLayer* pool2 = network->addPoolingNd(*conv_class->getOutput(0), nvinfer1::PoolingType::kAVERAGE,
                                                           nvinfer1::DimsHW{dims.d[1], dims.d[2]});
    assert(pool2);

    // Fully connected layer declaration
    nvinfer1::IFullyConnectedLayer* yolo = network->addFullyConnected(
            *pool2->getOutput(0), kClsNumClass, weightMap["model.9.linear.weight"], weightMap["model.9.linear.bias"]);
    assert(yolo);

    // Set the name for the output tensor and mark it as network output
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    // Set the maximum batch size and workspace size
    builder->setMaxBatchSize(kBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));

    // Configuration according to the precision mode being used
#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform supports int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator = new Int8EntropyCalibrator2(1, kClsInputW, kClsInputH, kInputQuantizationFolder,
                                                  "int8calib.table", kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    // Begin building the engine; this may take a while
    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // Cleanup the network definition and allocated weights
    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}

nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                            nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                            int& max_channels) {
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);

    /*******************************************************************************************************
  ******************************************  YOLOV8 INPUT
  ***********************************************
  *******************************************************************************************************/
    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW});
    assert(data);

    /*******************************************************************************************************
  *****************************************  YOLOV8 BACKBONE
  *********************************************
  *******************************************************************************************************/
    nvinfer1::IElementWiseLayer* conv0 =
            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
    nvinfer1::IElementWiseLayer* conv1 =
            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1");
    nvinfer1::IElementWiseLayer* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
                                             get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2");
    nvinfer1::IElementWiseLayer* conv3 =
            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3");
    nvinfer1::IElementWiseLayer* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
                                             get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4");
    nvinfer1::IElementWiseLayer* conv5 =
            convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5");
    nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                                             get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");
    nvinfer1::IElementWiseLayer* conv7 =
            convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7");
    nvinfer1::IElementWiseLayer* conv8 =
            C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
                get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8");
    nvinfer1::IElementWiseLayer* conv9 =
            SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), 5, "model.9");

    /*******************************************************************************************************
  *********************************************  YOLOV8 HEAD
  *********************************************
  *******************************************************************************************************/
    float scale[] = {1.0, 2.0, 2.0};
    nvinfer1::IResizeLayer* upsample10 = network->addResize(*conv9->getOutput(0));
    assert(upsample10);
    upsample10->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample10->setScales(scale, 3);

    nvinfer1::ITensor* inputTensor11[] = {upsample10->getOutput(0), conv6->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat11 = network->addConcatenation(inputTensor11, 2);
    nvinfer1::IElementWiseLayer* conv12 =
            C2F(network, weightMap, *cat11->getOutput(0), get_width(512, gw, max_channels),
                get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12");

    nvinfer1::IResizeLayer* upsample13 = network->addResize(*conv12->getOutput(0));
    assert(upsample13);
    upsample13->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample13->setScales(scale, 3);

    nvinfer1::ITensor* inputTensor14[] = {upsample13->getOutput(0), conv4->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat14 = network->addConcatenation(inputTensor14, 2);
    nvinfer1::IElementWiseLayer* conv15 =
            C2F(network, weightMap, *cat14->getOutput(0), get_width(256, gw, max_channels),
                get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.15");
    nvinfer1::IElementWiseLayer* conv16 = convBnSiLU(network, weightMap, *conv15->getOutput(0),
                                                     get_width(256, gw, max_channels), 3, 2, 1, "model.16");
    nvinfer1::ITensor* inputTensor17[] = {conv16->getOutput(0), conv12->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat17 = network->addConcatenation(inputTensor17, 2);
    nvinfer1::IElementWiseLayer* conv18 =
            C2F(network, weightMap, *cat17->getOutput(0), get_width(512, gw, max_channels),
                get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.18");
    nvinfer1::IElementWiseLayer* conv19 = convBnSiLU(network, weightMap, *conv18->getOutput(0),
                                                     get_width(512, gw, max_channels), 3, 2, 1, "model.19");
    nvinfer1::ITensor* inputTensor20[] = {conv19->getOutput(0), conv9->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat20 = network->addConcatenation(inputTensor20, 2);
    nvinfer1::IElementWiseLayer* conv21 =
            C2F(network, weightMap, *cat20->getOutput(0), get_width(1024, gw, max_channels),
                get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21");

    /*******************************************************************************************************
  *********************************************  YOLOV8 OUTPUT
  *******************************************
  *******************************************************************************************************/
    int base_in_channel = (gw == 1.25) ? 80 : 64;
    int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(256, gw, max_channels);

    // output0
    nvinfer1::IElementWiseLayer* conv22_cv2_0_0 =
            convBnSiLU(network, weightMap, *conv15->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.0");
    nvinfer1::IElementWiseLayer* conv22_cv2_0_1 =
            convBnSiLU(network, weightMap, *conv22_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.1");
    nvinfer1::IConvolutionLayer* conv22_cv2_0_2 =
            network->addConvolutionNd(*conv22_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.22.cv2.0.2.weight"], weightMap["model.22.cv2.0.2.bias"]);
    conv22_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv22_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::IElementWiseLayer* conv22_cv3_0_0 =
            convBnSiLU(network, weightMap, *conv15->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.0.0");
    nvinfer1::IElementWiseLayer* conv22_cv3_0_1 = convBnSiLU(network, weightMap, *conv22_cv3_0_0->getOutput(0),
                                                             base_out_channel, 3, 1, 1, "model.22.cv3.0.1");
    nvinfer1::IConvolutionLayer* conv22_cv3_0_2 =
            network->addConvolutionNd(*conv22_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.22.cv3.0.2.weight"], weightMap["model.22.cv3.0.2.bias"]);
    conv22_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1});
    conv22_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor22_0[] = {conv22_cv2_0_2->getOutput(0), conv22_cv3_0_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_0 = network->addConcatenation(inputTensor22_0, 2);

    // output1
    nvinfer1::IElementWiseLayer* conv22_cv2_1_0 =
            convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.0");
    nvinfer1::IElementWiseLayer* conv22_cv2_1_1 =
            convBnSiLU(network, weightMap, *conv22_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.1");
    nvinfer1::IConvolutionLayer* conv22_cv2_1_2 =
            network->addConvolutionNd(*conv22_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.22.cv2.1.2.weight"], weightMap["model.22.cv2.1.2.bias"]);
    conv22_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv22_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::IElementWiseLayer* conv22_cv3_1_0 =
            convBnSiLU(network, weightMap, *conv18->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.1.0");
    nvinfer1::IElementWiseLayer* conv22_cv3_1_1 = convBnSiLU(network, weightMap, *conv22_cv3_1_0->getOutput(0),
                                                             base_out_channel, 3, 1, 1, "model.22.cv3.1.1");
    nvinfer1::IConvolutionLayer* conv22_cv3_1_2 =
            network->addConvolutionNd(*conv22_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.22.cv3.1.2.weight"], weightMap["model.22.cv3.1.2.bias"]);
    conv22_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv22_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor22_1[] = {conv22_cv2_1_2->getOutput(0), conv22_cv3_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_1 = network->addConcatenation(inputTensor22_1, 2);

    // output2
    nvinfer1::IElementWiseLayer* conv22_cv2_2_0 =
            convBnSiLU(network, weightMap, *conv21->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.0");
    nvinfer1::IElementWiseLayer* conv22_cv2_2_1 =
            convBnSiLU(network, weightMap, *conv22_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.1");
    nvinfer1::IConvolutionLayer* conv22_cv2_2_2 =
            network->addConvolution(*conv22_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                    weightMap["model.22.cv2.2.2.weight"], weightMap["model.22.cv2.2.2.bias"]);
    nvinfer1::IElementWiseLayer* conv22_cv3_2_0 =
            convBnSiLU(network, weightMap, *conv21->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.2.0");
    nvinfer1::IElementWiseLayer* conv22_cv3_2_1 = convBnSiLU(network, weightMap, *conv22_cv3_2_0->getOutput(0),
                                                             base_out_channel, 3, 1, 1, "model.22.cv3.2.1");
    nvinfer1::IConvolutionLayer* conv22_cv3_2_2 =
            network->addConvolution(*conv22_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                    weightMap["model.22.cv3.2.2.weight"], weightMap["model.22.cv3.2.2.bias"]);
    nvinfer1::ITensor* inputTensor22_2[] = {conv22_cv2_2_2->getOutput(0), conv22_cv3_2_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_2 = network->addConcatenation(inputTensor22_2, 2);

    /*******************************************************************************************************
  *********************************************  YOLOV8 DETECT
  *******************************************
  *******************************************************************************************************/

    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7};
    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
    int stridesLength = sizeof(strides) / sizeof(int);

    nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0));
    shuffle22_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
    nvinfer1::ISliceLayer* split22_0_0 = network->addSlice(
            *shuffle22_0->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split22_0_1 = network->addSlice(
            *shuffle22_0->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl22_0 =
            DFL(network, weightMap, *split22_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
                1, 0, "model.22.dfl.conv.weight");

    nvinfer1::IShuffleLayer* shuffle22_1 = network->addShuffle(*cat22_1->getOutput(0));
    shuffle22_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
    nvinfer1::ISliceLayer* split22_1_0 = network->addSlice(
            *shuffle22_1->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split22_1_1 = network->addSlice(
            *shuffle22_1->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl22_1 =
            DFL(network, weightMap, *split22_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
                1, 0, "model.22.dfl.conv.weight");

    nvinfer1::IShuffleLayer* shuffle22_2 = network->addShuffle(*cat22_2->getOutput(0));
    shuffle22_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
    nvinfer1::ISliceLayer* split22_2_0 = network->addSlice(
            *shuffle22_2->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split22_2_1 = network->addSlice(
            *shuffle22_2->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl22_2 =
            DFL(network, weightMap, *split22_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
                1, 0, "model.22.dfl.conv.weight");

    // det0
    auto proto_coef_0 = cv4_conv_combined(network, weightMap, *conv15->getOutput(0), "model.22.cv4.0",
                                          (kInputH / strides[0]) * (kInputW / strides[0]), gw, "seg");
    nvinfer1::ITensor* inputTensor22_dfl_0[] = {dfl22_0->getOutput(0), split22_0_1->getOutput(0),
                                                proto_coef_0->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 3);

    // det1
    auto proto_coef_1 = cv4_conv_combined(network, weightMap, *conv18->getOutput(0), "model.22.cv4.1",
                                          (kInputH / strides[1]) * (kInputW / strides[1]), gw, "seg");
    nvinfer1::ITensor* inputTensor22_dfl_1[] = {dfl22_1->getOutput(0), split22_1_1->getOutput(0),
                                                proto_coef_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 3);

    // det2
    auto proto_coef_2 = cv4_conv_combined(network, weightMap, *conv21->getOutput(0), "model.22.cv4.2",
                                          (kInputH / strides[2]) * (kInputW / strides[2]), gw, "seg");
    nvinfer1::ITensor* inputTensor22_dfl_2[] = {dfl22_2->getOutput(0), split22_2_1->getOutput(0),
                                                proto_coef_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 3);

    nvinfer1::IPluginV2Layer* yolo =
            addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer*>{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2},
                         strides, stridesLength, kNumClass, true, false, false);
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    auto proto = Proto(network, weightMap, *conv15->getOutput(0), "model.22.proto", gw, max_channels);
    proto->getOutput(0)->setName("proto");
    network->markOutput(*proto->getOutput(0));

    builder->setMaxBatchSize(kBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table",
                                                  kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}

nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                             nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                             int& max_channels) {
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);

    /*******************************************************************************************************
  ******************************************  YOLOV8 INPUT
  ***********************************************
  *******************************************************************************************************/
    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW});
    assert(data);

    /*******************************************************************************************************
  *****************************************  YOLOV8 BACKBONE
  *********************************************
  *******************************************************************************************************/
    nvinfer1::IElementWiseLayer* conv0 =
            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
    nvinfer1::IElementWiseLayer* conv1 =
            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1");
    nvinfer1::IElementWiseLayer* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
                                             get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2");
    nvinfer1::IElementWiseLayer* conv3 =
            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3");
    nvinfer1::IElementWiseLayer* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
                                             get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4");
    nvinfer1::IElementWiseLayer* conv5 =
            convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5");
    nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                                             get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");
    nvinfer1::IElementWiseLayer* conv7 =
            convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7");
    nvinfer1::IElementWiseLayer* conv8 =
            C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
                get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8");
    nvinfer1::IElementWiseLayer* conv9 =
            SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), 5, "model.9");
    /*******************************************************************************************************
  *********************************************  YOLOV8 HEAD
  *********************************************
  *******************************************************************************************************/
    float scale[] = {1.0, 2.0, 2.0};
    nvinfer1::IResizeLayer* upsample10 = network->addResize(*conv9->getOutput(0));
    assert(upsample10);
    upsample10->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample10->setScales(scale, 3);

    nvinfer1::ITensor* inputTensor11[] = {upsample10->getOutput(0), conv6->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat11 = network->addConcatenation(inputTensor11, 2);
    nvinfer1::IElementWiseLayer* conv12 =
            C2F(network, weightMap, *cat11->getOutput(0), get_width(512, gw, max_channels),
                get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12");

    nvinfer1::IResizeLayer* upsample13 = network->addResize(*conv12->getOutput(0));
    assert(upsample13);
    upsample13->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample13->setScales(scale, 3);

    nvinfer1::ITensor* inputTensor14[] = {upsample13->getOutput(0), conv4->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat14 = network->addConcatenation(inputTensor14, 2);
    nvinfer1::IElementWiseLayer* conv15 =
            C2F(network, weightMap, *cat14->getOutput(0), get_width(256, gw, max_channels),
                get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.15");
    nvinfer1::IElementWiseLayer* conv16 = convBnSiLU(network, weightMap, *conv15->getOutput(0),
                                                     get_width(256, gw, max_channels), 3, 2, 1, "model.16");
    nvinfer1::ITensor* inputTensor17[] = {conv16->getOutput(0), conv12->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat17 = network->addConcatenation(inputTensor17, 2);
    nvinfer1::IElementWiseLayer* conv18 =
            C2F(network, weightMap, *cat17->getOutput(0), get_width(512, gw, max_channels),
                get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.18");
    nvinfer1::IElementWiseLayer* conv19 = convBnSiLU(network, weightMap, *conv18->getOutput(0),
                                                     get_width(512, gw, max_channels), 3, 2, 1, "model.19");
    nvinfer1::ITensor* inputTensor20[] = {conv19->getOutput(0), conv9->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat20 = network->addConcatenation(inputTensor20, 2);
    nvinfer1::IElementWiseLayer* conv21 =
            C2F(network, weightMap, *cat20->getOutput(0), get_width(1024, gw, max_channels),
                get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21");

    /*******************************************************************************************************
  *********************************************  YOLOV8 OUTPUT
  *******************************************
  *******************************************************************************************************/
    int base_in_channel = (gw == 1.25) ? 80 : 64;
    int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kPoseNumClass, 100)) : get_width(256, gw, max_channels);

    // output0
    nvinfer1::IElementWiseLayer* conv22_cv2_0_0 =
            convBnSiLU(network, weightMap, *conv15->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.0");
    nvinfer1::IElementWiseLayer* conv22_cv2_0_1 =
            convBnSiLU(network, weightMap, *conv22_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.1");
    nvinfer1::IConvolutionLayer* conv22_cv2_0_2 =
            network->addConvolutionNd(*conv22_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.22.cv2.0.2.weight"], weightMap["model.22.cv2.0.2.bias"]);
    conv22_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv22_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::IElementWiseLayer* conv22_cv3_0_0 =
            convBnSiLU(network, weightMap, *conv15->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.0.0");
    nvinfer1::IElementWiseLayer* conv22_cv3_0_1 = convBnSiLU(network, weightMap, *conv22_cv3_0_0->getOutput(0),
                                                             base_out_channel, 3, 1, 1, "model.22.cv3.0.1");
    nvinfer1::IConvolutionLayer* conv22_cv3_0_2 =
            network->addConvolutionNd(*conv22_cv3_0_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.22.cv3.0.2.weight"], weightMap["model.22.cv3.0.2.bias"]);
    conv22_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1});
    conv22_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor22_0[] = {conv22_cv2_0_2->getOutput(0), conv22_cv3_0_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_0 = network->addConcatenation(inputTensor22_0, 2);

    // output1
    nvinfer1::IElementWiseLayer* conv22_cv2_1_0 =
            convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.0");
    nvinfer1::IElementWiseLayer* conv22_cv2_1_1 =
            convBnSiLU(network, weightMap, *conv22_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.1");
    nvinfer1::IConvolutionLayer* conv22_cv2_1_2 =
            network->addConvolutionNd(*conv22_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.22.cv2.1.2.weight"], weightMap["model.22.cv2.1.2.bias"]);
    conv22_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv22_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::IElementWiseLayer* conv22_cv3_1_0 =
            convBnSiLU(network, weightMap, *conv18->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.1.0");
    nvinfer1::IElementWiseLayer* conv22_cv3_1_1 = convBnSiLU(network, weightMap, *conv22_cv3_1_0->getOutput(0),
                                                             base_out_channel, 3, 1, 1, "model.22.cv3.1.1");
    nvinfer1::IConvolutionLayer* conv22_cv3_1_2 =
            network->addConvolutionNd(*conv22_cv3_1_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.22.cv3.1.2.weight"], weightMap["model.22.cv3.1.2.bias"]);
    conv22_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv22_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor22_1[] = {conv22_cv2_1_2->getOutput(0), conv22_cv3_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_1 = network->addConcatenation(inputTensor22_1, 2);

    // output2
    nvinfer1::IElementWiseLayer* conv22_cv2_2_0 =
            convBnSiLU(network, weightMap, *conv21->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.0");
    nvinfer1::IElementWiseLayer* conv22_cv2_2_1 =
            convBnSiLU(network, weightMap, *conv22_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.1");
    nvinfer1::IConvolutionLayer* conv22_cv2_2_2 =
            network->addConvolution(*conv22_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                    weightMap["model.22.cv2.2.2.weight"], weightMap["model.22.cv2.2.2.bias"]);
    nvinfer1::IElementWiseLayer* conv22_cv3_2_0 =
            convBnSiLU(network, weightMap, *conv21->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.2.0");
    nvinfer1::IElementWiseLayer* conv22_cv3_2_1 = convBnSiLU(network, weightMap, *conv22_cv3_2_0->getOutput(0),
                                                             base_out_channel, 3, 1, 1, "model.22.cv3.2.1");
    nvinfer1::IConvolutionLayer* conv22_cv3_2_2 =
            network->addConvolution(*conv22_cv3_2_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1},
                                    weightMap["model.22.cv3.2.2.weight"], weightMap["model.22.cv3.2.2.bias"]);
    nvinfer1::ITensor* inputTensor22_2[] = {conv22_cv2_2_2->getOutput(0), conv22_cv3_2_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_2 = network->addConcatenation(inputTensor22_2, 2);
    /*******************************************************************************************************
  *********************************************  YOLOV8 DETECT
  *******************************************
  *******************************************************************************************************/

    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7};
    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
    int stridesLength = sizeof(strides) / sizeof(int);

    /**************************************************************************************P3****************************************************************************************************************************************/
    nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0));
    shuffle22_0->setReshapeDimensions(
            nvinfer1::Dims2{64 + kPoseNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
    nvinfer1::ISliceLayer* split22_0_0 = network->addSlice(
            *shuffle22_0->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split22_0_1 = network->addSlice(
            *shuffle22_0->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kPoseNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl22_0 =
            DFL(network, weightMap, *split22_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
                1, 0, "model.22.dfl.conv.weight");

    // det0
    auto shuffle_conv15 = cv4_conv_combined(network, weightMap, *conv15->getOutput(0), "model.22.cv4.0",
                                            (kInputH / strides[0]) * (kInputW / strides[0]), gw, "pose");

    nvinfer1::ITensor* inputTensor22_dfl_0[] = {dfl22_0->getOutput(0), split22_0_1->getOutput(0),
                                                shuffle_conv15->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 3);

    /********************************************************************************************P4**********************************************************************************************************************************/
    nvinfer1::IShuffleLayer* shuffle22_1 = network->addShuffle(*cat22_1->getOutput(0));
    shuffle22_1->setReshapeDimensions(
            nvinfer1::Dims2{64 + kPoseNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
    nvinfer1::ISliceLayer* split22_1_0 = network->addSlice(
            *shuffle22_1->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split22_1_1 = network->addSlice(
            *shuffle22_1->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kPoseNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl22_1 =
            DFL(network, weightMap, *split22_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
                1, 0, "model.22.dfl.conv.weight");

    // det1
    auto shuffle_conv18 = cv4_conv_combined(network, weightMap, *conv18->getOutput(0), "model.22.cv4.1",
                                            (kInputH / strides[1]) * (kInputW / strides[1]), gw, "pose");

    nvinfer1::ITensor* inputTensor22_dfl_1[] = {dfl22_1->getOutput(0), split22_1_1->getOutput(0),
                                                shuffle_conv18->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 3);

    /********************************************************************************************P5**********************************************************************************************************************************/
    nvinfer1::IShuffleLayer* shuffle22_2 = network->addShuffle(*cat22_2->getOutput(0));
    shuffle22_2->setReshapeDimensions(
            nvinfer1::Dims2{64 + kPoseNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
    nvinfer1::ISliceLayer* split22_2_0 = network->addSlice(
            *shuffle22_2->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split22_2_1 = network->addSlice(
            *shuffle22_2->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kPoseNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl22_2 =
            DFL(network, weightMap, *split22_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
                1, 0, "model.22.dfl.conv.weight");

    // det2
    auto shuffle_conv21 = cv4_conv_combined(network, weightMap, *conv21->getOutput(0), "model.22.cv4.2",
                                            (kInputH / strides[2]) * (kInputW / strides[2]), gw, "pose");
    nvinfer1::ITensor* inputTensor22_dfl_2[] = {dfl22_2->getOutput(0), split22_2_1->getOutput(0),
                                                shuffle_conv21->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 3);

    nvinfer1::IPluginV2Layer* yolo =
            addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer*>{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2},
                         strides, stridesLength, kPoseNumClass, false, true, false);
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    builder->setMaxBatchSize(kBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table",
                                                  kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}

nvinfer1::IHostMemory* buildEngineYolov8PoseP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                               nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                               int& max_channels) {
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
    /*******************************************************************************************************
  ******************************************  YOLOV8 INPUT
  ***********************************************
  *******************************************************************************************************/
    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW});
    assert(data);
    /*******************************************************************************************************
  *****************************************  YOLOV8 BACKBONE
  *********************************************
  *******************************************************************************************************/
    nvinfer1::IElementWiseLayer* conv0 =
            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
    nvinfer1::IElementWiseLayer* conv1 =
            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1");
    // 11233
    nvinfer1::IElementWiseLayer* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
                                             get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2");
    nvinfer1::IElementWiseLayer* conv3 =
            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3");
    // 22466
    nvinfer1::IElementWiseLayer* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
                                             get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4");
    nvinfer1::IElementWiseLayer* conv5 =
            convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5");
    // 22466
    nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                                             get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");

    nvinfer1::IElementWiseLayer* conv7 =
            convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(768, gw, max_channels), 3, 2, 1, "model.7");
    nvinfer1::IElementWiseLayer* conv8 = C2F(network, weightMap, *conv7->getOutput(0), get_width(768, gw, max_channels),
                                             get_width(768, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8");

    nvinfer1::IElementWiseLayer* conv9 =
            convBnSiLU(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.9");
    nvinfer1::IElementWiseLayer* conv10 =
            C2F(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels),
                get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.10");

    nvinfer1::IElementWiseLayer* conv11 =
            SPPF(network, weightMap, *conv10->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), 5, "model.11");

    /*******************************************************************************************************
  *********************************************  YOLOV8 HEAD
  *********************************************
  *******************************************************************************************************/
    // Head
    float scale[] = {1.0, 2.0, 2.0};  // scale used for upsampling

    // P5
    nvinfer1::IResizeLayer* upsample12 = network->addResize(*conv11->getOutput(0));
    upsample12->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample12->setScales(scale, 3);
    nvinfer1::ITensor* concat13_inputs[] = {upsample12->getOutput(0), conv8->getOutput(0)};
    nvinfer1::IConcatenationLayer* concat13 = network->addConcatenation(concat13_inputs, 2);
    nvinfer1::IElementWiseLayer* conv14 =
            C2(network, weightMap, *concat13->getOutput(0), get_width(768, gw, max_channels),
               get_width(768, gw, max_channels), get_depth(3, gd), false, 0.5, "model.14");

    // P4
    nvinfer1::IResizeLayer* upsample15 = network->addResize(*conv14->getOutput(0));
    upsample15->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample15->setScales(scale, 3);
    nvinfer1::ITensor* concat16_inputs[] = {upsample15->getOutput(0), conv6->getOutput(0)};
    nvinfer1::IConcatenationLayer* concat16 = network->addConcatenation(concat16_inputs, 2);
    nvinfer1::IElementWiseLayer* conv17 =
            C2(network, weightMap, *concat16->getOutput(0), get_width(512, gw, max_channels),
               get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.17");

    // P3
    nvinfer1::IResizeLayer* upsample18 = network->addResize(*conv17->getOutput(0));
    upsample18->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample18->setScales(scale, 3);
    nvinfer1::ITensor* concat19_inputs[] = {upsample18->getOutput(0), conv4->getOutput(0)};
    nvinfer1::IConcatenationLayer* concat19 = network->addConcatenation(concat19_inputs, 2);
    nvinfer1::IElementWiseLayer* conv20 =
            C2(network, weightMap, *concat19->getOutput(0), get_width(256, gw, max_channels),
               get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.20");

    // Additional layers for P4, P5, P6
    // P4/16-medium
    nvinfer1::IElementWiseLayer* conv21 = convBnSiLU(network, weightMap, *conv20->getOutput(0),
                                                     get_width(256, gw, max_channels), 3, 2, 1, "model.21");
    nvinfer1::ITensor* concat22_inputs[] = {conv21->getOutput(0), conv17->getOutput(0)};
    nvinfer1::IConcatenationLayer* concat22 = network->addConcatenation(concat22_inputs, 2);
    nvinfer1::IElementWiseLayer* conv23 =
            C2(network, weightMap, *concat22->getOutput(0), get_width(512, gw, max_channels),
               get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.23");

    // P5/32-large
    nvinfer1::IElementWiseLayer* conv24 = convBnSiLU(network, weightMap, *conv23->getOutput(0),
                                                     get_width(512, gw, max_channels), 3, 2, 1, "model.24");
    nvinfer1::ITensor* concat25_inputs[] = {conv24->getOutput(0), conv14->getOutput(0)};
    nvinfer1::IConcatenationLayer* concat25 = network->addConcatenation(concat25_inputs, 2);
    nvinfer1::IElementWiseLayer* conv26 =
            C2(network, weightMap, *concat25->getOutput(0), get_width(768, gw, max_channels),
               get_width(768, gw, max_channels), get_depth(3, gd), false, 0.5, "model.26");

    // P6/64-xlarge
    nvinfer1::IElementWiseLayer* conv27 = convBnSiLU(network, weightMap, *conv26->getOutput(0),
                                                     get_width(768, gw, max_channels), 3, 2, 1, "model.27");
    nvinfer1::ITensor* concat28_inputs[] = {conv27->getOutput(0), conv11->getOutput(0)};
    nvinfer1::IConcatenationLayer* concat28 = network->addConcatenation(concat28_inputs, 2);
    nvinfer1::IElementWiseLayer* conv29 =
            C2(network, weightMap, *concat28->getOutput(0), get_width(1024, gw, max_channels),
               get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.29");

    /*******************************************************************************************************
  *********************************************  YOLOV8 OUTPUT
  *******************************************
  *******************************************************************************************************/
    int base_in_channel = (gw == 1.25) ? 80 : 64;
    int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kPoseNumClass, 100)) : get_width(256, gw, max_channels);

    // output0
    nvinfer1::IElementWiseLayer* conv30_cv2_0_0 =
            convBnSiLU(network, weightMap, *conv20->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.0.0");
    nvinfer1::IElementWiseLayer* conv30_cv2_0_1 =
            convBnSiLU(network, weightMap, *conv30_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.0.1");
    nvinfer1::IConvolutionLayer* conv30_cv2_0_2 =
            network->addConvolutionNd(*conv30_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.30.cv2.0.2.weight"], weightMap["model.30.cv2.0.2.bias"]);
    conv30_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});

    conv30_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});

    nvinfer1::IElementWiseLayer* conv30_cv3_0_0 =
            convBnSiLU(network, weightMap, *conv20->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.0.0");

    nvinfer1::IElementWiseLayer* conv30_cv3_0_1 = convBnSiLU(network, weightMap, *conv30_cv3_0_0->getOutput(0),
                                                             base_out_channel, 3, 1, 1, "model.30.cv3.0.1");
    nvinfer1::IConvolutionLayer* conv30_cv3_0_2 =
            network->addConvolutionNd(*conv30_cv3_0_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.30.cv3.0.2.weight"], weightMap["model.30.cv3.0.2.bias"]);
    conv30_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1});
    conv30_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor30_0[] = {conv30_cv2_0_2->getOutput(0), conv30_cv3_0_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat30_0 = network->addConcatenation(inputTensor30_0, 2);

    // output1
    nvinfer1::IElementWiseLayer* conv30_cv2_1_0 =
            convBnSiLU(network, weightMap, *conv23->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.1.0");
    nvinfer1::IElementWiseLayer* conv30_cv2_1_1 =
            convBnSiLU(network, weightMap, *conv30_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.1.1");
    nvinfer1::IConvolutionLayer* conv30_cv2_1_2 =
            network->addConvolutionNd(*conv30_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.30.cv2.1.2.weight"], weightMap["model.30.cv2.1.2.bias"]);
    conv30_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv30_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::IElementWiseLayer* conv30_cv3_1_0 =
            convBnSiLU(network, weightMap, *conv23->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.1.0");
    nvinfer1::IElementWiseLayer* conv30_cv3_1_1 = convBnSiLU(network, weightMap, *conv30_cv3_1_0->getOutput(0),
                                                             base_out_channel, 3, 1, 1, "model.30.cv3.1.1");
    nvinfer1::IConvolutionLayer* conv30_cv3_1_2 =
            network->addConvolutionNd(*conv30_cv3_1_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.30.cv3.1.2.weight"], weightMap["model.30.cv3.1.2.bias"]);
    conv30_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv30_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor30_1[] = {conv30_cv2_1_2->getOutput(0), conv30_cv3_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat30_1 = network->addConcatenation(inputTensor30_1, 2);

    // output2
    nvinfer1::IElementWiseLayer* conv30_cv2_2_0 =
            convBnSiLU(network, weightMap, *conv26->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.2.0");
    nvinfer1::IElementWiseLayer* conv30_cv2_2_1 =
            convBnSiLU(network, weightMap, *conv30_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.2.1");
    nvinfer1::IConvolutionLayer* conv30_cv2_2_2 =
            network->addConvolution(*conv30_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                    weightMap["model.30.cv2.2.2.weight"], weightMap["model.30.cv2.2.2.bias"]);
    conv30_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv30_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::IElementWiseLayer* conv30_cv3_2_0 =
            convBnSiLU(network, weightMap, *conv26->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.2.0");
    nvinfer1::IElementWiseLayer* conv30_cv3_2_1 = convBnSiLU(network, weightMap, *conv30_cv3_2_0->getOutput(0),
                                                             base_out_channel, 3, 1, 1, "model.30.cv3.2.1");
    nvinfer1::IConvolutionLayer* conv30_cv3_2_2 =
            network->addConvolution(*conv30_cv3_2_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1},
                                    weightMap["model.30.cv3.2.2.weight"], weightMap["model.30.cv3.2.2.bias"]);
    conv30_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv30_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor30_2[] = {conv30_cv2_2_2->getOutput(0), conv30_cv3_2_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat30_2 = network->addConcatenation(inputTensor30_2, 2);

    // output3
    nvinfer1::IElementWiseLayer* conv30_cv2_3_0 =
            convBnSiLU(network, weightMap, *conv29->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.3.0");
    nvinfer1::IElementWiseLayer* conv30_cv2_3_1 =
            convBnSiLU(network, weightMap, *conv30_cv2_3_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.3.1");
    nvinfer1::IConvolutionLayer* conv30_cv2_3_2 =
            network->addConvolution(*conv30_cv2_3_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                    weightMap["model.30.cv2.3.2.weight"], weightMap["model.30.cv2.3.2.bias"]);
    conv30_cv2_3_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv30_cv2_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::IElementWiseLayer* conv30_cv3_3_0 =
            convBnSiLU(network, weightMap, *conv29->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.3.0");
    nvinfer1::IElementWiseLayer* conv30_cv3_3_1 = convBnSiLU(network, weightMap, *conv30_cv3_3_0->getOutput(0),
                                                             base_out_channel, 3, 1, 1, "model.30.cv3.3.1");
    nvinfer1::IConvolutionLayer* conv30_cv3_3_2 =
            network->addConvolution(*conv30_cv3_3_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1},
                                    weightMap["model.30.cv3.3.2.weight"], weightMap["model.30.cv3.3.2.bias"]);
    conv30_cv3_3_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv30_cv3_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor30_3[] = {conv30_cv2_3_2->getOutput(0), conv30_cv3_3_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat30_3 = network->addConcatenation(inputTensor30_3, 2);

    /*******************************************************************************************************
  *********************************************  YOLOV8 DETECT
  *******************************************
  *******************************************************************************************************/
    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7, conv9};
    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
    int stridesLength = sizeof(strides) / sizeof(int);

    // P3 processing steps (remains unchanged)
    nvinfer1::IShuffleLayer* shuffle30_0 =
            network->addShuffle(*cat30_0->getOutput(0));  // Reusing the previous cat30_0 as P3 concatenation layer
    shuffle30_0->setReshapeDimensions(
            nvinfer1::Dims2{64 + kPoseNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
    nvinfer1::ISliceLayer* split30_0_0 = network->addSlice(
            *shuffle30_0->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split30_0_1 = network->addSlice(
            *shuffle30_0->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kPoseNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl30_0 =
            DFL(network, weightMap, *split30_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
                1, 0, "model.30.dfl.conv.weight");

    // det0
    auto shuffle_conv20 = cv4_conv_combined(network, weightMap, *conv20->getOutput(0), "model.30.cv4.0",
                                            (kInputH / strides[0]) * (kInputW / strides[0]), gw, "pose");
    nvinfer1::ITensor* inputTensor30_dfl_0[] = {dfl30_0->getOutput(0), split30_0_1->getOutput(0),
                                                shuffle_conv20->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat30_dfl_0 = network->addConcatenation(inputTensor30_dfl_0, 3);

    // P4 processing steps (remains unchanged)
    nvinfer1::IShuffleLayer* shuffle30_1 =
            network->addShuffle(*cat30_1->getOutput(0));  // Reusing the previous cat30_1 as P4 concatenation layer
    shuffle30_1->setReshapeDimensions(
            nvinfer1::Dims2{64 + kPoseNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
    nvinfer1::ISliceLayer* split30_1_0 = network->addSlice(
            *shuffle30_1->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split30_1_1 = network->addSlice(
            *shuffle30_1->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kPoseNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl30_1 =
            DFL(network, weightMap, *split30_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
                1, 0, "model.30.dfl.conv.weight");

    // det1
    auto shuffle_conv23 = cv4_conv_combined(network, weightMap, *conv23->getOutput(0), "model.30.cv4.1",
                                            (kInputH / strides[1]) * (kInputW / strides[1]), gw, "pose");
    nvinfer1::ITensor* inputTensor30_dfl_1[] = {dfl30_1->getOutput(0), split30_1_1->getOutput(0),
                                                shuffle_conv23->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat30_dfl_1 = network->addConcatenation(inputTensor30_dfl_1, 3);

    // P5 processing steps (remains unchanged)
    nvinfer1::IShuffleLayer* shuffle30_2 =
            network->addShuffle(*cat30_2->getOutput(0));  // Reusing the previous cat30_2 as P5 concatenation layer
    shuffle30_2->setReshapeDimensions(
            nvinfer1::Dims2{64 + kPoseNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
    nvinfer1::ISliceLayer* split30_2_0 = network->addSlice(
            *shuffle30_2->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split30_2_1 = network->addSlice(
            *shuffle30_2->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kPoseNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl30_2 =
            DFL(network, weightMap, *split30_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
                1, 0, "model.30.dfl.conv.weight");

    // det2
    auto shuffle_conv26 = cv4_conv_combined(network, weightMap, *conv26->getOutput(0), "model.30.cv4.2",
                                            (kInputH / strides[2]) * (kInputW / strides[2]), gw, "pose");
    nvinfer1::ITensor* inputTensor30_dfl_2[] = {dfl30_2->getOutput(0), split30_2_1->getOutput(0),
                                                shuffle_conv26->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat30_dfl_2 = network->addConcatenation(inputTensor30_dfl_2, 3);

    // P6 processing steps
    nvinfer1::IShuffleLayer* shuffle30_3 = network->addShuffle(*cat30_3->getOutput(0));
    shuffle30_3->setReshapeDimensions(
            nvinfer1::Dims2{64 + kPoseNumClass, (kInputH / strides[3]) * (kInputW / strides[3])});
    nvinfer1::ISliceLayer* split30_3_0 = network->addSlice(
            *shuffle30_3->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split30_3_1 = network->addSlice(
            *shuffle30_3->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kPoseNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl30_3 =
            DFL(network, weightMap, *split30_3_0->getOutput(0), 4, (kInputH / strides[3]) * (kInputW / strides[3]), 1,
                1, 0, "model.30.dfl.conv.weight");

    // det3
    auto shuffle_conv29 = cv4_conv_combined(network, weightMap, *conv29->getOutput(0), "model.30.cv4.3",
                                            (kInputH / strides[3]) * (kInputW / strides[3]), gw, "pose");
    nvinfer1::ITensor* inputTensor30_dfl_3[] = {dfl30_3->getOutput(0), split30_3_1->getOutput(0),
                                                shuffle_conv29->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat30_dfl_3 = network->addConcatenation(inputTensor30_dfl_3, 3);

    nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(
            network, std::vector<nvinfer1::IConcatenationLayer*>{cat30_dfl_0, cat30_dfl_1, cat30_dfl_2, cat30_dfl_3},
            strides, stridesLength, kPoseNumClass, false, true, false);
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    builder->setMaxBatchSize(kBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table",
                                                  kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}

nvinfer1::IHostMemory* buildEngineYolov8_5uDet(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                               nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                               int& max_channels) {
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);

    /*******************************************************************************************************
  ******************************************  YOLOV5U INPUT
  ***********************************************
  *******************************************************************************************************/
    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW});
    assert(data);

    /*******************************************************************************************************
  *****************************************  YOLOV5U BACKBONE
  *********************************************
  *******************************************************************************************************/

    nvinfer1::IElementWiseLayer* conv0 =
            convBnSiLU(network, weightMap, *data, get_width_5u(64, gw), 6, 2, calculateP(6), "model.0");
    nvinfer1::IElementWiseLayer* conv1 =
            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width_5u(128, gw), 3, 2, calculateP(3), "model.1");
    // 11233
    nvinfer1::IElementWiseLayer* conv2 = C3(network, weightMap, *conv1->getOutput(0), get_width_5u(128, gw),
                                            get_width_5u(128, gw), get_depth(3, gd), true, 0.5, "model.2");

    nvinfer1::IElementWiseLayer* conv3 =
            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width_5u(256, gw), 3, 2, calculateP(3), "model.3");
    // 22466
    nvinfer1::IElementWiseLayer* conv4 = C3(network, weightMap, *conv3->getOutput(0), get_width_5u(256, gw),
                                            get_width_5u(256, gw), get_depth(6, gd), true, 0.5, "model.4");
    nvinfer1::IElementWiseLayer* conv5 =
            convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width_5u(512, gw), 3, 2, calculateP(3), "model.5");
    // 22466
    nvinfer1::IElementWiseLayer* conv6 = C3(network, weightMap, *conv5->getOutput(0), get_width_5u(512, gw),
                                            get_width_5u(512, gw), get_depth(6, gd), true, 0.5, "model.6");
    nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width_5u(1024, gw), 3,
                                                    2, calculateP(3), "model.7");
    // 11233
    nvinfer1::IElementWiseLayer* conv8 = C3(network, weightMap, *conv7->getOutput(0), get_width_5u(1024, gw),
                                            get_width_5u(1024, gw), get_depth(3, gd), true, 0.5, "model.8");
    nvinfer1::IElementWiseLayer* conv9 = SPPF(network, weightMap, *conv8->getOutput(0), get_width_5u(1024, gw),
                                              get_width_5u(1024, gw), 5, "model.9");
    /*******************************************************************************************************
  *********************************************  YOLOV5U HEAD
  *********************************************
  *******************************************************************************************************/

    //    auto conv10 = convBlock(network, weightMap, *spp9->getOutput(0),
    //    get_width_5u(512, gw), 1, 1, 1, "model.10");

    //*********************************************  cat backbone P4
    //********************************************
    nvinfer1::IElementWiseLayer* conv10 = convBnSiLU(network, weightMap, *conv9->getOutput(0), get_width_5u(512, gw), 1,
                                                     1, calculateP(1), "model.10");
    nvinfer1::IResizeLayer* upsample11 = network->addResize(*conv10->getOutput(0));
    assert(upsample11);
    upsample11->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample11->setOutputDimensions(conv6->getOutput(0)->getDimensions());
    nvinfer1::ITensor* inputTensor12[] = {upsample11->getOutput(0), conv6->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat12 = network->addConcatenation(inputTensor12, 2);
    nvinfer1::IElementWiseLayer* conv13 = C3(network, weightMap, *cat12->getOutput(0), get_width_5u(512, gw),
                                             get_width_5u(512, gw), get_depth(3, gd), false, 0.5, "model.13");
    //*********************************************  cat backbone P4
    //********************************************

    //*********************************************  cat backbone P3
    //********************************************
    nvinfer1::IElementWiseLayer* conv14 = convBnSiLU(network, weightMap, *conv13->getOutput(0), get_width_5u(256, gw),
                                                     1, 1, calculateP(1), "model.14");
    nvinfer1::IResizeLayer* upsample15 = network->addResize(*conv14->getOutput(0));
    assert(upsample15);
    upsample15->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample15->setOutputDimensions(conv4->getOutput(0)->getDimensions());
    nvinfer1::ITensor* inputTensor16[] = {upsample15->getOutput(0), conv4->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat16 = network->addConcatenation(inputTensor16, 2);
    nvinfer1::IElementWiseLayer* conv17 = C3(network, weightMap, *cat16->getOutput(0), get_width_5u(256, gw),
                                             get_width_5u(256, gw), get_depth(3, gd), false, 0.5, "model.17");
    //*********************************************  cat backbone P3
    //********************************************

    //*********************************************  cat head P4
    //********************************************
    nvinfer1::IElementWiseLayer* conv18 = convBnSiLU(network, weightMap, *conv17->getOutput(0), get_width_5u(256, gw),
                                                     3, 2, calculateP(3), "model.18");
    nvinfer1::ITensor* inputTensor19[] = {conv18->getOutput(0), conv14->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat19 = network->addConcatenation(inputTensor19, 2);
    nvinfer1::IElementWiseLayer* conv20 = C3(network, weightMap, *cat19->getOutput(0), get_width_5u(512, gw),
                                             get_width_5u(512, gw), get_depth(3, gd), false, 0.5, "model.20");
    //*********************************************  cat head P4
    //********************************************

    //*********************************************  cat head P3
    //********************************************
    nvinfer1::IElementWiseLayer* conv21 = convBnSiLU(network, weightMap, *conv20->getOutput(0), get_width_5u(512, gw),
                                                     3, 2, calculateP(3), "model.21");
    nvinfer1::ITensor* inputTensor22[] = {conv21->getOutput(0), conv10->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22 = network->addConcatenation(inputTensor22, 2);
    nvinfer1::IElementWiseLayer* conv23 = C3(network, weightMap, *cat22->getOutput(0), get_width_5u(1024, gw),
                                             get_width_5u(1024, gw), get_depth(3, gd), false, 0.5, "model.23");
    //*********************************************  cat head P3
    //********************************************

    /*******************************************************************************************************
  *********************************************  YOLOV5U OUTPUT
  *******************************************
  *******************************************************************************************************/
    int base_in_channel = (gw == 1.25) ? 80 : 64;
    int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width_5u(256, gw);

    // output0
    nvinfer1::IElementWiseLayer* conv24_cv2_0_0 = convBnSiLU(network, weightMap, *conv17->getOutput(0), base_in_channel,
                                                             3, 1, calculateP(3), "model.24.cv2.0.0");
    nvinfer1::IElementWiseLayer* conv24_cv2_0_1 = convBnSiLU(network, weightMap, *conv24_cv2_0_0->getOutput(0),
                                                             base_in_channel, 3, 1, calculateP(3), "model.24.cv2.0.1");
    nvinfer1::IConvolutionLayer* conv24_cv2_0_2 =
            network->addConvolutionNd(*conv24_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.24.cv2.0.2.weight"], weightMap["model.24.cv2.0.2.bias"]);
    conv24_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv24_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::IElementWiseLayer* conv24_cv3_0_0 = convBnSiLU(network, weightMap, *conv17->getOutput(0),
                                                             base_out_channel, 3, 1, calculateP(3), "model.24.cv3.0.0");
    nvinfer1::IElementWiseLayer* conv24_cv3_0_1 = convBnSiLU(network, weightMap, *conv24_cv3_0_0->getOutput(0),
                                                             base_out_channel, 3, 1, calculateP(3), "model.24.cv3.0.1");
    nvinfer1::IConvolutionLayer* conv24_cv3_0_2 =
            network->addConvolutionNd(*conv24_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.24.cv3.0.2.weight"], weightMap["model.24.cv3.0.2.bias"]);
    conv24_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1});
    conv24_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor24_0[] = {conv24_cv2_0_2->getOutput(0), conv24_cv3_0_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat24_0 = network->addConcatenation(inputTensor24_0, 2);

    // output1
    nvinfer1::IElementWiseLayer* conv24_cv2_1_0 = convBnSiLU(network, weightMap, *conv20->getOutput(0), base_in_channel,
                                                             3, 1, calculateP(3), "model.24.cv2.1.0");
    nvinfer1::IElementWiseLayer* conv24_cv2_1_1 = convBnSiLU(network, weightMap, *conv24_cv2_1_0->getOutput(0),
                                                             base_in_channel, 3, 1, calculateP(3), "model.24.cv2.1.1");
    nvinfer1::IConvolutionLayer* conv24_cv2_1_2 =
            network->addConvolutionNd(*conv24_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.24.cv2.1.2.weight"], weightMap["model.24.cv2.1.2.bias"]);
    conv24_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv24_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::IElementWiseLayer* conv24_cv3_1_0 = convBnSiLU(network, weightMap, *conv20->getOutput(0),
                                                             base_out_channel, 3, 1, calculateP(3), "model.24.cv3.1.0");
    nvinfer1::IElementWiseLayer* conv24_cv3_1_1 = convBnSiLU(network, weightMap, *conv24_cv3_1_0->getOutput(0),
                                                             base_out_channel, 3, 1, calculateP(3), "model.24.cv3.1.1");
    nvinfer1::IConvolutionLayer* conv24_cv3_1_2 =
            network->addConvolutionNd(*conv24_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.24.cv3.1.2.weight"], weightMap["model.24.cv3.1.2.bias"]);
    conv24_cv3_1_2->setStride(nvinfer1::DimsHW{1, 1});
    conv24_cv3_1_2->setPadding(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor24_1[] = {conv24_cv2_1_2->getOutput(0), conv24_cv3_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat24_1 = network->addConcatenation(inputTensor24_1, 2);

    // output2
    nvinfer1::IElementWiseLayer* conv24_cv2_2_0 = convBnSiLU(network, weightMap, *conv23->getOutput(0), base_in_channel,
                                                             3, 1, calculateP(3), "model.24.cv2.2.0");
    nvinfer1::IElementWiseLayer* conv24_cv2_2_1 = convBnSiLU(network, weightMap, *conv24_cv2_2_0->getOutput(0),
                                                             base_in_channel, 3, 1, calculateP(3), "model.24.cv2.2.1");
    nvinfer1::IConvolutionLayer* conv24_cv2_2_2 =
            network->addConvolutionNd(*conv24_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.24.cv2.2.2.weight"], weightMap["model.24.cv2.2.2.bias"]);
    conv24_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv24_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::IElementWiseLayer* conv24_cv3_2_0 = convBnSiLU(network, weightMap, *conv23->getOutput(0),
                                                             base_out_channel, 3, 1, calculateP(3), "model.24.cv3.2.0");
    nvinfer1::IElementWiseLayer* conv24_cv3_2_1 = convBnSiLU(network, weightMap, *conv24_cv3_2_0->getOutput(0),
                                                             base_out_channel, 3, 1, calculateP(3), "model.24.cv3.2.1");
    nvinfer1::IConvolutionLayer* conv24_cv3_2_2 =
            network->addConvolutionNd(*conv24_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.24.cv3.2.2.weight"], weightMap["model.24.cv3.2.2.bias"]);
    conv24_cv3_2_2->setStride(nvinfer1::DimsHW{1, 1});
    conv24_cv3_2_2->setPadding(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor24_2[] = {conv24_cv2_2_2->getOutput(0), conv24_cv3_2_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat24_2 = network->addConcatenation(inputTensor24_2, 2);

    /*******************************************************************************************************
  *********************************************  YOLOV5U DETECT
  *******************************************
  *******************************************************************************************************/

    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7};
    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
    int stridesLength = sizeof(strides) / sizeof(int);

    // det0
    nvinfer1::IShuffleLayer* shuffle24_0 = network->addShuffle(*cat24_0->getOutput(0));
    shuffle24_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
    nvinfer1::ISliceLayer* split24_0_0 = network->addSlice(
            *shuffle24_0->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split24_0_1 = network->addSlice(
            *shuffle24_0->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl24_0 =
            DFL(network, weightMap, *split24_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
                1, 0, "model.24.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor24_dfl_0[] = {dfl24_0->getOutput(0), split24_0_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat24_dfl_0 = network->addConcatenation(inputTensor24_dfl_0, 2);

    // det1
    nvinfer1::IShuffleLayer* shuffle24_1 = network->addShuffle(*cat24_1->getOutput(0));
    shuffle24_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
    nvinfer1::ISliceLayer* split24_1_0 = network->addSlice(
            *shuffle24_1->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split24_1_1 = network->addSlice(
            *shuffle24_1->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl24_1 =
            DFL(network, weightMap, *split24_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
                1, 0, "model.24.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor24_dfl_1[] = {dfl24_1->getOutput(0), split24_1_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat24_dfl_1 = network->addConcatenation(inputTensor24_dfl_1, 2);

    // det2
    nvinfer1::IShuffleLayer* shuffle24_2 = network->addShuffle(*cat24_2->getOutput(0));
    shuffle24_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
    nvinfer1::ISliceLayer* split24_2_0 = network->addSlice(
            *shuffle24_2->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split24_2_1 = network->addSlice(
            *shuffle24_2->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl24_2 =
            DFL(network, weightMap, *split24_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
                1, 0, "model.24.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor24_dfl_2[] = {dfl24_2->getOutput(0), split24_2_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat24_dfl_2 = network->addConcatenation(inputTensor24_dfl_2, 2);

    nvinfer1::IPluginV2Layer* yolo =
            addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer*>{cat24_dfl_0, cat24_dfl_1, cat24_dfl_2},
                         strides, stridesLength, kNumClass, false, false, false);

    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    builder->setMaxBatchSize(kBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table",
                                                  kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}

nvinfer1::IHostMemory* buildEngineYolov8_5uDetP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                                 nvinfer1::DataType dt, const std::string& wts_path, float& gd,
                                                 float& gw, int& max_channels) {
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);

    /*******************************************************************************************************
  ******************************************  YOLOV5U-P6 INPUT
  ***********************************************
  *******************************************************************************************************/
    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW});
    assert(data);

    /*******************************************************************************************************
  *****************************************  YOLOV5U-P6 BACKBONE
  *********************************************
  *******************************************************************************************************/

    nvinfer1::IElementWiseLayer* conv0 =
            convBnSiLU(network, weightMap, *data, get_width_5u(64, gw), 6, 2, calculateP(6), "model.0");
    nvinfer1::IElementWiseLayer* conv1 =
            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width_5u(128, gw), 3, 2, calculateP(3), "model.1");
    // 11233
    nvinfer1::IElementWiseLayer* conv2 = C3(network, weightMap, *conv1->getOutput(0), get_width_5u(128, gw),
                                            get_width_5u(128, gw), get_depth(3, gd), true, 0.5, "model.2");

    nvinfer1::IElementWiseLayer* conv3 =
            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width_5u(256, gw), 3, 2, calculateP(3), "model.3");
    // 22466
    nvinfer1::IElementWiseLayer* conv4 = C3(network, weightMap, *conv3->getOutput(0), get_width_5u(256, gw),
                                            get_width_5u(256, gw), get_depth(6, gd), true, 0.5, "model.4");
    nvinfer1::IElementWiseLayer* conv5 =
            convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width_5u(512, gw), 3, 2, calculateP(3), "model.5");
    // 22466
    nvinfer1::IElementWiseLayer* conv6 = C3(network, weightMap, *conv5->getOutput(0), get_width_5u(512, gw),
                                            get_width_5u(512, gw), get_depth(6, gd), true, 0.5, "model.6");
    nvinfer1::IElementWiseLayer* conv7 =
            convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width_5u(768, gw), 3, 2, calculateP(3), "model.7");
    // 11233
    nvinfer1::IElementWiseLayer* conv8 = C3(network, weightMap, *conv7->getOutput(0), get_width_5u(768, gw),
                                            get_width_5u(768, gw), get_depth(3, gd), true, 0.5, "model.8");

    nvinfer1::IElementWiseLayer* conv9 = convBnSiLU(network, weightMap, *conv8->getOutput(0), get_width_5u(1024, gw), 3,
                                                    2, calculateP(3), "model.9");
    // 11233
    nvinfer1::IElementWiseLayer* conv10 = C3(network, weightMap, *conv9->getOutput(0), get_width_5u(1024, gw),
                                             get_width_5u(1024, gw), get_depth(3, gd), true, 0.5, "model.10");

    nvinfer1::IElementWiseLayer* conv11 = SPPF(network, weightMap, *conv10->getOutput(0), get_width_5u(1024, gw),
                                               get_width_5u(1024, gw), 5, "model.11");
    /*******************************************************************************************************
  *********************************************  YOLOV5U-P6 HEAD
  *********************************************
  *******************************************************************************************************/

    //*********************************************  cat backbone P5
    //********************************************
    nvinfer1::IElementWiseLayer* conv12 = convBnSiLU(network, weightMap, *conv11->getOutput(0), get_width_5u(768, gw),
                                                     1, 1, calculateP(1), "model.12");
    nvinfer1::IResizeLayer* upsample13 = network->addResize(*conv12->getOutput(0));
    assert(upsample13);
    upsample13->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample13->setOutputDimensions(conv8->getOutput(0)->getDimensions());
    nvinfer1::ITensor* inputTensor14[] = {upsample13->getOutput(0), conv8->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat14 = network->addConcatenation(inputTensor14, 2);
    nvinfer1::IElementWiseLayer* conv15 = C3(network, weightMap, *cat14->getOutput(0), get_width_5u(768, gw),
                                             get_width_5u(768, gw), get_depth(3, gd), false, 0.5, "model.15");
    //*********************************************  cat backbone P5
    //********************************************

    //*********************************************  cat backbone P4
    //********************************************
    nvinfer1::IElementWiseLayer* conv16 = convBnSiLU(network, weightMap, *conv15->getOutput(0), get_width_5u(512, gw),
                                                     1, 1, calculateP(1), "model.16");
    nvinfer1::IResizeLayer* upsample17 = network->addResize(*conv16->getOutput(0));
    assert(upsample17);
    upsample17->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample17->setOutputDimensions(conv6->getOutput(0)->getDimensions());
    nvinfer1::ITensor* inputTensor18[] = {upsample17->getOutput(0), conv6->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat18 = network->addConcatenation(inputTensor18, 2);
    nvinfer1::IElementWiseLayer* conv19 = C3(network, weightMap, *cat18->getOutput(0), get_width_5u(512, gw),
                                             get_width_5u(512, gw), get_depth(3, gd), false, 0.5, "model.19");
    //*********************************************  cat backbone P4
    //********************************************

    //*********************************************  cat backbone P3
    //********************************************
    nvinfer1::IElementWiseLayer* conv20 = convBnSiLU(network, weightMap, *conv19->getOutput(0), get_width_5u(256, gw),
                                                     1, 1, calculateP(1), "model.20");
    nvinfer1::IResizeLayer* upsample21 = network->addResize(*conv20->getOutput(0));
    assert(upsample21);
    upsample21->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample21->setOutputDimensions(conv4->getOutput(0)->getDimensions());
    nvinfer1::ITensor* inputTensor22[] = {upsample21->getOutput(0), conv4->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22 = network->addConcatenation(inputTensor22, 2);
    nvinfer1::IElementWiseLayer* conv23 = C3(network, weightMap, *cat22->getOutput(0), get_width_5u(256, gw),
                                             get_width_5u(256, gw), get_depth(3, gd), false, 0.5, "model.23");
    //*********************************************  cat backbone P3
    //********************************************

    //*********************************************  cat head P4
    //********************************************
    nvinfer1::IElementWiseLayer* conv24 = convBnSiLU(network, weightMap, *conv23->getOutput(0), get_width_5u(256, gw),
                                                     3, 2, calculateP(3), "model.24");
    nvinfer1::ITensor* inputTensor25[] = {conv24->getOutput(0), conv20->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat25 = network->addConcatenation(inputTensor25, 2);
    nvinfer1::IElementWiseLayer* conv26 = C3(network, weightMap, *cat25->getOutput(0), get_width_5u(512, gw),
                                             get_width_5u(512, gw), get_depth(3, gd), false, 0.5, "model.26");
    //*********************************************  cat head P4
    //********************************************

    //*********************************************  cat head P5
    //********************************************
    nvinfer1::IElementWiseLayer* conv27 = convBnSiLU(network, weightMap, *conv26->getOutput(0), get_width_5u(512, gw),
                                                     3, 2, calculateP(3), "model.27");
    nvinfer1::ITensor* inputTensor28[] = {conv27->getOutput(0), conv16->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat28 = network->addConcatenation(inputTensor28, 2);
    nvinfer1::IElementWiseLayer* conv29 = C3(network, weightMap, *cat28->getOutput(0), get_width_5u(768, gw),
                                             get_width_5u(768, gw), get_depth(3, gd), false, 0.5, "model.29");
    //*********************************************  cat head P5
    //********************************************

    //*********************************************  cat head P6
    //********************************************
    nvinfer1::IElementWiseLayer* conv30 = convBnSiLU(network, weightMap, *conv29->getOutput(0), get_width_5u(768, gw),
                                                     3, 2, calculateP(3), "model.30");
    nvinfer1::ITensor* inputTensor31[] = {conv30->getOutput(0), conv12->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat31 = network->addConcatenation(inputTensor31, 2);
    nvinfer1::IElementWiseLayer* conv32 = C3(network, weightMap, *cat31->getOutput(0), get_width_5u(768, gw),
                                             get_width_5u(1024, gw), get_depth(3, gd), false, 0.5, "model.32");
    //*********************************************  cat head P6
    //********************************************

    /*******************************************************************************************************
  *********************************************  YOLOV5U-P6 OUTPUT
  *******************************************
  *******************************************************************************************************/
    int base_in_channel = (gw == 1.25) ? 80 : 64;
    int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width_5u(256, gw);

    // output0
    nvinfer1::IElementWiseLayer* conv33_cv2_0_0 = convBnSiLU(network, weightMap, *conv23->getOutput(0), base_in_channel,
                                                             3, 1, calculateP(3), "model.33.cv2.0.0");
    nvinfer1::IElementWiseLayer* conv33_cv2_0_1 = convBnSiLU(network, weightMap, *conv33_cv2_0_0->getOutput(0),
                                                             base_in_channel, 3, 1, calculateP(3), "model.33.cv2.0.1");
    nvinfer1::IConvolutionLayer* conv33_cv2_0_2 =
            network->addConvolutionNd(*conv33_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.33.cv2.0.2.weight"], weightMap["model.33.cv2.0.2.bias"]);
    conv33_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv33_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::IElementWiseLayer* conv33_cv3_0_0 = convBnSiLU(network, weightMap, *conv23->getOutput(0),
                                                             base_out_channel, 3, 1, calculateP(3), "model.33.cv3.0.0");
    nvinfer1::IElementWiseLayer* conv33_cv3_0_1 = convBnSiLU(network, weightMap, *conv33_cv3_0_0->getOutput(0),
                                                             base_out_channel, 3, 1, calculateP(3), "model.33.cv3.0.1");
    nvinfer1::IConvolutionLayer* conv33_cv3_0_2 =
            network->addConvolutionNd(*conv33_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.33.cv3.0.2.weight"], weightMap["model.33.cv3.0.2.bias"]);
    conv33_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1});
    conv33_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor33_0[] = {conv33_cv2_0_2->getOutput(0), conv33_cv3_0_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat33_0 = network->addConcatenation(inputTensor33_0, 2);

    // output1
    nvinfer1::IElementWiseLayer* conv33_cv2_1_0 = convBnSiLU(network, weightMap, *conv26->getOutput(0), base_in_channel,
                                                             3, 1, calculateP(3), "model.33.cv2.1.0");
    nvinfer1::IElementWiseLayer* conv33_cv2_1_1 = convBnSiLU(network, weightMap, *conv33_cv2_1_0->getOutput(0),
                                                             base_in_channel, 3, 1, calculateP(3), "model.33.cv2.1.1");
    nvinfer1::IConvolutionLayer* conv33_cv2_1_2 =
            network->addConvolutionNd(*conv33_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.33.cv2.1.2.weight"], weightMap["model.33.cv2.1.2.bias"]);
    conv33_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv33_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::IElementWiseLayer* conv33_cv3_1_0 = convBnSiLU(network, weightMap, *conv26->getOutput(0),
                                                             base_out_channel, 3, 1, calculateP(3), "model.33.cv3.1.0");
    nvinfer1::IElementWiseLayer* conv33_cv3_1_1 = convBnSiLU(network, weightMap, *conv33_cv3_1_0->getOutput(0),
                                                             base_out_channel, 3, 1, calculateP(3), "model.33.cv3.1.1");
    nvinfer1::IConvolutionLayer* conv33_cv3_1_2 =
            network->addConvolutionNd(*conv33_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.33.cv3.1.2.weight"], weightMap["model.33.cv3.1.2.bias"]);
    conv33_cv3_1_2->setStride(nvinfer1::DimsHW{1, 1});
    conv33_cv3_1_2->setPadding(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor33_1[] = {conv33_cv2_1_2->getOutput(0), conv33_cv3_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat33_1 = network->addConcatenation(inputTensor33_1, 2);

    // output2
    nvinfer1::IElementWiseLayer* conv33_cv2_2_0 = convBnSiLU(network, weightMap, *conv29->getOutput(0), base_in_channel,
                                                             3, 1, calculateP(3), "model.33.cv2.2.0");
    nvinfer1::IElementWiseLayer* conv33_cv2_2_1 = convBnSiLU(network, weightMap, *conv33_cv2_2_0->getOutput(0),
                                                             base_in_channel, 3, 1, calculateP(3), "model.33.cv2.2.1");
    nvinfer1::IConvolutionLayer* conv33_cv2_2_2 =
            network->addConvolutionNd(*conv33_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.33.cv2.2.2.weight"], weightMap["model.33.cv2.2.2.bias"]);
    conv33_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv33_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::IElementWiseLayer* conv33_cv3_2_0 = convBnSiLU(network, weightMap, *conv29->getOutput(0),
                                                             base_out_channel, 3, 1, calculateP(3), "model.33.cv3.2.0");
    nvinfer1::IElementWiseLayer* conv33_cv3_2_1 = convBnSiLU(network, weightMap, *conv33_cv3_2_0->getOutput(0),
                                                             base_out_channel, 3, 1, calculateP(3), "model.33.cv3.2.1");
    nvinfer1::IConvolutionLayer* conv33_cv3_2_2 =
            network->addConvolutionNd(*conv33_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.33.cv3.2.2.weight"], weightMap["model.33.cv3.2.2.bias"]);
    conv33_cv3_2_2->setStride(nvinfer1::DimsHW{1, 1});
    conv33_cv3_2_2->setPadding(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor33_2[] = {conv33_cv2_2_2->getOutput(0), conv33_cv3_2_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat33_2 = network->addConcatenation(inputTensor33_2, 2);

    // output3
    nvinfer1::IElementWiseLayer* conv33_cv2_3_0 = convBnSiLU(network, weightMap, *conv32->getOutput(0), base_in_channel,
                                                             3, 1, calculateP(3), "model.33.cv2.3.0");
    nvinfer1::IElementWiseLayer* conv33_cv2_3_1 = convBnSiLU(network, weightMap, *conv33_cv2_3_0->getOutput(0),
                                                             base_in_channel, 3, 1, calculateP(3), "model.33.cv2.3.1");
    nvinfer1::IConvolutionLayer* conv33_cv2_3_2 =
            network->addConvolutionNd(*conv33_cv2_3_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.33.cv2.3.2.weight"], weightMap["model.33.cv2.3.2.bias"]);
    conv33_cv2_3_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv33_cv2_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::IElementWiseLayer* conv33_cv3_3_0 = convBnSiLU(network, weightMap, *conv32->getOutput(0),
                                                             base_out_channel, 3, 1, calculateP(3), "model.33.cv3.3.0");
    nvinfer1::IElementWiseLayer* conv33_cv3_3_1 = convBnSiLU(network, weightMap, *conv33_cv3_3_0->getOutput(0),
                                                             base_out_channel, 3, 1, calculateP(3), "model.33.cv3.3.1");
    nvinfer1::IConvolutionLayer* conv33_cv3_3_2 =
            network->addConvolutionNd(*conv33_cv3_3_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.33.cv3.3.2.weight"], weightMap["model.33.cv3.3.2.bias"]);
    conv33_cv3_3_2->setStride(nvinfer1::DimsHW{1, 1});
    conv33_cv3_3_2->setPadding(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor33_3[] = {conv33_cv2_3_2->getOutput(0), conv33_cv3_3_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat33_3 = network->addConcatenation(inputTensor33_3, 2);

    /*******************************************************************************************************
  *********************************************  YOLOV5U-P6 DETECT
  *******************************************
  *******************************************************************************************************/

    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7, conv9};
    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
    int stridesLength = sizeof(strides) / sizeof(int);

    // det0
    nvinfer1::IShuffleLayer* shuffle33_0 = network->addShuffle(*cat33_0->getOutput(0));
    shuffle33_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
    nvinfer1::ISliceLayer* split33_0_0 = network->addSlice(
            *shuffle33_0->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split33_0_1 = network->addSlice(
            *shuffle33_0->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl33_0 =
            DFL(network, weightMap, *split33_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
                1, 0, "model.33.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor33_dfl_0[] = {dfl33_0->getOutput(0), split33_0_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat33_dfl_0 = network->addConcatenation(inputTensor33_dfl_0, 2);

    // det1
    nvinfer1::IShuffleLayer* shuffle33_1 = network->addShuffle(*cat33_1->getOutput(0));
    shuffle33_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
    nvinfer1::ISliceLayer* split33_1_0 = network->addSlice(
            *shuffle33_1->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split33_1_1 = network->addSlice(
            *shuffle33_1->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl33_1 =
            DFL(network, weightMap, *split33_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
                1, 0, "model.33.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor33_dfl_1[] = {dfl33_1->getOutput(0), split33_1_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat33_dfl_1 = network->addConcatenation(inputTensor33_dfl_1, 2);

    // det2
    nvinfer1::IShuffleLayer* shuffle33_2 = network->addShuffle(*cat33_2->getOutput(0));
    shuffle33_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
    nvinfer1::ISliceLayer* split33_2_0 = network->addSlice(
            *shuffle33_2->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split33_2_1 = network->addSlice(
            *shuffle33_2->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl33_2 =
            DFL(network, weightMap, *split33_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
                1, 0, "model.33.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor33_dfl_2[] = {dfl33_2->getOutput(0), split33_2_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat33_dfl_2 = network->addConcatenation(inputTensor33_dfl_2, 2);

    // det3
    nvinfer1::IShuffleLayer* shuffle33_3 = network->addShuffle(*cat33_3->getOutput(0));
    shuffle33_3->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])});
    nvinfer1::ISliceLayer* split33_3_0 = network->addSlice(
            *shuffle33_3->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split33_3_1 = network->addSlice(
            *shuffle33_3->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl33_3 =
            DFL(network, weightMap, *split33_3_0->getOutput(0), 4, (kInputH / strides[3]) * (kInputW / strides[3]), 1,
                1, 0, "model.33.dfl.conv.weight");
    nvinfer1::ITensor* inputTensor33_dfl_3[] = {dfl33_3->getOutput(0), split33_3_1->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat33_dfl_3 = network->addConcatenation(inputTensor33_dfl_3, 2);

    nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(
            network, std::vector<nvinfer1::IConcatenationLayer*>{cat33_dfl_0, cat33_dfl_1, cat33_dfl_2, cat33_dfl_3},
            strides, stridesLength, kNumClass, false, false, false);

    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    builder->setMaxBatchSize(kBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table",
                                                  kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}

nvinfer1::IHostMemory* buildEngineYolov8Obb(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                            nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                            int& max_channels) {
    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);

    /*******************************************************************************************************
    ******************************************  YOLOV8 INPUT  **********************************************
    *******************************************************************************************************/
    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW});
    assert(data);

    /*******************************************************************************************************
    *****************************************  YOLOV8 BACKBONE  ********************************************
    *******************************************************************************************************/
    nvinfer1::IElementWiseLayer* conv0 =
            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
    nvinfer1::IElementWiseLayer* conv1 =
            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1");
    nvinfer1::IElementWiseLayer* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
                                             get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2");
    nvinfer1::IElementWiseLayer* conv3 =
            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3");
    nvinfer1::IElementWiseLayer* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
                                             get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4");
    nvinfer1::IElementWiseLayer* conv5 =
            convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5");
    nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                                             get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");
    nvinfer1::IElementWiseLayer* conv7 =
            convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7");
    nvinfer1::IElementWiseLayer* conv8 =
            C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
                get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8");
    nvinfer1::IElementWiseLayer* conv9 =
            SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), 5, "model.9");

    /*******************************************************************************************************
    *********************************************  YOLOV8 HEAD  ********************************************
    *******************************************************************************************************/
    float scale[] = {1.0, 2.0, 2.0};
    nvinfer1::IResizeLayer* upsample10 = network->addResize(*conv9->getOutput(0));
    assert(upsample10);
    upsample10->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample10->setScales(scale, 3);

    nvinfer1::ITensor* inputTensor11[] = {upsample10->getOutput(0), conv6->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat11 = network->addConcatenation(inputTensor11, 2);
    nvinfer1::IElementWiseLayer* conv12 =
            C2F(network, weightMap, *cat11->getOutput(0), get_width(512, gw, max_channels),
                get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12");

    nvinfer1::IResizeLayer* upsample13 = network->addResize(*conv12->getOutput(0));
    assert(upsample13);
    upsample13->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    upsample13->setScales(scale, 3);

    nvinfer1::ITensor* inputTensor14[] = {upsample13->getOutput(0), conv4->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat14 = network->addConcatenation(inputTensor14, 2);
    nvinfer1::IElementWiseLayer* conv15 =
            C2F(network, weightMap, *cat14->getOutput(0), get_width(256, gw, max_channels),
                get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.15");
    nvinfer1::IElementWiseLayer* conv16 = convBnSiLU(network, weightMap, *conv15->getOutput(0),
                                                     get_width(256, gw, max_channels), 3, 2, 1, "model.16");
    nvinfer1::ITensor* inputTensor17[] = {conv16->getOutput(0), conv12->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat17 = network->addConcatenation(inputTensor17, 2);
    nvinfer1::IElementWiseLayer* conv18 =
            C2F(network, weightMap, *cat17->getOutput(0), get_width(512, gw, max_channels),
                get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.18");
    nvinfer1::IElementWiseLayer* conv19 = convBnSiLU(network, weightMap, *conv18->getOutput(0),
                                                     get_width(512, gw, max_channels), 3, 2, 1, "model.19");
    nvinfer1::ITensor* inputTensor20[] = {conv19->getOutput(0), conv9->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat20 = network->addConcatenation(inputTensor20, 2);
    nvinfer1::IElementWiseLayer* conv21 =
            C2F(network, weightMap, *cat20->getOutput(0), get_width(1024, gw, max_channels),
                get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21");

    /*******************************************************************************************************
    *********************************************  YOLOV8 OUTPUT  ******************************************
    *******************************************************************************************************/
    int base_in_channel = (gw == 1.25) ? 80 : 64;
    int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kObbNumClass, 100)) : get_width(256, gw, max_channels);

    // output0
    nvinfer1::IElementWiseLayer* conv22_cv2_0_0 =
            convBnSiLU(network, weightMap, *conv15->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.0");

    nvinfer1::IElementWiseLayer* conv22_cv2_0_1 =
            convBnSiLU(network, weightMap, *conv22_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.1");

    nvinfer1::IConvolutionLayer* conv22_cv2_0_2 =
            network->addConvolutionNd(*conv22_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.22.cv2.0.2.weight"], weightMap["model.22.cv2.0.2.bias"]);
    conv22_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv22_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});

    nvinfer1::IElementWiseLayer* conv22_cv3_0_0 =
            convBnSiLU(network, weightMap, *conv15->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.0.0");
    nvinfer1::IElementWiseLayer* conv22_cv3_0_1 = convBnSiLU(network, weightMap, *conv22_cv3_0_0->getOutput(0),
                                                             base_out_channel, 3, 1, 1, "model.22.cv3.0.1");

    nvinfer1::IConvolutionLayer* conv22_cv3_0_2 =
            network->addConvolutionNd(*conv22_cv3_0_1->getOutput(0), kObbNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.22.cv3.0.2.weight"], weightMap["model.22.cv3.0.2.bias"]);
    conv22_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1});
    conv22_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor22_0[] = {conv22_cv2_0_2->getOutput(0), conv22_cv3_0_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_0 = network->addConcatenation(inputTensor22_0, 2);

    // output1
    nvinfer1::IElementWiseLayer* conv22_cv2_1_0 =
            convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.0");
    nvinfer1::IElementWiseLayer* conv22_cv2_1_1 =
            convBnSiLU(network, weightMap, *conv22_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.1");
    nvinfer1::IConvolutionLayer* conv22_cv2_1_2 =
            network->addConvolutionNd(*conv22_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.22.cv2.1.2.weight"], weightMap["model.22.cv2.1.2.bias"]);
    conv22_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv22_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::IElementWiseLayer* conv22_cv3_1_0 =
            convBnSiLU(network, weightMap, *conv18->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.1.0");
    nvinfer1::IElementWiseLayer* conv22_cv3_1_1 = convBnSiLU(network, weightMap, *conv22_cv3_1_0->getOutput(0),
                                                             base_out_channel, 3, 1, 1, "model.22.cv3.1.1");
    nvinfer1::IConvolutionLayer* conv22_cv3_1_2 =
            network->addConvolutionNd(*conv22_cv3_1_1->getOutput(0), kObbNumClass, nvinfer1::DimsHW{1, 1},
                                      weightMap["model.22.cv3.1.2.weight"], weightMap["model.22.cv3.1.2.bias"]);
    conv22_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
    conv22_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
    nvinfer1::ITensor* inputTensor22_1[] = {conv22_cv2_1_2->getOutput(0), conv22_cv3_1_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_1 = network->addConcatenation(inputTensor22_1, 2);

    // output2
    nvinfer1::IElementWiseLayer* conv22_cv2_2_0 =
            convBnSiLU(network, weightMap, *conv21->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.0");
    nvinfer1::IElementWiseLayer* conv22_cv2_2_1 =
            convBnSiLU(network, weightMap, *conv22_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.1");
    nvinfer1::IConvolutionLayer* conv22_cv2_2_2 =
            network->addConvolution(*conv22_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                    weightMap["model.22.cv2.2.2.weight"], weightMap["model.22.cv2.2.2.bias"]);
    nvinfer1::IElementWiseLayer* conv22_cv3_2_0 =
            convBnSiLU(network, weightMap, *conv21->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.2.0");
    nvinfer1::IElementWiseLayer* conv22_cv3_2_1 = convBnSiLU(network, weightMap, *conv22_cv3_2_0->getOutput(0),
                                                             base_out_channel, 3, 1, 1, "model.22.cv3.2.1");
    nvinfer1::IConvolutionLayer* conv22_cv3_2_2 =
            network->addConvolution(*conv22_cv3_2_1->getOutput(0), kObbNumClass, nvinfer1::DimsHW{1, 1},
                                    weightMap["model.22.cv3.2.2.weight"], weightMap["model.22.cv3.2.2.bias"]);
    nvinfer1::ITensor* inputTensor22_2[] = {conv22_cv2_2_2->getOutput(0), conv22_cv3_2_2->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_2 = network->addConcatenation(inputTensor22_2, 2);

    /*******************************************************************************************************
    *********************************************  YOLOV8 DETECT  ******************************************
    *******************************************************************************************************/

    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7};
    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
    int stridesLength = sizeof(strides) / sizeof(int);

    nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0));
    shuffle22_0->setReshapeDimensions(
            nvinfer1::Dims2{64 + kObbNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});

    nvinfer1::ISliceLayer* split22_0_0 = network->addSlice(
            *shuffle22_0->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split22_0_1 = network->addSlice(
            *shuffle22_0->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kObbNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl22_0 =
            DFL(network, weightMap, *split22_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
                1, 0, "model.22.dfl.conv.weight");

    nvinfer1::IShuffleLayer* shuffle22_1 = network->addShuffle(*cat22_1->getOutput(0));
    shuffle22_1->setReshapeDimensions(
            nvinfer1::Dims2{64 + kObbNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
    nvinfer1::ISliceLayer* split22_1_0 = network->addSlice(
            *shuffle22_1->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split22_1_1 = network->addSlice(
            *shuffle22_1->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kObbNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl22_1 =
            DFL(network, weightMap, *split22_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
                1, 0, "model.22.dfl.conv.weight");

    nvinfer1::IShuffleLayer* shuffle22_2 = network->addShuffle(*cat22_2->getOutput(0));
    shuffle22_2->setReshapeDimensions(
            nvinfer1::Dims2{64 + kObbNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
    nvinfer1::ISliceLayer* split22_2_0 = network->addSlice(
            *shuffle22_2->getOutput(0), nvinfer1::Dims2{0, 0},
            nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
    nvinfer1::ISliceLayer* split22_2_1 = network->addSlice(
            *shuffle22_2->getOutput(0), nvinfer1::Dims2{64, 0},
            nvinfer1::Dims2{kObbNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
    nvinfer1::IShuffleLayer* dfl22_2 =
            DFL(network, weightMap, *split22_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
                1, 0, "model.22.dfl.conv.weight");

    // det0
    auto shuffle_conv15 = cv4_conv_combined(network, weightMap, *conv15->getOutput(0), "model.22.cv4.0",
                                            (kInputH / strides[0]) * (kInputW / strides[0]), gw, "obb");
    nvinfer1::ITensor* inputTensor22_dfl_0[] = {dfl22_0->getOutput(0), split22_0_1->getOutput(0),
                                                shuffle_conv15->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 3);

    // det1
    auto shuffle_conv18 = cv4_conv_combined(network, weightMap, *conv18->getOutput(0), "model.22.cv4.1",
                                            (kInputH / strides[1]) * (kInputW / strides[1]), gw, "obb");
    nvinfer1::ITensor* inputTensor22_dfl_1[] = {dfl22_1->getOutput(0), split22_1_1->getOutput(0),
                                                shuffle_conv18->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 3);

    // det2
    auto shuffle_conv21 = cv4_conv_combined(network, weightMap, *conv21->getOutput(0), "model.22.cv4.2",
                                            (kInputH / strides[2]) * (kInputW / strides[2]), gw, "obb");
    nvinfer1::ITensor* inputTensor22_dfl_2[] = {dfl22_2->getOutput(0), split22_2_1->getOutput(0),
                                                shuffle_conv21->getOutput(0)};
    nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 3);

    nvinfer1::IPluginV2Layer* yolo =
            addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer*>{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2},
                         strides, stridesLength, kObbNumClass, false, false, true);

    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    builder->setMaxBatchSize(kBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table",
                                                  kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }
    return serialized_model;
}


================================================
FILE: yolov8/src/postprocess.cpp
================================================
#include "postprocess.h"
#include <algorithm>
#include <iostream>  // Include this header for printing
#include "utils.h"

cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
    float l, r, t, b;
    float r_w = kInputW / (img.cols * 1.0);
    float r_h = kInputH / (img.rows * 1.0);

    if (r_h > r_w) {
        l = bbox[0];
        r = bbox[2];
        t = bbox[1] - (kInputH - r_w * img.rows) / 2;
        b = bbox[3] - (kInputH - r_w * img.rows) / 2;
        l = l / r_w;
        r = r / r_w;
        t = t / r_w;
        b = b / r_w;
    } else {
        l = bbox[0] - (kInputW - r_h * img.cols) / 2;
        r = bbox[2] - (kInputW - r_h * img.cols) / 2;
        t = bbox[1];
        b = bbox[3];
        l = l / r_h;
        r = r / r_h;
        t = t / r_h;
        b = b / r_h;
    }
    l = std::max(0.0f, l);
    t = std::max(0.0f, t);
    int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
    int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));

    return cv::Rect(int(round(l)), int(round(t)), width, height);
}

cv::Rect get_rect_adapt_landmark(cv::Mat& img, float bbox[4], float lmk[kNumberOfPoints * 3]) {
    float l, r, t, b;
    float r_w = kInputW / (img.cols * 1.0);
    float r_h = kInputH / (img.rows * 1.0);
    if (r_h > r_w) {
        l = bbox[0] / r_w;
        r = bbox[2] / r_w;
        t = (bbox[1] - (kInputH - r_w * img.rows) / 2) / r_w;
        b = (bbox[3] - (kInputH - r_w * img.rows) / 2) / r_w;
        for (int i = 0; i < kNumberOfPoints * 3; i += 3) {
            lmk[i] /= r_w;
            lmk[i + 1] = (lmk[i + 1] - (kInputH - r_w * img.rows) / 2) / r_w;
            // lmk[i + 2]
        }
    } else {
        l = (bbox[0] - (kInputW - r_h * img.cols) / 2) / r_h;
        r = (bbox[2] - (kInputW - r_h * img.cols) / 2) / r_h;
        t = bbox[1] / r_h;
        b = bbox[3] / r_h;
        for (int i = 0; i < kNumberOfPoints * 3; i += 3) {
            lmk[i] = (lmk[i] - (kInputW - r_h * img.cols) / 2) / r_h;
            lmk[i + 1] /= r_h;
            // lmk[i + 2]
        }
    }
    l = std::max(0.0f, l);
    t = std::max(0.0f, t);
    int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
    int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));

    return cv::Rect(int(round(l)), int(round(t)), width, height);
}

static float iou(float lbox[4], float rbox[4]) {
    float interBox[] = {
            (std::max)(lbox[0], rbox[0]),
            (std::min)(lbox[2], rbox[2]),
            (std::max)(lbox[1], rbox[1]),
            (std::min)(lbox[3], rbox[3]),
    };

    if (interBox[2] > interBox[3] || interBox[0] > interBox[1])
        return 0.0f;

    float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]);
    float unionBoxS = (lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) - interBoxS;
    return interBoxS / unionBoxS;
}

static bool cmp(const Detection& a, const Detection& b) {
    if (a.conf == b.conf) {
        return a.bbox[0] < b.bbox[0];
    }
    return a.conf > b.conf;
}

void nms(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh) {
    int det_size = sizeof(Detection) / sizeof(float);
    std::map<float, std::vector<Detection>> m;

    for (int i = 0; i < output[0]; i++) {
        if (output[1 + det_size * i + 4] <= conf_thresh || isnan(output[1 + det_size * i + 4]))
            continue;
        Detection det;
        memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
        if (m.count(det.class_id) == 0)
            m.emplace(det.class_id, std::vector<Detection>());
        m[det.class_id].push_back(det);
    }
    for (auto it = m.begin(); it != m.end(); it++) {
        auto& dets = it->second;
        std::sort(dets.begin(), dets.end(), cmp);
        for (size_t m = 0; m < dets.size(); ++m) {
            auto& item = dets[m];
            res.push_back(item);
            for (size_t n = m + 1; n < dets.size(); ++n) {
                if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
                    dets.erase(dets.begin() + n);
                    --n;
                }
            }
        }
    }
}

void batch_nms(std::vector<std::vector<Detection>>& res_batch, float* output, int batch_size, int output_size,
               float conf_thresh, float nms_thresh) {
    res_batch.resize(batch_size);
    for (int i = 0; i < batch_size; i++) {
        nms(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh);
    }
}

void process_decode_ptr_host(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img,
                             int count) {
    Detection det;
    for (int i = 0; i < count; i++) {
        int basic_pos = 1 + i * bbox_element;
        int keep_flag = decode_ptr_host[basic_pos + 6];
        if (keep_flag == 1) {
            det.bbox[0] = decode_ptr_host[basic_pos + 0];
            det.bbox[1] = decode_ptr_host[basic_pos + 1];
            det.bbox[2] = decode_ptr_host[basic_pos + 2];
            det.bbox[3] = decode_ptr_host[basic_pos + 3];
            det.conf = decode_ptr_host[basic_pos + 4];
            det.class_id = decode_ptr_host[basic_pos + 5];
            res.push_back(det);
        }
    }
}

void batch_process(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
                   int bbox_element, const std::vector<cv::Mat>& img_batch) {
    res_batch.resize(batch_size);
    int count = static_cast<int>(*decode_ptr_host);
    count = std::min(count, kMaxNumOutputBbox);
    for (int i = 0; i < batch_size; i++) {
        auto& img = const_cast<cv::Mat&>(img_batch[i]);
        process_decode_ptr_host(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count);
    }
}

void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
    for (size_t i = 0; i < img_batch.size(); i++) {
        auto& res = res_batch[i];
        cv::Mat img = img_batch[i];
        for (size_t j = 0; j < res.size(); j++) {
            cv::Rect r = get_rect(img, res[j].bbox);
            cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
            cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
                        cv::Scalar(0xFF, 0xFF, 0xFF), 2);
        }
    }
}

void draw_bbox_keypoints_line(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
    const std::vector<std::pair<int, int>> skeleton_pairs = {
            {0, 1}, {0, 2},  {0, 5}, {0, 6},  {1, 2},   {1, 3},   {2, 4},   {5, 6},   {5, 7},  {5, 11},
            {6, 8}, {6, 12}, {7, 9}, {8, 10}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}};

    for (size_t i = 0; i < img_batch.size(); i++) {
        auto& res = res_batch[i];
        cv::Mat img = img_batch[i];
        for (size_t j = 0; j < res.size(); j++) {
            cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].keypoints);
            cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
            cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
                        cv::Scalar(0xFF, 0xFF, 0xFF), 2);

            for (int k = 0; k < kNumberOfPoints * 3; k += 3) {
                if (res[j].keypoints[k + 2] > 0.5) {
                    cv::circle(img, cv::Point((int)res[j].keypoints[k], (int)res[j].keypoints[k + 1]), 3,
                               cv::Scalar(0, 0x27, 0xC1), -1);
                }
            }

            for (const auto& bone : skeleton_pairs) {
                int kp1_idx = bone.first * 3;
                int kp2_idx = bone.second * 3;
                if (res[j].keypoints[kp1_idx + 2] > 0.5 && res[j].keypoints[kp2_idx + 2] > 0.5) {
                    cv::Point p1((int)res[j].keypoints[kp1_idx], (int)res[j].keypoints[kp1_idx + 1]);
                    cv::Point p2((int)res[j].keypoints[kp2_idx], (int)res[j].keypoints[kp2_idx + 1]);
                    cv::line(img, p1, p2, cv::Scalar(0, 0x27, 0xC1), 2);
                }
            }
        }
    }
}

cv::Mat scale_mask(cv::Mat mask, cv::Mat img) {
    int x, y, w, h;
    float r_w = kInputW / (img.cols * 1.0);
    float r_h = kInputH / (img.rows * 1.0);
    if (r_h > r_w) {
        w = kInputW;
        h = r_w * img.rows;
        x = 0;
        y = (kInputH - h) / 2;
    } else {
        w = r_h * img.cols;
        h = kInputH;
        x = (kInputW - w) / 2;
        y = 0;
    }
    cv::Rect r(x, y, w, h);
    cv::Mat res;
    cv::resize(mask(r), res, img.size());
    return res;
}

void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks,
                    std::unordered_map<int, std::string>& labels_map) {
    static std::vector<uint32_t> colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17,
                                           0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF,
                                           0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7};
    for (size_t i = 0; i < dets.size(); i++) {
        cv::Mat img_mask = scale_mask(masks[i], img);
        auto color = colors[(int)dets[i].class_id % colors.size()];
        auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF);

        cv::Rect r = get_rect(img, dets[i].bbox);
        for (int x = r.x; x < r.x + r.width; x++) {
            for (int y = r.y; y < r.y + r.height; y++) {
                float val = img_mask.at<float>(y, x);
                if (val <= 0.5)
                    continue;
                img.at<cv::Vec3b>(y, x)[0] = img.at<cv::Vec3b>(y, x)[0] / 2 + bgr[0] / 2;
                img.at<cv::Vec3b>(y, x)[1] = img.at<cv::Vec3b>(y, x)[1] / 2 + bgr[1] / 2;
                img.at<cv::Vec3b>(y, x)[2] = img.at<cv::Vec3b>(y, x)[2] / 2 + bgr[2] / 2;
            }
        }

        cv::rectangle(img, r, bgr, 2);

        // Get the size of the text
        cv::Size textSize =
                cv::getTextSize(labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf),
                                cv::FONT_HERSHEY_PLAIN, 1.2, 2, NULL);
        // Set the top left corner of the rectangle
        cv::Point topLeft(r.x, r.y - textSize.height);

        // Set the bottom right corner of the rectangle
        cv::Point bottomRight(r.x + textSize.width, r.y + textSize.height);

        // Set the thickness of the rectangle lines
        int lineThickness = 2;

        // Draw the rectangle on the image
        cv::rectangle(img, topLeft, bottomRight, bgr, -1);

        cv::putText(img, labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf),
                    cv::Point(r.x, r.y + 4), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar::all(0xFF), 2);
    }
}

void process_decode_ptr_host_obb(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element,
                                 cv::Mat& img, int count) {
    Detection det;
    for (int i = 0; i < count; i++) {
        int basic_pos = 1 + i * bbox_element;
        int keep_flag = decode_ptr_host[basic_pos + 6];
        if (keep_flag == 1) {
            det.bbox[0] = decode_ptr_host[basic_pos + 0];
            det.bbox[1] = decode_ptr_host[basic_pos + 1];
            det.bbox[2] = decode_ptr_host[basic_pos + 2];
            det.bbox[3] = decode_ptr_host[basic_pos + 3];
            det.conf = decode_ptr_host[basic_pos + 4];
            det.class_id = decode_ptr_host[basic_pos + 5];
            det.angle = decode_ptr_host[basic_pos + 7];
            res.push_back(det);
        }
    }
}

void batch_process_obb(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
                       int bbox_element, const std::vector<cv::Mat>& img_batch) {
    res_batch.resize(batch_size);
    int count = static_cast<int>(*decode_ptr_host);
    count = std::min(count, kMaxNumOutputBbox);
    for (int i = 0; i < batch_size; i++) {
        auto& img = const_cast<cv::Mat&>(img_batch[i]);
        process_decode_ptr_host_obb(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count);
    }
}

std::tuple<float, float, float> convariance_matrix(Detection res) {
    float w = res.bbox[2];
    float h = res.bbox[3];

    float a = w * w / 12.0;
    float b = h * h / 12.0;
    float c = res.angle;

    float cos_r = std::cos(c);
    float sin_r = std::sin(c);

    float cos_r2 = cos_r * cos_r;
    float sin_r2 = sin_r * sin_r;

    float a_val = a * cos_r2 + b * sin_r2;
    float b_val = a * sin_r2 + b * cos_r2;
    float c_val = (a - b) * cos_r * sin_r;

    return std::make_tuple(a_val, b_val, c_val);
}

static float probiou(const Detection& res1, const Detection& res2, float eps = 1e-7) {
    // Calculate the prob iou between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf.
    float a1, b1, c1, a2, b2, c2;
    std::tuple<float, float, float> matrix1 = {a1, b1, c1};
    std::tuple<float, float, float> matrix2 = {a2, b2, c2};
    matrix1 = convariance_matrix(res1);
    matrix2 = convariance_matrix(res2);
    a1 = std::get<0>(matrix1);
    b1 = std::get<1>(matrix1);
    c1 = std::get<2>(matrix1);
    a2 = std::get<0>(matrix2);
    b2 = std::get<1>(matrix2);
    c2 = std::get<2>(matrix2);

    float x1 = res1.bbox[0], y1 = res1.bbox[1];
    float x2 = res2.bbox[0], y2 = res2.bbox[1];

    float t1 = ((a1 + a2) * std::pow(y1 - y2, 2) + (b1 + b2) * std::pow(x1 - x2, 2)) /
               ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps);
    float t2 = ((c1 + c2) * (x2 - x1) * (y1 - y2)) / ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps);
    float t3 = std::log(
            ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2)) /
                    (4 * std::sqrt(std::max(a1 * b1 - c1 * c1, 0.0f)) * std::sqrt(std::max(a2 * b2 - c2 * c2, 0.0f)) +
                     eps) +
            eps);

    float bd = 0.25f * t1 + 0.5f * t2 + 0.5f * t3;
    bd = std::max(std::min(bd, 100.0f), eps);
    float hd = std::sqrt(1.0 - std::exp(-bd) + eps);

    return 1 - hd;
}

void nms_obb(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh) {
    int det_size = sizeof(Detection) / sizeof(float);
    std::map<float, std::vector<Detection>> m;

    for (int i = 0; i < output[0]; i++) {

        if (output[1 + det_size * i + 4] <= conf_thresh)
            continue;
        Detection det;
        memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
        if (m.count(det.class_id) == 0)
            m.emplace(det.class_id, std::vector<Detection>());
        m[det.class_id].push_back(det);
    }
    for (auto it = m.begin(); it != m.end(); it++) {
        auto& dets = it->second;
        std::sort(dets.begin(), dets.end(), cmp);
        for (size_t m = 0; m < dets.size(); ++m) {
            auto& item = dets[m];
            res.push_back(item);
            for (size_t n = m + 1; n < dets.size(); ++n) {
                if (probiou(item, dets[n]) >= nms_thresh) {
                    dets.erase(dets.begin() + n);
                    --n;
                }
            }
        }
    }
}

void batch_nms_obb(std::vector<std::vector<Detection>>& res_batch, float* output, int batch_size, int output_size,
                   float conf_thresh, float nms_thresh) {
    res_batch.resize(batch_size);
    for (int i = 0; i < batch_size; i++) {
        nms_obb(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh);
    }
}

static std::vector<cv::Point> get_corner(cv::Mat& img, const Detection& box) {
    float cos_value, sin_value;

    // Calculate center point and width/height
    float x1 = box.bbox[0];
    float y1 = box.bbox[1];
    float w = box.bbox[2];
    float h = box.bbox[3];
    float angle = box.angle * 180.0f / CV_PI;  // Convert radians to degrees

    // Print original angle
    std::cout << "Original angle: " << angle << std::endl;

    // Swap width and height if height is greater than or equal to width
    if (h >= w) {
        std::swap(w, h);
        angle = fmod(angle + 90.0f, 180.0f);  // Adjust angle to be within [0, 180)
    }

    // Ensure the angle is between 0 and 180 degrees
    if (angle < 0) {
        angle += 360.0f;  // Convert to positive value
    }
    if (angle > 180.0f) {
        angle -= 180.0f;  // Subtract 180 from angles greater than 180
    }

    // Print adjusted angle
    std::cout << "Adjusted angle: " << angle << std::endl;

    // Convert to normal angle value
    float normal_angle = fmod(angle, 180.0f);
    if (normal_angle < 0) {
        normal_angle += 180.0f;  // Ensure it's a positive value
    }

    // Print normal angle value
    std::cout << "Normal angle: " << normal_angle << std::endl;

    cos_value = std::cos(angle * CV_PI / 180.0f);  // Convert to radians
    sin_value = std::sin(angle * CV_PI / 180.0f);

    // Calculate each corner point
    float l = x1 - w / 2;  // Left boundary
    float r = x1 + w / 2;  // Right boundary
    float t = y1 - h / 2;  // Top boundary
    float b = y1 + h / 2;  // Bottom boundary

    // Use get_rect function to scale the coordinates
    float bbox[4] = {l, t, r, b};
    cv::Rect rect = get_rect(img, bbox);

    float x_ = (rect.x + rect.x + rect.width) / 2;   // Center x
    float y_ = (rect.y + rect.y + rect.height) / 2;  // Center y
    float width = rect.width;                        // Width
    float height = rect.height;                      // Height

    // Calculate each corner point
    std::vector<cv::Point> corner_points(4);
    float vec1x = width / 2 * cos_value;
    float vec1y = width / 2 * sin_value;
    float vec2x = -height / 2 * sin_value;
    float vec2y = height / 2 * cos_value;

    corner_points[0] = cv::Point(int(round(x_ + vec1x + vec2x)), int(round(y_ + vec1y + vec2y)));  // Top-left corner
    corner_points[1] = cv::Point(int(round(x_ + vec1x - vec2x)), int(round(y_ + vec1y - vec2y)));  // Top-right corner
    corner_points[2] =
            cv::Point(int(round(x_ - vec1x - vec2x)), int(round(y_ - vec1y - vec2y)));  // Bottom-right corner
    corner_points[3] = cv::Point(int(round(x_ - vec1x + vec2x)), int(round(y_ - vec1y + vec2y)));  // Bottom-left corner

    // Check and adjust corner points to ensure the rectangle is parallel to image boundaries
    for (auto& point : corner_points) {
        point.x = std::max(0, std::min(point.x, img.cols - 1));
        point.y = std::max(0, std::min(point.y, img.rows - 1));
    }

    return corner_points;
}

void draw_bbox_obb(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
    static std::vector<uint32_t> colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17,
                                           0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF,
                                           0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7};
    for (size_t i = 0; i < img_batch.size(); i++) {
        auto& res = res_batch[i];
        auto& img = img_batch[i];
        for (auto& obj : res) {
            auto color = colors[(int)obj.class_id % colors.size()];
            auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF);
            auto corner_points = get_corner(img, obj);
            cv::polylines(img, std::vector<std::vector<cv::Point>>{corner_points}, true, bgr, 1);

            auto text = (std::to_string((int)(obj.class_id)) + ":" + to_string_with_precision(obj.conf));
            cv::Size textsize = cv::getTextSize(text, 0, 0.3, 1, nullptr);

            int width = textsize.width;
            int height = textsize.height;
            bool outside = (corner_points[0].y - height >= 3) ? true : false;
            cv::Point p1(corner_points[0].x, corner_points[0].y), p2;
            p2.x = corner_points[0].x + width;
            if (outside) {
                p2.y = corner_points[0].y - height - 3;
            } else {
                p2.y = corner_points[0].y + height + 3;
            }
            cv::rectangle(img, p1, p2, bgr, -1, cv::LINE_AA);
            cv::putText(
                    img, text,
                    cv::Point(corner_points[0].x, (outside ? corner_points[0].y - 2 : corner_points[0].y + height + 2)),
                    0, 0.3, cv::Scalar::all(255), 1, cv::LINE_AA);
        }
    }
}


================================================
FILE: yolov8/src/postprocess.cu
================================================
//
// Created by lindsay on 23-7-17.
//
#include "postprocess.h"
#include "types.h"

static __global__ void decode_kernel_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray,
                                         int max_objects) {
    float count = predict[0];
    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    if (position >= count)
        return;

    float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float));
    int index = atomicAdd(parray, 1);
    if (index >= max_objects)
        return;

    float confidence = pitem[4];

    if (confidence < confidence_threshold)
        return;
    //[center_x center_y w h conf class_id  mask[32] keypoints[51] angle]
    float cx = pitem[0];
    float cy = pitem[1];
    float width = pitem[2];
    float height = pitem[3];
    float label = pitem[5];
    float angle = pitem[89];

    float* pout_item = parray + 1 + index * bbox_element;
    *pout_item++ = cx;
    *pout_item++ = cy;
    *pout_item++ = width;
    *pout_item++ = height;
    *pout_item++ = confidence;
    *pout_item++ = label;
    *pout_item++ = 1;  // 1 = keep, 0 = ignore
    *pout_item++ = angle;
}

static __global__ void decode_kernel(float* predict, int num_bboxes, float confidence_threshold, float* parray,
                                     int max_objects) {
    float count = predict[0];
    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    if (position >= count)
        return;

    float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float));
    int index = atomicAdd(parray, 1);
    if (index >= max_objects)
        return;

    float confidence = pitem[4];
    if (confidence < confidence_threshold)
        return;

    float left = pitem[0];
    float top = pitem[1];
    float right = pitem[2];
    float bottom = pitem[3];
    float label = pitem[5];

    float* pout_item = parray + 1 + index * bbox_element;
    *pout_item++ = left;
    *pout_item++ = top;
    *pout_item++ = right;
    *pout_item++ = bottom;
    *pout_item++ = confidence;
    *pout_item++ = label;
    *pout_item++ = 1;  // 1 = keep, 0 = ignore
}

static __device__ float box_iou(float aleft, float atop, float aright, float abottom, float bleft, float btop,
                                float bright, float bbottom) {
    float cleft = max(aleft, bleft);
    float ctop = max(atop, btop);
    float cright = min(aright, bright);
    float cbottom = min(abottom, bbottom);
    float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
    if (c_area == 0.0f)
        return 0.0f;

    float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
    float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
    return c_area / (a_area + b_area - c_area);
}

static __global__ void nms_kernel(float* bboxes, int max_objects, float threshold) {
    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    int count = min(static_cast<int>(bboxes[0]), max_objects);
    if (position >= count)
        return;

    float* pcurrent = bboxes + 1 + position * bbox_element;
    for (int i = 0; i < count; ++i) {
        float* pitem = bboxes + 1 + i * bbox_element;
        if (i == position || pcurrent[5] != pitem[5])
            continue;
        if (pitem[4] >= pcurrent[4]) {
            if (pitem[4] == pcurrent[4] && i < position)
                continue;
            float iou =
                    box_iou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pitem[0], pitem[1], pitem[2], pitem[3]);
            if (iou > threshold) {
                pcurrent[6] = 0;
                return;
            }
        }
    }
}

static __device__ void convariance_matrix(float w, float h, float r, float& a, float& b, float& c) {
    float a_val = w * w / 12.0f;
    float b_val = h * h / 12.0f;
    float cos_r = cosf(r);
    float sin_r = sinf(r);

    a = a_val * cos_r * cos_r + b_val * sin_r * sin_r;
    b = a_val * sin_r * sin_r + b_val * cos_r * cos_r;
    c = (a_val - b_val) * sin_r * cos_r;
}

static __device__ float box_probiou(float cx1, float cy1, float w1, float h1, float r1, float cx2, float cy2, float w2,
                                    float h2, float r2, float eps = 1e-7) {

    // Calculate the prob iou between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf.
    float a1, b1, c1, a2, b2, c2;
    convariance_matrix(w1, h1, r1, a1, b1, c1);
    convariance_matrix(w2, h2, r2, a2, b2, c2);

    float t1 = ((a1 + a2) * powf(cy1 - cy2, 2) + (b1 + b2) * powf(cx1 - cx2, 2)) /
               ((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2) + eps);
    float t2 = ((c1 + c2) * (cx2 - cx1) * (cy1 - cy2)) / ((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2) + eps);
    float t3 = logf(((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2)) /
                            (4 * sqrtf(fmaxf(a1 * b1 - c1 * c1, 0.0f)) * sqrtf(fmaxf(a2 * b2 - c2 * c2, 0.0f)) + eps) +
                    eps);
    float bd = 0.25f * t1 + 0.5f * t2 + 0.5f * t3;
    bd = fmaxf(fminf(bd, 100.0f), eps);
    float hd = sqrtf(1.0f - expf(-bd) + eps);
    return 1 - hd;
}

static __global__ void nms_kernel_obb(float* bboxes, int max_objects, float threshold) {
    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    int count = min(static_cast<int>(bboxes[0]), max_objects);
    if (position >= count)
        return;

    float* pcurrent = bboxes + 1 + position * bbox_element;
    for (int i = 0; i < count; ++i) {
        float* pitem = bboxes + 1 + i * bbox_element;
        if (i == position || pcurrent[5] != pitem[5])
            continue;
        if (pitem[4] >= pcurrent[4]) {
            if (pitem[4] == pcurrent[4] && i < position)
                continue;
            float iou = box_probiou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pcurrent[7], pitem[0], pitem[1],
                                    pitem[2], pitem[3], pitem[7]);
            if (iou > threshold) {
                pcurrent[6] = 0;
                return;
            }
        }
    }
}

void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
                 cudaStream_t stream) {
    int block = 256;
    int grid = ceil(num_bboxes / (float)block);
    decode_kernel<<<grid, block, 0, stream>>>((float*)predict, num_bboxes, confidence_threshold, parray, max_objects);
}

void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) {
    int block = max_objects < 256 ? max_objects : 256;
    int grid = ceil(max_objects / (float)block);
    nms_kernel<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold);
}

void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
                     cudaStream_t stream) {
    int block = 256;
    int grid = ceil(num_bboxes / (float)block);
    decode_kernel_obb<<<grid, block, 0, stream>>>((float*)predict, num_bboxes, confidence_threshold, parray,
                                                  max_objects);
}

void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) {
    int block = max_objects < 256 ? max_objects : 256;
    int grid = ceil(max_objects / (float)block);
    nms_kernel_obb<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold);
}


================================================
FILE: yolov8/src/preprocess.cu
================================================
#include "cuda_utils.h"
#include "preprocess.h"

static uint8_t* img_buffer_host = nullptr;
static uint8_t* img_buffer_device = nullptr;

__global__ void warpaffine_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst,
                                  int dst_width, int dst_height, uint8_t const_value_st, AffineMatrix d2s, int edge) {
    int position = blockDim.x * blockIdx.x + threadIdx.x;
    if (position >= edge)
        return;

    float m_x1 = d2s.value[0];
    float m_y1 = d2s.value[1];
    float m_z1 = d2s.value[2];
    float m_x2 = d2s.value[3];
    float m_y2 = d2s.value[4];
    float m_z2 = d2s.value[5];

    int dx = position % dst_width;
    int dy = position / dst_width;
    float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
    float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
    float c0, c1, c2;

    if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
        // out of range
        c0 = const_value_st;
        c1 = const_value_st;
        c2 = const_value_st;
    } else {
        int y_low = floorf(src_y);
        int x_low = floorf(src_x);
        int y_high = y_low + 1;
        int x_high = x_low + 1;

        uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
        float ly = src_y - y_low;
        float lx = src_x - x_low;
        float hy = 1 - ly;
        float hx = 1 - lx;
        float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
        uint8_t* v1 = const_value;
        uint8_t* v2 = const_value;
        uint8_t* v3 = const_value;
        uint8_t* v4 = const_value;

        if (y_low >= 0) {
            if (x_low >= 0)
                v1 = src + y_low * src_line_size + x_low * 3;

            if (x_high < src_width)
                v2 = src + y_low * src_line_size + x_high * 3;
        }

        if (y_high < src_height) {
            if (x_low >= 0)
                v3 = src + y_high * src_line_size + x_low * 3;

            if (x_high < src_width)
                v4 = src + y_high * src_line_size + x_high * 3;
        }

        c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
        c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
        c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
    }

    // bgr to rgb
    float t = c2;
    c2 = c0;
    c0 = t;

    // normalization
    c0 = c0 / 255.0f;
    c1 = c1 / 255.0f;
    c2 = c2 / 255.0f;

    // rgbrgbrgb to rrrgggbbb
    int area = dst_width * dst_height;
    float* pdst_c0 = dst + dy * dst_width + dx;
    float* pdst_c1 = pdst_c0 + area;
    float* pdst_c2 = pdst_c1 + area;
    *pdst_c0 = c0;
    *pdst_c1 = c1;
    *pdst_c2 = c2;
}

void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height,
                     cudaStream_t stream) {
    int img_size = src_width * src_height * 3;
    // copy data to pinned memory
    memcpy(img_buffer_host, src, img_size);
    // copy data to device memory
    CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream));

    AffineMatrix s2d, d2s;
    float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width);

    s2d.value[0] = scale;
    s2d.value[1] = 0;
    s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5;
    s2d.value[3] = 0;
    s2d.value[4] = scale;
    s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5;
    cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
    cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
    cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);

    memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));

    int jobs = dst_height * dst_width;
    int threads = 256;
    int blocks = ceil(jobs / (float)threads);
    warpaffine_kernel<<<blocks, threads, 0, stream>>>(img_buffer_device, src_width * 3, src_width, src_height, dst,
                                                      dst_width, dst_height, 128, d2s, jobs);
}

void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch, float* dst, int dst_width, int dst_height,
                           cudaStream_t stream) {
    int dst_size = dst_width * dst_height * 3;
    for (size_t i = 0; i < img_batch.size(); i++) {
        cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width,
                        dst_height, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
    }
}

void cuda_preprocess_init(int max_image_size) {
    // prepare input data in pinned memory
    CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3));
    // prepare input data in device memory
    CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3));
}

void cuda_preprocess_destroy() {
    CUDA_CHECK(cudaFree(img_buffer_device));
    CUDA_CHECK(cudaFreeHost(img_buffer_host));
}


================================================
FILE: yolov8/yolov8_5u_det.cpp
================================================

#include <fstream>
#include <iostream>
#include <opencv2/opencv.hpp>
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "postprocess.h"
#include "preprocess.h"
#include "utils.h"

Logger gLogger;
using namespace nvinfer1;
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;

void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd,
                      float& gw, int& max_channels) {
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();
    IHostMemory* serialized_engine = nullptr;

    if (is_p == 6) {
        serialized_engine =
                buildEngineYolov8_5uDetP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
    } else {
        serialized_engine = buildEngineYolov8_5uDet(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
    }

    assert(serialized_engine);
    std::ofstream p(engine_name, std::ios::binary);
    if (!p) {
        std::cout << "could not open plan output file" << std::endl;
        assert(false);
    }
    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());

    delete serialized_engine;
    delete config;
    delete builder;
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
                        IExecutionContext** context) {
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        assert(false);
    }
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    char* serialized_engine = new char[size];
    assert(serialized_engine);
    file.read(serialized_engine, size);
    file.close();

    *runtime = createInferRuntime(gLogger);
    assert(*runtime);
    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
    assert(*engine);
    *context = (*engine)->createExecutionContext();
    assert(*context);
    delete[] serialized_engine;
}

void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
                    float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
                    std::string cuda_post_process) {
    assert(engine->getNbBindings() == 2);
    // In order to bind the buffers, we need to know the names of the input and
    // output tensors. Note that indices are guaranteed to be less than
    // IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(kInputTensorName);
    const int outputIndex = engine->getBindingIndex(kOutputTensorName);
    assert(inputIndex == 0);
    assert(outputIndex == 1);
    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
    if (cuda_post_process == "c") {
        *output_buffer_host = new float[kBatchSize * kOutputSize];
    } else if (cuda_post_process == "g") {
        if (kBatchSize > 1) {
            std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
            exit(0);
        }
        // Allocate memory for decode_ptr_host and copy to device
        *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
        CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
    }
}

void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
           float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
    // infer on the batch asynchronously, and DMA output back to host
    auto start = std::chrono::system_clock::now();
    context.enqueue(batchsize, buffers, stream, nullptr);
    if (cuda_post_process == "c") {
        CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
                                   stream));
        auto end = std::chrono::system_clock::now();
        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
                  << "ms" << std::endl;
    } else if (cuda_post_process == "g") {
        CUDA_CHECK(
                cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
        cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
        cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox,
                 stream);  // cuda nms
        CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
                                   sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
                                   stream));
        auto end = std::chrono::system_clock::now();
        std::cout << "inference and gpu postprocess time: "
                  << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));
}

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, int& is_p, std::string& img_dir,
                std::string& sub_type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) {
    if (argc < 4)
        return false;
    if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) {
        wts = std::string(argv[2]);
        engine = std::string(argv[3]);
        auto sub_type = std::string(argv[4]);

        if (sub_type[0] == 'n') {
            gd = 0.33;
            gw = 0.25;
            max_channels = 1024;
        } else if (sub_type[0] == 's') {
            gd = 0.33;
            gw = 0.50;
            max_channels = 1024;
        } else if (sub_type[0] == 'm') {
            gd = 0.67;
            gw = 0.75;
            max_channels = 576;
        } else if (sub_type[0] == 'l') {
            gd = 1.0;
            gw = 1.0;
            max_channels = 512;
        } else if (sub_type[0] == 'x') {
            gd = 1.33;
            gw = 1.25;
            max_channels = 640;
        } else {
            return false;
        }
        if (sub_type.size() == 2 && sub_type[1] == '6') {
            is_p = 6;
        }
    } else if (std::string(argv[1]) == "-d" && argc == 5) {
        engine = std::string(argv[2]);
        img_dir = std::string(argv[3]);
        cuda_post_process = std::string(argv[4]);
    } else {
        return false;
    }
    return true;
}

int main(int argc, char** argv) {
    cudaSetDevice(kGpuId);
    std::string wts_name = "";
    std::string engine_name = "";
    std::string img_dir;
    std::string sub_type = "";
    std::string cuda_post_process = "";
    int model_bboxes;
    int is_p = 0;
    float gd = 0.0f, gw = 0.0f;
    int max_channels = 0;

    if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw,
                    max_channels)) {
        std::cerr << "Arguments not right!" << std::endl;
        std::cerr << "./yolov8_5u_det -s [.wts] [.engine] "
                     "[n/s/m/l/x//n6/s6/m6/l6/x6]  // serialize model to "
                     "plan file"
                  << std::endl;
        std::cerr << "./yolov8_5u_det -d [.engine] ../samples  [c/g]// deserialize "
                     "plan file and run inference"
                  << std::endl;
        return -1;
    }

    // Create a model using the API directly and serialize it to a file
    if (!wts_name.empty()) {
        serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels);
        return 0;
    }

    // Deserialize the engine from file
    IRuntime* runtime = nullptr;
    ICudaEngine* engine = nullptr;
    IExecutionContext* context = nullptr;
    deserialize_engine(engine_name, &runtime, &engine, &context);
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));
    cuda_preprocess_init(kMaxInputImageSize);
    auto out_dims = engine->getBindingDimensions(1);
    model_bboxes = out_dims.d[0];
    // Prepare cpu and gpu buffers
    float* device_buffers[2];
    float* output_buffer_host = nullptr;
    float* decode_ptr_host = nullptr;
    float* decode_ptr_device = nullptr;

    // Read images from directory
    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
                   &decode_ptr_device, cuda_post_process);

    // batch predict
    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
        // Get a batch of images
        std::vector<cv::Mat> img_batch;
        std::vector<std::string> img_name_batch;
        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
            img_batch.push_back(img);
            img_name_batch.push_back(file_names[j]);
        }
        // Preprocess
        cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
        // Run inference
        infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
              decode_ptr_device, model_bboxes, cuda_post_process);
        std::vector<std::vector<Detection>> res_batch;
        if (cuda_post_process == "c") {
            // NMS
            batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
        } else if (cuda_post_process == "g") {
            // Process gpu decode and nms results
            batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
        }
        // Draw bounding boxes
        draw_bbox(img_batch, res_batch);
        // Save images
        for (size_t j = 0; j < img_batch.size(); j++) {
            cv::imwrite("_" + img_name_batch[j], img_batch[j]);
        }
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(device_buffers[0]));
    CUDA_CHECK(cudaFree(device_buffers[1]));
    CUDA_CHECK(cudaFree(decode_ptr_device));
    delete[] decode_ptr_host;
    delete[] output_buffer_host;
    cuda_preprocess_destroy();
    // Destroy the engine
    delete context;
    delete engine;
    delete runtime;

    // Print histogram of the output distribution
    // std::cout << "\nOutput:\n\n";
    // for (unsigned int i = 0; i < kOutputSize; i++)
    //{
    //    std::cout << prob[i] << ", ";
    //    if (i % 10 == 0) std::cout << std::endl;
    //}
    // std::cout << std::endl;

    return 0;
}


================================================
FILE: yolov8/yolov8_5u_det_trt.py
================================================
"""
An example that uses TensorRT's Python api to make inferences.
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit  # noqa: F401
import pycuda.driver as cuda
import tensorrt as trt

CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4
POSE_NUM = 17 * 3
DET_NUM = 6
SEG_NUM = 32
OBB_NUM = 1


def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret


def plot_one_box(x, img, color=None, label=None, line_thickness=None):
    """
    description: Plots one bounding box on image img,
                 this function comes from YoLov8 project.
    param:
        x:      a box likes [x1,y1,x2,y2]
        img:    a opencv image object
        color:  color to draw rectangle, such as (0,255,0)
        label:  str
        line_thickness: int
    return:
        no return

    """
    tl = (
            line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
    )  # line/font thickness
    color = color or [random.randint(0, 255) for _ in range(3)]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
    if label:
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(
            img,
            label,
            (c1[0], c1[1] - 2),
            0,
            tl / 3,
            [225, 255, 255],
            thickness=tf,
            lineType=cv2.LINE_AA,
            )


class YoLov8TRT(object):
    """
    description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.batch_size = engine.max_batch_size
        self.det_output_length = host_outputs[0].shape[0]

    def infer(self, raw_image_generator):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_origin_h = []
        batch_origin_w = []
        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
            batch_image_raw.append(image_raw)
            batch_origin_h.append(origin_h)
            batch_origin_w.append(origin_w)
            np.copyto(batch_input_image[i], input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        # Do postprocess
        for i in range(self.batch_size):
            result_boxes, result_scores, result_classid = self.post_process(
                output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i],
                batch_origin_w[i]
            )
            # Draw rectangles and labels on the original image
            for j in range(len(result_boxes)):
                box = result_boxes[j]
                plot_one_box(
                    box,
                    batch_image_raw[i],
                    label="{}:{:.2f}".format(
                        categories[int(result_classid[j])], result_scores[j]
                    ),
                )
        return batch_image_raw, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()

    def get_raw_image(self, image_path_batch):
        """
        description: Read an image from image path
        """
        for img_path in image_path_batch:
            yield cv2.imread(img_path)

    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_image(self, raw_bgr_image):
        """
        description: Convert BGR image to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = self.input_w / w
        r_h = self.input_h / h
        if r_h > r_w:
            tw = self.input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.input_h - th) / 2)
            ty2 = self.input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = self.input_h
            tx1 = int((self.input_w - tw) / 2)
            tx2 = self.input_w - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy(self, origin_h, origin_w, x):
        """
        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
        param:
            origin_h:   height of original image
            origin_w:   width of original image
            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
        return:
            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
        """
        y = np.zeros_like(x)
        r_w = self.input_w / origin_w
        r_h = self.input_h / origin_h
        if r_h > r_w:
            y[:, 0] = x[:, 0]
            y[:, 2] = x[:, 2]
            y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2
            y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2
            y /= r_w
        else:
            y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2
            y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2
            y[:, 1] = x[:, 1]
            y[:, 3] = x[:, 3]
            y /= r_h

        return y

    def post_process(self, output, origin_h, origin_w):
        """
        description: postprocess the prediction
        param:
            output:     A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
            result_scores: finally scores, a numpy, each element is the score correspoing to box
            result_classid: finally classid, a numpy, each element is the classid correspoing to box
        """
        num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + OBB_NUM
        # Get the num of boxes detected
        num = int(output[0])
        # Reshape to a two dimentional ndarray
        # pred = np.reshape(output[1:], (-1, 38))[:num, :]
        pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :]
        # Do nms
        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
        result_scores = boxes[:, 4] if len(boxes) else np.array([])
        result_classid = boxes[:, 5] if len(boxes) else np.array([])
        return result_boxes, result_scores, result_classid

    def bbox_iou(self, box1, box2, x1y1x2y2=True):
        """
        description: compute the IoU of two bounding boxes
        param:
            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            x1y1x2y2: select the coordinate format
        return:
            iou: computed iou
        """
        if not x1y1x2y2:
            # Transform from center and width to exact coordinates
            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
        else:
            # Get the coordinates of bounding boxes
            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

        # Get the coordinates of the intersection rectangle
        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
        # Intersection area
        inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None)
                      * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None))
        # Union Area
        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)

        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

        return iou

    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
        """
        description: Removes detections with lower object confidence score than 'conf_thres' and performs
        Non-Maximum Suppression to further filter detections.
        param:
            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
            origin_h: original image height
            origin_w: original image width
            conf_thres: a confidence threshold to filter detections
            nms_thres: a iou threshold to filter detections
        return:
            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
        """
        # Get the boxes that score > CONF_THRESH
        boxes = prediction[prediction[:, 4] >= conf_thres]
        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
        boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
        # clip the coordinates
        boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1)
        boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1)
        boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1)
        boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1)
        # Object confidence
        confs = boxes[:, 4]
        # Sort by the confs
        boxes = boxes[np.argsort(-confs)]
        # Perform non-maximum suppression
        keep_boxes = []
        while boxes.shape[0]:
            large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
            label_match = boxes[0, -1] == boxes[:, -1]
            # Indices of boxes with lower confidence scores, large IOUs and matching labels
            invalid = large_overlap & label_match
            keep_boxes += [boxes[0]]
            boxes = boxes[~invalid]
        boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
        return boxes


class inferThread(threading.Thread):
    def __init__(self, yolov8_wrapper, image_path_batch):
        threading.Thread.__init__(self)
        self.yolov8_wrapper = yolov8_wrapper
        self.image_path_batch = image_path_batch

    def run(self):
        batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch))
        for i, img_path in enumerate(self.image_path_batch):
            parent, filename = os.path.split(img_path)
            save_name = os.path.join('output', filename)
            # Save image
            cv2.imwrite(save_name, batch_image_raw[i])
        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))


class warmUpThread(threading.Thread):
    def __init__(self, yolov8_wrapper):
        threading.Thread.__init__(self)
        self.yolov8_wrapper = yolov8_wrapper

    def run(self):
        batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros())
        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))


if __name__ == "__main__":
    # load custom plugin and engine
    PLUGIN_LIBRARY = "./build/libmyplugins.so"
    engine_file_path = "yolov5xu.engine"

    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]
    if len(sys.argv) > 2:
        PLUGIN_LIBRARY = sys.argv[2]

    ctypes.CDLL(PLUGIN_LIBRARY)

    # load coco labels

    categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
                  "traffic light",
                  "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
                  "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase",
                  "frisbee",
                  "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
                  "surfboard",
                  "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
                  "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
                  "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
                  "cell phone",
                  "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
                  "teddy bear",
                  "hair drier", "toothbrush"]

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')
    # a YoLov8TRT instance
    yolov8_wrapper = YoLov8TRT(engine_file_path)
    try:
        print('batch size is', yolov8_wrapper.batch_size)

        image_dir = "images/"
        image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir)

        for i in range(10):
            # create a new thread to do warm_up
            thread1 = warmUpThread(yolov8_wrapper)
            thread1.start()
            thread1.join()
        for batch in image_path_batches:
            # create a new thread to do inference
            thread1 = inferThread(yolov8_wrapper, batch)
            thread1.start()
            thread1.join()
    finally:
        # destroy the instance
        yolov8_wrapper.destroy()


================================================
FILE: yolov8/yolov8_cls.cpp
================================================
#include "calibrator.h"
#include "config.h"
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "utils.h"

#include <chrono>
#include <cmath>
#include <iostream>
#include <numeric>
#include <opencv2/opencv.hpp>

using namespace nvinfer1;

static Logger gLogger;
const static int kOutputSize = kClsNumClass;

void batch_preprocess(std::vector<cv::Mat>& imgs, float* output, int dst_width = 224, int dst_height = 224) {
    for (size_t b = 0; b < imgs.size(); b++) {
        int h = imgs[b].rows;
        int w = imgs[b].cols;
        int m = std::min(h, w);
        int top = (h - m) / 2;
        int left = (w - m) / 2;
        cv::Mat img = imgs[b](cv::Rect(left, top, m, m));
        cv::resize(img, img, cv::Size(dst_width, dst_height), 0, 0, cv::INTER_LINEAR);
        cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
        img.convertTo(img, CV_32F, 1 / 255.0);

        std::vector<cv::Mat> channels(3);
        cv::split(img, channels);

        // CHW format
        for (int c = 0; c < 3; ++c) {
            int i = 0;
            for (int row = 0; row < dst_height; ++row) {
                for (int col = 0; col < dst_width; ++col) {
                    output[b * 3 * dst_height * dst_width + c * dst_height * dst_width + i] =
                            channels[c].at<float>(row, col);
                    ++i;
                }
            }
        }
    }
}

std::vector<float> softmax(float* prob, int n) {
    std::vector<float> res;
    float sum = 0.0f;
    float t;
    for (int i = 0; i < n; i++) {
        t = expf(prob[i]);
        res.push_back(t);
        sum += t;
    }
    for (int i = 0; i < n; i++) {
        res[i] /= sum;
    }
    return res;
}

std::vector<int> topk(const std::vector<float>& vec, int k) {
    std::vector<int> topk_index;
    std::vector<size_t> vec_index(vec.size());
    std::iota(vec_index.begin(), vec_index.end(), 0);

    std::sort(vec_index.begin(), vec_index.end(),
              [&vec](size_t index_1, size_t index_2) { return vec[index_1] > vec[index_2]; });

    int k_num = std::min<int>(vec.size(), k);

    for (int i = 0; i < k_num; ++i) {
        topk_index.push_back(vec_index[i]);
    }

    return topk_index;
}

std::vector<std::string> read_classes(std::string file_name) {
    std::vector<std::string> classes;
    std::ifstream ifs(file_name, std::ios::in);
    if (!ifs.is_open()) {
        std::cerr << file_name << " is not found, pls refer to README and download it." << std::endl;
        assert(0);
    }
    std::string s;
    while (std::getline(ifs, s)) {
        classes.push_back(s);
    }
    ifs.close();
    return classes;
}

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, float& gd, float& gw,
                std::string& img_dir) {
    if (argc < 4)
        return false;
    if (std::string(argv[1]) == "-s" && (argc == 5)) {
        wts = std::string(argv[2]);
        engine = std::string(argv[3]);
        auto net = std::string(argv[4]);
        if (net[0] == 'n') {
            gd = 0.33;
            gw = 0.25;
        } else if (net[0] == 's') {
            gd = 0.33;
            gw = 0.50;
        } else if (net[0] == 'm') {
            gd = 0.67;
            gw = 0.75;
        } else if (net[0] == 'l') {
            gd = 1.0;
            gw = 1.0;
        } else if (net[0] == 'x') {
            gd = 1.0;
            gw = 1.25;
        } else {
            return false;
        }
    } else if (std::string(argv[1]) == "-d" && argc == 4) {
        engine = std::string(argv[2]);
        img_dir = std::string(argv[3]);
    } else {
        return false;
    }
    return true;
}

void prepare_buffers(ICudaEngine* engine, float** gpu_input_buffer, float** gpu_output_buffer, float** cpu_input_buffer,
                     float** output_buffer_host) {
    assert(engine->getNbBindings() == 2);
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(kInputTensorName);
    const int outputIndex = engine->getBindingIndex(kOutputTensorName);
    assert(inputIndex == 0);
    assert(outputIndex == 1);
    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc((void**)gpu_input_buffer, kBatchSize * 3 * kClsInputH * kClsInputW * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)gpu_output_buffer, kBatchSize * kOutputSize * sizeof(float)));

    *cpu_input_buffer = new float[kBatchSize * 3 * kClsInputH * kClsInputW];
    *output_buffer_host = new float[kBatchSize * kOutputSize];
}

void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* input, float* output,
           int batchSize) {
    CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * kClsInputH * kClsInputW * sizeof(float),
                               cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
                               stream));
    cudaStreamSynchronize(stream);
}

void serialize_engine(unsigned int max_batchsize, float& gd, float& gw, std::string& wts_name,
                      std::string& engine_name) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();
    // Create model to populate the network, then set the outputs and create an engine
    IHostMemory* serialized_engine = nullptr;
    //engine = buildEngineYolov8Cls(max_batchsize, builder, config, DataType::kFLOAT, gd, gw, wts_name);
    serialized_engine = buildEngineYolov8Cls(builder, config, DataType::kFLOAT, wts_name, gd, gw);
    assert(serialized_engine);
    // Save engine to file
    std::ofstream p(engine_name, std::ios::binary);
    if (!p) {
        std::cerr << "Could not open plan output file" << std::endl;
        assert(false);
    }
    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());

    // Close everything down
    delete serialized_engine;
    delete config;
    delete builder;
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
                        IExecutionContext** context) {
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        assert(false);
    }
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    char* serialized_engine = new char[size];
    assert(serialized_engine);
    file.read(serialized_engine, size);
    file.close();

    *runtime = createInferRuntime(gLogger);
    assert(*runtime);
    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
    assert(*engine);
    *context = (*engine)->createExecutionContext();
    assert(*context);
    delete[] serialized_engine;
}

int main(int argc, char** argv) {
    cudaSetDevice(kGpuId);

    std::string wts_name = "";
    std::string engine_name = "";
    float gd = 0.0f, gw = 0.0f;
    std::string img_dir;

    if (!parse_args(argc, argv, wts_name, engine_name, gd, gw, img_dir)) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./yolov8_cls -s [.wts] [.engine] [n/s/m/l/x or c gd gw]  // serialize model to plan file"
                  << std::endl;
        std::cerr << "./yolov8_cls -d [.engine] ../samples  // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // Create a model using the API directly and serialize it to a file
    if (!wts_name.empty()) {
        serialize_engine(kBatchSize, gd, gw, wts_name, engine_name);
        return 0;
    }

    // Deserialize the engine from file
    IRuntime* runtime = nullptr;
    ICudaEngine* engine = nullptr;
    IExecutionContext* context = nullptr;
    deserialize_engine(engine_name, &runtime, &engine, &context);
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    // Prepare cpu and gpu buffers
    float* device_buffers[2];
    float* cpu_input_buffer = nullptr;
    float* output_buffer_host = nullptr;
    prepare_buffers(engine, &device_buffers[0], &device_buffers[1], &cpu_input_buffer, &output_buffer_host);

    // Read images from directory
    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    // Read imagenet labels
    auto classes = read_classes("imagenet_classes.txt");

    // batch predict
    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
        // Get a batch of images
        std::vector<cv::Mat> img_batch;
        std::vector<std::string> img_name_batch;
        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
            img_batch.push_back(img);
            img_name_batch.push_back(file_names[j]);
        }

        // Preprocess
        batch_preprocess(img_batch, cpu_input_buffer);

        // Run inference
        auto start = std::chrono::system_clock::now();
        infer(*context, stream, (void**)device_buffers, cpu_input_buffer, output_buffer_host, kBatchSize);
        auto end = std::chrono::system_clock::now();
        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
                  << "ms" << std::endl;

        // Postprocess and get top-k result
        for (size_t b = 0; b < img_name_batch.size(); b++) {
            float* p = &output_buffer_host[b * kOutputSize];
            auto res = softmax(p, kOutputSize);
            auto topk_idx = topk(res, 3);
            std::cout << img_name_batch[b] << std::endl;
            for (auto idx : topk_idx) {
                std::cout << "  " << classes[idx] << " " << res[idx] << std::endl;
            }
        }
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(device_buffers[0]));
    CUDA_CHECK(cudaFree(device_buffers[1]));
    delete[] cpu_input_buffer;
    delete[] output_buffer_host;
    // Destroy the engine
    delete context;
    delete engine;
    delete runtime;
    return 0;
}


================================================
FILE: yolov8/yolov8_cls_trt.py
================================================
"""
An example that uses TensorRT's Python api to make inferences.
"""
import os
import shutil
import sys
import threading
import time
import cv2
import numpy as np
import torch
import pycuda.autoinit  # noqa: F401
import pycuda.driver as cuda
import tensorrt as trt


def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret


with open("imagenet_classes.txt") as f:
    classes = [line.strip() for line in f.readlines()]


class YoLov8TRT(object):
    """
    description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []
        self.mean = (0.485, 0.456, 0.406)
        self.std = (0.229, 0.224, 0.225)

        for binding in engine:
            print('binding:', binding, engine.get_binding_shape(binding))
            size = trt.volume(engine.get_binding_shape(
                binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.batch_size = engine.max_batch_size

    def infer(self, raw_image_generator):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_input_image = np.empty(
            shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            batch_image_raw.append(image_raw)
            input_image = self.preprocess_cls_image(image_raw)
            np.copyto(batch_input_image[i], input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async(batch_size=self.batch_size,
                              bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        # Do postprocess
        for i in range(self.batch_size):
            classes_ls, predicted_conf_ls, category_id_ls = self.postprocess_cls(
                output)
            cv2.putText(batch_image_raw[i], str(
                classes_ls), (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1, cv2.LINE_AA)
            print(classes_ls, predicted_conf_ls)
        return batch_image_raw, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()

    def get_raw_image(self, image_path_batch):
        """
        description: Read an image from image path
        """
        for img_path in image_path_batch:
            yield cv2.imread(img_path)

    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_cls_image(self, raw_bgr_image, dst_width=224, dst_height=224):

        """
            description: Convert BGR image to RGB,
                         crop the center square frame,
                         resize it to target size, normalize to [0,1],
                         transform to NCHW format.
            param:
                raw_bgr_image: numpy array, raw BGR image
                dst_width: int, target image width
                dst_height: int, target image height
            return:
                image:  the processed image
                image_raw: the original image
                h: original height
                w: original width
        """
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        # Crop the center square frame
        m = min(h, w)
        top = (h - m) // 2
        left = (w - m) // 2
        image = raw_bgr_image[top:top + m, left:left + m]

        # Resize the image with target size while maintaining ratio
        image = cv2.resize(image, (dst_width, dst_height), interpolation=cv2.INTER_LINEAR)

        # Convert BGR to RGB
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # Normalize to [0,1]
        image = image.astype(np.float32) / 255.0

        # HWC to CHW format
        image = image.transpose(2, 0, 1)

        # CHW to NCHW format (add batch dimension)
        image = np.expand_dims(image, axis=0)

        # Convert the image to row-major order, also known as "C order"
        image = np.ascontiguousarray(image)

        batch_data = np.expand_dims(image, axis=0)

        return batch_data

    def postprocess_cls(self, output_data):
        classes_ls = []
        predicted_conf_ls = []
        category_id_ls = []
        output_data = output_data.reshape(self.batch_size, -1)
        output_data = torch.Tensor(output_data)
        p = torch.nn.functional.softmax(output_data, dim=1)
        score, index = torch.topk(p, 3)
        for ind in range(index.shape[0]):
            input_category_id = index[ind][0].item()  # 716
            category_id_ls.append(input_category_id)
            predicted_confidence = score[ind][0].item()
            predicted_conf_ls.append(predicted_confidence)
            classes_ls.append(classes[input_category_id])
        return classes_ls, predicted_conf_ls, category_id_ls


class inferThread(threading.Thread):
    def __init__(self, yolov8_wrapper, image_path_batch):
        threading.Thread.__init__(self)
        self.yolov8_wrapper = yolov8_wrapper
        self.image_path_batch = image_path_batch

    def run(self):
        batch_image_raw, use_time = self.yolov8_wrapper.infer(
            self.yolov8_wrapper.get_raw_image(self.image_path_batch))
        for i, img_path in enumerate(self.image_path_batch):
            parent, filename = os.path.split(img_path)
            save_name = os.path.join('output', filename)
            # Save image
            cv2.imwrite(save_name, batch_image_raw[i])
        print('input->{}, time->{:.2f}ms, saving into output/'.format(
            self.image_path_batch, use_time * 1000))


class warmUpThread(threading.Thread):
    def __init__(self, yolov8_wrapper):
        threading.Thread.__init__(self)
        self.yolov8_wrapper = yolov8_wrapper

    def run(self):
        batch_image_raw, use_time = self.yolov8_wrapper.infer(
            self.yolov8_wrapper.get_raw_image_zeros())
        print(
            'warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))


if __name__ == "__main__":
    # load custom plugin and engine
    engine_file_path = "./yolov8x-cls-fp32.engine"

    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')
    # a YoLov8TRT instance
    yolov8_wrapper = YoLov8TRT(engine_file_path)
    try:
        print('batch size is', yolov8_wrapper.batch_size)

        image_dir = "samples/"
        image_path_batches = get_img_path_batches(
            yolov8_wrapper.batch_size, image_dir)

        for i in range(10):
            # create a new thread to do warm_up
            thread1 = warmUpThread(yolov8_wrapper)
            thread1.start()
            thread1.join()
        for batch in image_path_batches:
            # create a new thread to do inference
            thread1 = inferThread(yolov8_wrapper, batch)
            thread1.start()
            thread1.join()
    finally:
        # destroy the instance
        yolov8_wrapper.destroy()


================================================
FILE: yolov8/yolov8_det.cpp
================================================

#include <fstream>
#include <iostream>
#include <opencv2/opencv.hpp>
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "postprocess.h"
#include "preprocess.h"
#include "utils.h"

Logger gLogger;
using namespace nvinfer1;
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;

void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd,
                      float& gw, int& max_channels) {
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();
    IHostMemory* serialized_engine = nullptr;

    if (is_p == 6) {
        serialized_engine = buildEngineYolov8DetP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
    } else if (is_p == 2) {
        serialized_engine = buildEngineYolov8DetP2(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
    } else {
        serialized_engine = buildEngineYolov8Det(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
    }

    assert(serialized_engine);
    std::ofstream p(engine_name, std::ios::binary);
    if (!p) {
        std::cout << "could not open plan output file" << std::endl;
        assert(false);
    }
    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());

    delete serialized_engine;
    delete config;
    delete builder;
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
                        IExecutionContext** context) {
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        assert(false);
    }
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    char* serialized_engine = new char[size];
    assert(serialized_engine);
    file.read(serialized_engine, size);
    file.close();

    *runtime = createInferRuntime(gLogger);
    assert(*runtime);
    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
    assert(*engine);
    *context = (*engine)->createExecutionContext();
    assert(*context);
    delete[] serialized_engine;
}

void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
                    float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
                    std::string cuda_post_process) {
    assert(engine->getNbBindings() == 2);
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(kInputTensorName);
    const int outputIndex = engine->getBindingIndex(kOutputTensorName);
    assert(inputIndex == 0);
    assert(outputIndex == 1);
    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
    if (cuda_post_process == "c") {
        *output_buffer_host = new float[kBatchSize * kOutputSize];
    } else if (cuda_post_process == "g") {
        if (kBatchSize > 1) {
            std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
            exit(0);
        }
        // Allocate memory for decode_ptr_host and copy to device
        *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
        CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
    }
}

void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
           float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
    // infer on the batch asynchronously, and DMA output back to host
    auto start = std::chrono::system_clock::now();
    context.enqueue(batchsize, buffers, stream, nullptr);
    if (cuda_post_process == "c") {
        CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
                                   stream));
        auto end = std::chrono::system_clock::now();
        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
                  << "ms" << std::endl;
    } else if (cuda_post_process == "g") {
        CUDA_CHECK(
                cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
        cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
        cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream);  //cuda nms
        CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
                                   sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
                                   stream));
        auto end = std::chrono::system_clock::now();
        std::cout << "inference and gpu postprocess time: "
                  << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));
}

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, int& is_p, std::string& img_dir,
                std::string& sub_type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) {
    if (argc < 4)
        return false;
    if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) {
        wts = std::string(argv[2]);
        engine = std::string(argv[3]);
        auto sub_type = std::string(argv[4]);

        if (sub_type[0] == 'n') {
            gd = 0.33;
            gw = 0.25;
            max_channels = 1024;
        } else if (sub_type[0] == 's') {
            gd = 0.33;
            gw = 0.50;
            max_channels = 1024;
        } else if (sub_type[0] == 'm') {
            gd = 0.67;
            gw = 0.75;
            max_channels = 576;
        } else if (sub_type[0] == 'l') {
            gd = 1.0;
            gw = 1.0;
            max_channels = 512;
        } else if (sub_type[0] == 'x') {
            gd = 1.0;
            gw = 1.25;
            max_channels = 640;
        } else {
            return false;
        }
        if (sub_type.size() == 2 && sub_type[1] == '6') {
            is_p = 6;
        } else if (sub_type.size() == 2 && sub_type[1] == '2') {
            is_p = 2;
        }
    } else if (std::string(argv[1]) == "-d" && argc == 5) {
        engine = std::string(argv[2]);
        img_dir = std::string(argv[3]);
        cuda_post_process = std::string(argv[4]);
    } else {
        return false;
    }
    return true;
}

int main(int argc, char** argv) {
    cudaSetDevice(kGpuId);
    std::string wts_name = "";
    std::string engine_name = "";
    std::string img_dir;
    std::string sub_type = "";
    std::string cuda_post_process = "";
    int model_bboxes;
    int is_p = 0;
    float gd = 0.0f, gw = 0.0f;
    int max_channels = 0;

    if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw,
                    max_channels)) {
        std::cerr << "Arguments not right!" << std::endl;
        std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6]  // serialize model to "
                     "plan file"
                  << std::endl;
        std::cerr << "./yolov8 -d [.engine] ../samples  [c/g]// deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // Create a model using the API directly and serialize it to a file
    if (!wts_name.empty()) {
        serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels);
        return 0;
    }

    // Deserialize the engine from file
    IRuntime* runtime = nullptr;
    ICudaEngine* engine = nullptr;
    IExecutionContext* context = nullptr;
    deserialize_engine(engine_name, &runtime, &engine, &context);
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));
    cuda_preprocess_init(kMaxInputImageSize);
    auto out_dims = engine->getBindingDimensions(1);
    model_bboxes = out_dims.d[0];
    // Prepare cpu and gpu buffers
    float* device_buffers[2];
    float* output_buffer_host = nullptr;
    float* decode_ptr_host = nullptr;
    float* decode_ptr_device = nullptr;

    // Read images from directory
    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
                   &decode_ptr_device, cuda_post_process);

    // batch predict
    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
        // Get a batch of images
        std::vector<cv::Mat> img_batch;
        std::vector<std::string> img_name_batch;
        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
            img_batch.push_back(img);
            img_name_batch.push_back(file_names[j]);
        }
        // Preprocess
        cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
        // Run inference
        infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
              decode_ptr_device, model_bboxes, cuda_post_process);
        std::vector<std::vector<Detection>> res_batch;
        if (cuda_post_process == "c") {
            // NMS
            batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
        } else if (cuda_post_process == "g") {
            //Process gpu decode and nms results
            batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
        }
        // Draw bounding boxes
        draw_bbox(img_batch, res_batch);
        // Save images
        for (size_t j = 0; j < img_batch.size(); j++) {
            cv::imwrite("_" + img_name_batch[j], img_batch[j]);
        }
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(device_buffers[0]));
    CUDA_CHECK(cudaFree(device_buffers[1]));
    CUDA_CHECK(cudaFree(decode_ptr_device));
    delete[] decode_ptr_host;
    delete[] output_buffer_host;
    cuda_preprocess_destroy();
    // Destroy the engine
    delete context;
    delete engine;
    delete runtime;

    // Print histogram of the output distribution
    //std::cout << "\nOutput:\n\n";
    //for (unsigned int i = 0; i < kOutputSize; i++)
    //{
    //    std::cout << prob[i] << ", ";
    //    if (i % 10 == 0) std::cout << std::endl;
    //}
    //std::cout << std::endl;

    return 0;
}


================================================
FILE: yolov8/yolov8_det_trt.py
================================================
"""
An example that uses TensorRT's Python api to make inferences.
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit  # noqa: F401
import pycuda.driver as cuda
import tensorrt as trt

CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4
POSE_NUM = 17 * 3
DET_NUM = 6
SEG_NUM = 32
OBB_NUM = 1


def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret


def plot_one_box(x, img, color=None, label=None, line_thickness=None):
    """
    description: Plots one bounding box on image img,
                 this function comes from YoLov8 project.
    param:
        x:      a box likes [x1,y1,x2,y2]
        img:    a opencv image object
        color:  color to draw rectangle, such as (0,255,0)
        label:  str
        line_thickness: int
    return:
        no return

    """
    tl = (
            line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
    )  # line/font thickness
    color = color or [random.randint(0, 255) for _ in range(3)]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
    if label:
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(
            img,
            label,
            (c1[0], c1[1] - 2),
            0,
            tl / 3,
            [225, 255, 255],
            thickness=tf,
            lineType=cv2.LINE_AA,
            )


class YoLov8TRT(object):
    """
    description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.batch_size = engine.max_batch_size
        self.det_output_length = host_outputs[0].shape[0]

    def infer(self, raw_image_generator):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_origin_h = []
        batch_origin_w = []
        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
            batch_image_raw.append(image_raw)
            batch_origin_h.append(origin_h)
            batch_origin_w.append(origin_w)
            np.copyto(batch_input_image[i], input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        # Do postprocess
        for i in range(self.batch_size):
            result_boxes, result_scores, result_classid = self.post_process(
                output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i],
                batch_origin_w[i]
            )
            # Draw rectangles and labels on the original image
            for j in range(len(result_boxes)):
                box = result_boxes[j]
                plot_one_box(
                    box,
                    batch_image_raw[i],
                    label="{}:{:.2f}".format(
                        categories[int(result_classid[j])], result_scores[j]
                    ),
                )
        return batch_image_raw, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()

    def get_raw_image(self, image_path_batch):
        """
        description: Read an image from image path
        """
        for img_path in image_path_batch:
            yield cv2.imread(img_path)

    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_image(self, raw_bgr_image):
        """
        description: Convert BGR image to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = self.input_w / w
        r_h = self.input_h / h
        if r_h > r_w:
            tw = self.input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.input_h - th) / 2)
            ty2 = self.input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = self.input_h
            tx1 = int((self.input_w - tw) / 2)
            tx2 = self.input_w - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy(self, origin_h, origin_w, x):
        """
        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
        param:
            origin_h:   height of original image
            origin_w:   width of original image
            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
        return:
            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
        """
        y = np.zeros_like(x)
        r_w = self.input_w / origin_w
        r_h = self.input_h / origin_h
        if r_h > r_w:
            y[:, 0] = x[:, 0]
            y[:, 2] = x[:, 2]
            y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2
            y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2
            y /= r_w
        else:
            y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2
            y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2
            y[:, 1] = x[:, 1]
            y[:, 3] = x[:, 3]
            y /= r_h

        return y

    def post_process(self, output, origin_h, origin_w):
        """
        description: postprocess the prediction
        param:
            output:     A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
            result_scores: finally scores, a numpy, each element is the score correspoing to box
            result_classid: finally classid, a numpy, each element is the classid correspoing to box
        """
        num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + OBB_NUM
        # Get the num of boxes detected
        num = int(output[0])
        # Reshape to a two dimentional ndarray
        # pred = np.reshape(output[1:], (-1, 38))[:num, :]
        pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :]
        # Do nms
        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
        result_scores = boxes[:, 4] if len(boxes) else np.array([])
        result_classid = boxes[:, 5] if len(boxes) else np.array([])
        return result_boxes, result_scores, result_classid

    def bbox_iou(self, box1, box2, x1y1x2y2=True):
        """
        description: compute the IoU of two bounding boxes
        param:
            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            x1y1x2y2: select the coordinate format
        return:
            iou: computed iou
        """
        if not x1y1x2y2:
            # Transform from center and width to exact coordinates
            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
        else:
            # Get the coordinates of bounding boxes
            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

        # Get the coordinates of the intersection rectangle
        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
        # Intersection area
        inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None)
                      * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None))
        # Union Area
        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)

        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

        return iou

    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
        """
        description: Removes detections with lower object confidence score than 'conf_thres' and performs
        Non-Maximum Suppression to further filter detections.
        param:
            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
            origin_h: original image height
            origin_w: original image width
            conf_thres: a confidence threshold to filter detections
            nms_thres: a iou threshold to filter detections
        return:
            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
        """
        # Get the boxes that score > CONF_THRESH
        boxes = prediction[prediction[:, 4] >= conf_thres]
        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
        boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
        # clip the coordinates
        boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1)
        boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1)
        boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1)
        boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1)
        # Object confidence
        confs = boxes[:, 4]
        # Sort by the confs
        boxes = boxes[np.argsort(-confs)]
        # Perform non-maximum suppression
        keep_boxes = []
        while boxes.shape[0]:
            large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
            label_match = boxes[0, -1] == boxes[:, -1]
            # Indices of boxes with lower confidence scores, large IOUs and matching labels
            invalid = large_overlap & label_match
            keep_boxes += [boxes[0]]
            boxes = boxes[~invalid]
        boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
        return boxes


class inferThread(threading.Thread):
    def __init__(self, yolov8_wrapper, image_path_batch):
        threading.Thread.__init__(self)
        self.yolov8_wrapper = yolov8_wrapper
        self.image_path_batch = image_path_batch

    def run(self):
        batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch))
        for i, img_path in enumerate(self.image_path_batch):
            parent, filename = os.path.split(img_path)
            save_name = os.path.join('output', filename)
            # Save image
            cv2.imwrite(save_name, batch_image_raw[i])
        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))


class warmUpThread(threading.Thread):
    def __init__(self, yolov8_wrapper):
        threading.Thread.__init__(self)
        self.yolov8_wrapper = yolov8_wrapper

    def run(self):
        batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros())
        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))


if __name__ == "__main__":
    # load custom plugin and engine
    PLUGIN_LIBRARY = "./build/libmyplugins.so"
    engine_file_path = "yolov8n.engine"

    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]
    if len(sys.argv) > 2:
        PLUGIN_LIBRARY = sys.argv[2]

    ctypes.CDLL(PLUGIN_LIBRARY)

    # load coco labels

    categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
                  "traffic light",
                  "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
                  "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase",
                  "frisbee",
                  "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
                  "surfboard",
                  "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
                  "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
                  "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
                  "cell phone",
                  "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
                  "teddy bear",
                  "hair drier", "toothbrush"]

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')
    # a YoLov8TRT instance
    yolov8_wrapper = YoLov8TRT(engine_file_path)
    try:
        print('batch size is', yolov8_wrapper.batch_size)

        image_dir = "images/"
        image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir)

        for i in range(10):
            # create a new thread to do warm_up
            thread1 = warmUpThread(yolov8_wrapper)
            thread1.start()
            thread1.join()
        for batch in image_path_batches:
            # create a new thread to do inference
            thread1 = inferThread(yolov8_wrapper, batch)
            thread1.start()
            thread1.join()
    finally:
        # destroy the instance
        yolov8_wrapper.destroy()


================================================
FILE: yolov8/yolov8_obb.cpp
================================================

#include <fstream>
#include <iostream>
#include <opencv2/opencv.hpp>
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "postprocess.h"
#include "preprocess.h"
#include "utils.h"

Logger gLogger;
using namespace nvinfer1;
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;

void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd,
                      float& gw, int& max_channels) {
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();
    IHostMemory* serialized_engine = nullptr;

    if (is_p == 6) {
        std::cout << "p6 is not supported right now" << std::endl;
    } else if (is_p == 2) {
        std::cout << "p2 is not supported right now" << std::endl;
    } else {
        serialized_engine = buildEngineYolov8Obb(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
    }

    assert(serialized_engine);
    std::ofstream p(engine_name, std::ios::binary);
    if (!p) {
        std::cout << "could not open plan output file" << std::endl;
        assert(false);
    }
    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());

    delete serialized_engine;
    delete config;
    delete builder;
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
                        IExecutionContext** context) {
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        assert(false);
    }
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    char* serialized_engine = new char[size];
    assert(serialized_engine);
    file.read(serialized_engine, size);
    file.close();

    *runtime = createInferRuntime(gLogger);
    assert(*runtime);
    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
    assert(*engine);
    *context = (*engine)->createExecutionContext();
    assert(*context);
    delete[] serialized_engine;
}

void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
                    float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
                    std::string cuda_post_process) {
    assert(engine->getNbBindings() == 2);
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(kInputTensorName);
    const int outputIndex = engine->getBindingIndex(kOutputTensorName);
    assert(inputIndex == 0);
    assert(outputIndex == 1);
    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
    if (cuda_post_process == "c") {
        *output_buffer_host = new float[kBatchSize * kOutputSize];
    } else if (cuda_post_process == "g") {
        if (kBatchSize > 1) {
            std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
            exit(0);
        }
        // Allocate memory for decode_ptr_host and copy to device
        *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
        CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
    }
}

void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
           float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
    // infer on the batch asynchronously, and DMA output back to host
    auto start = std::chrono::system_clock::now();
    context.enqueue(batchsize, buffers, stream, nullptr);
    if (cuda_post_process == "c") {
        CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
                                   stream));
        auto end = std::chrono::system_clock::now();
        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
                  << "ms" << std::endl;
    } else if (cuda_post_process == "g") {
        CUDA_CHECK(
                cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
        cuda_decode_obb((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
        cuda_nms_obb(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream);  //cuda nms
        CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
                                   sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
                                   stream));
        auto end = std::chrono::system_clock::now();
        std::cout << "inference and gpu postprocess time: "
                  << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));
}

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, int& is_p, std::string& img_dir,
                std::string& sub_type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) {
    if (argc < 4)
        return false;
    if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) {
        wts = std::string(argv[2]);
        engine = std::string(argv[3]);
        auto sub_type = std::string(argv[4]);

        if (sub_type[0] == 'n') {
            gd = 0.33;
            gw = 0.25;
            max_channels = 1024;
        } else if (sub_type[0] == 's') {
            gd = 0.33;
            gw = 0.50;
            max_channels = 1024;
        } else if (sub_type[0] == 'm') {
            gd = 0.67;
            gw = 0.75;
            max_channels = 576;
        } else if (sub_type[0] == 'l') {
            gd = 1.0;
            gw = 1.0;
            max_channels = 512;
        } else if (sub_type[0] == 'x') {
            gd = 1.0;
            gw = 1.25;
            max_channels = 640;
        } else {
            return false;
        }
        if (sub_type.size() == 2 && sub_type[1] == '6') {
            is_p = 6;
        } else if (sub_type.size() == 2 && sub_type[1] == '2') {
            is_p = 2;
        }
    } else if (std::string(argv[1]) == "-d" && argc == 5) {
        engine = std::string(argv[2]);
        img_dir = std::string(argv[3]);
        cuda_post_process = std::string(argv[4]);
    } else {
        return false;
    }
    return true;
}

int main(int argc, char** argv) {
    cudaSetDevice(kGpuId);
    std::string wts_name = "";
    std::string engine_name = "";
    std::string img_dir;
    std::string sub_type = "";
    std::string cuda_post_process = "";
    int model_bboxes;
    int is_p = 0;
    float gd = 0.0f, gw = 0.0f;
    int max_channels = 0;

    if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw,
                    max_channels)) {
        std::cerr << "Arguments not right!" << std::endl;
        std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6]  // serialize model to "
                     "plan file"
                  << std::endl;
        std::cerr << "./yolov8 -d [.engine] ../samples  [c/g]// deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // Create a model using the API directly and serialize it to a file
    if (!wts_name.empty()) {
        serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels);
        return 0;
    }

    // Deserialize the engine from file
    IRuntime* runtime = nullptr;
    ICudaEngine* engine = nullptr;
    IExecutionContext* context = nullptr;
    deserialize_engine(engine_name, &runtime, &engine, &context);
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));
    cuda_preprocess_init(kMaxInputImageSize);
    auto out_dims = engine->getBindingDimensions(1);
    model_bboxes = out_dims.d[0];
    // Prepare cpu and gpu buffers
    float* device_buffers[2];
    float* output_buffer_host = nullptr;
    float* decode_ptr_host = nullptr;
    float* decode_ptr_device = nullptr;

    // Read images from directory
    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
                   &decode_ptr_device, cuda_post_process);

    // batch predict
    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
        // Get a batch of images
        std::vector<cv::Mat> img_batch;
        std::vector<std::string> img_name_batch;
        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
            img_batch.push_back(img);
            img_name_batch.push_back(file_names[j]);
        }
        // Preprocess
        cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
        // Run inference
        infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
              decode_ptr_device, model_bboxes, cuda_post_process);
        std::vector<std::vector<Detection>> res_batch;
        if (cuda_post_process == "c") {
            // NMS
            batch_nms_obb(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
        } else if (cuda_post_process == "g") {
            //Process gpu decode and nms results
            batch_process_obb(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
        }
        // Draw bounding boxes
        draw_bbox_obb(img_batch, res_batch);
        // Save images
        for (size_t j = 0; j < img_batch.size(); j++) {
            cv::imwrite("_" + img_name_batch[j], img_batch[j]);
        }
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(device_buffers[0]));
    CUDA_CHECK(cudaFree(device_buffers[1]));
    CUDA_CHECK(cudaFree(decode_ptr_device));
    delete[] decode_ptr_host;
    delete[] output_buffer_host;
    cuda_preprocess_destroy();
    // Destroy the engine
    delete context;
    delete engine;
    delete runtime;

    // Print histogram of the output distribution
    //std::cout << "\nOutput:\n\n";
    //for (unsigned int i = 0; i < kOutputSize; i++)
    //{
    //    std::cout << prob[i] << ", ";
    //    if (i % 10 == 0) std::cout << std::endl;
    //}
    //std::cout << std::endl;

    return 0;
}


================================================
FILE: yolov8/yolov8_obb_trt.py
================================================
"""
An example that uses TensorRT's Python api to make inferences.
"""
import ctypes
import os
import shutil
import sys
import threading
import time
import cv2
import math
import numpy as np
import pycuda.autoinit  # noqa: F401
import pycuda.driver as cuda
import tensorrt as trt

CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4
POSE_NUM = 17 * 3
DET_NUM = 6
SEG_NUM = 32
OBB_NUM = 1

INPUT_W = 640
INPUT_H = 640


class Detection:
    def __init__(self, bbox, score, class_id, angle):
        self.bbox = bbox
        self.score = score
        self.class_id = class_id
        self.angle = angle


def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret


def get_corner(img, box: Detection):
    """
    description: Get the four corner points of the rotated bounding box
    param:
        img:    an opencv image object (numpy array)
        box:    a Detection object containing bbox [cx,cy,w,h] and angle (radians)
    return:
        corners: four corner points of the rotated bounding box as numpy array [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
    """
    # Extract box parameters
    cx, cy, w, h = box.bbox
    angle = box.angle * 180.0 / math.pi  # Convert radians to degrees

    # Swap width and height if height >= width
    if h >= w:
        w, h = h, w
        angle = (angle + 90.0) % 180.0  # Adjust angle

    # Ensure angle is between 0 and 180 degrees
    if angle < 0:
        angle += 360.0
    if angle > 180.0:
        angle -= 180.0

    # Convert to normalized angle (0-180)
    normal_angle = angle % 180.0
    if normal_angle < 0:
        normal_angle += 180.0

    # Convert back to radians for calculation
    angle_rad = angle * math.pi / 180.0
    cos_val = math.cos(angle_rad)
    sin_val = math.sin(angle_rad)

    # Calculate boundaries
    l_x = cx - w / 2
    r_x = cx + w / 2
    t_y = cy - h / 2
    b_y = cy + h / 2

    # Scale coordinates using get_rect_obb (matching C++ version)
    bbox = [l_x, t_y, r_x, b_y]
    rect = get_rect_obb(img, bbox)

    # Calculate center and dimensions of scaled box
    x_ = (rect[0] + rect[0] + rect[2]) / 2  # rect.x + rect.width/2
    y_ = (rect[1] + rect[1] + rect[3]) / 2  # rect.y + rect.height/2
    width = rect[2]
    height = rect[3]

    # Calculate vectors
    vec1x = width / 2 * cos_val
    vec1y = width / 2 * sin_val
    vec2x = -height / 2 * sin_val
    vec2y = height / 2 * cos_val

    # Calculate four corners
    corners = np.array([
        [int(round(x_ + vec1x + vec2x)), int(round(y_ + vec1y + vec2y))],  # Top-left
        [int(round(x_ + vec1x - vec2x)), int(round(y_ + vec1y - vec2y))],  # Top-right
        [int(round(x_ - vec1x - vec2x)), int(round(y_ - vec1y - vec2y))],  # Bottom-right
        [int(round(x_ - vec1x + vec2x)), int(round(y_ - vec1y + vec2y))]  # Bottom-left
    ], dtype=np.int32)

    # Clip to image boundaries
    h, w = img.shape[:2]
    corners[:, 0] = np.clip(corners[:, 0], 0, w - 1)
    corners[:, 1] = np.clip(corners[:, 1], 0, h - 1)

    return corners


def get_rect_obb(img, bbox):
    """
    Scale coordinates according to image resize ratio (matching C++ version)
    param:
        img: OpenCV image (numpy array)
        bbox: [left, top, right, bottom]
    return:
        [x, y, width, height]
    """
    l_x, t_y, r_x, b_y = bbox
    r_w = INPUT_W / img.shape[1]  # INPUT_W should be your model input width
    r_h = INPUT_H / img.shape[0]  # INPUT_H should be your model input height

    if r_h > r_w:
        l_x = l_x
        r_x = r_x
        t_y = t_y - (INPUT_H - r_w * img.shape[0]) / 2
        b_y = b_y - (INPUT_H - r_w * img.shape[0]) / 2
        l_x = l_x / r_w
        r_x = r_x / r_w
        t_y = t_y / r_w
        b_y = b_y / r_w
    else:
        l_x = l_x - (INPUT_W - r_h * img.shape[1]) / 2
        r_x = r_x - (INPUT_W - r_h * img.shape[1]) / 2
        t_y = t_y
        b_y = b_y
        l_x = l_x / r_h
        r_x = r_x / r_h
        t_y = t_y / r_h
        b_y = b_y / r_h

    l_x = max(0.0, l_x)
    t_y = max(0.0, t_y)
    width = max(0, min(int(round(r_x - l_x)), img.shape[1] - int(round(l_x))))
    height = max(0, min(int(round(b_y - t_y)), img.shape[0] - int(round(t_y))))

    return [int(round(l_x)), int(round(t_y)), width, height]


def plot_one_box(x, img, color=None, label=None, line_thickness=None):
    """
    description: Plots one rotated bounding box on image img
    param:
        x:      a box in [cx, cy, w, h, angle] format
        img:    an opencv image object
        color:  color to draw rectangle
        label:  str
        line_thickness: int
    """
    tl = line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1

    # Get four corner points
    corners = get_corner(img, x)
    corners = corners.astype(int)

    # Draw the rotated rectangle
    cv2.polylines(img, [corners], isClosed=True, color=color, thickness=tl, lineType=cv2.LINE_AA)

    if label:
        tf = max(tl - 1, 1)  # font thickness
        # Use first corner point for label placement
        p1 = tuple(corners[0])
        w, h = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]

        outside = p1[1] - h >= 3
        p2 = (p1[0] + w, p1[1] - h - 3 if outside else p1[1] + h + 3)

        cv2.rectangle(img, p1, p2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(
            img,
            label,
            (p1[0], p1[1] - 2 if outside else p1[1] + h + 2),
            0,
            tl / 3,
            [225, 255, 255],
            thickness=tf,
            lineType=cv2.LINE_AA
        )


class YoLov8TRT(object):
    """
    description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.batch_size = engine.max_batch_size
        self.det_output_length = host_outputs[0].shape[0]

    def infer(self, raw_image_generator):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_origin_h = []
        batch_origin_w = []
        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
            batch_image_raw.append(image_raw)
            batch_origin_h.append(origin_h)
            batch_origin_w.append(origin_w)
            np.copyto(batch_input_image[i], input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        # Do postprocess
        for i in range(self.batch_size):
            keep = self.post_process(
                output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i],
                batch_origin_w[i]
            )
            # Draw rectangles and labels on the original image
            for j in range(len(keep)):
                box = keep[j]  # type: Detection
                np.random.seed(int(keep[j].class_id))
                color = [np.random.randint(0, 255) for _ in range(3)]
                plot_one_box(
                    box,
                    batch_image_raw[i],
                    label="{}:{:.2f}".format(
                        categories[int(keep[j].class_id)], keep[j].score
                    ),
                    color=color,
                    line_thickness=1
                )
        return batch_image_raw, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()

    def get_raw_image(self, image_path_batch):
        """
        description: Read an image from image path
        """
        for img_path in image_path_batch:
            yield cv2.imread(img_path)

    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_image(self, raw_bgr_image):
        """
        description: Convert BGR image to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = self.input_w / w
        r_h = self.input_h / h
        if r_h > r_w:
            tw = self.input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.input_h - th) / 2)
            ty2 = self.input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = self.input_h
            tx1 = int((self.input_w - tw) / 2)
            tx2 = self.input_w - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy(self, origin_h, origin_w, x):
        """
        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
        param:
            origin_h:   height of original image
            origin_w:   width of original image
            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
        return:
            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
        """
        y = np.zeros_like(x)
        r_w = self.input_w / origin_w
        r_h = self.input_h / origin_h
        if r_h > r_w:
            y[:, 0] = x[:, 0]
            y[:, 2] = x[:, 2]
            y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2
            y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2
            y /= r_w
        else:
            y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2
            y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2
            y[:, 1] = x[:, 1]
            y[:, 3] = x[:, 3]
            y /= r_h

        return y

    def covariance_matrix(self, res: Detection):
        """
        description: Generating covariance matrix from obbs.
        param:
            box (np.ndarray): A numpy array representing rotated bounding box, with xywhr format.

        return:
            tuple: (a, b, c) values of covariance matrix
        """
        w = res.bbox[2]
        h = res.bbox[3]
        angle = res.angle

        a = w * w / 12.0
        b = h * h / 12.0
        c = angle

        cos_r = math.cos(c)
        sin_r = math.sin(c)

        cos_r2 = cos_r * cos_r
        sin_r2 = sin_r * sin_r

        a_val = a * cos_r2 + b * sin_r2
        b_val = a * sin_r2 + b * cos_r2
        c_val = (a - b) * cos_r * sin_r

        return a_val, b_val, c_val

    def probiou(self, box1: Detection, box2: Detection, eps=1e-7):
        """
        description: Calculate the prob IoU between oriented bounding boxes.
        param:
            box1 (np.ndarray): First box in xywhr format
            box2 (np.ndarray): Second box in xywhr format
            eps (float): Small value to avoid division by zero
        return:
            float: 1 - hd where hd is the Bhattacharyya distance
        """
        a1, b1, c1 = self.covariance_matrix(box1)
        a2, b2, c2 = self.covariance_matrix(box2)

        x1, y1 = box1.bbox[0], box1.bbox[1]
        x2, y2 = box2.bbox[0], box2.bbox[1]

        t1 = ((a1 + a2) * (y1 - y2) ** 2 + (b1 + b2) * (x1 - x2) ** 2) / \
             ((a1 + a2) * (b1 + b2) - (c1 + c2) ** 2 + eps)
        t1 *= 0.25

        t2 = ((c1 + c2) * (x2 - x1) * (y1 - y2)) / \
             ((a1 + a2) * (b1 + b2) - (c1 + c2) ** 2 + eps)
        t2 *= 0.5

        t3 = ((a1 + a2) * (b1 + b2) - (c1 + c2) ** 2) / \
             (4 * math.sqrt(max(a1 * b1 - c1 * c1, 0.0)) *
              math.sqrt(max(a2 * b2 - c2 * c2, 0.0)) + eps)
        t3 = math.log(t3 + eps) * 0.5

        bd = max(min(t1 + t2 + t3, 100.0), eps)
        hd = math.sqrt(1.0 - math.exp(-bd) + eps)

        return 1 - hd

    def post_process(self, output, origin_h, origin_w):
        """
        description: postprocess the prediction
        param:
            output:     A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id,angle cx,cy,w,h,conf,cls_id,angle ...]
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2, angle]
            result_scores: finally scores, a numpy, each element is the score correspoing to box
            result_classid: finally classid, a numpy, each element is the classid correspoing to box
        """
        num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + OBB_NUM
        # Get the num of boxes detected
        num = int(output[0])
        # Reshape to a two dimentional ndarray
        pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :]

        # Filter by confidence threshold
        mask = pred[:, 4] >= CONF_THRESH
        pred = pred[mask]

        if len(pred) == 0:
            return []

        m_map = {}
        for i in range(len(pred)):
            class_id = int(pred[i][5])
            if class_id not in m_map:
                m_map[class_id] = []
            m_map[class_id].append(Detection(pred[i][:4], pred[i][4], class_id, pred[i][89]))

        res = []
        for it in m_map:
            dets = m_map[it]
            dets = sorted(dets, key=lambda x: x.score, reverse=True)
            for m in range(len(dets)):
                if dets[m].score == 0.0:
                    continue
                item = dets[m]
                res.append(item)
                for n in range(m + 1, len(dets)):
                    if dets[n].score == 0.0:
                        continue
                    if self.probiou(item, dets[n]) > IOU_THRESHOLD:
                        dets[n].score = 0.0

        keep = []
        for i in range(len(res)):
            if res[i].score > CONF_THRESH:
                keep.append(res[i])

        return keep


class inferThread(threading.Thread):
    def __init__(self, yolov8_wrapper, image_path_batch):
        threading.Thread.__init__(self)
        self.yolov8_wrapper = yolov8_wrapper
        self.image_path_batch = image_path_batch

    def run(self):
        batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch))
        for i, img_path in enumerate(self.image_path_batch):
            parent, filename = os.path.split(img_path)
            save_name = os.path.join('output', filename)
            # Save image
            cv2.imwrite(save_name, batch_image_raw[i])
        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))


class warmUpThread(threading.Thread):
    def __init__(self, yolov8_wrapper):
        threading.Thread.__init__(self)
        self.yolov8_wrapper = yolov8_wrapper

    def run(self):
        batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros())
        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))


if __name__ == "__main__":
    # load custom plugin and engine
    PLUGIN_LIBRARY = "./build/libmyplugins.so"
    engine_file_path = "yolov8n-obb.engine"

    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]
    if len(sys.argv) > 2:
        PLUGIN_LIBRARY = sys.argv[2]

    ctypes.CDLL(PLUGIN_LIBRARY)

    # load DOTAV 1.5 labels

    categories = ["plane", "ship", "storage tank", "baseball diamond", "tennis court",
                  "basketball court", "ground track field", "harbor",
                  "bridge", "large vehicle", "small vehicle", "helicopter",
                  "roundabout", "soccer ball field", "swimming pool", "container crane"]

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')
    # a YoLov8TRT instance
    yolov8_wrapper = YoLov8TRT(engine_file_path)
    try:
        print('batch size is', yolov8_wrapper.batch_size)

        image_dir = "images/"
        image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir)

        for i in range(10):
            # create a new thread to do warm_up
            thread1 = warmUpThread(yolov8_wrapper)
            thread1.start()
            thread1.join()
        for batch in image_path_batches:
            # create a new thread to do inference
            thread1 = inferThread(yolov8_wrapper, batch)
            thread1.start()
            thread1.join()
    finally:
        # destroy the instance
        yolov8_wrapper.destroy()


================================================
FILE: yolov8/yolov8_pose.cpp
================================================

#include <fstream>
#include <iostream>
#include <opencv2/opencv.hpp>
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "postprocess.h"
#include "preprocess.h"
#include "utils.h"

Logger gLogger;
using namespace nvinfer1;
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;

void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd,
                      float& gw, int& max_channels) {
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();
    IHostMemory* serialized_engine = nullptr;

    if (is_p == 6) {
        serialized_engine = buildEngineYolov8PoseP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
    } else if (is_p == 2) {
        std::cout << "p2 is not supported right now" << std::endl;
    } else {
        serialized_engine = buildEngineYolov8Pose(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
    }

    assert(serialized_engine);
    std::ofstream p(engine_name, std::ios::binary);
    if (!p) {
        std::cout << "could not open plan output file" << std::endl;
        assert(false);
    }
    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());

    delete serialized_engine;
    delete config;
    delete builder;
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
                        IExecutionContext** context) {
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        assert(false);
    }
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    char* serialized_engine = new char[size];
    assert(serialized_engine);
    file.read(serialized_engine, size);
    file.close();

    *runtime = createInferRuntime(gLogger);
    assert(*runtime);
    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
    assert(*engine);
    *context = (*engine)->createExecutionContext();
    assert(*context);
    delete[] serialized_engine;
}

void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
                    float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
                    std::string cuda_post_process) {
    assert(engine->getNbBindings() == 2);
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(kInputTensorName);
    const int outputIndex = engine->getBindingIndex(kOutputTensorName);
    assert(inputIndex == 0);
    assert(outputIndex == 1);
    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
    if (cuda_post_process == "c") {
        *output_buffer_host = new float[kBatchSize * kOutputSize];
    } else if (cuda_post_process == "g") {
        if (kBatchSize > 1) {
            std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
            exit(0);
        }
        // Allocate memory for decode_ptr_host and copy to device
        *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
        CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
    }
}

void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
           float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
    // infer on the batch asynchronously, and DMA output back to host
    auto start = std::chrono::system_clock::now();
    context.enqueue(batchsize, buffers, stream, nullptr);
    if (cuda_post_process == "c") {
        CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
                                   stream));
        auto end = std::chrono::system_clock::now();
        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
                  << "ms" << std::endl;
    } else if (cuda_post_process == "g") {
        CUDA_CHECK(
                cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
        cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
        cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream);  //cuda nms
        CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
                                   sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
                                   stream));
        auto end = std::chrono::system_clock::now();
        std::cout << "inference and gpu postprocess time: "
                  << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));
}

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, int& is_p, std::string& img_dir,
                std::string& sub_type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) {
    if (argc < 4)
        return false;
    if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) {
        wts = std::string(argv[2]);
        engine = std::string(argv[3]);
        auto sub_type = std::string(argv[4]);

        if (sub_type[0] == 'n') {
            gd = 0.33;
            gw = 0.25;
            max_channels = 1024;
        } else if (sub_type[0] == 's') {
            gd = 0.33;
            gw = 0.50;
            max_channels = 1024;
        } else if (sub_type[0] == 'm') {
            gd = 0.67;
            gw = 0.75;
            max_channels = 576;
        } else if (sub_type[0] == 'l') {
            gd = 1.0;
            gw = 1.0;
            max_channels = 512;
        } else if (sub_type[0] == 'x') {
            gd = 1.0;
            gw = 1.25;
            max_channels = 640;
        } else {
            return false;
        }
        if (sub_type.size() == 2 && sub_type[1] == '6') {
            is_p = 6;
        } else if (sub_type.size() == 2 && sub_type[1] == '2') {
            is_p = 2;
        }
    } else if (std::string(argv[1]) == "-d" && argc == 5) {
        engine = std::string(argv[2]);
        img_dir = std::string(argv[3]);
        cuda_post_process = std::string(argv[4]);
    } else {
        return false;
    }
    return true;
}

int main(int argc, char** argv) {
    cudaSetDevice(kGpuId);
    std::string wts_name = "";
    std::string engine_name = "";
    std::string img_dir;
    std::string sub_type = "";
    std::string cuda_post_process = "";
    int model_bboxes;
    int is_p = 0;
    float gd = 0.0f, gw = 0.0f;
    int max_channels = 0;

    if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw,
                    max_channels)) {
        std::cerr << "Arguments not right!" << std::endl;
        std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6]  // serialize model to "
                     "plan file"
                  << std::endl;
        std::cerr << "./yolov8 -d [.engine] ../samples  [c/g]// deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // Create a model using the API directly and serialize it to a file
    if (!wts_name.empty()) {
        serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels);
        return 0;
    }

    // Deserialize the engine from file
    IRuntime* runtime = nullptr;
    ICudaEngine* engine = nullptr;
    IExecutionContext* context = nullptr;
    deserialize_engine(engine_name, &runtime, &engine, &context);
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));
    cuda_preprocess_init(kMaxInputImageSize);
    auto out_dims = engine->getBindingDimensions(1);
    model_bboxes = out_dims.d[0];
    // Prepare cpu and gpu buffers
    float* device_buffers[2];
    float* output_buffer_host = nullptr;
    float* decode_ptr_host = nullptr;
    float* decode_ptr_device = nullptr;

    // Read images from directory
    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
                   &decode_ptr_device, cuda_post_process);

    // batch predict
    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
        // Get a batch of images
        std::vector<cv::Mat> img_batch;
        std::vector<std::string> img_name_batch;
        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
            img_batch.push_back(img);
            img_name_batch.push_back(file_names[j]);
        }
        // Preprocess
        cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
        // Run inference
        infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
              decode_ptr_device, model_bboxes, cuda_post_process);
        std::vector<std::vector<Detection>> res_batch;
        if (cuda_post_process == "c") {
            // NMS
            batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
        } else if (cuda_post_process == "g") {
            // Process gpu decode and nms results
            // todo pose in gpu
            std::cerr << "pose_postprocess is not support in gpu right now" << std::endl;
        }
        // Draw bounding boxes
        draw_bbox_keypoints_line(img_batch, res_batch);
        // Save images
        for (size_t j = 0; j < img_batch.size(); j++) {
            cv::imwrite("_" + img_name_batch[j], img_batch[j]);
        }
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(device_buffers[0]));
    CUDA_CHECK(cudaFree(device_buffers[1]));
    CUDA_CHECK(cudaFree(decode_ptr_device));
    delete[] decode_ptr_host;
    delete[] output_buffer_host;
    cuda_preprocess_destroy();
    // Destroy the engine
    delete context;
    delete engine;
    delete runtime;

    // Print histogram of the output distribution
    //std::cout << "\nOutput:\n\n";
    //for (unsigned int i = 0; i < kOutputSize; i++)
    //{
    //    std::cout << prob[i] << ", ";
    //    if (i % 10 == 0) std::cout << std::endl;
    //}
    //std::cout << std::endl;

    return 0;
}


================================================
FILE: yolov8/yolov8_pose_trt.py
================================================
"""
An example that uses TensorRT's Python api to make inferences.
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit  # noqa: F401
import pycuda.driver as cuda
import tensorrt as trt

CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4
POSE_NUM = 17 * 3
DET_NUM = 6
SEG_NUM = 32
OBB_NUM = 1
keypoint_pairs = [
    (0, 1), (0, 2), (0, 5), (0, 6), (1, 2),
    (1, 3), (2, 4), (5, 6), (5, 7), (5, 11),
    (6, 8), (6, 12), (7, 9), (8, 10), (11, 12),
    (11, 13), (12, 14), (13, 15), (14, 16)
]


def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret


def plot_one_box(x, img, color=None, label=None, line_thickness=None):
    """
    description: Plots one bounding box on image img,
                 this function comes from YoLov8 project.
    param:
        x:      a box likes [x1,y1,x2,y2]
        img:    a opencv image object
        color:  color to draw rectangle, such as (0,255,0)
        label:  str
        line_thickness: int
    return:
        no return

    """
    tl = (
            line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
    )  # line/font thickness
    color = color or [random.randint(0, 255) for _ in range(3)]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
    if label:
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(
            img,
            label,
            (c1[0], c1[1] - 2),
            0,
            tl / 3,
            [225, 255, 255],
            thickness=tf,
            lineType=cv2.LINE_AA,
            )


class YoLov8TRT(object):
    """
    description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.batch_size = engine.max_batch_size
        self.det_output_size = host_outputs[0].shape[0]

    def infer(self, raw_image_generator):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_origin_h = []
        batch_origin_w = []
        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
            batch_image_raw.append(image_raw)
            batch_origin_h.append(origin_h)
            batch_origin_w.append(origin_w)
            np.copyto(batch_input_image[i],
                      input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        # Do postprocess
        for i in range(self.batch_size):

            result_boxes, result_scores, result_classid, keypoints = self.post_process(
                output[i * (self.det_output_size): (i + 1) * (self.det_output_size)],
                batch_origin_h[i], batch_origin_w[i]
            )

            # Draw rectangles and labels on the original image
            for j in range(len(result_boxes)):
                box = result_boxes[j]
                plot_one_box(
                    box,
                    batch_image_raw[i],
                    label="{}:{:.2f}".format(
                        categories[int(result_classid[j])], result_scores[j]
                    ),
                )

                num_keypoints = len(keypoints[j]) // 3
                points = []
                for k in range(num_keypoints):
                    x = keypoints[j][k * 3]
                    y = keypoints[j][k * 3 + 1]
                    confidence = keypoints[j][k * 3 + 2]
                    if confidence > 0:
                        points.append((int(x), int(y)))
                    else:
                        points.append(None)

                # 根据关键点索引对绘制线条
                for pair in keypoint_pairs:
                    partA, partB = pair
                    if points[partA] and points[partB]:
                        cv2.line(batch_image_raw[i], points[partA], points[partB], (0, 255, 0), 2)

        return batch_image_raw, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()

    def get_raw_image(self, image_path_batch):
        """
        description: Read an image from image path
        """
        for img_path in image_path_batch:
            yield cv2.imread(img_path)

    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_image(self, raw_bgr_image):
        """
        description: Convert BGR image to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = self.input_w / w
        r_h = self.input_h / h
        if r_h > r_w:
            tw = self.input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.input_h - th) / 2)
            ty2 = self.input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = self.input_h
            tx1 = int((self.input_w - tw) / 2)
            tx2 = self.input_w - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy_with_keypoints(self, origin_h, origin_w, boxes, keypoints):

        n = len(boxes)
        box_array = np.zeros_like(boxes)
        keypoint_array = np.zeros_like(keypoints)
        r_w = self.input_w / origin_w
        r_h = self.input_h / origin_h
        for i in range(n):
            if r_h > r_w:
                box = boxes[i]
                lmk = keypoints[i]
                box_array[i, 0] = box[0] / r_w
                box_array[i, 2] = box[2] / r_w
                box_array[i, 1] = (box[1] - (self.input_h - r_w * origin_h) / 2) / r_w
                box_array[i, 3] = (box[3] - (self.input_h - r_w * origin_h) / 2) / r_w

                for j in range(0, len(lmk), 3):
                    keypoint_array[i, j] = lmk[j] / r_w
                    keypoint_array[i, j + 1] = (lmk[j + 1] - (self.input_h - r_w * origin_h) / 2) / r_w
                    keypoint_array[i, j + 2] = lmk[j + 2]
            else:

                box = boxes[i]
                lmk = keypoints[i]

                box_array[i, 0] = (box[0] - (self.input_w - r_h * origin_w) / 2) / r_h
                box_array[i, 2] = (box[2] - (self.input_w - r_h * origin_w) / 2) / r_h
                box_array[i, 1] = box[1] / r_h
                box_array[i, 3] = box[3] / r_h

                for j in range(0, len(lmk), 3):
                    keypoint_array[i, j] = (lmk[j] - (self.input_w - r_h * origin_w) / 2) / r_h
                    keypoint_array[i, j + 1] = lmk[j + 1] / r_h
                    keypoint_array[i, j + 2] = lmk[j + 2]

        return box_array, keypoint_array

    def post_process(self, output, origin_h, origin_w):
        """
        description: Post-process the prediction to include pose keypoints
        param:
            output:     A numpy array like [num_boxes, cx, cy, w, h, conf,
            cls_id, px1, py1, pconf1,...px17, py17, pconf17] where p denotes pose keypoint
            origin_h:   Height of original image
            origin_w:   Width of original image
        return:
            result_boxes:    Final boxes, a numpy array, each row is a box [x1, y1, x2, y2]
            result_scores:   Final scores, a numpy array, each element is the score corresponding to box
            result_classid:  Final classID, a numpy array, each element is the classid corresponding to box
            result_keypoints: Final keypoints, a list of numpy arrays,
            each element represents keypoints for a box, shaped as (#keypoints, 3)
        """
        # Number of values per detection: 38 base values + 17 keypoints * 3 values each + angle
        num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + OBB_NUM
        # Get the number of boxes detected
        num = int(output[0])
        # Reshape to a two-dimensional ndarray with the full detection shape
        pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :]

        # Perform non-maximum suppression to filter the detections
        boxes = self.non_max_suppression(
            pred[:, :num_values_per_detection], origin_h, origin_w,
            conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)

        # Extract the bounding boxes, confidence scores, and class IDs
        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
        result_scores = boxes[:, 4] if len(boxes) else np.array([])
        result_classid = boxes[:, 5] if len(boxes) else np.array([])
        result_keypoints = boxes[:, -POSE_NUM-1:-1] if len(boxes) else np.array([])

        # Return the post-processed results including keypoints
        return result_boxes, result_scores, result_classid, result_keypoints

    def bbox_iou(self, box1, box2, x1y1x2y2=True):
        """
        description: compute the IoU of two bounding boxes
        param:
            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            x1y1x2y2: select the coordinate format
        return:
            iou: computed iou
        """
        if not x1y1x2y2:
            # Transform from center and width to exact coordinates
            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
        else:
            # Get the coordinates of bounding boxes
            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

        # Get the coordinates of the intersection rectangle
        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
        # Intersection area
        inter_area = np.clip(
            inter_rect_x2 - inter_rect_x1 + 1, 0, None) * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)
        # Union Area
        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)

        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

        return iou

    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
        """
        description: Removes detections with lower object confidence score than 'conf_thres' and performs
        Non-Maximum Suppression to further filter detections.
        param:
            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
            origin_h: original image height
            origin_w: original image width
            conf_thres: a confidence threshold to filter detections
            nms_thres: a iou threshold to filter detections
        return:
            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
        """
        # Get the boxes that score > CONF_THRESH
        boxes = prediction[prediction[:, 4] >= conf_thres]
        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
        res_array = np.copy(boxes)
        box_pred_deep_copy = np.copy(boxes[:, :4])
        keypoints_pred_deep_copy = np.copy(boxes[:, -POSE_NUM-1:-1])
        res_box, res_keypoints = self.xywh2xyxy_with_keypoints(
            origin_h, origin_w, box_pred_deep_copy, keypoints_pred_deep_copy)
        res_array[:, :4] = res_box
        res_array[:, -POSE_NUM-1:-1] = res_keypoints
        # clip the coordinates
        res_array[:, 0] = np.clip(res_array[:, 0], 0, origin_w - 1)
        res_array[:, 2] = np.clip(res_array[:, 2], 0, origin_w - 1)
        res_array[:, 1] = np.clip(res_array[:, 1], 0, origin_h - 1)
        res_array[:, 3] = np.clip(res_array[:, 3], 0, origin_h - 1)
        # Object confidence
        confs = res_array[:, 4]
        # Sort by the confs
        res_array = res_array[np.argsort(-confs)]
        # Perform non-maximum suppression
        keep_res_array = []
        while res_array.shape[0]:
            large_overlap = self.bbox_iou(np.expand_dims(res_array[0, :4], 0), res_array[:, :4]) > nms_thres
            label_match = res_array[0, 5] == res_array[:, 5]
            invalid = large_overlap & label_match
            keep_res_array.append(res_array[0])
            res_array = res_array[~invalid]

        res_array = np.stack(keep_res_array, 0) if len(keep_res_array) else np.array([])
        return res_array


class inferThread(threading.Thread):
    def __init__(self, yolov8_wrapper, image_path_batch):
        threading.Thread.__init__(self)
        self.yolov8_wrapper = yolov8_wrapper
        self.image_path_batch = image_path_batch

    def run(self):
        batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch))
        for i, img_path in enumerate(self.image_path_batch):
            parent, filename = os.path.split(img_path)
            save_name = os.path.join('output', filename)
            # Save image

            cv2.imwrite(save_name, batch_image_raw[i])
        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))


class warmUpThread(threading.Thread):
    def __init__(self, yolov8_wrapper):
        threading.Thread.__init__(self)
        self.yolov8_wrapper = yolov8_wrapper

    def run(self):
        batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros())
        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))


if __name__ == "__main__":
    # load custom plugin and engine
    PLUGIN_LIBRARY = "./build/libmyplugins.so"
    engine_file_path = "yolov8n-pose.engine"

    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]
    if len(sys.argv) > 2:
        PLUGIN_LIBRARY = sys.argv[2]

    ctypes.CDLL(PLUGIN_LIBRARY)

    # load coco labels

    categories = ["person"]

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')
    # a YoLov8TRT instance
    yolov8_wrapper = YoLov8TRT(engine_file_path)
    try:
        print('batch size is', yolov8_wrapper.batch_size)

        image_dir = "images/"
        image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir)

        for i in range(10):
            # create a new thread to do warm_up
            thread1 = warmUpThread(yolov8_wrapper)
            thread1.start()
            thread1.join()
        for batch in image_path_batches:
            # create a new thread to do inference
            thread1 = inferThread(yolov8_wrapper, batch)
            thread1.start()
            thread1.join()
    finally:
        # destroy the instance
        yolov8_wrapper.destroy()


================================================
FILE: yolov8/yolov8_seg.cpp
================================================

#include <fstream>
#include <iostream>
#include <opencv2/opencv.hpp>
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "postprocess.h"
#include "preprocess.h"
#include "utils.h"

Logger gLogger;
using namespace nvinfer1;
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
const static int kOutputSegSize = 32 * (kInputH / 4) * (kInputW / 4);

static cv::Rect get_downscale_rect(float bbox[4], float scale) {

    float left = bbox[0];
    float top = bbox[1];
    float right = bbox[0] + bbox[2];
    float bottom = bbox[1] + bbox[3];

    left = left < 0 ? 0 : left;
    top = top < 0 ? 0 : top;
    right = right > kInputW ? kInputW : right;
    bottom = bottom > kInputH ? kInputH : bottom;

    left /= scale;
    top /= scale;
    right /= scale;
    bottom /= scale;
    return cv::Rect(int(left), int(top), int(right - left), int(bottom - top));
}

std::vector<cv::Mat> process_mask(const float* proto, int proto_size, std::vector<Detection>& dets) {

    std::vector<cv::Mat> masks;
    for (size_t i = 0; i < dets.size(); i++) {

        cv::Mat mask_mat = cv::Mat::zeros(kInputH / 4, kInputW / 4, CV_32FC1);
        auto r = get_downscale_rect(dets[i].bbox, 4);

        for (int x = r.x; x < r.x + r.width; x++) {
            for (int y = r.y; y < r.y + r.height; y++) {
                float e = 0.0f;
                for (int j = 0; j < 32; j++) {
                    e += dets[i].mask[j] * proto[j * proto_size / 32 + y * mask_mat.cols + x];
                }
                e = 1.0f / (1.0f + expf(-e));
                mask_mat.at<float>(y, x) = e;
            }
        }
        cv::resize(mask_mat, mask_mat, cv::Size(kInputW, kInputH));
        masks.push_back(mask_mat);
    }
    return masks;
}

void serialize_engine(std::string& wts_name, std::string& engine_name, std::string& sub_type, float& gd, float& gw,
                      int& max_channels) {
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();
    IHostMemory* serialized_engine = nullptr;

    serialized_engine = buildEngineYolov8Seg(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);

    assert(serialized_engine);
    std::ofstream p(engine_name, std::ios::binary);
    if (!p) {
        std::cout << "could not open plan output file" << std::endl;
        assert(false);
    }
    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());

    delete serialized_engine;
    delete config;
    delete builder;
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
                        IExecutionContext** context) {
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        assert(false);
    }
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    char* serialized_engine = new char[size];
    assert(serialized_engine);
    file.read(serialized_engine, size);
    file.close();

    *runtime = createInferRuntime(gLogger);
    assert(*runtime);
    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
    assert(*engine);
    *context = (*engine)->createExecutionContext();
    assert(*context);
    delete[] serialized_engine;
}

void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
                    float** output_seg_buffer_device, float** output_buffer_host, float** output_seg_buffer_host,
                    float** decode_ptr_host, float** decode_ptr_device, std::string cuda_post_process) {
    assert(engine->getNbBindings() == 3);
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(kInputTensorName);
    const int outputIndex = engine->getBindingIndex(kOutputTensorName);
    const int outputIndex_seg = engine->getBindingIndex("proto");

    assert(inputIndex == 0);
    assert(outputIndex == 1);
    assert(outputIndex_seg == 2);
    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)output_seg_buffer_device, kBatchSize * kOutputSegSize * sizeof(float)));

    if (cuda_post_process == "c") {
        *output_buffer_host = new float[kBatchSize * kOutputSize];
        *output_seg_buffer_host = new float[kBatchSize * kOutputSegSize];
    } else if (cuda_post_process == "g") {
        if (kBatchSize > 1) {
            std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
            exit(0);
        }
        // Allocate memory for decode_ptr_host and copy to device
        *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
        CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
    }
}

void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, float* output_seg,
           int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes,
           std::string cuda_post_process) {
    // infer on the batch asynchronously, and DMA output back to host
    auto start = std::chrono::system_clock::now();
    context.enqueue(batchsize, buffers, stream, nullptr);
    if (cuda_post_process == "c") {

        std::cout << "kOutputSize:" << kOutputSize << std::endl;
        CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
                                   stream));
        std::cout << "kOutputSegSize:" << kOutputSegSize << std::endl;
        CUDA_CHECK(cudaMemcpyAsync(output_seg, buffers[2], batchsize * kOutputSegSize * sizeof(float),
                                   cudaMemcpyDeviceToHost, stream));

        auto end = std::chrono::system_clock::now();
        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
                  << "ms" << std::endl;
    } else if (cuda_post_process == "g") {
        CUDA_CHECK(
                cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
        cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
        cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream);  //cuda nms
        CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
                                   sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
                                   stream));
        auto end = std::chrono::system_clock::now();
        std::cout << "inference and gpu postprocess time: "
                  << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));
}

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir,
                std::string& sub_type, std::string& cuda_post_process, std::string& labels_filename, float& gd,
                float& gw, int& max_channels) {
    if (argc < 4)
        return false;
    if (std::string(argv[1]) == "-s" && argc == 5) {
        wts = std::string(argv[2]);
        engine = std::string(argv[3]);
        sub_type = std::string(argv[4]);
        if (sub_type == "n") {
            gd = 0.33;
            gw = 0.25;
            max_channels = 1024;
        } else if (sub_type == "s") {
            gd = 0.33;
            gw = 0.50;
            max_channels = 1024;
        } else if (sub_type == "m") {
            gd = 0.67;
            gw = 0.75;
            max_channels = 576;
        } else if (sub_type == "l") {
            gd = 1.0;
            gw = 1.0;
            max_channels = 512;
        } else if (sub_type == "x") {
            gd = 1.0;
            gw = 1.25;
            max_channels = 640;
        } else {
            return false;
        }
    } else if (std::string(argv[1]) == "-d" && argc == 6) {
        engine = std::string(argv[2]);
        img_dir = std::string(argv[3]);
        cuda_post_process = std::string(argv[4]);
        labels_filename = std::string(argv[5]);
    } else {
        return false;
    }
    return true;
}

int main(int argc, char** argv) {
    cudaSetDevice(kGpuId);
    std::string wts_name = "";
    std::string engine_name = "";
    std::string img_dir;
    std::string sub_type = "";
    std::string cuda_post_process = "";
    std::string labels_filename = "../coco.txt";
    int model_bboxes;
    float gd = 0.0f, gw = 0.0f;
    int max_channels = 0;

    if (!parse_args(argc, argv, wts_name, engine_name, img_dir, sub_type, cuda_post_process, labels_filename, gd, gw,
                    max_channels)) {
        std::cerr << "Arguments not right!" << std::endl;
        std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x]  // serialize model to plan file" << std::endl;
        std::cerr << "./yolov8 -d [.engine] ../samples  [c/g] coco_file// deserialize plan file and run inference"
                  << std::endl;
        return -1;
    }

    // Create a model using the API directly and serialize it to a file
    if (!wts_name.empty()) {
        serialize_engine(wts_name, engine_name, sub_type, gd, gw, max_channels);
        return 0;
    }

    // Deserialize the engine from file
    IRuntime* runtime = nullptr;
    ICudaEngine* engine = nullptr;
    IExecutionContext* context = nullptr;
    deserialize_engine(engine_name, &runtime, &engine, &context);
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));
    cuda_preprocess_init(kMaxInputImageSize);
    auto out_dims = engine->getBindingDimensions(1);
    model_bboxes = out_dims.d[0];
    // Prepare cpu and gpu buffers
    float* device_buffers[3];
    float* output_buffer_host = nullptr;
    float* output_seg_buffer_host = nullptr;
    float* decode_ptr_host = nullptr;
    float* decode_ptr_device = nullptr;

    // Read images from directory
    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    std::unordered_map<int, std::string> labels_map;
    read_labels(labels_filename, labels_map);
    assert(kNumClass == labels_map.size());

    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &device_buffers[2], &output_buffer_host,
                   &output_seg_buffer_host, &decode_ptr_host, &decode_ptr_device, cuda_post_process);

    // // batch predict
    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
        // Get a batch of images
        std::vector<cv::Mat> img_batch;
        std::vector<std::string> img_name_batch;
        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
            img_batch.push_back(img);
            img_name_batch.push_back(file_names[j]);
        }
        // Preprocess
        cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
        // Run inference
        infer(*context, stream, (void**)device_buffers, output_buffer_host, output_seg_buffer_host, kBatchSize,
              decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process);
        std::vector<std::vector<Detection>> res_batch;
        if (cuda_post_process == "c") {
            // NMS
            batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
            for (size_t b = 0; b < img_batch.size(); b++) {
                auto& res = res_batch[b];
                cv::Mat img = img_batch[b];
                auto masks = process_mask(&output_seg_buffer_host[b * kOutputSegSize], kOutputSegSize, res);
                draw_mask_bbox(img, res, masks, labels_map);
                cv::imwrite("_" + img_name_batch[b], img);
            }
        } else if (cuda_post_process == "g") {
            // Process gpu decode and nms results
            // batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
            // todo seg in gpu
            std::cerr << "seg_postprocess is not support in gpu right now" << std::endl;
        }
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(device_buffers[0]));
    CUDA_CHECK(cudaFree(device_buffers[1]));
    CUDA_CHECK(cudaFree(device_buffers[2]));
    CUDA_CHECK(cudaFree(decode_ptr_device));
    delete[] decode_ptr_host;
    delete[] output_buffer_host;
    delete[] output_seg_buffer_host;
    cuda_preprocess_destroy();
    // Destroy the engine
    delete context;
    delete engine;
    delete runtime;

    // Print histogram of the output distribution
    // std::cout << "\nOutput:\n\n";
    // for (unsigned int i = 0; i < kOutputSize; i++)
    //{
    //    std::cout << prob[i] << ", ";
    //    if (i % 10 == 0) std::cout << std::endl;
    //}
    // std::cout << std::endl;

    return 0;
}


================================================
FILE: yolov8/yolov8_seg_trt.py
================================================
"""
An example that uses TensorRT's Python api to make inferences.
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit  # noqa: F401
import pycuda.driver as cuda
import tensorrt as trt

CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4
POSE_NUM = 17 * 3
DET_NUM = 6
SEG_NUM = 32
OBB_NUM = 1


def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret


def plot_one_box(x, img, color=None, label=None, line_thickness=None):
    """
    description: Plots one bounding box on image img,
                 this function comes from YoLov8 project.
    param:
        x:      a box likes [x1,y1,x2,y2]
        img:    a opencv image object
        color:  color to draw rectangle, such as (0,255,0)
        label:  str
        line_thickness: int
    return:
        no return

    """
    tl = (
            line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
    )  # line/font thickness
    color = color or [random.randint(0, 255) for _ in range(3)]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
    if label:
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(
            img,
            label,
            (c1[0], c1[1] - 2),
            0,
            tl / 3,
            [225, 255, 255],
            thickness=tf,
            lineType=cv2.LINE_AA,
            )


class YoLov8TRT(object):
    """
    description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.batch_size = engine.max_batch_size

        # Data length
        self.det_output_length = host_outputs[0].shape[0]
        self.seg_output_length = host_outputs[1].shape[0]
        self.seg_w = int(self.input_w / 4)
        self.seg_h = int(self.input_h / 4)
        self.seg_c = int(self.seg_output_length / (self.seg_w * self.seg_w))
        self.det_row_output_length = self.seg_c + DET_NUM + POSE_NUM + OBB_NUM

        # Draw mask
        self.colors_obj = Colors()

    def infer(self, raw_image_generator):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_origin_h = []
        batch_origin_w = []
        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
            batch_image_raw.append(image_raw)
            batch_origin_h.append(origin_h)
            batch_origin_w.append(origin_w)
            np.copyto(batch_input_image[i], input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        cuda.memcpy_dtoh_async(host_outputs[1], cuda_outputs[1], stream)

        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        output_proto_mask = host_outputs[1]
        # Do postprocess
        for i in range(self.batch_size):
            result_boxes, result_scores, result_classid, result_proto_coef = self.post_process(
                output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i],
                batch_origin_w[i]
            )

            if result_proto_coef.shape[0] == 0:
                continue
            result_masks = self.process_mask(output_proto_mask, result_proto_coef, result_boxes, batch_origin_h[i],
                                             batch_origin_w[i])

            self.draw_mask(result_masks, colors_=[self.colors_obj(x, True) for x in result_classid],
                           im_src=batch_image_raw[i])

            # Draw rectangles and labels on the original image
            for j in range(len(result_boxes)):
                box = result_boxes[j]
                plot_one_box(
                    box,
                    batch_image_raw[i],
                    label="{}:{:.2f}".format(
                        categories[int(result_classid[j])], result_scores[j]
                    ),
                )
        return batch_image_raw, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()

    def get_raw_image(self, image_path_batch):
        """
        description: Read an image from image path
        """
        for img_path in image_path_batch:
            yield cv2.imread(img_path)

    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_image(self, raw_bgr_image):
        """
        description: Convert BGR image to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = self.input_w / w
        r_h = self.input_h / h
        if r_h > r_w:
            tw = self.input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.input_h - th) / 2)
            ty2 = self.input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = self.input_h
            tx1 = int((self.input_w - tw) / 2)
            tx2 = self.input_w - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy(self, origin_h, origin_w, x):
        """
        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
        param:
            origin_h:   height of original image
            origin_w:   width of original image
            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
        return:
            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
        """
        y = np.zeros_like(x)
        r_w = self.input_w / origin_w
        r_h = self.input_h / origin_h
        if r_h > r_w:
            y[:, 0] = x[:, 0]
            y[:, 2] = x[:, 2]
            y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2
            y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2
            y /= r_w
        else:
            y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2
            y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2
            y[:, 1] = x[:, 1]
            y[:, 3] = x[:, 3]
            y /= r_h

        return y

    def post_process(self, output, origin_h, origin_w):
        """
        description: postprocess the prediction
        param:
            output:     A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
            result_scores: finally scores, a numpy, each element is the score correspoing to box
            result_classid: finally classid, a numpy, each element is the classid correspoing to box
        """
        # Get the num of boxes detected
        num = int(output[0])
        # Reshape to a two dimentional ndarray
        pred = np.reshape(output[1:], (-1, self.det_row_output_length))[:num, :]

        # Do nms
        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
        result_scores = boxes[:, 4] if len(boxes) else np.array([])
        result_classid = boxes[:, 5] if len(boxes) else np.array([])
        result_proto_coef = boxes[:, DET_NUM:int(DET_NUM + SEG_NUM)] if len(boxes) else np.array([])
        return result_boxes, result_scores, result_classid, result_proto_coef

    def bbox_iou(self, box1, box2, x1y1x2y2=True):
        """
        description: compute the IoU of two bounding boxes
        param:
            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            x1y1x2y2: select the coordinate format
        return:
            iou: computed iou
        """
        if not x1y1x2y2:
            # Transform from center and width to exact coordinates
            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
        else:
            # Get the coordinates of bounding boxes
            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

        # Get the coordinates of the intersection rectangle
        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
        # Intersection area
        inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None)
                      * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None))
        # Union Area
        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)

        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

        return iou

    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
        """
        description: Removes detections with lower object confidence score than 'conf_thres' and performs
        Non-Maximum Suppression to further filter detections.
        param:
            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
            origin_h: original image height
            origin_w: original image width
            conf_thres: a confidence threshold to filter detections
            nms_thres: a iou threshold to filter detections
        return:
            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
        """
        # Get the boxes that score > CONF_THRESH
        boxes = prediction[prediction[:, 4] >= conf_thres]
        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
        boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
        # clip the coordinates
        boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1)
        boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1)
        boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1)
        boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1)
        # Object confidence
        confs = boxes[:, 4]
        # Sort by the confs
        boxes = boxes[np.argsort(-confs)]
        # Perform non-maximum suppression
        keep_boxes = []
        while boxes.shape[0]:
            large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
            label_match = boxes[0, 5] == boxes[:, 5]
            # Indices of boxes with lower confidence scores, large IOUs and matching labels
            invalid = large_overlap & label_match
            keep_boxes += [boxes[0]]
            boxes = boxes[~invalid]
        boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
        return boxes

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def scale_mask(self, mask, ih, iw):
        mask = cv2.resize(mask, (self.input_w, self.input_h))
        r_w = self.input_w / (iw * 1.0)
        r_h = self.input_h / (ih * 1.0)
        if r_h > r_w:
            w = self.input_w
            h = int(r_w * ih)
            x = 0
            y = int((self.input_h - h) / 2)
        else:
            w = int(r_h * iw)
            h = self.input_h
            x = int((self.input_w - w) / 2)
            y = 0
        crop = mask[y:y + h, x:x + w]
        crop = cv2.resize(crop, (iw, ih))
        return crop

    def process_mask(self, output_proto_mask, result_proto_coef, result_boxes, ih, iw):
        """
        description: Mask pred by yolov8 instance segmentation ,
        param:
            output_proto_mask: prototype mask e.g. (32, 160, 160) for 640x640 input
            result_proto_coef: prototype mask coefficients (n, 32), n represents n results
            result_boxes     :
            ih: rows of original image
            iw: cols of original image
        return:
            mask_result: (n, ih, iw)
        """
        result_proto_masks = output_proto_mask.reshape(self.seg_c, self.seg_h, self.seg_w)
        c, mh, mw = result_proto_masks.shape
        print(result_proto_masks.shape)
        print(result_proto_coef.shape)
        masks = self.sigmoid((result_proto_coef @ result_proto_masks.astype(np.float32).reshape(c, -1))).reshape(-1, mh,
                                                                                                                 mw)

        mask_result = []
        for mask, box in zip(masks, result_boxes):
            mask_s = np.zeros((ih, iw))
            crop_mask = self.scale_mask(mask, ih, iw)
            x1 = int(box[0])
            y1 = int(box[1])
            x2 = int(box[2])
            y2 = int(box[3])
            crop = crop_mask[y1:y2, x1:x2]
            crop = np.where(crop >= 0.5, 1, 0)
            crop = crop.astype(np.uint8)
            mask_s[y1:y2, x1:x2] = crop

            mask_result.append(mask_s)
        mask_result = np.array(mask_result)
        return mask_result

    def draw_mask(self, masks, colors_, im_src, alpha=0.5):
        """
        description: Draw mask on image ,
        param:
            masks  : result_mask
            colors_: color to draw mask
            im_src : original image
            alpha  : scale between original  image and mask
        return:
            no return
        """
        if len(masks) == 0:
            return
        masks = np.asarray(masks, dtype=np.uint8)
        masks = np.ascontiguousarray(masks.transpose(1, 2, 0))
        masks = np.asarray(masks, dtype=np.float32)
        colors_ = np.asarray(colors_, dtype=np.float32)
        s = masks.sum(2, keepdims=True).clip(0, 1)
        masks = (masks @ colors_).clip(0, 255)
        im_src[:] = masks * alpha + im_src * (1 - s * alpha)


class inferThread(threading.Thread):
    def __init__(self, yolov8_wrapper, image_path_batch):
        threading.Thread.__init__(self)
        self.yolov8_wrapper = yolov8_wrapper
        self.image_path_batch = image_path_batch

    def run(self):
        batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch))
        for i, img_path in enumerate(self.image_path_batch):
            parent, filename = os.path.split(img_path)
            save_name = os.path.join('output', filename)
            # Save image
            cv2.imwrite(save_name, batch_image_raw[i])
        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))


class warmUpThread(threading.Thread):
    def __init__(self, yolov8_wrapper):
        threading.Thread.__init__(self)
        self.yolov8_wrapper = yolov8_wrapper

    def run(self):
        batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros())
        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))


class Colors:
    def __init__(self):
        hexs = ('FF3838', 'FF9D97', 'FF701F', 'FFB21D', 'CFD231', '48F90A',
                '92CC17', '3DDB86', '1A9334', '00D4BB', '2C99A8', '00C2FF',
                '344593', '6473FF', '0018EC', '8438FF', '520085', 'CB38FF',
                'FF95C8', 'FF37C7')
        self.palette = [self.hex2rgb(f'#{c}') for c in hexs]
        self.n = len(self.palette)

    def __call__(self, i, bgr=False):
        c = self.palette[int(i) % self.n]
        return (c[2], c[1], c[0]) if bgr else c

    @staticmethod
    def hex2rgb(h):  # rgb order (PIL)
        return tuple(int(h[1 + i:1 + i + 2], 16) for i in (0, 2, 4))


if __name__ == "__main__":
    # load custom plugin and engine
    PLUGIN_LIBRARY = "./build/libmyplugins.so"
    engine_file_path = "yolov8n-seg.engine"

    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]
    if len(sys.argv) > 2:
        PLUGIN_LIBRARY = sys.argv[2]

    ctypes.CDLL(PLUGIN_LIBRARY)

    # load coco labels

    categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
                  "traffic light",
                  "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
                  "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase",
                  "frisbee",
                  "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
                  "surfboard",
                  "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
                  "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
                  "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
                  "cell phone",
                  "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
                  "teddy bear",
                  "hair drier", "toothbrush"]

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')
    # a YoLov8TRT instance
    yolov8_wrapper = YoLov8TRT(engine_file_path)
    try:
        print('batch size is', yolov8_wrapper.batch_size)

        image_dir = "images/"
        image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir)

        for i in range(10):
            # create a new thread to do warm_up
            thread1 = warmUpThread(yolov8_wrapper)
            thread1.start()
            thread1.join()
        for batch in image_path_batches:
            # create a new thread to do inference
            thread1 = inferThread(yolov8_wrapper, batch)
            thread1.start()
            thread1.join()
    finally:
        # destroy the instance
        yolov8_wrapper.destroy()


================================================
FILE: yolov9/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.10)

project(TRTCreater)

add_definitions(-w)
add_definitions(-std=c++11)
add_definitions(-DAPI_EXPORTS)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)
set(CMAKE_CUDA_ARCHITECTURES 75 86 89)

MESSAGE(STATUS "operation system is ${CMAKE_SYSTEM}") 
IF (CMAKE_SYSTEM_NAME MATCHES "Linux")
    MESSAGE(STATUS "current platform: Linux ")
    set(CUDA_COMPILER_PATH "/usr/local/cuda/bin/nvcc")
    set(TENSORRT_PATH "/home/benol/Package/TensorRT-8.6.1.6")
    include_directories(/usr/local/cuda/include)
    link_directories(/usr/local/cuda/lib64)
    link_directories(/usr/local/cuda/lib)
ELSEIF (CMAKE_SYSTEM_NAME MATCHES "Windows")
    MESSAGE(STATUS "current platform: Windows")
    set(CUDA_COMPILER_PATH "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe")
    set(TENSORRT_PATH "D:\\Program Files\\TensorRT-8.6.1.6")
    set(OpenCV_DIR "D:\\Program Files\\opencv\\build")
    include_directories(${PROJECT_SOURCE_DIR}/windows)
    find_package(CUDA REQUIRED)
    include_directories(${CUDA_INCLUDE_DIRS})
    link_directories(${CUDA_LIBRARIES})
ELSE (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
    MESSAGE(STATUS "other platform: ${CMAKE_SYSTEM_PROCESSOR}")
    include_directories(/usr/local/cuda/targets/aarch64-linux/include)
    link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
ENDIF (CMAKE_SYSTEM_NAME MATCHES "Linux")
set(CMAKE_CUDA_COMPILER ${CUDA_COMPILER_PATH})
enable_language(CUDA)

# tensorrt
include_directories(${TENSORRT_PATH}/include)
link_directories(${TENSORRT_PATH}/lib)

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})

include_directories(${PROJECT_SOURCE_DIR}/include/)
include_directories(${PROJECT_SOURCE_DIR}/plugin/)

file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu)
file(GLOB_RECURSE PLUGIN_SRCS ${PROJECT_SOURCE_DIR}/plugin/*.cu)

# add_library(myplugins SHARED ${PLUGIN_SRCS})
add_library(myplugins SHARED ${PLUGIN_SRCS})
target_link_libraries(myplugins nvinfer cudart)

add_executable(yolov9 demo.cpp ${SRCS})
target_link_libraries(yolov9 nvinfer cudart myplugins ${OpenCV_LIBS})


================================================
FILE: yolov9/README.md
================================================
# YOLOv9

The Pytorch implementation is [WongKinYiu/yolov9](https://github.com/WongKinYiu/yolov9).

## Contributors

<a href="https://github.com/WuxinrongY"><img src="https://avatars.githubusercontent.com/u/53141838?v=4?s=48" width="40px;" alt=""/></a>

## Progress
- [x] YOLOv9-t
- [x] YOLOv9-t-convert(gelan)
- [x] YOLOv9-s
- [x] YOLOv9-s-convert(gelan)
- [x] YOLOv9-m
- [x] YOLOv9-m-convert(gelan)
- [x] YOLOv9-c
- [x] YOLOv9-c-convert(gelan)
- [x] YOLOv9-e
- [x] YOLOv9-e-convert(gelan)

## Requirements

- TensorRT 8.0+
- OpenCV 3.4.0+

## Speed Test

The speed test is done on a desktop with R7-5700G CPU and RTX 4060Ti GPU. The input size is 640x640. The FP32, FP16 and INT8 models are tested. The time only includes the inference time, not includes the pre-processing and post-processing. The time is the average of 1000 times inference.

| frame  | Model | FP32 | FP16 | INT8 |
| --- | --- | --- | --- | --- |
| tensorrt | YOLOv5-n | -ms | 0.58ms | -ms |
| tensorrt | YOLOv5-s | -ms | 0.90ms | -ms |
| tensorrt | YOLOv5-m | -ms | 1.9ms | -ms |
| tensorrt | YOLOv5-l | -ms | 2.8ms | -ms |
| tensorrt | YOLOv5-x | -ms | 5.1ms | -ms |
| tensorrt | YOLOv9-t-convert | -ms | 1.37ms | -ms |
| tensorrt | YOLOv9-s | -ms | 1.78ms | -ms |
| tensorrt | YOLOv9-s-convert | -ms | 1.78ms | -ms |
| tensorrt | YOLOv9-m | -ms | 3.1ms | -ms |
| tensorrt | YOLOv9-m-convert | -ms | 2.8ms | -ms |
| tensorrt | YOLOv9-c | 13.5ms | 4.6ms | 3.0ms |
| tensorrt | YOLOv9-e | 8.3ms | 3.2ms | 2.15ms |

**GELAN will be updated later.**

YOLOv9-e is faster than YOLOv9-c in tensorrt, because the YOLOv9-e requires fewer layers of inference.

```
YOLOv9-c:
[[31, 34, 37, 16, 19, 22], 1, DualDDetect, [nc]] # [A3, A4, A5, P3, P4, P5]

YOLOv9-e:
[[35, 32, 29, 42, 45, 48], 1, DualDDetect, [nc]]

```

In DualDDetect, the A3, A4, A5, P3, P4, P5 are the output of the backbone. The first 3 layers are used for the inference of the final result.

The YOLOv9-c requires 37 layers of inference, but YOLOv9-e requires 35 layers of inference.

## How to Run, yolov9 as example

1. generate .wts from pytorch with .pt, or download .wts from model zoo

```
// download https://github.com/WongKinYiu/yolov9
cp {tensorrtx}/yolov9/gen_wts.py {yolov9}/yolov9
cd {yolov9}/yolov9
python gen_wts.py
// a file 'yolov9.wts' will be generated.
```
2. build tensorrtx/yolov9 and run

```
cd {tensorrtx}/yolov9/
// update kNumClass in config.h if your model is trained on custom dataset
mkdir build
cd build
cp {ultralytics}/ultralytics/yolov9.wts {tensorrtx}/yolov9/build
cmake ..
make
sudo ./yolov9 -s [.wts] [.engine] [c/e]  // serialize model to plan file
sudo ./yolov9 -d [.engine] [image folder] // deserialize and run inference, the images in [image folder] will be processed.
// For example yolov9
sudo ./yolov9 -s yolov9-c.wts yolov9-c.engine c
sudo ./yolov9 -d yolov9-c.engine ../images
```

3. check the images generated, as follows. _zidane.jpg and _bus.jpg

4. optional, load and run the tensorrt model in python

```
// install python-tensorrt, pycuda, etc.
// ensure the yolov9.engine and libmyplugins.so have been built
python yolov9_trt.py
```

# INT8 Quantization

1. Prepare calibration images, you can randomly select 1000s images from your train set. For coco, you can also download my calibration images `coco_calib` from [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing) or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh

2. unzip it in yolov9/build

3. set the macro `USE_INT8` in config.h and change the path of calibration images in config.h, such as 'gCalibTablePath="./coco_calib/";'

4. serialize the model and test

<p align="center">
<img src="https://user-images.githubusercontent.com/15235574/78247927-4d9fac00-751e-11ea-8b1b-704a0aeb3fcf.jpg" height="360px;">
</p>

## More Information

See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)


================================================
FILE: yolov9/demo.cpp
================================================
#include <chrono>
#include <fstream>
#include "config.h"
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "postprocess.h"
#include "preprocess.h"
#include "utils.h"

using namespace nvinfer1;
const static int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
static Logger gLogger;
void serialize_engine(unsigned int max_batchsize, std::string& wts_name, std::string& sub_type,
                      std::string& engine_name) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    IHostMemory* serialized_engine = nullptr;
    if (sub_type == "t") {
        serialized_engine = build_engine_yolov9_t(max_batchsize, builder, config, DataType::kFLOAT, wts_name, false);
    } else if (sub_type == "s") {
        serialized_engine = build_engine_yolov9_s(max_batchsize, builder, config, DataType::kFLOAT, wts_name, false);
    } else if (sub_type == "m") {
        serialized_engine = build_engine_yolov9_m(max_batchsize, builder, config, DataType::kFLOAT, wts_name, false);
    } else if (sub_type == "c") {
        serialized_engine = build_engine_yolov9_c(max_batchsize, builder, config, DataType::kFLOAT, wts_name);
    } else if (sub_type == "e") {
        serialized_engine = build_engine_yolov9_e(max_batchsize, builder, config, DataType::kFLOAT, wts_name);
    }

    else if (sub_type == "gt") {
        serialized_engine = build_engine_yolov9_t(max_batchsize, builder, config, DataType::kFLOAT, wts_name, true);
    } else if (sub_type == "gs") {
        serialized_engine = build_engine_yolov9_s(max_batchsize, builder, config, DataType::kFLOAT, wts_name, true);
    } else if (sub_type == "gm") {
        serialized_engine = build_engine_yolov9_m(max_batchsize, builder, config, DataType::kFLOAT, wts_name, true);
    } else if (sub_type == "gc") {
        serialized_engine = build_engine_gelan_c(max_batchsize, builder, config, DataType::kFLOAT, wts_name);
    } else if (sub_type == "ge") {
        serialized_engine = build_engine_gelan_e(max_batchsize, builder, config, DataType::kFLOAT, wts_name);
    } else {
        return;
    }

    assert(serialized_engine != nullptr);

    std::ofstream p(engine_name, std::ios::binary);
    if (!p) {
        std::cerr << "could not open plan output file" << std::endl;
        assert(false);
    }
    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());

    delete config;
    delete serialized_engine;
    delete builder;
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
                        IExecutionContext** context) {
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        assert(false);
    }
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    char* serialized_engine = new char[size];
    assert(serialized_engine);
    file.read(serialized_engine, size);
    file.close();

    *runtime = createInferRuntime(gLogger);
    assert(*runtime);
    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
    assert(*engine);
    *context = (*engine)->createExecutionContext();
    assert(*context);
    delete[] serialized_engine;
}

void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
                    float** output_buffer_host) {
    assert(engine->getNbBindings() == 2);
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(kInputTensorName);
    const int outputIndex = engine->getBindingIndex(kOutputTensorName);
    assert(inputIndex == 0);
    assert(outputIndex == 1);
    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));

    *output_buffer_host = new float[kBatchSize * kOutputSize];
}

void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchSize) {
    // infer on the batch asynchronously, and DMA output back to host
    context.enqueue(batchSize, buffers, stream, nullptr);
    CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
                               stream));
    CUDA_CHECK(cudaStreamSynchronize(stream));
}

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir,
                std::string& sub_type) {
    if (argc < 4)
        return false;
    if (std::string(argv[1]) == "-s" && argc == 5) {
        wts = std::string(argv[2]);
        engine = std::string(argv[3]);
        sub_type = std::string(argv[4]);
    } else if (std::string(argv[1]) == "-d" && argc == 4) {
        engine = std::string(argv[2]);
        img_dir = std::string(argv[3]);
    } else {
        return false;
    }
    return true;
}

int main(int argc, char** argv) {
    cudaSetDevice(kGpuId);

    std::string wts_name = "";
    std::string engine_name = "../yolov9-m-converted.engine";
    std::string img_dir = "../images";
    std::string sub_type = "m";
    // speed test or inference
    const int speed_test_iter = 1000;
    // const int speed_test_iter = 1;

    // if (!parse_args(argc, argv, wts_name, engine_name, img_dir, sub_type)) {
    //     std::cerr << "Arguments not right!" << std::endl;
    //     std::cerr << "./yolov9 -s [.wts] [.engine] [s/m/c/e/gt/gs/gm/gc/ge]  // serialize model to plan file" << std::endl;
    //     std::cerr << "./yolov9 -d [.engine] ../samples  // deserialize plan file and run inference" << std::endl;
    //     return -1;
    // }

    // Create a model using the API directly and serialize it to a file
    if (!wts_name.empty()) {
        serialize_engine(kBatchSize, wts_name, sub_type, engine_name);
        return 0;
    }

    // Deserialize the engine from file
    IRuntime* runtime = nullptr;
    ICudaEngine* engine = nullptr;
    IExecutionContext* context = nullptr;
    deserialize_engine(engine_name, &runtime, &engine, &context);
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    cuda_preprocess_init(kMaxInputImageSize);

    // Prepare cpu and gpu buffers
    float* device_buffers[2];
    float* output_buffer_host = nullptr;
    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host);

    // Read images from directory
    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

    // batch predict
    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
        // Get a batch of images
        std::vector<cv::Mat> img_batch;
        std::vector<std::string> img_name_batch;
        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
            img_batch.push_back(img);
            img_name_batch.push_back(file_names[j]);
        }

        // Preprocess
        cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);

        // Run inference
        auto start = std::chrono::system_clock::now();
        for (int j = 0; j < speed_test_iter; j++) {
            infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize);
        }
        // infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize);
        auto end = std::chrono::system_clock::now();
        std::cout << "inference time: "
                  << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() / 1000.0 /
                             speed_test_iter
                  << "ms" << std::endl;

        // NMS
        std::vector<std::vector<Detection>> res_batch;
        batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);

        // Draw bounding boxes
        draw_bbox(img_batch, res_batch);

        // Save images
        for (size_t j = 0; j < img_batch.size(); j++) {
            cv::imwrite("_" + img_name_batch[j], img_batch[j]);
        }
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(device_buffers[0]));
    CUDA_CHECK(cudaFree(device_buffers[1]));
    delete[] output_buffer_host;
    cuda_preprocess_destroy();
    // Destroy the engine
    delete context;
    delete engine;
    delete runtime;

    // Print histogram of the output distribution
    //std::cout << "\nOutput:\n\n";
    //for (unsigned int i = 0; i < kOutputSize; i++)
    //{
    //    std::cout << prob[i] << ", ";
    //    if (i % 10 == 0) std::cout << std::endl;
    //}
    //std::cout << std::endl;

    return 0;
}


================================================
FILE: yolov9/gen_wts.py
================================================
import sys  # noqa: F401
import argparse
import os
import struct
import torch
from utils.torch_utils import select_device


def parse_args():
    parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
    parser.add_argument('-w', '--weights', default='yolov9-e.pt',
                        help='Input weights (.pt) file path (required)')
    parser.add_argument(
        '-o', '--output', help='Output (.wts) file path (optional)')
    parser.add_argument(
        '-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg'],
        help='determines the model is detection/classification')
    args = parser.parse_args()
    if not os.path.isfile(args.weights):
        raise SystemExit('Invalid input file')
    if not args.output:
        args.output = os.path.splitext(args.weights)[0] + '.wts'
    elif os.path.isdir(args.output):
        args.output = os.path.join(
            args.output,
            os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
    return args.weights, args.output, args.type


pt_file, wts_file, m_type = parse_args()
print(f'Generating .wts for {m_type} model')

# Load model
print(f'Loading {pt_file}')
device = select_device('cpu')
model = torch.load(pt_file, map_location=device, weights_only=False)  # Load FP32 weights
model = model['ema' if model.get('ema') else 'model'].float()

if m_type in ['detect', 'seg']:
    # update anchor_grid info
    anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None]
    # model.model[-1].anchor_grid = anchor_grid
    # delattr(model.model[-1], 'anchor_grid')  # model.model[-1] is detect layer
    # The parameters are saved in the OrderDict through the "register_buffer" method, and then saved to the weight.
    model.model[-1].register_buffer("anchor_grid", anchor_grid)
    # model.model[-1].register_buffer("strides", model.model[-1].stride)

model.to(device).eval()

# print(model.model)
# 将model.model保存到txt中
with open('model.txt', 'w') as f:
    f.write(str(model.model))
f.close()
print(f'Writing into {wts_file}')
with open(wts_file, 'w') as f:
    f.write('{}\n'.format(len(model.state_dict().keys())))
    for k, v in model.state_dict().items():
        vr = v.reshape(-1).cpu().numpy()
        f.write('{} {} '.format(k, len(vr)))
        for vv in vr:
            f.write(' ')
            f.write(struct.pack('>f', float(vv)).hex())
        f.write('\n')
wts_file_key = wts_file.replace('.wts', '_key.txt')
print(f'Writing into {wts_file_key}')
with open(wts_file_key, 'w') as f:
    f.write('{}\n'.format(len(model.state_dict().keys())))
    for k, v in model.state_dict().items():
        vr = v.reshape(-1).cpu().numpy()
        f.write('{} {} '.format(k, len(vr)))
        f.write('\n')


================================================
FILE: yolov9/include/block.h
================================================
#include "config.h"
#include "yololayer.h"

#include <cassert>
#include <cmath>
#include <cstring>
#include <fstream>
#include <iostream>
#include <map>

using namespace nvinfer1;

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
void PrintDim(const ILayer* layer, std::string log = "");
std::map<std::string, Weights> loadWeights(const std::string file);
int get_width(int x, float gw, int divisor = 8);
int get_depth(int x, float gd);
ILayer* Proto(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int c_, int c2,
              std::string lname);
std::vector<std::vector<float>> getAnchors(std::map<std::string, Weights>& weightMap, std::string lname);
// ----------------------------------------------------------------
nvinfer1::ILayer* convBnSiLU(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights>& weightMap,
                             nvinfer1::ITensor& input, int ch, int k, int s, int p, std::string lname, int g = 1);
ILayer* ELAN1(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2,
              int c3, int c4, std::string lname);
ILayer* RepNCSPELAN4(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1,
                     int c2, int c3, int c4, int c5, std::string lname);
ILayer* ADown(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int c2,
              std::string lname);
ILayer* AConv(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int c2,
              std::string lname);
std::vector<ILayer*> CBLinear(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input,
                              std::vector<int> c2s, int k, int s, int p, int g, std::string lname);
ILayer* CBFuse(INetworkDefinition* network, std::vector<std::vector<ILayer*>> input, std::vector<int> idx,
               std::vector<int> strides);
ILayer* SPPELAN(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2,
                int c3, std::string lname);
std::vector<IConcatenationLayer*> DualDDetect(INetworkDefinition* network, std::map<std::string, Weights>& weightMap,
                                              std::vector<ILayer*> dets, int cls, std::vector<int> ch,
                                              std::string lname);
nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
                                       std::vector<nvinfer1::IConcatenationLayer*> dets, bool is_segmentation);
nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                             nvinfer1::ITensor& input, int ch, int k, int s, int p, std::string lname);
nvinfer1::ILayer* convBnNoAct(nvinfer1::INetworkDefinition* network,
                              std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int ch,
                              int k, int s, int p, std::string lname, int g);
std::vector<IConcatenationLayer*> DDetect(INetworkDefinition* network, std::map<std::string, Weights>& weightMap,
                                          std::vector<ILayer*> dets, int cls, std::vector<int> ch, std::string lname);


================================================
FILE: yolov9/include/calibrator.h
================================================
#pragma once

#include "macros.h"
#include <string>
#include <vector>

//! \class Int8EntropyCalibrator2
//!
//! \brief Implements Entropy calibrator 2.
//!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
//!
class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
public:
    Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true);

    virtual ~Int8EntropyCalibrator2();
    int getBatchSize() const TRT_NOEXCEPT override;
    bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
    const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
    void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;

private:
    int batchsize_;
    int input_w_;
    int input_h_;
    int img_idx_;
    std::string img_dir_;
    std::vector<std::string> img_files_;
    size_t input_count_;
    std::string calib_table_name_;
    const char* input_blob_name_;
    bool read_cache_;
    void* device_input_;
    std::vector<char> calib_cache_;
};


================================================
FILE: yolov9/include/config.h
================================================
#pragma once

/* --------------------------------------------------------
 * These configs are related to tensorrt model, if these are changed,
 * please re-compile and re-serialize the tensorrt model.
 * --------------------------------------------------------*/

// For INT8, you need prepare the calibration dataset, please refer to
// https://github.com/wang-xinyu/tensorrtx/tree/master/yolov5#int8-quantization
#define USE_FP16  // set USE_INT8 or USE_FP16 or USE_FP32
#ifdef USE_INT8
const static char* gCalibTablePath = "./calib";
#endif

// These are used to define input/output tensor names,
// you can set them to whatever you want.
const static char* kInputTensorName = "images";
const static char* kOutputTensorName = "output";

// Detection model and Segmentation model' number of classes
constexpr static int kNumClass = 80;

// Classfication model's number of classes
constexpr static int kClsNumClass = 1000;

constexpr static int kBatchSize = 1;

// Yolo's input width and height must by divisible by 32
constexpr static int kInputH = 640;
constexpr static int kInputW = 640;

// Classfication model's input shape
constexpr static int kClsInputH = 224;
constexpr static int kClsInputW = 224;

// Maximum number of output bounding boxes from yololayer plugin.
// That is maximum number of output bounding boxes before NMS.
constexpr static int kMaxNumOutputBbox = 2000;

constexpr static int kNumAnchor = 3;

// The bboxes whose confidence is lower than kIgnoreThresh will be ignored in yololayer plugin.
constexpr static float kIgnoreThresh = 0.05f;

/* --------------------------------------------------------
 * These configs are NOT related to tensorrt model, if these are changed,
 * please re-compile, but no need to re-serialize the tensorrt model.
 * --------------------------------------------------------*/

// NMS overlapping thresh and final detection confidence thresh
const static float kNmsThresh = 0.45f;
const static float kConfThresh = 0.1f;

const static int kGpuId = 0;

// If your image size is larger than 4096 * 3112, please increase this value
const static int kMaxInputImageSize = 4096 * 3112;


================================================
FILE: yolov9/include/cuda_utils.h
================================================
#ifndef TRTX_CUDA_UTILS_H_
#define TRTX_CUDA_UTILS_H_

#include <cuda_runtime_api.h>

#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)\
    {\
        cudaError_t error_code = callstr;\
        if (error_code != cudaSuccess) {\
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
            assert(0);\
        }\
    }
#endif  // CUDA_CHECK

#endif  // TRTX_CUDA_UTILS_H_


================================================
FILE: yolov9/include/logging.h
================================================
/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H

#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "macros.h"

using Severity = nvinfer1::ILogger::Severity;

class LogStreamConsumerBuffer : public std::stringbuf
{
public:
    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mOutput(stream)
        , mPrefix(prefix)
        , mShouldLog(shouldLog)
    {
    }

    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
        : mOutput(other.mOutput)
    {
    }

    ~LogStreamConsumerBuffer()
    {
        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
        // if the pointer to the beginning is not equal to the pointer to the current position,
        // call putOutput() to log the output to the stream
        if (pbase() != pptr())
        {
            putOutput();
        }
    }

    // synchronizes the stream buffer and returns 0 on success
    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
    // resetting the buffer and flushing the stream
    virtual int sync()
    {
        putOutput();
        return 0;
    }

    void putOutput()
    {
        if (mShouldLog)
        {
            // prepend timestamp
            std::time_t timestamp = std::time(nullptr);
            tm* tm_local = std::localtime(&timestamp);
            std::cout << "[";
            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
            // std::stringbuf::str() gets the string contents of the buffer
            // insert the buffer contents pre-appended by the appropriate prefix into the stream
            mOutput << mPrefix << str();
            // set the buffer to empty
            str("");
            // flush the stream
            mOutput.flush();
        }
    }

    void setShouldLog(bool shouldLog)
    {
        mShouldLog = shouldLog;
    }

private:
    std::ostream& mOutput;
    std::string mPrefix;
    bool mShouldLog;
};

//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
        : mBuffer(stream, prefix, shouldLog)
    {
    }

protected:
    LogStreamConsumerBuffer mBuffer;
};

//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//!  Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
    //!  Reportable severity determines if the messages are severe enough to be logged.
    LogStreamConsumer(Severity reportableSeverity, Severity severity)
        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(severity <= reportableSeverity)
        , mSeverity(severity)
    {
    }

    LogStreamConsumer(LogStreamConsumer&& other)
        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
        , std::ostream(&mBuffer) // links the stream buffer with the stream
        , mShouldLog(other.mShouldLog)
        , mSeverity(other.mSeverity)
    {
    }

    void setReportableSeverity(Severity reportableSeverity)
    {
        mShouldLog = mSeverity <= reportableSeverity;
        mBuffer.setShouldLog(mShouldLog);
    }

private:
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    static std::string severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    bool mShouldLog;
    Severity mSeverity;
};

//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.

class Logger : public nvinfer1::ILogger
{
public:
    Logger(Severity severity = Severity::kWARNING)
        : mReportableSeverity(severity)
    {
    }

    //!
    //! \enum TestResult
    //! \brief Represents the state of a given test
    //!
    enum class TestResult
    {
        kRUNNING, //!< The test is running
        kPASSED,  //!< The test passed
        kFAILED,  //!< The test failed
        kWAIVED   //!< The test was waived
    };

    //!
    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
    //! \return The nvinfer1::ILogger associated with this Logger
    //!
    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
    //! we can eliminate the inheritance of Logger from ILogger
    //!
    nvinfer1::ILogger& getTRTLogger()
    {
        return *this;
    }

    //!
    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
    //!
    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
    //! inheritance from nvinfer1::ILogger
    //!
    void log(Severity severity, const char* msg) TRT_NOEXCEPT override 
    {
        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
    }

    //!
    //! \brief Method for controlling the verbosity of logging output
    //!
    //! \param severity The logger will only emit messages that have severity of this level or higher.
    //!
    void setReportableSeverity(Severity severity)
    {
        mReportableSeverity = severity;
    }

    //!
    //! \brief Opaque handle that holds logging information for a particular test
    //!
    //! This object is an opaque handle to information used by the Logger to print test results.
    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
    //! with Logger::reportTest{Start,End}().
    //!
    class TestAtom
    {
    public:
        TestAtom(TestAtom&&) = default;

    private:
        friend class Logger;

        TestAtom(bool started, const std::string& name, const std::string& cmdline)
            : mStarted(started)
            , mName(name)
            , mCmdline(cmdline)
        {
        }

        bool mStarted;
        std::string mName;
        std::string mCmdline;
    };

    //!
    //! \brief Define a test for logging
    //!
    //! \param[in] name The name of the test.  This should be a string starting with
    //!                  "TensorRT" and containing dot-separated strings containing
    //!                  the characters [A-Za-z0-9_].
    //!                  For example, "TensorRT.sample_googlenet"
    //! \param[in] cmdline The command line used to reproduce the test
    //
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    //!
    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
    {
        return TestAtom(false, name, cmdline);
    }

    //!
    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
    //!        as input
    //!
    //! \param[in] name The name of the test
    //! \param[in] argc The number of command-line arguments
    //! \param[in] argv The array of command-line arguments (given as C strings)
    //!
    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
    {
        auto cmdline = genCmdlineString(argc, argv);
        return defineTest(name, cmdline);
    }

    //!
    //! \brief Report that a test has started.
    //!
    //! \pre reportTestStart() has not been called yet for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has started
    //!
    static void reportTestStart(TestAtom& testAtom)
    {
        reportTestResult(testAtom, TestResult::kRUNNING);
        assert(!testAtom.mStarted);
        testAtom.mStarted = true;
    }

    //!
    //! \brief Report that a test has ended.
    //!
    //! \pre reportTestStart() has been called for the given testAtom
    //!
    //! \param[in] testAtom The handle to the test that has ended
    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
    //!                   TestResult::kFAILED, TestResult::kWAIVED
    //!
    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
    {
        assert(result != TestResult::kRUNNING);
        assert(testAtom.mStarted);
        reportTestResult(testAtom, result);
    }

    static int reportPass(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kPASSED);
        return EXIT_SUCCESS;
    }

    static int reportFail(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kFAILED);
        return EXIT_FAILURE;
    }

    static int reportWaive(const TestAtom& testAtom)
    {
        reportTestEnd(testAtom, TestResult::kWAIVED);
        return EXIT_SUCCESS;
    }

    static int reportTest(const TestAtom& testAtom, bool pass)
    {
        return pass ? reportPass(testAtom) : reportFail(testAtom);
    }

    Severity getReportableSeverity() const
    {
        return mReportableSeverity;
    }

private:
    //!
    //! \brief returns an appropriate string for prefixing a log message with the given severity
    //!
    static const char* severityPrefix(Severity severity)
    {
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: return "[F] ";
        case Severity::kERROR: return "[E] ";
        case Severity::kWARNING: return "[W] ";
        case Severity::kINFO: return "[I] ";
        case Severity::kVERBOSE: return "[V] ";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate string for prefixing a test result message with the given result
    //!
    static const char* testResultString(TestResult result)
    {
        switch (result)
        {
        case TestResult::kRUNNING: return "RUNNING";
        case TestResult::kPASSED: return "PASSED";
        case TestResult::kFAILED: return "FAILED";
        case TestResult::kWAIVED: return "WAIVED";
        default: assert(0); return "";
        }
    }

    //!
    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
    //!
    static std::ostream& severityOstream(Severity severity)
    {
        return severity >= Severity::kINFO ? std::cout : std::cerr;
    }

    //!
    //! \brief method that implements logging test results
    //!
    static void reportTestResult(const TestAtom& testAtom, TestResult result)
    {
        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
                                         << testAtom.mCmdline << std::endl;
    }

    //!
    //! \brief generate a command line string from the given (argc, argv) values
    //!
    static std::string genCmdlineString(int argc, char const* const* argv)
    {
        std::stringstream ss;
        for (int i = 0; i < argc; i++)
        {
            if (i > 0)
                ss << " ";
            ss << argv[i];
        }
        return ss.str();
    }

    Severity mReportableSeverity;
};

namespace
{

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//!     LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//!     LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//!     LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}

//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
//         ("fatal" severity)
//!
//! Example usage:
//!
//!     LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}

} // anonymous namespace

#endif // TENSORRT_LOGGING_H


================================================
FILE: yolov9/include/macros.h
================================================
#ifndef __MACROS_H
#define __MACROS_H

#include <NvInfer.h>

#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else

#if defined(_MSC_VER)
#define API __declspec(dllimport)
#else
#define API
#endif
#endif  // API_EXPORTS

#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif

#endif  // __MACROS_H


================================================
FILE: yolov9/include/model.h
================================================
#pragma once

#include <NvInfer.h>
#include <string>
// yolov9
nvinfer1::IHostMemory* build_engine_yolov9_t(unsigned int maxBatchSize, nvinfer1::IBuilder* builder,
                                             nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt,
                                             std::string& wts_name, bool isConvert = false);
nvinfer1::IHostMemory* build_engine_yolov9_s(unsigned int maxBatchSize, nvinfer1::IBuilder* builder,
                                             nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt,
                                             std::string& wts_name, bool isConvert = false);
nvinfer1::IHostMemory* build_engine_yolov9_m(unsigned int maxBatchSize, nvinfer1::IBuilder* builder,
                                             nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt,
                                             std::string& wts_name, bool isConvert = false);
nvinfer1::IHostMemory* build_engine_yolov9_c(unsigned int maxBatchSize, nvinfer1::IBuilder* builder,
                                             nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt,
                                             std::string& wts_name);
nvinfer1::IHostMemory* build_engine_yolov9_e(unsigned int maxBatchSize, nvinfer1::IBuilder* builder,
                                             nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt,
                                             std::string& wts_name);
// gelan
nvinfer1::IHostMemory* build_engine_gelan_t(unsigned int maxBatchSize, nvinfer1::IBuilder* builder,
                                            nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt,
                                            std::string& wts_name);
nvinfer1::IHostMemory* build_engine_gelan_m(unsigned int maxBatchSize, nvinfer1::IBuilder* builder,
                                            nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt,
                                            std::string& wts_name);
nvinfer1::IHostMemory* build_engine_gelan_c(unsigned int maxBatchSize, nvinfer1::IBuilder* builder,
                                            nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt,
                                            std::string& wts_name);
nvinfer1::IHostMemory* build_engine_gelan_e(unsigned int maxBatchSize, nvinfer1::IBuilder* builder,
                                            nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt,
                                            std::string& wts_name);


================================================
FILE: yolov9/include/postprocess.h
================================================
#pragma once

#include "types.h"
#include <opencv2/opencv.hpp>
#include <cuda_runtime.h>
cv::Rect get_rect(cv::Mat& img, float bbox[4]);

void nms(std::vector<Detection>& res, float *output, float conf_thresh, float nms_thresh = 0.5);

void batch_nms(std::vector<std::vector<Detection>>& batch_res, float *output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5);

void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);

std::vector<cv::Mat> process_mask(const float* proto, int proto_size, std::vector<Detection>& dets);

void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks, std::unordered_map<int, std::string>& labels_map);
// cuda NMS
void cuda_decode(float* predict, int num_bboxes, float confidence_threshold,float* parray,int max_objects, cudaStream_t stream);
void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);
void batch_process(std::vector<std::vector<Detection>> &res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector<cv::Mat>& img_batch);

================================================
FILE: yolov9/include/preprocess.h
================================================
#pragma once

#include <cuda_runtime.h>
#include <cstdint>
#include <opencv2/opencv.hpp>

void cuda_preprocess_init(int max_image_size);
void cuda_preprocess_destroy();
void cuda_preprocess(uint8_t* src, int src_width, int src_height,
                     float* dst, int dst_width, int dst_height,
                     cudaStream_t stream);
void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch,
                           float* dst, int dst_width, int dst_height,
                           cudaStream_t stream);


================================================
FILE: yolov9/include/types.h
================================================
#pragma once

#include "config.h"

struct YoloKernel {
    int width;
    int height;
    float anchors[kNumAnchor * 2];
};

struct alignas(float) Detection {
    float bbox[4];  // center_x center_y w h
    float conf;  // bbox_conf * cls_conf
    float class_id;
    float mask[32];
};
const int bbox_element = 7; // center_x, center_y, w, h, conf, cls, obj


================================================
FILE: yolov9/include/utils.h
================================================
#pragma once

#include <dirent.h>
#include <cstring>
#include <fstream>
#include <sstream>
#include <string>
#include <unordered_map>
#include <vector>

static inline int read_files_in_dir(const char* p_dir_name, std::vector<std::string>& file_names) {
    DIR* p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }

    closedir(p_dir);
    return 0;
}
// Function to trim leading and trailing whitespace from a string
static inline std::string trim_leading_whitespace(const std::string& str) {
    size_t first = str.find_first_not_of(' ');
    if (std::string::npos == first) {
        return str;
    }
    size_t last = str.find_last_not_of(' ');
    return str.substr(first, (last - first + 1));
}

// Src: https://stackoverflow.com/questions/16605967
static inline std::string to_string_with_precision(const float a_value, const int n = 2) {
    std::ostringstream out;
    out.precision(n);
    out << std::fixed << a_value;
    return out.str();
}

static inline int read_labels(const std::string labels_filename, std::unordered_map<int, std::string>& labels_map) {

    std::ifstream file(labels_filename);
    // Read each line of the file
    std::string line;
    int index = 0;
    while (std::getline(file, line)) {
        // Strip the line of any leading or trailing whitespace
        line = trim_leading_whitespace(line);

        // Add the stripped line to the labels_map, using the loop index as the key
        labels_map[index] = line;
        index++;
    }
    // Close the file
    file.close();

    return 0;
}


================================================
FILE: yolov9/plugin/yololayer.cu
================================================
#include "yololayer.h"
#include "types.h"
#include <assert.h>
#include <math.h>
#include "cuda_utils.h"
#include <vector>
#include <iostream>

namespace Tn {
    template<typename T>
    void write(char*& buffer, const T& val) {
        *reinterpret_cast<T*>(buffer) = val;
        buffer += sizeof(T);
    }

    template<typename T>
    void read(const char*& buffer, T& val) {
        val = *reinterpret_cast<const T*>(buffer);
        buffer += sizeof(T);
    }
}  // namespace Tn


namespace nvinfer1 {
YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation) {
    mClassCount = classCount;
    mYoloV8NetWidth = netWidth;
    mYoloV8netHeight = netHeight;
    mMaxOutObject = maxOut;
    is_segmentation_ = is_segmentation;
}

YoloLayerPlugin::~YoloLayerPlugin() {}

YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) {
    using namespace Tn;
    const char* d = reinterpret_cast<const char*>(data), * a = d;
    read(d, mClassCount);
    read(d, mThreadCount);
    read(d, mYoloV8NetWidth);
    read(d, mYoloV8netHeight);
    read(d, mMaxOutObject);
    read(d, is_segmentation_);

    assert(d == a + length);
}

void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT {

    using namespace Tn;
    char* d = static_cast<char*>(buffer), * a = d;
    write(d, mClassCount);
    write(d, mThreadCount);
    write(d, mYoloV8NetWidth);
    write(d, mYoloV8netHeight);
    write(d, mMaxOutObject);
    write(d, is_segmentation_);

    assert(d == a + getSerializationSize());
}

size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT {
    return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + sizeof(mMaxOutObject) + sizeof(is_segmentation_);
}

int YoloLayerPlugin::initialize() TRT_NOEXCEPT {
    return 0;
}

nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT {
    int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float);
    return nvinfer1::Dims3(total_size + 1, 1, 1);
}

void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT {
    mPluginNamespace = pluginNamespace;
}

const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT {
    return mPluginNamespace;
}

nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT {
    return nvinfer1::DataType::kFLOAT;
}

bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT {

    return false;
}

bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT {

    return false;
}

void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput, nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT {};

void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {};

void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {}

const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT {

    return "YoloLayer_TRT";
}

const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT {
    return "1";
}

void YoloLayerPlugin::destroy() TRT_NOEXCEPT {

    delete this;
}

nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT {

    YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV8NetWidth, mYoloV8netHeight, mMaxOutObject, is_segmentation_);
    p->setPluginNamespace(mPluginNamespace);
    return p;
}

int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT {

    forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV8netHeight, mYoloV8NetWidth, batchSize);
    return 0;
}


__device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); };

__global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject,
                             const int grid_h, int grid_w, const int stride, int classes, int outputElem, bool is_segmentation) {
    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    if (idx >= numElements) return;

    int total_grid = grid_h * grid_w;
    int info_len = 4 + classes;
    if (is_segmentation) info_len += 32;
    int batchIdx = idx / total_grid;
    int elemIdx = idx % total_grid;
    const float* curInput = input + batchIdx * total_grid * info_len;
    int outputIdx = batchIdx * outputElem;

    int class_id = 0;
    float max_cls_prob = 0.0;
    for (int i = 4; i < 4 + classes; i++) {
        float p = Logist(curInput[elemIdx + i * total_grid]);
        if (p > max_cls_prob) {
            max_cls_prob = p;
            class_id = i - 4;
        }
    }

    if (max_cls_prob < 0.1) return;

    int count = (int)atomicAdd(output + outputIdx, 1);
    if (count >= maxoutobject) return;
    char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection);
    Detection* det = (Detection*)(data);

    int row = elemIdx / grid_w;
    int col = elemIdx % grid_w;

    det->conf = max_cls_prob;
    det->class_id = class_id;
    det->bbox[0] = (col + 0.5f - curInput[elemIdx + 0 * total_grid]) * stride;
    det->bbox[1] = (row + 0.5f - curInput[elemIdx + 1 * total_grid]) * stride;
    det->bbox[2] = (col + 0.5f + curInput[elemIdx + 2 * total_grid]) * stride;
    det->bbox[3] = (row + 0.5f + curInput[elemIdx + 3 * total_grid]) * stride;

    for (int k = 0; is_segmentation && k < 32; k++) {
        det->mask[k] = curInput[elemIdx + (k + 4 + classes) * total_grid];
    }
}

void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,int mYoloV8NetWidth, int batchSize) {
    int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
    cudaMemsetAsync(output, 0, sizeof(float), stream);
    for (int idx = 0; idx < batchSize; ++idx) {
        CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream));
    }
    int numElem = 0;
    int grids[3][2] = { {mYoloV8netHeight / 8, mYoloV8NetWidth / 8}, {mYoloV8netHeight / 16, mYoloV8NetWidth / 16}, {mYoloV8netHeight / 32, mYoloV8NetWidth / 32} };
    int strides[] = { 8, 16, 32 };
    for (unsigned int i = 0; i < 3; i++) {
        int grid_h = grids[i][0];
        int grid_w = grids[i][1];
        int stride = strides[i];
        numElem = grid_h * grid_w * batchSize;
        if (numElem < mThreadCount) mThreadCount = numElem;

        CalDetection << <(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream >> >
            (inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, outputElem, is_segmentation_);
    }
}

PluginFieldCollection YoloPluginCreator::mFC{};
std::vector<PluginField> YoloPluginCreator::mPluginAttributes;

YoloPluginCreator::YoloPluginCreator() {
    mPluginAttributes.clear();
    mFC.nbFields = mPluginAttributes.size();
    mFC.fields = mPluginAttributes.data();
}

const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT {
    return "YoloLayer_TRT";
}

const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT {
    return "1";
}

const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT {
    return &mFC;
}

IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT {
    assert(fc->nbFields == 1);
    assert(strcmp(fc->fields[0].name, "netinfo") == 0);
    int* p_netinfo = (int*)(fc->fields[0].data);
    int class_count = p_netinfo[0];
    int input_w = p_netinfo[1];
    int input_h = p_netinfo[2];
    int max_output_object_count = p_netinfo[3];
    bool is_segmentation = p_netinfo[4];
    YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, is_segmentation);
    obj->setPluginNamespace(mNamespace.c_str());
    return obj;
}

IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT {
    // This object will be deleted when the network is destroyed, which will
    // call YoloLayerPlugin::destroy()
    YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
    obj->setPluginNamespace(mNamespace.c_str());
    return obj;
}

} // namespace nvinfer1


================================================
FILE: yolov9/plugin/yololayer.h
================================================
#pragma once
#include "macros.h"
#include "NvInfer.h"
#include <string>
#include <vector>
#include "macros.h"
namespace nvinfer1 {
class API YoloLayerPlugin : public IPluginV2IOExt {
public:
        YoloLayerPlugin(int classCount, int netWdith, int netHeight, int maxOut, bool is_segmentation);
        YoloLayerPlugin(const void* data, size_t length);
        ~YoloLayerPlugin();

        int getNbOutputs() const TRT_NOEXCEPT override {
            return 1;
        }

        nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;

        int initialize() TRT_NOEXCEPT override;

        virtual void terminate() TRT_NOEXCEPT override {}

        virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; }

        virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;

        virtual size_t getSerializationSize() const TRT_NOEXCEPT override;

        virtual void serialize(void* buffer) const TRT_NOEXCEPT override;

        bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override {
            return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
        }


        const char* getPluginType() const TRT_NOEXCEPT override;

        const char* getPluginVersion() const TRT_NOEXCEPT override;

        void destroy() TRT_NOEXCEPT override;

        IPluginV2IOExt* clone() const TRT_NOEXCEPT override;

        void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;

        const char* getPluginNamespace() const TRT_NOEXCEPT override;

        nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const TRT_NOEXCEPT;

        bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override;

        bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;

        void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;

        void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out, int32_t nbOutput) TRT_NOEXCEPT override;

        void detachFromContext() TRT_NOEXCEPT override;

    private:
        void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight, int mYoloV8NetWidth, int batchSize);
        int mThreadCount = 256;
        const char* mPluginNamespace;
        int mClassCount;
        int mYoloV8NetWidth;
        int mYoloV8netHeight;
        int mMaxOutObject;
        bool is_segmentation_;
    };

class API YoloPluginCreator : public IPluginCreator {
public:
        YoloPluginCreator();
        ~YoloPluginCreator() override = default;

        const char* getPluginName() const TRT_NOEXCEPT override;

        const char* getPluginVersion() const TRT_NOEXCEPT override;

        const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;

        nvinfer1::IPluginV2IOExt* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;

        nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;

        void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override {
            mNamespace = libNamespace;
        }

        const char* getPluginNamespace() const TRT_NOEXCEPT override {
            return mNamespace.c_str();
        }

    private:
        std::string mNamespace;
        static PluginFieldCollection mFC;
        static std::vector<PluginField> mPluginAttributes;
    };
    REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
} // namespace nvinfer1


================================================
FILE: yolov9/src/block.cpp
================================================
#include "block.h"
#include "calibrator.h"
#include "config.h"
#include "yololayer.h"

#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstring>
#include <fstream>
#include <iostream>
#include <map>
#include <numeric>
#include <vector>

using namespace nvinfer1;
// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
void PrintDim(const ILayer* layer, std::string log) {
    Dims dim = layer->getOutput(0)->getDimensions();
    std::cout << log << ": "
              << "\t\t\t\t";
    for (int i = 0; i < dim.nbDims; i++) {
        std::cout << dim.d[i] << " ";
    }
    std::cout << std::endl;
}

std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--) {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x) {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

int get_width(int x, float gw, int divisor) {
    return int(ceil((x * gw) / divisor)) * divisor;
}

int get_depth(int x, float gd) {
    if (x == 1)
        return 1;
    int r = round(x * gd);
    if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0) {
        --r;
    }
    return std::max<int>(r, 1);
}
static nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network,
                                             std::map<std::string, nvinfer1::Weights> weightMap,
                                             nvinfer1::ITensor& input, std::string lname, float eps) {
    float* gamma = (float*)weightMap[lname + ".weight"].values;
    float* beta = (float*)weightMap[lname + ".bias"].values;
    float* mean = (float*)weightMap[lname + ".running_mean"].values;
    float* var = (float*)weightMap[lname + ".running_var"].values;
    int len = weightMap[lname + ".running_var"].count;

    float* scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        scval[i] = gamma[i] / sqrt(var[i] + eps);
    }
    nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, scval, len};

    float* shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
    }
    nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, shval, len};

    float* pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
    for (int i = 0; i < len; i++) {
        pval[i] = 1.0;
    }
    nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, pval, len};
    weightMap[lname + ".scale"] = scale;
    weightMap[lname + ".shift"] = shift;
    weightMap[lname + ".power"] = power;
    nvinfer1::IScaleLayer* output = network->addScale(input, nvinfer1::ScaleMode::kCHANNEL, shift, scale, power);
    assert(output);
    return output;
}
nvinfer1::ILayer* convBnSiLU(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights>& weightMap,
                             nvinfer1::ITensor& input, int ch, int k, int s, int p, std::string lname, int g) {
    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv =
            network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty);
    assert(conv);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    conv->setPaddingNd(nvinfer1::DimsHW{p, p});
    conv->setNbGroups(g);

    nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);

    nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
    assert(sigmoid);
    auto ew = network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
    assert(ew);
    return ew;
}
nvinfer1::ILayer* convBnNoAct(nvinfer1::INetworkDefinition* network,
                              std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int ch,
                              int k, int s, int p, std::string lname, int g) {
    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv =
            network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty);
    assert(conv);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    conv->setPaddingNd(nvinfer1::DimsHW{p, p});
    conv->setNbGroups(g);

    nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);
    return bn;
}

std::vector<std::vector<float>> getAnchors(std::map<std::string, Weights>& weightMap, std::string lname) {
    std::vector<std::vector<float>> anchors;
    Weights wts = weightMap[lname + ".anchor_grid"];
    int anchor_len = kNumAnchor * 2;
    for (int i = 0; i < wts.count / anchor_len; i++) {
        auto* p = (const float*)wts.values + i * anchor_len;
        std::vector<float> anchor(p, p + anchor_len);
        anchors.push_back(anchor);
    }
    return anchors;
}

ILayer* RepConvN(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2,
                 int k, int s, int p, int g, int d, bool act, bool bn, bool deploy, std::string lname) {
    assert(k == 3 && p == 1);
    ILayer* conv1 = convBnNoAct(network, weightMap, input, c2, k, s, p, lname + ".conv1", g);
    ILayer* conv2 = convBnNoAct(network, weightMap, input, c2, 1, s, p - k / 2, lname + ".conv2", g);
    ILayer* ew0 = network->addElementWise(*conv1->getOutput(0), *conv2->getOutput(0), ElementWiseOperation::kSUM);
    nvinfer1::IActivationLayer* sigmoid =
            network->addActivation(*ew0->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
    assert(sigmoid);

    auto ew =
            network->addElementWise(*ew0->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
    assert(ew);
    return ew;
}

ILayer* RepNBottleneck(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1,
                       int c2, bool shortcut, int k, int g, float e, std::string lname) {
    int c_ = int(c2 * e);
    assert(k == 3 && "RepVGG only support kernel size 3");
    auto cv1 = RepConvN(network, weightMap, input, c1, c_, k, 1, 1, g, 1, true, false, false, lname + ".cv1");
    auto cv2 = convBnSiLU(network, weightMap, *cv1->getOutput(0), c2, k, 1, 1, lname + ".cv2", g);
    if (shortcut && c1 == c2) {
        auto ew = network->addElementWise(input, *cv2->getOutput(0), ElementWiseOperation::kSUM);
        return ew;
    }
    return cv2;
}

ILayer* RepNCSP(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2,
                int n, bool shortcut, int g, float e, std::string lname) {
    int c_ = int(c2 * e);

    auto cv1 = convBnSiLU(network, weightMap, input, c_, 1, 1, 0, lname + ".cv1", 1);

    ILayer* m = cv1;
    for (int i = 0; i < n; i++) {
        m = RepNBottleneck(network, weightMap, *m->getOutput(0), c_, c_, shortcut, 3, g, 1.0,
                           lname + ".m." + std::to_string(i));
    }

    // auto m_0 = RepNBottleneck(network, weightMap, *cv1->getOutput(0), c_, c_, shortcut, 3, g, 1.0, lname + ".m.0");
    // auto m_1 = RepNBottleneck(network, weightMap, *m_0->getOutput(0), c_, c_, shortcut, 3, g, 1.0, lname + ".m.1");

    auto cv2 = convBnSiLU(network, weightMap, input, c_, 1, 1, 0, lname + ".cv2", 1);
    ITensor* inputTensors[] = {m->getOutput(0), cv2->getOutput(0)};
    auto cat = network->addConcatenation(inputTensors, 2);

    auto cv3 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv3", 1);
    return cv3;
}

ILayer* ELAN1(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2,
              int c3, int c4, std::string lname) {
    auto cv1 = convBnSiLU(network, weightMap, input, c3, 1, 1, 0, lname + ".cv1", 1);
    // chunk(2, 1)

    nvinfer1::Dims d = cv1->getOutput(0)->getDimensions();
    nvinfer1::ISliceLayer* split1 =
            network->addSlice(*cv1->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{d.d[0] / 2, d.d[1], d.d[2]},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split2 =
            network->addSlice(*cv1->getOutput(0), nvinfer1::Dims3{d.d[0] / 2, 0, 0},
                              nvinfer1::Dims3{d.d[0] / 2, d.d[1], d.d[2]}, nvinfer1::Dims3{1, 1, 1});
    auto cv2 = convBnSiLU(network, weightMap, *split2->getOutput(0), c4, 3, 1, 1, lname + ".cv2", 1);

    auto cv3 = convBnSiLU(network, weightMap, *cv2->getOutput(0), c4, 3, 1, 1, lname + ".cv3", 1);

    ITensor* inputTensors[] = {split1->getOutput(0), split2->getOutput(0), cv2->getOutput(0), cv3->getOutput(0)};
    auto cat = network->addConcatenation(inputTensors, 4);
    auto cv4 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv4", 1);
    return cv4;
}

ILayer* RepNCSPELAN4(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1,
                     int c2, int c3, int c4, int c5, std::string lname) {

    auto cv1 = convBnSiLU(network, weightMap, input, c3, 1, 1, 0, lname + ".cv1", 1);
    // chunk(2, 1)

    nvinfer1::Dims d = cv1->getOutput(0)->getDimensions();
    nvinfer1::ISliceLayer* split1 =
            network->addSlice(*cv1->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{d.d[0] / 2, d.d[1], d.d[2]},
                              nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split2 =
            network->addSlice(*cv1->getOutput(0), nvinfer1::Dims3{d.d[0] / 2, 0, 0},
                              nvinfer1::Dims3{d.d[0] / 2, d.d[1], d.d[2]}, nvinfer1::Dims3{1, 1, 1});

    auto cv2_0 = RepNCSP(network, weightMap, *split2->getOutput(0), c3 / 2, c4, c5, true, 1, 0.5, lname + ".cv2.0");
    auto cv2_1 = convBnSiLU(network, weightMap, *cv2_0->getOutput(0), c4, 3, 1, 1, lname + ".cv2.1", 1);

    auto cv3_0 = RepNCSP(network, weightMap, *cv2_1->getOutput(0), c4, c4, c5, true, 1, 0.5, lname + ".cv3.0");
    auto cv3_1 = convBnSiLU(network, weightMap, *cv3_0->getOutput(0), c4, 3, 1, 1, lname + ".cv3.1", 1);

    ITensor* inputTensors[] = {split1->getOutput(0), split2->getOutput(0), cv2_1->getOutput(0), cv3_1->getOutput(0)};
    auto cat = network->addConcatenation(inputTensors, 4);
    auto cv4 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv4", 1);
    return cv4;
}

ILayer* AConv(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int c2,
              std::string lname) {
    auto pool = network->addPoolingNd(input, PoolingType::kAVERAGE, DimsHW{2, 2});
    pool->setStrideNd(DimsHW{1, 1});
    pool->setPaddingNd(DimsHW{0, 0});
    auto cv1 = convBnSiLU(network, weightMap, *pool->getOutput(0), c2, 3, 2, 1, lname + ".cv1", 1);
    return cv1;
}
ILayer* ADown(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int c2,
              std::string lname) {
    int c_ = c2 / 2;
    auto pool = network->addPoolingNd(input, PoolingType::kAVERAGE, DimsHW{2, 2});
    pool->setStrideNd(DimsHW{1, 1});
    pool->setPaddingNd(DimsHW{0, 0});

    nvinfer1::Dims d = pool->getOutput(0)->getDimensions();
    nvinfer1::ISliceLayer* split1 =
            network->addSlice(*pool->getOutput(0), nvinfer1::Dims3{0, 0, 0},
                              nvinfer1::Dims3{d.d[0] / 2, d.d[1], d.d[2]}, nvinfer1::Dims3{1, 1, 1});
    nvinfer1::ISliceLayer* split2 =
            network->addSlice(*pool->getOutput(0), nvinfer1::Dims3{d.d[0] / 2, 0, 0},
                              nvinfer1::Dims3{d.d[0] / 2, d.d[1], d.d[2]}, nvinfer1::Dims3{1, 1, 1});

    // auto chunklayer = layer_split(1, pool->getOutput(0), network);
    auto cv1 = convBnSiLU(network, weightMap, *split1->getOutput(0), c_, 3, 2, 1, lname + ".cv1", 1);

    auto pool2 = network->addPoolingNd(*split2->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
    pool2->setStrideNd(DimsHW{2, 2});
    pool2->setPaddingNd(DimsHW{1, 1});
    auto cv2 = convBnSiLU(network, weightMap, *pool2->getOutput(0), c_, 1, 1, 0, lname + ".cv2", 1);

    ITensor* inputTensors[] = {cv1->getOutput(0), cv2->getOutput(0)};
    auto cat = network->addConcatenation(inputTensors, 2);
    return cat;
}

std::vector<ILayer*> CBLinear(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input,
                              std::vector<int> c2s, int k, int s, int p, int g, std::string lname) {

    IConvolutionLayer* conv1 =
            network->addConvolutionNd(input, std::accumulate(c2s.begin(), c2s.end(), 0), DimsHW{k, k},
                                      weightMap[lname + ".conv.weight"], weightMap[lname + ".conv.bias"]);
    assert(conv1);
    conv1->setName((lname + ".conv").c_str());
    conv1->setStrideNd(DimsHW{s, s});
    conv1->setPaddingNd(DimsHW{p, p});

    int h = input.getDimensions().d[1];
    int w = input.getDimensions().d[2];
    std::vector<ILayer*> slices(c2s.size());
    int start = 0;
    for (int i = 0; i < c2s.size(); i++) {
        slices[i] = network->addSlice(*conv1->getOutput(0), Dims3{start, 0, 0}, Dims3{c2s[i], h, w}, Dims3{1, 1, 1});
        start += c2s[i];
    }
    return slices;
}

ILayer* CBFuse(INetworkDefinition* network, std::vector<std::vector<ILayer*>> input, std::vector<int> idx,
               std::vector<int> strides) {
    ILayer** res = new ILayer*[input.size()];
    res[input.size() - 1] = input[input.size() - 1][0];

    for (int i = input.size() - 2; i >= 0; i--) {
        auto upsample = network->addResize(*input[i][idx[i]]->getOutput(0));
        upsample->setResizeMode(ResizeMode::kNEAREST);
        const float scales[] = {1, strides[i] / strides[strides.size() - 1], strides[i] / strides[strides.size() - 1]};
        upsample->setScales(scales, 3);
        res[i] = upsample;
    }

    for (int i = 1; i < input.size(); i++) {
        auto ew = network->addElementWise(*res[0]->getOutput(0), *res[i]->getOutput(0), ElementWiseOperation::kSUM);
        res[0] = ew;
    }
    return res[0];
}

ILayer* SP(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int k, int s) {
    int p = k / 2;
    auto pool = network->addPoolingNd(input, PoolingType::kMAX, DimsHW{k, k});
    pool->setPaddingNd(DimsHW{p, p});
    pool->setStrideNd(DimsHW{s, s});
    return pool;
}

ILayer* SPPELAN(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2,
                int c3, std::string lname) {
    auto cv1 = convBnSiLU(network, weightMap, input, c3, 1, 1, 0, lname + ".cv1", 1);
    auto cv2 = SP(network, weightMap, *cv1->getOutput(0), 5, 1);
    auto cv3 = SP(network, weightMap, *cv2->getOutput(0), 5, 1);
    auto cv4 = SP(network, weightMap, *cv3->getOutput(0), 5, 1);

    ITensor* inputTensors[] = {cv1->getOutput(0), cv2->getOutput(0), cv3->getOutput(0), cv4->getOutput(0)};
    auto cat = network->addConcatenation(inputTensors, 4);
    auto cv5 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv5", 1);
    return cv5;
}

ILayer* DetectBbox_Conv(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int c2,
                        int reg_max, std::string lname) {
    auto cv_0 = convBnSiLU(network, weightMap, input, c2, 3, 1, 1, lname + ".0", 1);
    auto cv_1 = convBnSiLU(network, weightMap, *cv_0->getOutput(0), c2, 3, 1, 1, lname + ".1", 4);
    auto cv_2 = network->addConvolutionNd(*cv_1->getOutput(0), reg_max * 4, DimsHW{1, 1},
                                          weightMap[lname + ".2.weight"], weightMap[lname + ".2.bias"]);
    cv_2->setName((lname + ".conv").c_str());
    cv_2->setStrideNd(DimsHW{1, 1});
    cv_2->setPaddingNd(DimsHW{0, 0});
    cv_2->setNbGroups(4);
    return cv_2;
}

ILayer* DetectCls_Conv(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int c2,
                       int cls, std::string lname) {
    auto cv_0 = convBnSiLU(network, weightMap, input, c2, 3, 1, 1, lname + ".0", 1);
    auto cv_1 = convBnSiLU(network, weightMap, *cv_0->getOutput(0), c2, 3, 1, 1, lname + ".1", 1);
    auto cv_2 = network->addConvolutionNd(*cv_1->getOutput(0), cls, DimsHW{1, 1}, weightMap[lname + ".2.weight"],
                                          weightMap[lname + ".2.bias"]);
    cv_2->setName((lname + ".conv").c_str());
    cv_2->setStrideNd(DimsHW{1, 1});
    cv_2->setPaddingNd(DimsHW{0, 0});
    return cv_2;
}

nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
                             nvinfer1::ITensor& input, int ch, int k, int s, int p, std::string lname) {
    auto dim = input.getDimensions();
    int c = dim.d[0];
    int grid = dim.d[1] * dim.d[2];
    int split_num = c / ch;

    nvinfer1::IShuffleLayer* shuffle1 = network->addShuffle(input);
    shuffle1->setReshapeDimensions(nvinfer1::Dims3{split_num, ch, grid});
    shuffle1->setSecondTranspose(nvinfer1::Permutation{1, 0, 2});
    nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*shuffle1->getOutput(0));
    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
    nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(*softmax->getOutput(0), 1, nvinfer1::DimsHW{1, 1},
                                                                  weightMap[lname + ".conv.weight"], bias_empty);
    conv->setStrideNd(nvinfer1::DimsHW{s, s});
    conv->setPaddingNd(nvinfer1::DimsHW{p, p});
    nvinfer1::IShuffleLayer* shuffle2 = network->addShuffle(*conv->getOutput(0));
    shuffle2->setReshapeDimensions(nvinfer1::Dims2{4, grid});
    return shuffle2;
}

nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
                                       std::vector<nvinfer1::IConcatenationLayer*> dets, bool is_segmentation) {
    auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");

    nvinfer1::PluginField plugin_fields[1];
    int netinfo[5] = {kNumClass, kInputW, kInputH, kMaxNumOutputBbox, is_segmentation};
    plugin_fields[0].data = netinfo;
    plugin_fields[0].length = 5;
    plugin_fields[0].name = "netinfo";
    plugin_fields[0].type = nvinfer1::PluginFieldType::kFLOAT32;

    nvinfer1::PluginFieldCollection plugin_data;
    plugin_data.nbFields = 1;
    plugin_data.fields = plugin_fields;
    nvinfer1::IPluginV2* plugin_obj = creator->createPlugin("yololayer", &plugin_data);
    std::vector<nvinfer1::ITensor*> input_tensors;
    for (auto det : dets) {
        input_tensors.push_back(det->getOutput(0));
    }
    auto yolo = network->addPluginV2(&input_tensors[0], input_tensors.size(), *plugin_obj);
    return yolo;
}

std::vector<IConcatenationLayer*> DualDDetect(INetworkDefinition* network, std::map<std::string, Weights>& weightMap,
                                              std::vector<ILayer*> dets, int cls, std::vector<int> ch,
                                              std::string lname) {
    int c2 = std::max(int(ch[0] / 4), int(16 * 4));
    int c3 = std::max(ch[0], std::min(cls * 2, 128));
    int reg_max = 16;

    std::vector<ILayer*> bboxlayers;
    std::vector<ILayer*> clslayers;

    for (int i = 0; i < dets.size(); i++) {
        // Conv(x, c2, 3), Conv(c2, c2, 3, g=4), nn.Conv2d(c2, 4 * self.reg_max, 1, groups=4)
        bboxlayers.push_back(DetectBbox_Conv(network, weightMap, *dets[i]->getOutput(0), c2, reg_max,
                                             lname + ".cv2." + std::to_string(i)));
        // Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, self.nc, 1)
        auto cls_layer = DetectCls_Conv(network, weightMap, *dets[i]->getOutput(0), c3, cls,
                                        lname + ".cv3." + std::to_string(i));
        auto dim = cls_layer->getOutput(0)->getDimensions();
        nvinfer1::IShuffleLayer* shuffle = network->addShuffle(*cls_layer->getOutput(0));
        shuffle->setReshapeDimensions(nvinfer1::Dims2{kNumClass, dim.d[1] * dim.d[2]});
        clslayers.push_back(shuffle);
    }

    std::vector<IConcatenationLayer*> ret;
    for (int i = 0; i < dets.size(); i++) {
        // softmax 16*4, w, h => 16, 4, w, h
        auto loc = DFL(network, weightMap, *bboxlayers[i]->getOutput(0), 16, 1, 1, 0, lname + ".dfl");
        nvinfer1::ITensor* inputTensor[] = {loc->getOutput(0), clslayers[i]->getOutput(0)};
        ret.push_back(network->addConcatenation(inputTensor, 2));
    }
    return ret;
}

std::vector<IConcatenationLayer*> DDetect(INetworkDefinition* network, std::map<std::string, Weights>& weightMap,
                                          std::vector<ILayer*> dets, int cls, std::vector<int> ch, std::string lname) {
    int c2 = std::max(int(ch[0] / 4), int(16 * 4));
    //  max((ch[0], min((self.nc * 2, 128))))
    // int c3 = std::max(ch[0], std::min(cls * 2, 128));
    int c3 = std::max(ch[0], std::min(cls, 128));
    int reg_max = 16;

    std::vector<ILayer*> bboxlayers;
    std::vector<ILayer*> clslayers;

    for (int i = 0; i < dets.size(); i++) {
        // Conv(x, c2, 3), Conv(c2, c2, 3, g=4), nn.Conv2d(c2, 4 * self.reg_max, 1, groups=4)
        bboxlayers.push_back(DetectBbox_Conv(network, weightMap, *dets[i]->getOutput(0), c2, reg_max,
                                             lname + ".cv2." + std::to_string(i)));
        // Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, self.nc, 1)
        auto cls_layer = DetectCls_Conv(network, weightMap, *dets[i]->getOutput(0), c3, cls,
                                        lname + ".cv3." + std::to_string(i));
        auto dim = cls_layer->getOutput(0)->getDimensions();
        nvinfer1::IShuffleLayer* shuffle = network->addShuffle(*cls_layer->getOutput(0));
        shuffle->setReshapeDimensions(nvinfer1::Dims2{kNumClass, dim.d[1] * dim.d[2]});
        clslayers.push_back(shuffle);
    }

    std::vector<IConcatenationLayer*> ret;
    for (int i = 0; i < dets.size(); i++) {
        // softmax 16*4, w, h => 16, 4, w, h
        auto loc = DFL(network, weightMap, *bboxlayers[i]->getOutput(0), 16, 1, 1, 0, lname + ".dfl");
        nvinfer1::ITensor* inputTensor[] = {loc->getOutput(0), clslayers[i]->getOutput(0)};
        ret.push_back(network->addConcatenation(inputTensor, 2));
    }
    return ret;
}


================================================
FILE: yolov9/src/calibrator.cpp
================================================
#include "calibrator.h"
#include "cuda_utils.h"
#include "utils.h"

#include <fstream>
#include <iostream>
#include <iterator>
#include <opencv2/dnn/dnn.hpp>
#include <opencv2/opencv.hpp>
static cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
    int w, h, x, y;
    float r_w = input_w / (img.cols * 1.0);
    float r_h = input_h / (img.rows * 1.0);
    if (r_h > r_w) {
        w = input_w;
        h = r_w * img.rows;
        x = 0;
        y = (input_h - h) / 2;
    } else {
        w = r_h * img.cols;
        h = input_h;
        x = (input_w - w) / 2;
        y = 0;
    }
    cv::Mat re(h, w, CV_8UC3);
    cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
    cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
    re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
    return out;
}

Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir,
                                               const char* calib_table_name, const char* input_blob_name,
                                               bool read_cache)
    : batchsize_(batchsize),
      input_w_(input_w),
      input_h_(input_h),
      img_idx_(0),
      img_dir_(img_dir),
      calib_table_name_(calib_table_name),
      input_blob_name_(input_blob_name),
      read_cache_(read_cache) {
    input_count_ = 3 * input_w * input_h * batchsize;
    CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float)));
    read_files_in_dir(img_dir, img_files_);
}

Int8EntropyCalibrator2::~Int8EntropyCalibrator2() {
    CUDA_CHECK(cudaFree(device_input_));
}

int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT {
    return batchsize_;
}

bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT {
    if (img_idx_ + batchsize_ > (int)img_files_.size()) {
        return false;
    }

    std::vector<cv::Mat> input_imgs_;
    for (int i = img_idx_; i < img_idx_ + batchsize_; i++) {
        std::cout << img_files_[i] << "  " << i << std::endl;
        cv::Mat temp = cv::imread(img_dir_ + img_files_[i]);
        if (temp.empty()) {
            std::cerr << "Fatal error: image cannot open!" << std::endl;
            return false;
        }
        cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_);
        input_imgs_.push_back(pr_img);
    }
    img_idx_ += batchsize_;
    cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0),
                                           true, false);

    CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr<float>(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice));
    assert(!strcmp(names[0], input_blob_name_));
    bindings[0] = device_input_;
    return true;
}

const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT {
    std::cout << "reading calib cache: " << calib_table_name_ << std::endl;
    calib_cache_.clear();
    std::ifstream input(calib_table_name_, std::ios::binary);
    input >> std::noskipws;
    if (read_cache_ && input.good()) {
        std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(calib_cache_));
    }
    length = calib_cache_.size();
    return length ? calib_cache_.data() : nullptr;
}

void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT {
    std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl;
    std::ofstream output(calib_table_name_, std::ios::binary);
    output.write(reinterpret_cast<const char*>(cache), length);
}


================================================
FILE: yolov9/src/model.cpp
================================================
#include "model.h"
#include <cassert>
#include <cmath>
#include <cstring>
#include <fstream>
#include <iostream>
#include <map>
#include "block.h"
#include "calibrator.h"
#include "config.h"
#include "yololayer.h"

using namespace nvinfer1;
#ifdef USE_INT8
void Calibrator(IBuilder* builder, IBuilderConfig* config) {
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(BuilderFlag::kINT8);
    Int8EntropyCalibrator2* calibrator =
            new Int8EntropyCalibrator2(1, kInputW, kInputH, gCalibTablePath, "int8calib.table", kInputTensorName);
    config->setInt8Calibrator(calibrator);
}
#endif

IHostMemory* build_engine_yolov9_t(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt,
                                   std::string& wts_name, bool isConvert) {
    /* ------ Create the builder ------ */
    INetworkDefinition* network = builder->createNetworkV2(0U);

    ITensor* data = network->addInput(kInputTensorName, dt, Dims3{3, kInputH, kInputW});
    assert(data);
    std::map<std::string, Weights> weightMap = loadWeights(wts_name);

    // # conv down
    auto conv_1 = convBnSiLU(network, weightMap, *data, 16, 3, 2, 1, "model.0", 1);
    // # conv down
    auto conv_2 = convBnSiLU(network, weightMap, *conv_1->getOutput(0), 32, 3, 2, 1, "model.1");
    // # elan-1 block
    auto repncspelan_3 = ELAN1(network, weightMap, *conv_2->getOutput(0), 32, 32, 32, 16, "model.2");
    // # avg-conv down
    // [-1, 1, ADown, [256]],  # 4-P3/8
    auto adown_4 = AConv(network, weightMap, *repncspelan_3->getOutput(0), 64, "model.3");
    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 256, 128, 1]],  # 5
    auto repncspelan_5 = RepNCSPELAN4(network, weightMap, *adown_4->getOutput(0), 64, 64, 64, 32, 3, "model.4");
    // # avg-conv down
    // [-1, 1, ADown, [512]],  # 6-P4/16
    auto adown_6 = AConv(network, weightMap, *repncspelan_5->getOutput(0), 96, "model.5");
    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]],  # 7
    auto repncspelan_7 = RepNCSPELAN4(network, weightMap, *adown_6->getOutput(0), 96, 96, 96, 48, 3, "model.6");
    // # avg-conv down
    // [-1, 1, ADown, [512]],  # 8-P5/32
    auto adown_8 = AConv(network, weightMap, *repncspelan_7->getOutput(0), 128, "model.7");
    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]],  # 9
    auto repncspelan_9 = RepNCSPELAN4(network, weightMap, *adown_8->getOutput(0), 128, 128, 128, 64, 3, "model.8");
    // # elan-spp block
    // [-1, 1, SPPELAN, [512, 256]],  # 10
    auto sppelan_10 = SPPELAN(network, weightMap, *repncspelan_9->getOutput(0), 128, 128, 64, "model.9");

    // # up-concat merge
    // [-1, 1, nn.Upsample, [None, 2, 'nearest']],
    auto upsample_11 = network->addResize(*sppelan_10->getOutput(0));
    upsample_11->setResizeMode(ResizeMode::kNEAREST);
    const float scales_11[] = {1.0, 2.0, 2.0};
    upsample_11->setScales(scales_11, 3);
    // [[-1, 7], 1, Concat, [1]],  # cat backbone P4
    ITensor* input_tensor_12[] = {upsample_11->getOutput(0), repncspelan_7->getOutput(0)};
    auto cat_12 = network->addConcatenation(input_tensor_12, 2);

    // # elan-2 block
    auto repncspelan_13 = RepNCSPELAN4(network, weightMap, *cat_12->getOutput(0), 288, 96, 96, 48, 3, "model.12");

    // # up-concat merge
    // [-1, 1, nn.Upsample, [None, 2, 'nearest']],
    auto upsample_14 = network->addResize(*repncspelan_13->getOutput(0));
    upsample_14->setResizeMode(ResizeMode::kNEAREST);
    const float scales_14[] = {1.0, 2.0, 2.0};
    upsample_14->setScales(scales_14, 3);
    // [[-1, 5], 1, Concat, [1]],  # cat backbone P3
    ITensor* input_tensor_15[] = {upsample_14->getOutput(0), repncspelan_5->getOutput(0)};
    auto cat_15 = network->addConcatenation(input_tensor_15, 2);

    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [256, 256, 128, 1]],  # 16 (P3/8-small)
    auto repncspelan_16 = RepNCSPELAN4(network, weightMap, *cat_15->getOutput(0), 192, 64, 64, 32, 3, "model.15");

    // # avg-conv-down merge
    // [-1, 1, ADown, [256]],
    auto adown_17 = AConv(network, weightMap, *repncspelan_16->getOutput(0), 48, "model.16");
    // [[-1, 13], 1, Concat, [1]],  # cat head P4
    ITensor* input_tensor_18[] = {adown_17->getOutput(0), repncspelan_13->getOutput(0)};
    auto cat_18 = network->addConcatenation(input_tensor_18, 2);

    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]],  # 19 (P4/16-medium)
    auto repncspelan_19 = RepNCSPELAN4(network, weightMap, *cat_18->getOutput(0), 144, 96, 96, 48, 3, "model.18");

    // # avg-conv-down merge
    // [-1, 1, ADown, [512]],
    auto adown_20 = AConv(network, weightMap, *repncspelan_19->getOutput(0), 64, "model.19");
    // [[-1, 10], 1, Concat, [1]],  # cat head P5
    ITensor* input_tensor_21[] = {adown_20->getOutput(0), sppelan_10->getOutput(0)};
    auto cat_21 = network->addConcatenation(input_tensor_21, 2);

    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]],  # 22 (P5/32-large)
    auto repncspelan_22 = RepNCSPELAN4(network, weightMap, *cat_21->getOutput(0), 256, 128, 128, 64, 3, "model.21");

    std::vector<IConcatenationLayer*> head;
    if (!isConvert) {
        // # elan-spp block
        auto sppelan_23 = SPPELAN(network, weightMap, *repncspelan_9->getOutput(0), 512, 128, 64, "model.22");

        // # up-concat merge
        auto upsample_24 = network->addResize(*sppelan_23->getOutput(0));
        upsample_24->setResizeMode(ResizeMode::kNEAREST);
        const float scales_24[] = {1.0, 2.0, 2.0};
        upsample_24->setScales(scales_24, 3);
        // [[-1, 6], 1, Concat, [1]],  # cat backbone P4
        ITensor* input_tensor_25[] = {upsample_24->getOutput(0), repncspelan_7->getOutput(0)};
        auto cat_25 = network->addConcatenation(input_tensor_25, 2);

        // # elan-2 block
        auto repncspelan_26 = RepNCSPELAN4(network, weightMap, *cat_25->getOutput(0), 384, 96, 96, 48, 3, "model.25");

        // # up-concat merge
        auto upsample_27 = network->addResize(*repncspelan_26->getOutput(0));
        upsample_27->setResizeMode(ResizeMode::kNEAREST);
        const float scales_27[] = {1.0, 2.0, 2.0};
        upsample_27->setScales(scales_27, 3);
        // [[-1, 4], 1, Concat, [1]],  # cat backbone P3
        ITensor* input_tensor_28[] = {upsample_27->getOutput(0), repncspelan_5->getOutput(0)};
        auto cat_28 = network->addConcatenation(input_tensor_28, 2);

        // # elan-2 block
        auto repncspelan_29 = RepNCSPELAN4(network, weightMap, *cat_28->getOutput(0), 256, 64, 64, 32, 3, "model.28");
        head = DualDDetect(network, weightMap, std::vector<ILayer*>{repncspelan_16, repncspelan_19, repncspelan_22},
                           kNumClass, {64, 96, 128}, "model.29");
    } else {
        head = DDetect(network, weightMap, std::vector<ILayer*>{repncspelan_16, repncspelan_19, repncspelan_22},
                       kNumClass, {64, 96, 128}, "model.22");
    }

    nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, head, false);
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    builder->setMaxBatchSize(kBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator =
            new Int8EntropyCalibrator2(1, kInputW, kInputH, gCalibTablePath, "int8calib.table", kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    return serialized_model;
}

IHostMemory* build_engine_yolov9_s(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt,
                                   std::string& wts_name, bool isConvert) {
    /* ------ Create the builder ------ */
    INetworkDefinition* network = builder->createNetworkV2(0U);

    ITensor* data = network->addInput(kInputTensorName, dt, Dims3{3, kInputH, kInputW});
    assert(data);
    std::map<std::string, Weights> weightMap = loadWeights(wts_name);

    // # conv down
    auto conv_1 = convBnSiLU(network, weightMap, *data, 32, 3, 2, 1, "model.0", 1);
    // # conv down
    auto conv_2 = convBnSiLU(network, weightMap, *conv_1->getOutput(0), 64, 3, 2, 1, "model.1");
    // # elan-1 block
    auto repncspelan_3 = ELAN1(network, weightMap, *conv_2->getOutput(0), 32, 64, 64, 32, "model.2");
    // # avg-conv down
    auto adown_4 = AConv(network, weightMap, *repncspelan_3->getOutput(0), 128, "model.3");
    // # elan-2 block
    auto repncspelan_5 = RepNCSPELAN4(network, weightMap, *adown_4->getOutput(0), 128, 128, 128, 64, 3, "model.4");
    // # avg-conv down
    auto adown_6 = AConv(network, weightMap, *repncspelan_5->getOutput(0), 192, "model.5");
    // # elan-2 block
    auto repncspelan_7 = RepNCSPELAN4(network, weightMap, *adown_6->getOutput(0), 192, 192, 192, 96, 3, "model.6");
    // # avg-conv down
    auto adown_8 = AConv(network, weightMap, *repncspelan_7->getOutput(0), 256, "model.7");
    // # elan-2 block
    auto repncspelan_9 = RepNCSPELAN4(network, weightMap, *adown_8->getOutput(0), 256, 256, 256, 128, 3, "model.8");
    // # elan-spp block
    auto sppelan_10 = SPPELAN(network, weightMap, *repncspelan_9->getOutput(0), 512, 256, 128, "model.9");

    // # up-concat merge
    auto upsample_11 = network->addResize(*sppelan_10->getOutput(0));
    upsample_11->setResizeMode(ResizeMode::kNEAREST);
    const float scales_11[] = {1.0, 2.0, 2.0};
    upsample_11->setScales(scales_11, 3);
    // [[-1, 7], 1, Concat, [1]],  # cat backbone P4
    ITensor* input_tensor_12[] = {upsample_11->getOutput(0), repncspelan_7->getOutput(0)};
    auto cat_12 = network->addConcatenation(input_tensor_12, 2);

    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]],  # 13
    auto repncspelan_13 = RepNCSPELAN4(network, weightMap, *cat_12->getOutput(0), 192, 192, 192, 96, 3, "model.12");

    // # up-concat merge
    // [-1, 1, nn.Upsample, [None, 2, 'nearest']],
    auto upsample_14 = network->addResize(*repncspelan_13->getOutput(0));
    upsample_14->setResizeMode(ResizeMode::kNEAREST);
    const float scales_14[] = {1.0, 2.0, 2.0};
    upsample_14->setScales(scales_14, 3);
    // [[-1, 5], 1, Concat, [1]],  # cat backbone P3
    ITensor* input_tensor_15[] = {upsample_14->getOutput(0), repncspelan_5->getOutput(0)};
    auto cat_15 = network->addConcatenation(input_tensor_15, 2);

    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [256, 256, 128, 1]],  # 16 (P3/8-small)
    auto repncspelan_16 = RepNCSPELAN4(network, weightMap, *cat_15->getOutput(0), 128, 128, 128, 64, 3, "model.15");

    // # avg-conv-down merge
    // [-1, 1, ADown, [256]],
    auto adown_17 = AConv(network, weightMap, *repncspelan_16->getOutput(0), 96, "model.16");
    // [[-1, 13], 1, Concat, [1]],  # cat head P4
    ITensor* input_tensor_18[] = {adown_17->getOutput(0), repncspelan_13->getOutput(0)};
    auto cat_18 = network->addConcatenation(input_tensor_18, 2);

    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]],  # 19 (P4/16-medium)
    auto repncspelan_19 = RepNCSPELAN4(network, weightMap, *cat_18->getOutput(0), 768, 192, 192, 96, 3, "model.18");

    // # avg-conv-down merge
    // [-1, 1, ADown, [512]],
    auto adown_20 = AConv(network, weightMap, *repncspelan_19->getOutput(0), 128, "model.19");
    // [[-1, 10], 1, Concat, [1]],  # cat head P5
    ITensor* input_tensor_21[] = {adown_20->getOutput(0), sppelan_10->getOutput(0)};
    auto cat_21 = network->addConcatenation(input_tensor_21, 2);

    // # elan-2 block
    auto repncspelan_22 = RepNCSPELAN4(network, weightMap, *cat_21->getOutput(0), 1024, 256, 256, 128, 1, "model.21");
    std::vector<IConcatenationLayer*> head;
    if (!isConvert) {
        // # elan-spp block
        auto sppelan_23 = SPPELAN(network, weightMap, *repncspelan_9->getOutput(0), 512, 256, 128, "model.22");

        // # up-concat merge
        auto upsample_24 = network->addResize(*sppelan_23->getOutput(0));
        upsample_24->setResizeMode(ResizeMode::kNEAREST);
        const float scales_24[] = {1.0, 2.0, 2.0};
        upsample_24->setScales(scales_24, 3);
        // [[-1, 6], 1, Concat, [1]],  # cat backbone P4
        ITensor* input_tensor_25[] = {upsample_24->getOutput(0), repncspelan_7->getOutput(0)};
        auto cat_25 = network->addConcatenation(input_tensor_25, 2);

        // # elan-2 block
        auto repncspelan_26 = RepNCSPELAN4(network, weightMap, *cat_25->getOutput(0), 384, 192, 192, 96, 3, "model.25");

        // # up-concat merge
        auto upsample_27 = network->addResize(*repncspelan_26->getOutput(0));
        upsample_27->setResizeMode(ResizeMode::kNEAREST);
        const float scales_27[] = {1.0, 2.0, 2.0};
        upsample_27->setScales(scales_27, 3);
        // [[-1, 4], 1, Concat, [1]],  # cat backbone P3
        ITensor* input_tensor_28[] = {upsample_27->getOutput(0), repncspelan_5->getOutput(0)};
        auto cat_28 = network->addConcatenation(input_tensor_28, 2);

        // # elan-2 block
        auto repncspelan_29 = RepNCSPELAN4(network, weightMap, *cat_28->getOutput(0), 256, 128, 128, 64, 3, "model.28");
        head = DualDDetect(network, weightMap, std::vector<ILayer*>{repncspelan_16, repncspelan_19, repncspelan_22},
                           kNumClass, {128, 192, 256}, "model.29");
    } else {
        head = DDetect(network, weightMap, std::vector<ILayer*>{repncspelan_16, repncspelan_19, repncspelan_22},
                       kNumClass, {128, 192, 256}, "model.22");
    }

    nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, head, false);
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    builder->setMaxBatchSize(kBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator =
            new Int8EntropyCalibrator2(1, kInputW, kInputH, gCalibTablePath, "int8calib.table", kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    return serialized_model;
}
IHostMemory* build_engine_yolov9_m(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt,
                                   std::string& wts_name, bool isConvert) {
    /* ------ Create the builder ------ */
    INetworkDefinition* network = builder->createNetworkV2(0U);

    ITensor* data = network->addInput(kInputTensorName, dt, Dims3{3, kInputH, kInputW});
    assert(data);
    std::map<std::string, Weights> weightMap = loadWeights(wts_name);

    int begin = isConvert ? 0 : 1;

    // # conv down
    // [-1, 1, Conv, [64, 3, 2]],  # 1-P1/2
    auto conv_1 = convBnSiLU(network, weightMap, *data, 32, 3, 2, 1, "model." + std::to_string(begin), 1);
    begin += 1;
    // # conv down
    // [-1, 1, Conv, [128, 3, 2]],  # 2-P2/4
    auto conv_2 = convBnSiLU(network, weightMap, *conv_1->getOutput(0), 64, 3, 2, 1, "model." + std::to_string(begin));
    begin += 1;
    // # elan-1 block
    // [-1, 1, RepNCSPELAN4, [256, 128, 64, 1]],  # 3
    auto repncspelan_3 = RepNCSPELAN4(network, weightMap, *conv_2->getOutput(0), 128, 128, 128, 64, 1,
                                      "model." + std::to_string(begin));
    begin += 1;
    // # avg-conv down
    // [-1, 1, ADown, [256]],  # 4-P3/8
    auto adown_4 = AConv(network, weightMap, *repncspelan_3->getOutput(0), 240, "model." + std::to_string(begin));
    begin += 1;
    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 256, 128, 1]],  # 5
    auto repncspelan_5 = RepNCSPELAN4(network, weightMap, *adown_4->getOutput(0), 256, 240, 240, 120, 1,
                                      "model." + std::to_string(begin));
    begin += 1;
    // # avg-conv down
    // [-1, 1, ADown, [512]],  # 6-P4/16
    auto adown_6 = AConv(network, weightMap, *repncspelan_5->getOutput(0), 360, "model." + std::to_string(begin));
    begin += 1;
    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]],  # 7
    auto repncspelan_7 = RepNCSPELAN4(network, weightMap, *adown_6->getOutput(0), 512, 360, 360, 180, 1,
                                      "model." + std::to_string(begin));
    begin += 1;
    // # avg-conv down
    // [-1, 1, ADown, [512]],  # 8-P5/32
    auto adown_8 = AConv(network, weightMap, *repncspelan_7->getOutput(0), 480, "model." + std::to_string(begin));
    begin += 1;
    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]],  # 9
    auto repncspelan_9 = RepNCSPELAN4(network, weightMap, *adown_8->getOutput(0), 512, 480, 480, 240, 1,
                                      "model." + std::to_string(begin));
    begin += 1;
    // # elan-spp block
    // [-1, 1, SPPELAN, [512, 256]],  # 10
    auto sppelan_10 =
            SPPELAN(network, weightMap, *repncspelan_9->getOutput(0), 512, 480, 240, "model." + std::to_string(begin));
    begin += 3;

    // # up-concat merge
    // [-1, 1, nn.Upsample, [None, 2, 'nearest']],
    auto upsample_11 = network->addResize(*sppelan_10->getOutput(0));
    upsample_11->setResizeMode(ResizeMode::kNEAREST);
    const float scales_11[] = {1.0, 2.0, 2.0};
    upsample_11->setScales(scales_11, 3);
    // [[-1, 7], 1, Concat, [1]],  # cat backbone P4
    ITensor* input_tensor_12[] = {upsample_11->getOutput(0), repncspelan_7->getOutput(0)};
    auto cat_12 = network->addConcatenation(input_tensor_12, 2);

    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]],  # 13
    auto repncspelan_13 = RepNCSPELAN4(network, weightMap, *cat_12->getOutput(0), 1536, 360, 360, 180, 1,
                                       "model." + std::to_string(begin));
    begin += 3;

    // # up-concat merge
    // [-1, 1, nn.Upsample, [None, 2, 'nearest']],
    auto upsample_14 = network->addResize(*repncspelan_13->getOutput(0));
    upsample_14->setResizeMode(ResizeMode::kNEAREST);
    const float scales_14[] = {1.0, 2.0, 2.0};
    upsample_14->setScales(scales_14, 3);
    // [[-1, 5], 1, Concat, [1]],  # cat backbone P3
    ITensor* input_tensor_15[] = {upsample_14->getOutput(0), repncspelan_5->getOutput(0)};
    auto cat_15 = network->addConcatenation(input_tensor_15, 2);

    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [256, 256, 128, 1]],  # 16 (P3/8-small)
    auto repncspelan_16 = RepNCSPELAN4(network, weightMap, *cat_15->getOutput(0), 1024, 240, 240, 120, 1,
                                       "model." + std::to_string(begin));
    begin += 1;

    // # avg-conv-down merge
    // [-1, 1, ADown, [256]],
    auto adown_17 = AConv(network, weightMap, *repncspelan_16->getOutput(0), 184, "model." + std::to_string(begin));
    begin += 2;
    // [[-1, 13], 1, Concat, [1]],  # cat head P4
    ITensor* input_tensor_18[] = {adown_17->getOutput(0), repncspelan_13->getOutput(0)};
    auto cat_18 = network->addConcatenation(input_tensor_18, 2);

    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]],  # 19 (P4/16-medium)
    auto repncspelan_19 = RepNCSPELAN4(network, weightMap, *cat_18->getOutput(0), 768, 360, 360, 180, 1,
                                       "model." + std::to_string(begin));
    begin += 1;

    // # avg-conv-down merge
    // [-1, 1, ADown, [512]],
    auto adown_20 = AConv(network, weightMap, *repncspelan_19->getOutput(0), 240, "model." + std::to_string(begin));
    begin += 2;
    // [[-1, 10], 1, Concat, [1]],  # cat head P5
    ITensor* input_tensor_21[] = {adown_20->getOutput(0), sppelan_10->getOutput(0)};
    auto cat_21 = network->addConcatenation(input_tensor_21, 2);

    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]],  # 22 (P5/32-large)
    auto repncspelan_22 = RepNCSPELAN4(network, weightMap, *cat_21->getOutput(0), 1024, 480, 480, 240, 1,
                                       "model." + std::to_string(begin));
    begin += 1;
    std::vector<IConcatenationLayer*> head;
    if (!isConvert) {
        // # routing
        // [5, 1, CBLinear, [[256]]], # 23
        auto cblinear_23 = CBLinear(network, weightMap, *repncspelan_5->getOutput(0), {240}, 1, 1, 0, 1,
                                    "model." + std::to_string(begin));
        begin += 1;
        // [7, 1, CBLinear, [[256, 512]]], # 24
        auto cblinear_24 = CBLinear(network, weightMap, *repncspelan_7->getOutput(0), {240, 360}, 1, 1, 0, 1,
                                    "model." + std::to_string(begin));
        begin += 1;
        // [9, 1, CBLinear, [[256, 512, 512]]], # 25
        auto cblinear_25 = CBLinear(network, weightMap, *repncspelan_9->getOutput(0), {240, 360, 480}, 1, 1, 0, 1,
                                    "model." + std::to_string(begin));
        begin += 1;

        // # conv down
        // [0, 1, Conv, [64, 3, 2]],  # 26-P1/2
        auto conv_26 = convBnSiLU(network, weightMap, *data, 32, 3, 2, 1, "model." + std::to_string(begin), 1);
        begin += 1;

        // # conv down
        // [-1, 1, Conv, [128, 3, 2]],  # 27-P2/4
        auto conv_27 =
                convBnSiLU(network, weightMap, *conv_26->getOutput(0), 64, 3, 2, 1, "model." + std::to_string(begin));
        begin += 1;

        // # elan-1 block
        // [-1, 1, RepNCSPELAN4, [256, 128, 64, 1]],  # 28
        auto repncspelan_28 = RepNCSPELAN4(network, weightMap, *conv_27->getOutput(0), 128, 128, 128, 64, 1,
                                           "model." + std::to_string(begin));
        begin += 1;

        // # avg-conv down fuse
        // [-1, 1, ADown, [256]],  # 29-P3/8
        auto adown_29 = AConv(network, weightMap, *repncspelan_28->getOutput(0), 240, "model." + std::to_string(begin));
        begin += 2;
        // [[23, 24, 25, -1], 1, CBFuse, [[0, 0, 0]]], # 30
        auto cbfuse = CBFuse(network, {cblinear_23, cblinear_24, cblinear_25, std::vector<ILayer*>{adown_29}},
                             {0, 0, 0, 0}, {8, 16, 32, 8});

        // # elan-2 block
        // [-1, 1, RepNCSPELAN4, [512, 256, 128, 1]],  # 31
        auto repncspelan_31 = RepNCSPELAN4(network, weightMap, *cbfuse->getOutput(0), 256, 240, 240, 120, 1,
                                           "model." + std::to_string(begin));
        begin += 1;

        // # avg-conv down fuse
        // [-1, 1, ADown, [512]],  # 32-P4/16
        auto adown_32 = AConv(network, weightMap, *repncspelan_31->getOutput(0), 360, "model." + std::to_string(begin));
        begin += 2;
        // [[24, 25, -1], 1, CBFuse, [[1, 1]]], # 33
        auto cbfuse_33 =
                CBFuse(network, {cblinear_24, cblinear_25, std::vector<ILayer*>{adown_32}}, {1, 1, 0}, {16, 32, 16});

        // # elan-2 block
        // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]],  # 34
        auto repncspelan_34 = RepNCSPELAN4(network, weightMap, *cbfuse_33->getOutput(0), 512, 360, 360, 180, 1,
                                           "model." + std::to_string(begin));
        begin += 1;

        // # avg-conv down fuse
        // [-1, 1, ADown, [512]],  # 35-P5/32
        auto adown_35 = AConv(network, weightMap, *repncspelan_34->getOutput(0), 480, "model." + std::to_string(begin));
        begin += 2;

        // [[25, -1], 1, CBFuse, [[2]]], # 36
        auto cbfuse_36 = CBFuse(network, {cblinear_25, std::vector<ILayer*>{adown_35}}, {2, 0}, {32, 32});

        // # elan-2 block
        // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]],  # 37
        auto repncspelan_37 = RepNCSPELAN4(network, weightMap, *cbfuse_36->getOutput(0), 512, 480, 480, 240, 1,
                                           "model." + std::to_string(begin));
        begin += 1;

        // # detection head
        // # detect
        // [[31, 34, 37, 16, 19, 22], 1, DualDDetect, [nc]],  # DualDDetect(A3, A4, A5, P3, P4, P5)
        head = DualDDetect(network, weightMap, std::vector<ILayer*>{repncspelan_31, repncspelan_34, repncspelan_37},
                           kNumClass, {240, 360, 480}, "model." + std::to_string(begin));
    } else {
        // # detection head
        // # detect
        // [[16, 19, 22], 1, DDetect, [nc]],  # DDetect(P3, P4, P5)
        head = DDetect(network, weightMap, std::vector<ILayer*>{repncspelan_16, repncspelan_19, repncspelan_22},
                       kNumClass, {240, 360, 480}, "model." + std::to_string(begin));
    }

    nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, head, false);
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    builder->setMaxBatchSize(kBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator =
            new Int8EntropyCalibrator2(1, kInputW, kInputH, gCalibTablePath, "int8calib.table", kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    return serialized_model;
}

IHostMemory* build_engine_yolov9_c(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt,
                                   std::string& wts_name) {
    /* ------ Create the builder ------ */
    INetworkDefinition* network = builder->createNetworkV2(0U);

    ITensor* data = network->addInput(kInputTensorName, dt, Dims3{3, kInputH, kInputW});
    assert(data);
    std::map<std::string, Weights> weightMap = loadWeights(wts_name);

    // # conv down
    // [-1, 1, Conv, [64, 3, 2]],  # 1-P1/2
    auto conv_1 = convBnSiLU(network, weightMap, *data, 64, 3, 2, 1, "model.1", 1);
    // # conv down
    // [-1, 1, Conv, [128, 3, 2]],  # 2-P2/4
    auto conv_2 = convBnSiLU(network, weightMap, *conv_1->getOutput(0), 128, 3, 2, 1, "model.2");
    // # elan-1 block
    // [-1, 1, RepNCSPELAN4, [256, 128, 64, 1]],  # 3
    auto repncspelan_3 = RepNCSPELAN4(network, weightMap, *conv_2->getOutput(0), 128, 256, 128, 64, 1, "model.3");
    // # avg-conv down
    // [-1, 1, ADown, [256]],  # 4-P3/8
    auto adown_4 = ADown(network, weightMap, *repncspelan_3->getOutput(0), 256, "model.4");
    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 256, 128, 1]],  # 5
    auto repncspelan_5 = RepNCSPELAN4(network, weightMap, *adown_4->getOutput(0), 256, 512, 256, 128, 1, "model.5");
    // # avg-conv down
    // [-1, 1, ADown, [512]],  # 6-P4/16
    auto adown_6 = ADown(network, weightMap, *repncspelan_5->getOutput(0), 512, "model.6");
    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]],  # 7
    auto repncspelan_7 = RepNCSPELAN4(network, weightMap, *adown_6->getOutput(0), 512, 512, 512, 256, 1, "model.7");
    // # avg-conv down
    // [-1, 1, ADown, [512]],  # 8-P5/32
    auto adown_8 = ADown(network, weightMap, *repncspelan_7->getOutput(0), 512, "model.8");
    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]],  # 9
    auto repncspelan_9 = RepNCSPELAN4(network, weightMap, *adown_8->getOutput(0), 512, 512, 512, 256, 1, "model.9");
    // # elan-spp block
    // [-1, 1, SPPELAN, [512, 256]],  # 10
    auto sppelan_10 = SPPELAN(network, weightMap, *repncspelan_9->getOutput(0), 512, 512, 256, "model.10");

    // # up-concat merge
    // [-1, 1, nn.Upsample, [None, 2, 'nearest']],
    auto upsample_11 = network->addResize(*sppelan_10->getOutput(0));
    upsample_11->setResizeMode(ResizeMode::kNEAREST);
    const float scales_11[] = {1.0, 2.0, 2.0};
    upsample_11->setScales(scales_11, 3);
    // [[-1, 7], 1, Concat, [1]],  # cat backbone P4
    ITensor* input_tensor_12[] = {upsample_11->getOutput(0), repncspelan_7->getOutput(0)};
    auto cat_12 = network->addConcatenation(input_tensor_12, 2);

    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]],  # 13
    auto repncspelan_13 = RepNCSPELAN4(network, weightMap, *cat_12->getOutput(0), 1536, 512, 512, 256, 1, "model.13");

    // # up-concat merge
    // [-1, 1, nn.Upsample, [None, 2, 'nearest']],
    auto upsample_14 = network->addResize(*repncspelan_13->getOutput(0));
    upsample_14->setResizeMode(ResizeMode::kNEAREST);
    const float scales_14[] = {1.0, 2.0, 2.0};
    upsample_14->setScales(scales_14, 3);
    // [[-1, 5], 1, Concat, [1]],  # cat backbone P3
    ITensor* input_tensor_15[] = {upsample_14->getOutput(0), repncspelan_5->getOutput(0)};
    auto cat_15 = network->addConcatenation(input_tensor_15, 2);

    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [256, 256, 128, 1]],  # 16 (P3/8-small)
    auto repncspelan_16 = RepNCSPELAN4(network, weightMap, *cat_15->getOutput(0), 1024, 256, 256, 128, 1, "model.16");

    // # avg-conv-down merge
    // [-1, 1, ADown, [256]],
    auto adown_17 = ADown(network, weightMap, *repncspelan_16->getOutput(0), 256, "model.17");
    // [[-1, 13], 1, Concat, [1]],  # cat head P4
    ITensor* input_tensor_18[] = {adown_17->getOutput(0), repncspelan_13->getOutput(0)};
    auto cat_18 = network->addConcatenation(input_tensor_18, 2);

    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]],  # 19 (P4/16-medium)
    auto repncspelan_19 = RepNCSPELAN4(network, weightMap, *cat_18->getOutput(0), 768, 512, 512, 256, 1, "model.19");

    // # avg-conv-down merge
    // [-1, 1, ADown, [512]],
    auto adown_20 = ADown(network, weightMap, *repncspelan_19->getOutput(0), 512, "model.20");
    // [[-1, 10], 1, Concat, [1]],  # cat head P5
    ITensor* input_tensor_21[] = {adown_20->getOutput(0), sppelan_10->getOutput(0)};
    auto cat_21 = network->addConcatenation(input_tensor_21, 2);

    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]],  # 22 (P5/32-large)
    auto repncspelan_22 = RepNCSPELAN4(network, weightMap, *cat_21->getOutput(0), 1024, 512, 512, 256, 1, "model.22");

    // # multi-level reversible auxiliary branch

    // # routing
    // [5, 1, CBLinear, [[256]]], # 23
    auto cblinear_23 = CBLinear(network, weightMap, *repncspelan_5->getOutput(0), {256}, 1, 1, 0, 1, "model.23");
    // [7, 1, CBLinear, [[256, 512]]], # 24
    auto cblinear_24 = CBLinear(network, weightMap, *repncspelan_7->getOutput(0), {256, 512}, 1, 1, 0, 1, "model.24");
    // [9, 1, CBLinear, [[256, 512, 512]]], # 25
    auto cblinear_25 =
            CBLinear(network, weightMap, *repncspelan_9->getOutput(0), {256, 512, 512}, 1, 1, 0, 1, "model.25");

    // # conv down
    // [0, 1, Conv, [64, 3, 2]],  # 26-P1/2
    auto conv_26 = convBnSiLU(network, weightMap, *data, 64, 3, 2, 1, "model.26", 1);

    // # conv down
    // [-1, 1, Conv, [128, 3, 2]],  # 27-P2/4
    auto conv_27 = convBnSiLU(network, weightMap, *conv_26->getOutput(0), 128, 3, 2, 1, "model.27");

    // # elan-1 block
    // [-1, 1, RepNCSPELAN4, [256, 128, 64, 1]],  # 28
    auto repncspelan_28 = RepNCSPELAN4(network, weightMap, *conv_27->getOutput(0), 128, 256, 128, 64, 1, "model.28");

    // # avg-conv down fuse
    // [-1, 1, ADown, [256]],  # 29-P3/8
    auto adown_29 = ADown(network, weightMap, *repncspelan_28->getOutput(0), 256, "model.29");
    // [[23, 24, 25, -1], 1, CBFuse, [[0, 0, 0]]], # 30
    auto cbfuse = CBFuse(network, {cblinear_23, cblinear_24, cblinear_25, std::vector<ILayer*>{adown_29}}, {0, 0, 0, 0},
                         {8, 16, 32, 8});

    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 256, 128, 1]],  # 31
    auto repncspelan_31 = RepNCSPELAN4(network, weightMap, *cbfuse->getOutput(0), 256, 512, 256, 128, 1, "model.31");

    // # avg-conv down fuse
    // [-1, 1, ADown, [512]],  # 32-P4/16
    auto adown_32 = ADown(network, weightMap, *repncspelan_31->getOutput(0), 512, "model.32");
    // [[24, 25, -1], 1, CBFuse, [[1, 1]]], # 33
    auto cbfuse_33 =
            CBFuse(network, {cblinear_24, cblinear_25, std::vector<ILayer*>{adown_32}}, {1, 1, 0}, {16, 32, 16});

    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]],  # 34
    auto repncspelan_34 = RepNCSPELAN4(network, weightMap, *cbfuse_33->getOutput(0), 512, 512, 512, 256, 1, "model.34");

    // # avg-conv down fuse
    // [-1, 1, ADown, [512]],  # 35-P5/32
    auto adown_35 = ADown(network, weightMap, *repncspelan_34->getOutput(0), 512, "model.35");

    // [[25, -1], 1, CBFuse, [[2]]], # 36
    auto cbfuse_36 = CBFuse(network, {cblinear_25, std::vector<ILayer*>{adown_35}}, {2, 0}, {32, 32});

    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]],  # 37
    auto repncspelan_37 = RepNCSPELAN4(network, weightMap, *cbfuse_36->getOutput(0), 512, 512, 512, 256, 1, "model.37");

    // # detection head
    // # detect
    // [[31, 34, 37, 16, 19, 22], 1, DualDDetect, [nc]],  # DualDDetect(A3, A4, A5, P3, P4, P5)
    auto dualddetect_38 =
            DualDDetect(network, weightMap, std::vector<ILayer*>{repncspelan_31, repncspelan_34, repncspelan_37},
                        kNumClass, {512, 512, 512}, "model.38");

    nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, dualddetect_38, false);
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    builder->setMaxBatchSize(kBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator =
            new Int8EntropyCalibrator2(1, kInputW, kInputH, gCalibTablePath, "int8calib.table", kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    return serialized_model;
}

IHostMemory* build_engine_yolov9_e(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt,
                                   std::string& wts_name) {
    /* ------ Create the builder ------ */
    INetworkDefinition* network = builder->createNetworkV2(0U);

    ITensor* data = network->addInput(kInputTensorName, dt, Dims3{3, kInputH, kInputW});
    assert(data);
    std::map<std::string, Weights> weightMap = loadWeights(wts_name);

    /* ------backbone------ */
    // [-1, 1, Conv, [64, 3, 2]],  # 1-P1/2
    auto conv_1 = convBnSiLU(network, weightMap, *data, 64, 3, 2, 1, "model.1", 1);
    assert(conv_1);
    // [-1, 1, Conv, [128, 3, 2]],  # 2-P2/4
    auto conv_2 = convBnSiLU(network, weightMap, *conv_1->getOutput(0), 128, 3, 2, 1, "model.2");
    // csp-elan block
    // [-1, 1, RepNCSPELAN4, [256, 128, 64, 2]],  # 3
    auto repncspelan_3 = RepNCSPELAN4(network, weightMap, *conv_2->getOutput(0), 128, 256, 128, 64, 2, "model.3");
    // avg-conv down
    // [-1, 1, ADown, [256]],  # 4-P3/8
    auto adown_4 = ADown(network, weightMap, *repncspelan_3->getOutput(0), 256, "model.4");
    // csp-elan block
    // [-1, 1, RepNCSPELAN4, [512, 256, 128, 2]],  # 5
    auto repncspelan_5 = RepNCSPELAN4(network, weightMap, *adown_4->getOutput(0), 256, 512, 256, 128, 2, "model.5");
    // avg-conv down
    // [-1, 1, ADown, [512]],  # 6-P4/16
    auto adown_6 = ADown(network, weightMap, *repncspelan_5->getOutput(0), 512, "model.6");
    // csp-elan block
    // [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]],  # 7
    auto repncspelan_7 = RepNCSPELAN4(network, weightMap, *adown_6->getOutput(0), 512, 1024, 512, 256, 2, "model.7");
    // avg-conv down
    // [-1, 1, ADown, [1024]],  # 8-P5/32
    auto adown_8 = ADown(network, weightMap, *repncspelan_7->getOutput(0), 1024, "model.8");
    // csp-elan block
    // [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]],  # 9
    auto repncspelan_9 = RepNCSPELAN4(network, weightMap, *adown_8->getOutput(0), 512, 1024, 512, 256, 2, "model.9");

    // [1, 1, CBLinear, [[64]]], # 10
    auto cblinear_10 = CBLinear(network, weightMap, *conv_1->getOutput(0), {64}, 1, 1, 0, 1, "model.10");
    // [3, 1, CBLinear, [[64, 128]]], # 11
    auto cblinear_11 = CBLinear(network, weightMap, *repncspelan_3->getOutput(0), {64, 128}, 1, 1, 0, 1, "model.11");
    // [5, 1, CBLinear, [[64, 128, 256]]], # 12
    auto cblinear_12 =
            CBLinear(network, weightMap, *repncspelan_5->getOutput(0), {64, 128, 256}, 1, 1, 0, 1, "model.12");
    // [7, 1, CBLinear, [[64, 128, 256, 512]]], # 13
    auto cblinear_13 =
            CBLinear(network, weightMap, *repncspelan_7->getOutput(0), {64, 128, 256, 512}, 1, 1, 0, 1, "model.13");
    // [9, 1, CBLinear, [[64, 128, 256, 512, 1024]]], # 14
    auto cblinear_14 = CBLinear(network, weightMap, *repncspelan_9->getOutput(0), {64, 128, 256, 512, 1024}, 1, 1, 0, 1,
                                "model.14");

    // conv down
    // [0, 1, Conv, [64, 3, 2]],  # 15-P1/2
    auto conv_15 = convBnSiLU(network, weightMap, *data, 64, 3, 2, 1, "model.15", 1);
    // [[10, 11, 12, 13, 14, -1], 1, CBFuse, [[0, 0, 0, 0, 0]]], # 16
    auto cbfuse_16 = CBFuse(
            network, {cblinear_10, cblinear_11, cblinear_12, cblinear_13, cblinear_14, std::vector<ILayer*>{conv_15}},
            {0, 0, 0, 0, 0, 0}, {2, 4, 8, 16, 32, 2});

    // conv down
    // [-1, 1, Conv, [128, 3, 2]],  # 17-P2/4
    auto conv_17 = convBnSiLU(network, weightMap, *cbfuse_16->getOutput(0), 128, 3, 2, 1, "model.17");
    // [[11, 12, 13, 14, -1], 1, CBFuse, [[1, 1, 1, 1]]], # 18
    auto cbfuse_18 =
            CBFuse(network, {cblinear_11, cblinear_12, cblinear_13, cblinear_14, std::vector<ILayer*>{conv_17}},
                   {1, 1, 1, 1, 0}, {4, 8, 16, 32, 4});

    // csp-elan block
    // [-1, 1, RepNCSPELAN4, [256, 128, 64, 2]],  # 19
    auto repncspelan_19 = RepNCSPELAN4(network, weightMap, *cbfuse_18->getOutput(0), 128, 256, 128, 64, 2, "model.19");

    // avg-conv down fuse
    // [-1, 1, ADown, [256]],  # 20-P3/8
    auto adown_20 = ADown(network, weightMap, *repncspelan_19->getOutput(0), 256, "model.20");
    // [[12, 13, 14, -1], 1, CBFuse, [[2, 2, 2]]], # 21
    auto cbfuse_21 = CBFuse(network, {cblinear_12, cblinear_13, cblinear_14, std::vector<ILayer*>{adown_20}},
                            {2, 2, 2, 0}, {8, 16, 32, 8});

    // csp-elan block
    // [-1, 1, RepNCSPELAN4, [512, 256, 128, 2]],  # 22
    auto repncspelan_22 = RepNCSPELAN4(network, weightMap, *cbfuse_21->getOutput(0), 256, 512, 256, 128, 2, "model.22");

    // avg-conv down fuse
    // [-1, 1, ADown, [512]],  # 23-P4/16
    auto adown_23 = ADown(network, weightMap, *repncspelan_22->getOutput(0), 512, "model.23");
    // [[13, 14, -1], 1, CBFuse, [[3, 3]]], # 24
    auto cbfuse_24 =
            CBFuse(network, {cblinear_13, cblinear_14, std::vector<ILayer*>{adown_23}}, {3, 3, 0}, {16, 32, 16});

    // csp-elan block
    // [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]],  # 25
    auto repncspelan_25 =
            RepNCSPELAN4(network, weightMap, *cbfuse_24->getOutput(0), 512, 1024, 512, 256, 2, "model.25");

    // avg-conv down fuse
    // [-1, 1, ADown, [1024]],  # 26-P5/32
    auto adown_26 = ADown(network, weightMap, *repncspelan_25->getOutput(0), 1024, "model.26");
    // [[14, -1], 1, CBFuse, [[4]]], # 27
    auto cbfuse_27 = CBFuse(network, {cblinear_14, std::vector<ILayer*>{adown_26}}, {4, 0}, {32, 32});

    // csp-elan block
    // [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]],  # 28
    auto repncspelan_28 =
            RepNCSPELAN4(network, weightMap, *cbfuse_27->getOutput(0), 512, 1024, 512, 256, 2, "model.28");

    // elan-spp block
    // [9, 1, SPPELAN, [512, 256]],  # 29
    auto sppelan_29 = SPPELAN(network, weightMap, *repncspelan_9->getOutput(0), 1024, 512, 256, "model.29");

    // # up-concat merge
    // [-1, 1, nn.Upsample, [None, 2, 'nearest']],
    auto upsample_30 = network->addResize(*sppelan_29->getOutput(0));
    upsample_30->setResizeMode(ResizeMode::kNEAREST);
    const float scales_30[] = {1.0, 2.0, 2.0};
    upsample_30->setScales(scales_30, 3);
    // [[-1, 7], 1, Concat, [1]],  # cat backbone P4
    ITensor* input_tensor_31[] = {upsample_30->getOutput(0), repncspelan_7->getOutput(0)};
    auto cat_31 = network->addConcatenation(input_tensor_31, 2);

    // # csp-elan block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 2]],  # 32
    auto repncspelan_32 = RepNCSPELAN4(network, weightMap, *cat_31->getOutput(0), 1536, 512, 512, 256, 2, "model.32");

    // # up-concat merge
    // [-1, 1, nn.Upsample, [None, 2, 'nearest']],
    auto upsample_33 = network->addResize(*repncspelan_32->getOutput(0));
    upsample_33->setResizeMode(ResizeMode::kNEAREST);
    const float scales_33[] = {1.0, 2.0, 2.0};
    upsample_33->setScales(scales_33, 3);
    // [[-1, 5], 1, Concat, [1]],  # cat backbone P3
    ITensor* input_tensor_34[] = {upsample_33->getOutput(0), repncspelan_5->getOutput(0)};
    auto cat_34 = network->addConcatenation(input_tensor_34, 2);

    // # csp-elan block
    // [-1, 1, RepNCSPELAN4, [256, 256, 128, 2]],  # 35
    auto repncspelan_35 = RepNCSPELAN4(network, weightMap, *cat_34->getOutput(0), 1024, 256, 256, 128, 2, "model.35");

    // # elan-spp block
    // [28, 1, SPPELAN, [512, 256]],  # 36
    auto sppelan_36 = SPPELAN(network, weightMap, *repncspelan_28->getOutput(0), 1024, 512, 256, "model.36");

    // # up-concat merge
    // [-1, 1, nn.Upsample, [None, 2, 'nearest']],
    auto upsample_37 = network->addResize(*sppelan_36->getOutput(0));
    upsample_37->setResizeMode(ResizeMode::kNEAREST);
    const float scales_37[] = {1.0, 2.0, 2.0};
    upsample_37->setScales(scales_37, 3);
    // [[-1, 25], 1, Concat, [1]],  # cat backbone P4
    ITensor* input_tensor_38[] = {upsample_37->getOutput(0), repncspelan_25->getOutput(0)};
    auto cat_38 = network->addConcatenation(input_tensor_38, 2);

    // # csp-elan block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 2]],  # 39
    auto repncspelan_39 = RepNCSPELAN4(network, weightMap, *cat_38->getOutput(0), 1536, 512, 512, 256, 2, "model.39");

    // # up-concat merge
    // [-1, 1, nn.Upsample, [None, 2, 'nearest']],
    auto upsample_40 = network->addResize(*repncspelan_39->getOutput(0));
    upsample_40->setResizeMode(ResizeMode::kNEAREST);
    const float scales_40[] = {1.0, 2.0, 2.0};
    upsample_40->setScales(scales_40, 3);
    // [[-1, 22], 1, Concat, [1]],  # cat backbone P3
    ITensor* input_tensor_41[] = {upsample_40->getOutput(0), repncspelan_22->getOutput(0)};
    auto cat_41 = network->addConcatenation(input_tensor_41, 2);

    // # csp-elan block
    // [-1, 1, RepNCSPELAN4, [256, 256, 128, 2]],  # 42 (P3/8-small)
    auto repncspelan_42 = RepNCSPELAN4(network, weightMap, *cat_41->getOutput(0), 1024, 256, 256, 128, 2, "model.42");
    // # avg-conv-down merge
    // [-1, 1, ADown, [256]],
    auto adown_43 = ADown(network, weightMap, *repncspelan_42->getOutput(0), 256, "model.43");
    // [[-1, 39], 1, Concat, [1]],  # cat head P4
    ITensor* input_tensor_44[] = {adown_43->getOutput(0), repncspelan_39->getOutput(0)};
    auto cat_44 = network->addConcatenation(input_tensor_44, 2);

    // # csp-elan block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 2]],  # 45 (P4/16-medium)
    auto repncspelan_45 = RepNCSPELAN4(network, weightMap, *cat_44->getOutput(0), 768, 512, 512, 256, 2, "model.45");
    // # avg-conv-down merge
    // [-1, 1, ADown, [512]],
    auto adown_46 = ADown(network, weightMap, *repncspelan_45->getOutput(0), 512, "model.46");
    // [[-1, 36], 1, Concat, [1]],  # cat head P5
    ITensor* input_tensor_47[] = {adown_46->getOutput(0), sppelan_36->getOutput(0)};
    auto cat_47 = network->addConcatenation(input_tensor_47, 2);

    // # csp-elan block
    // [-1, 1, RepNCSPELAN4, [512, 1024, 512, 2]],  # 48 (P5/32-large)
    auto repncspelan_48 = RepNCSPELAN4(network, weightMap, *cat_47->getOutput(0), 1024, 512, 1024, 512, 2, "model.48");

    // auto DualDDetect_49 = DualDDetect(network, weightMap, std::vector<ILayer*>{RepNCSPELAN_42, RepNCSPELAN_45, RepNCSPELAN_48}, kNumClass, {256, 512, 512}, "model.49");
    auto dualddetect_49 =
            DualDDetect(network, weightMap, std::vector<ILayer*>{repncspelan_35, repncspelan_32, sppelan_29}, kNumClass,
                        {256, 512, 512}, "model.49");

    nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, dualddetect_49, false);
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    builder->setMaxBatchSize(kBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator =
            new Int8EntropyCalibrator2(1, kInputW, kInputH, gCalibTablePath, "int8calib.table", kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    return serialized_model;
}

IHostMemory* build_engine_gelan_e(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt,
                                  std::string& wts_name) {
    /* ------ Create the builder ------ */
    INetworkDefinition* network = builder->createNetworkV2(0U);

    ITensor* data = network->addInput(kInputTensorName, dt, Dims3{3, kInputH, kInputW});
    assert(data);
    std::map<std::string, Weights> weightMap = loadWeights(wts_name);

    /* ------backbone------ */
    // [-1, 1, Conv, [64, 3, 2]],  # 1-P1/2
    auto conv_1 = convBnSiLU(network, weightMap, *data, 64, 3, 2, 1, "model.1", 1);
    assert(conv_1);
    // [-1, 1, Conv, [128, 3, 2]],  # 2-P2/4
    auto conv_2 = convBnSiLU(network, weightMap, *conv_1->getOutput(0), 128, 3, 2, 1, "model.2");
    // csp-elan block
    // [-1, 1, RepNCSPELAN4, [256, 128, 64, 2]],  # 3
    auto repncspelan_3 = RepNCSPELAN4(network, weightMap, *conv_2->getOutput(0), 128, 256, 128, 64, 2, "model.3");
    // avg-conv down
    // [-1, 1, ADown, [256]],  # 4-P3/8
    auto adown_4 = ADown(network, weightMap, *repncspelan_3->getOutput(0), 256, "model.4");
    // csp-elan block
    // [-1, 1, RepNCSPELAN4, [512, 256, 128, 2]],  # 5
    auto repncspelan_5 = RepNCSPELAN4(network, weightMap, *adown_4->getOutput(0), 256, 512, 256, 128, 2, "model.5");
    // avg-conv down
    // [-1, 1, ADown, [512]],  # 6-P4/16
    auto adown_6 = ADown(network, weightMap, *repncspelan_5->getOutput(0), 512, "model.6");
    // csp-elan block
    // [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]],  # 7
    auto repncspelan_7 = RepNCSPELAN4(network, weightMap, *adown_6->getOutput(0), 512, 1024, 512, 256, 2, "model.7");
    // avg-conv down
    // [-1, 1, ADown, [1024]],  # 8-P5/32
    auto adown_8 = ADown(network, weightMap, *repncspelan_7->getOutput(0), 1024, "model.8");
    // csp-elan block
    // [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]],  # 9
    auto repncspelan_9 = RepNCSPELAN4(network, weightMap, *adown_8->getOutput(0), 512, 1024, 512, 256, 2, "model.9");

    // [1, 1, CBLinear, [[64]]], # 10
    auto cblinear_10 = CBLinear(network, weightMap, *conv_1->getOutput(0), {64}, 1, 1, 0, 1, "model.10");
    // [3, 1, CBLinear, [[64, 128]]], # 11
    auto cblinear_11 = CBLinear(network, weightMap, *repncspelan_3->getOutput(0), {64, 128}, 1, 1, 0, 1, "model.11");
    // [5, 1, CBLinear, [[64, 128, 256]]], # 12
    auto cblinear_12 =
            CBLinear(network, weightMap, *repncspelan_5->getOutput(0), {64, 128, 256}, 1, 1, 0, 1, "model.12");
    // [7, 1, CBLinear, [[64, 128, 256, 512]]], # 13
    auto cblinear_13 =
            CBLinear(network, weightMap, *repncspelan_7->getOutput(0), {64, 128, 256, 512}, 1, 1, 0, 1, "model.13");
    // [9, 1, CBLinear, [[64, 128, 256, 512, 1024]]], # 14
    auto cblinear_14 = CBLinear(network, weightMap, *repncspelan_9->getOutput(0), {64, 128, 256, 512, 1024}, 1, 1, 0, 1,
                                "model.14");

    // conv down
    // [0, 1, Conv, [64, 3, 2]],  # 15-P1/2
    auto conv_15 = convBnSiLU(network, weightMap, *data, 64, 3, 2, 1, "model.15", 1);
    // [[10, 11, 12, 13, 14, -1], 1, CBFuse, [[0, 0, 0, 0, 0]]], # 16
    auto cbfuse_16 = CBFuse(
            network, {cblinear_10, cblinear_11, cblinear_12, cblinear_13, cblinear_14, std::vector<ILayer*>{conv_15}},
            {0, 0, 0, 0, 0, 0}, {2, 4, 8, 16, 32, 2});

    // conv down
    // [-1, 1, Conv, [128, 3, 2]],  # 17-P2/4
    auto conv_17 = convBnSiLU(network, weightMap, *cbfuse_16->getOutput(0), 128, 3, 2, 1, "model.17");
    // [[11, 12, 13, 14, -1], 1, CBFuse, [[1, 1, 1, 1]]], # 18
    auto cbfuse_18 =
            CBFuse(network, {cblinear_11, cblinear_12, cblinear_13, cblinear_14, std::vector<ILayer*>{conv_17}},
                   {1, 1, 1, 1, 0}, {4, 8, 16, 32, 4});

    // csp-elan block
    // [-1, 1, RepNCSPELAN4, [256, 128, 64, 2]],  # 19
    auto repncspelan_19 = RepNCSPELAN4(network, weightMap, *cbfuse_18->getOutput(0), 128, 256, 128, 64, 2, "model.19");

    // avg-conv down fuse
    // [-1, 1, ADown, [256]],  # 20-P3/8
    auto adown_20 = ADown(network, weightMap, *repncspelan_19->getOutput(0), 256, "model.20");
    // [[12, 13, 14, -1], 1, CBFuse, [[2, 2, 2]]], # 21
    auto cbfuse_21 = CBFuse(network, {cblinear_12, cblinear_13, cblinear_14, std::vector<ILayer*>{adown_20}},
                            {2, 2, 2, 0}, {8, 16, 32, 8});

    // csp-elan block
    // [-1, 1, RepNCSPELAN4, [512, 256, 128, 2]],  # 22
    auto repncspelan_22 = RepNCSPELAN4(network, weightMap, *cbfuse_21->getOutput(0), 256, 512, 256, 128, 2, "model.22");

    // avg-conv down fuse
    // [-1, 1, ADown, [512]],  # 23-P4/16
    auto adown_23 = ADown(network, weightMap, *repncspelan_22->getOutput(0), 512, "model.23");
    // [[13, 14, -1], 1, CBFuse, [[3, 3]]], # 24
    auto cbfuse_24 =
            CBFuse(network, {cblinear_13, cblinear_14, std::vector<ILayer*>{adown_23}}, {3, 3, 0}, {16, 32, 16});

    // csp-elan block
    // [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]],  # 25
    auto repncspelan_25 =
            RepNCSPELAN4(network, weightMap, *cbfuse_24->getOutput(0), 512, 1024, 512, 256, 2, "model.25");

    // avg-conv down fuse
    // [-1, 1, ADown, [1024]],  # 26-P5/32
    auto adown_26 = ADown(network, weightMap, *repncspelan_25->getOutput(0), 1024, "model.26");
    // [[14, -1], 1, CBFuse, [[4]]], # 27
    auto cbfuse_27 = CBFuse(network, {cblinear_14, std::vector<ILayer*>{adown_26}}, {4, 0}, {32, 32});

    // csp-elan block
    // [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]],  # 28
    auto repncspelan_28 =
            RepNCSPELAN4(network, weightMap, *cbfuse_27->getOutput(0), 512, 1024, 512, 256, 2, "model.28");

    // elan-spp block
    // [28, 1, SPPELAN, [512, 256]],  # 29
    auto sppelan_29 = SPPELAN(network, weightMap, *repncspelan_28->getOutput(0), 1024, 512, 256, "model.29");

    // # up-concat merge
    // [-1, 1, nn.Upsample, [None, 2, 'nearest']],
    auto upsample_30 = network->addResize(*sppelan_29->getOutput(0));
    upsample_30->setResizeMode(ResizeMode::kNEAREST);
    const float scales_30[] = {1.0, 2.0, 2.0};
    upsample_30->setScales(scales_30, 3);
    // [[-1, 25], 1, Concat, [1]],  # cat backbone P4
    ITensor* input_tensor_31[] = {upsample_30->getOutput(0), repncspelan_25->getOutput(0)};
    auto cat_31 = network->addConcatenation(input_tensor_31, 2);

    // # csp-elan block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 2]],  # 32
    auto repncspelan_32 = RepNCSPELAN4(network, weightMap, *cat_31->getOutput(0), 1536, 512, 512, 256, 2, "model.32");

    // # up-concat merge
    // [-1, 1, nn.Upsample, [None, 2, 'nearest']],
    auto upsample_33 = network->addResize(*repncspelan_32->getOutput(0));
    upsample_33->setResizeMode(ResizeMode::kNEAREST);
    const float scales_33[] = {1.0, 2.0, 2.0};
    upsample_33->setScales(scales_33, 3);
    // [[-1, 22], 1, Concat, [1]],  # cat backbone P3
    ITensor* input_tensor_34[] = {upsample_33->getOutput(0), repncspelan_22->getOutput(0)};
    auto cat_34 = network->addConcatenation(input_tensor_34, 2);

    // # csp-elan block
    // [-1, 1, RepNCSPELAN4, [256, 256, 128, 2]],  # 35
    auto repncspelan_35 = RepNCSPELAN4(network, weightMap, *cat_34->getOutput(0), 1024, 256, 256, 128, 2, "model.35");

    // # avg-conv-down merge
    // [-1, 1, ADown, [256]],
    auto adown_36 = ADown(network, weightMap, *repncspelan_35->getOutput(0), 256, "model.36");
    // [[-1, 32], 1, Concat, [1]],  # cat head P4
    ITensor* input_tensor_37[] = {adown_36->getOutput(0), repncspelan_32->getOutput(0)};
    auto cat_37 = network->addConcatenation(input_tensor_37, 2);

    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 2]],  # 38 (P4/16-medium)
    auto repncspelan_38 = RepNCSPELAN4(network, weightMap, *cat_37->getOutput(0), 768, 512, 512, 256, 2, "model.38");

    // # avg-conv-down merge
    // [-1, 1, ADown, [512]],
    auto adown_39 = ADown(network, weightMap, *repncspelan_38->getOutput(0), 512, "model.39");
    // [[-1, 29], 1, Concat, [1]],  # cat head P5
    ITensor* input_tensor_40[] = {adown_39->getOutput(0), sppelan_29->getOutput(0)};
    auto cat_40 = network->addConcatenation(input_tensor_40, 2);

    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 1024, 512, 2]],  # 41 (P5/32-large)
    auto repncspelan_41 = RepNCSPELAN4(network, weightMap, *cat_40->getOutput(0), 1024, 512, 1024, 512, 2, "model.41");

    auto ddetect_42 = DDetect(network, weightMap, std::vector<ILayer*>{repncspelan_35, repncspelan_38, repncspelan_41},
                              kNumClass, {256, 512, 512}, "model.42");

    nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, ddetect_42, false);
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    builder->setMaxBatchSize(kBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator =
            new Int8EntropyCalibrator2(1, kInputW, kInputH, gCalibTablePath, "int8calib.table", kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    return serialized_model;
}
IHostMemory* build_engine_gelan_c(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt,
                                  std::string& wts_name) {
    /* ------ Create the builder ------ */
    INetworkDefinition* network = builder->createNetworkV2(0U);

    ITensor* data = network->addInput(kInputTensorName, dt, Dims3{3, kInputH, kInputW});
    assert(data);
    std::map<std::string, Weights> weightMap = loadWeights(wts_name);

    // # conv down
    // [-1, 1, Conv, [64, 3, 2]],  # 1-P1/2
    auto conv_1 = convBnSiLU(network, weightMap, *data, 64, 3, 2, 1, "model.0", 1);
    // # conv down
    // [-1, 1, Conv, [128, 3, 2]],  # 2-P2/4
    auto conv_2 = convBnSiLU(network, weightMap, *conv_1->getOutput(0), 128, 3, 2, 1, "model.1");
    // # elan-1 block
    // [-1, 1, RepNCSPELAN4, [256, 128, 64, 1]],  # 3
    auto repncspelan_3 = RepNCSPELAN4(network, weightMap, *conv_2->getOutput(0), 128, 256, 128, 64, 1, "model.2");
    // # avg-conv down
    // [-1, 1, ADown, [256]],  # 4-P3/8
    auto adown_4 = ADown(network, weightMap, *repncspelan_3->getOutput(0), 256, "model.3");
    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 256, 128, 1]],  # 5
    auto repncspelan_5 = RepNCSPELAN4(network, weightMap, *adown_4->getOutput(0), 256, 512, 256, 128, 1, "model.4");
    // # avg-conv down
    // [-1, 1, ADown, [512]],  # 6-P4/16
    auto adown_6 = ADown(network, weightMap, *repncspelan_5->getOutput(0), 512, "model.5");
    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]],  # 7
    auto repncspelan_7 = RepNCSPELAN4(network, weightMap, *adown_6->getOutput(0), 512, 512, 512, 256, 1, "model.6");
    // # avg-conv down
    // [-1, 1, ADown, [512]],  # 8-P5/32
    auto adown_8 = ADown(network, weightMap, *repncspelan_7->getOutput(0), 512, "model.7");
    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]],  # 9
    auto repncspelan_9 = RepNCSPELAN4(network, weightMap, *adown_8->getOutput(0), 512, 512, 512, 256, 1, "model.8");
    // # elan-spp block
    // [-1, 1, SPPELAN, [512, 256]],  # 10
    auto sppelan_10 = SPPELAN(network, weightMap, *repncspelan_9->getOutput(0), 512, 512, 256, "model.9");

    // # up-concat merge
    // [-1, 1, nn.Upsample, [None, 2, 'nearest']],
    auto upsample_11 = network->addResize(*sppelan_10->getOutput(0));
    upsample_11->setResizeMode(ResizeMode::kNEAREST);
    const float scales_11[] = {1.0, 2.0, 2.0};
    upsample_11->setScales(scales_11, 3);
    // [[-1, 7], 1, Concat, [1]],  # cat backbone P4
    ITensor* input_tensor_12[] = {upsample_11->getOutput(0), repncspelan_7->getOutput(0)};
    auto cat_12 = network->addConcatenation(input_tensor_12, 2);

    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]],  # 13
    auto repncspelan_13 = RepNCSPELAN4(network, weightMap, *cat_12->getOutput(0), 1536, 512, 512, 256, 1, "model.12");

    // # up-concat merge
    // [-1, 1, nn.Upsample, [None, 2, 'nearest']],
    auto upsample_14 = network->addResize(*repncspelan_13->getOutput(0));
    upsample_14->setResizeMode(ResizeMode::kNEAREST);
    const float scales_14[] = {1.0, 2.0, 2.0};
    upsample_14->setScales(scales_14, 3);
    // [[-1, 5], 1, Concat, [1]],  # cat backbone P3
    ITensor* input_tensor_15[] = {upsample_14->getOutput(0), repncspelan_5->getOutput(0)};
    auto cat_15 = network->addConcatenation(input_tensor_15, 2);

    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [256, 256, 128, 1]],  # 16 (P3/8-small)
    auto repncspelan_16 = RepNCSPELAN4(network, weightMap, *cat_15->getOutput(0), 1024, 256, 256, 128, 1, "model.15");

    // # avg-conv-down merge
    // [-1, 1, ADown, [256]],
    auto adown_17 = ADown(network, weightMap, *repncspelan_16->getOutput(0), 256, "model.16");
    // [[-1, 13], 1, Concat, [1]],  # cat head P4
    ITensor* input_tensor_18[] = {adown_17->getOutput(0), repncspelan_13->getOutput(0)};
    auto cat_18 = network->addConcatenation(input_tensor_18, 2);

    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]],  # 19 (P4/16-medium)
    auto repncspelan_19 = RepNCSPELAN4(network, weightMap, *cat_18->getOutput(0), 768, 512, 512, 256, 1, "model.18");

    // # avg-conv-down merge
    // [-1, 1, ADown, [512]],
    auto adown_20 = ADown(network, weightMap, *repncspelan_19->getOutput(0), 512, "model.19");
    // [[-1, 10], 1, Concat, [1]],  # cat head P5
    ITensor* input_tensor_21[] = {adown_20->getOutput(0), sppelan_10->getOutput(0)};
    auto cat_21 = network->addConcatenation(input_tensor_21, 2);

    // # elan-2 block
    // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]],  # 22 (P5/32-large)
    auto repncspelan_22 = RepNCSPELAN4(network, weightMap, *cat_21->getOutput(0), 1024, 512, 512, 256, 1, "model.21");

    // # detection head
    // # detect
    // [[31, 34, 37, 16, 19, 22], 1, DualDDetect, [nc]],  # DualDDetect(A3, A4, A5, P3, P4, P5)
    auto ddetect_23 = DDetect(network, weightMap, std::vector<ILayer*>{repncspelan_16, repncspelan_19, repncspelan_22},
                              kNumClass, {256, 512, 512}, "model.22");

    nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, ddetect_23, false);
    yolo->getOutput(0)->setName(kOutputTensorName);
    network->markOutput(*yolo->getOutput(0));

    builder->setMaxBatchSize(kBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));

#if defined(USE_FP16)
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(nvinfer1::BuilderFlag::kINT8);
    auto* calibrator =
            new Int8EntropyCalibrator2(1, kInputW, kInputH, gCalibTablePath, "int8calib.table", kInputTensorName);
    config->setInt8Calibrator(calibrator);
#endif

    std::cout << "Building engine, please wait for a while..." << std::endl;
    IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    delete network;

    // Release host memory
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    return serialized_model;
}


================================================
FILE: yolov9/src/postprocess.cpp
================================================
#include "postprocess.h"
#include "utils.h"
cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
    float l, r, t, b;
    float r_w = kInputW / (img.cols * 1.0);
    float r_h = kInputH / (img.rows * 1.0);
    if (r_h > r_w) {
        l = bbox[0] - bbox[2] / 2.f;
        r = bbox[0] + bbox[2] / 2.f;
        t = bbox[1] - bbox[3] / 2.f - (kInputH - r_w * img.rows) / 2;
        b = bbox[1] + bbox[3] / 2.f - (kInputH - r_w * img.rows) / 2;
        l = l / r_w;
        r = r / r_w;
        t = t / r_w;
        b = b / r_w;
    } else {
        l = bbox[0] - bbox[2] / 2.f - (kInputW - r_h * img.cols) / 2;
        r = bbox[0] + bbox[2] / 2.f - (kInputW - r_h * img.cols) / 2;
        t = bbox[1] - bbox[3] / 2.f;
        b = bbox[1] + bbox[3] / 2.f;
        l = l / r_h;
        r = r / r_h;
        t = t / r_h;
        b = b / r_h;
    }
    return cv::Rect(round(l), round(t), round(r - l), round(b - t));
}

static float iou(float lbox[4], float rbox[4]) {
    float interBox[] = {
            (std::max)(lbox[0] - lbox[2] / 2.f, rbox[0] - rbox[2] / 2.f),  //left
            (std::min)(lbox[0] + lbox[2] / 2.f, rbox[0] + rbox[2] / 2.f),  //right
            (std::max)(lbox[1] - lbox[3] / 2.f, rbox[1] - rbox[3] / 2.f),  //top
            (std::min)(lbox[1] + lbox[3] / 2.f, rbox[1] + rbox[3] / 2.f),  //bottom
    };

    if (interBox[2] > interBox[3] || interBox[0] > interBox[1])
        return 0.0f;

    float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]);
    return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS);
}

static bool cmp(const Detection& a, const Detection& b) {
    return a.conf > b.conf;
}

void nms(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh) {
    int det_size = sizeof(Detection) / sizeof(float);
    std::map<float, std::vector<Detection>> m;
    for (int i = 0; i < output[0] && i < kMaxNumOutputBbox; i++) {
        if (output[1 + det_size * i + 4] <= conf_thresh)
            continue;
        Detection det;
        memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
        if (m.count(det.class_id) == 0)
            m.emplace(det.class_id, std::vector<Detection>());
        // x1x2y1y2 -> xywh
        float c_x = (det.bbox[0] + det.bbox[2]) / 2;
        float c_y = (det.bbox[1] + det.bbox[3]) / 2;
        float w = det.bbox[2] - det.bbox[0];
        float h = det.bbox[3] - det.bbox[1];
        det.bbox[0] = c_x;
        det.bbox[1] = c_y;
        det.bbox[2] = w;
        det.bbox[3] = h;
        m[det.class_id].push_back(det);
    }
    for (auto it = m.begin(); it != m.end(); it++) {
        auto& dets = it->second;
        std::sort(dets.begin(), dets.end(), cmp);
        for (size_t m = 0; m < dets.size(); ++m) {
            auto& item = dets[m];
            res.push_back(item);
            for (size_t n = m + 1; n < dets.size(); ++n) {
                if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
                    dets.erase(dets.begin() + n);
                    --n;
                }
            }
        }
    }
}

void batch_nms(std::vector<std::vector<Detection>>& res_batch, float* output, int batch_size, int output_size,
               float conf_thresh, float nms_thresh) {
    res_batch.resize(batch_size);
    for (int i = 0; i < batch_size; i++) {
        nms(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh);
    }
}

void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
    for (size_t i = 0; i < img_batch.size(); i++) {
        auto& res = res_batch[i];
        cv::Mat img = img_batch[i];
        for (size_t j = 0; j < res.size(); j++) {
            cv::Rect r = get_rect(img, res[j].bbox);
            cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
            cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
                        cv::Scalar(0xFF, 0xFF, 0xFF), 2);
        }
    }
    // draw num of objets to img
    for (size_t i = 0; i < img_batch.size(); i++) {
        cv::putText(img_batch[i], std::to_string(res_batch[i].size()), cv::Point(0, 20), cv::FONT_HERSHEY_PLAIN, 1.2,
                    cv::Scalar(0xFF, 0xFF, 0xFF), 2);
    }
}

static cv::Rect get_downscale_rect(float bbox[4], float scale) {
    float left = bbox[0] - bbox[2] / 2;
    float top = bbox[1] - bbox[3] / 2;
    float right = bbox[0] + bbox[2] / 2;
    float bottom = bbox[1] + bbox[3] / 2;
    left /= scale;
    top /= scale;
    right /= scale;
    bottom /= scale;
    return cv::Rect(round(left), round(top), round(right - left), round(bottom - top));
}

// std::vector<cv::Mat> process_mask(const float* proto, int proto_size, std::vector<Detection>& dets) {
//     std::vector<cv::Mat> masks;
//     for (size_t i = 0; i < dets.size(); i++) {
//         cv::Mat mask_mat = cv::Mat::zeros(kInputH / 4, kInputW / 4, CV_32FC1);
//         auto r = get_downscale_rect(dets[i].bbox, 4);
//         for (int x = r.x; x < r.x + r.width; x++) {
//             for (int y = r.y; y < r.y + r.height; y++) {
//                 float e = 0.0f;
//                 for (int j = 0; j < 32; j++) {
//                     e += dets[i].mask[j] * proto[j * proto_size / 32 + y * mask_mat.cols + x];
//                 }
//                 e = 1.0f / (1.0f + expf(-e));
//                 mask_mat.at<float>(y, x) = e;
//             }
//         }
//         cv::resize(mask_mat, mask_mat, cv::Size(kInputW, kInputH));
//         masks.push_back(mask_mat);
//     }
//     return masks;
// }

cv::Mat scale_mask(cv::Mat mask, cv::Mat img) {
    int x, y, w, h;
    float r_w = kInputW / (img.cols * 1.0);
    float r_h = kInputH / (img.rows * 1.0);
    if (r_h > r_w) {
        w = kInputW;
        h = r_w * img.rows;
        x = 0;
        y = (kInputH - h) / 2;
    } else {
        w = r_h * img.cols;
        h = kInputH;
        x = (kInputW - w) / 2;
        y = 0;
    }
    cv::Rect r(x, y, w, h);
    cv::Mat res;
    cv::resize(mask(r), res, img.size());
    return res;
}

void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks,
                    std::unordered_map<int, std::string>& labels_map) {
    static std::vector<uint32_t> colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17,
                                           0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF,
                                           0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7};
    for (size_t i = 0; i < dets.size(); i++) {
        cv::Mat img_mask = scale_mask(masks[i], img);
        auto color = colors[(int)dets[i].class_id % colors.size()];
        auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF);

        cv::Rect r = get_rect(img, dets[i].bbox);
        for (int x = r.x; x < r.x + r.width; x++) {
            for (int y = r.y; y < r.y + r.height; y++) {
                float val = img_mask.at<float>(y, x);
                if (val <= 0.5)
                    continue;
                img.at<cv::Vec3b>(y, x)[0] = img.at<cv::Vec3b>(y, x)[0] / 2 + bgr[0] / 2;
                img.at<cv::Vec3b>(y, x)[1] = img.at<cv::Vec3b>(y, x)[1] / 2 + bgr[1] / 2;
                img.at<cv::Vec3b>(y, x)[2] = img.at<cv::Vec3b>(y, x)[2] / 2 + bgr[2] / 2;
            }
        }

        cv::rectangle(img, r, bgr, 2);

        // Get the size of the text
        cv::Size textSize =
                cv::getTextSize(labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf),
                                cv::FONT_HERSHEY_PLAIN, 1.2, 2, NULL);
        // Set the top left corner of the rectangle
        cv::Point topLeft(r.x, r.y - textSize.height);

        // Set the bottom right corner of the rectangle
        cv::Point bottomRight(r.x + textSize.width, r.y + textSize.height);

        // Set the thickness of the rectangle lines
        int lineThickness = 2;

        // Draw the rectangle on the image
        cv::rectangle(img, topLeft, bottomRight, bgr, -1);

        cv::putText(img, labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf),
                    cv::Point(r.x, r.y + 4), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar::all(0xFF), 2);
    }
}
void process_decode_ptr_host(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img,
                             int count) {
    Detection det;
    for (int i = 0; i < count; i++) {
        int basic_pos = 1 + i * bbox_element;
        int keep_flag = decode_ptr_host[basic_pos + 6];
        if (keep_flag == 1) {
            det.bbox[0] = decode_ptr_host[basic_pos + 0];
            det.bbox[1] = decode_ptr_host[basic_pos + 1];
            det.bbox[2] = decode_ptr_host[basic_pos + 2];
            det.bbox[3] = decode_ptr_host[basic_pos + 3];
            det.conf = decode_ptr_host[basic_pos + 4];
            det.class_id = decode_ptr_host[basic_pos + 5];
            res.push_back(det);
        }
    }
}
void batch_process(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
                   int bbox_element, const std::vector<cv::Mat>& img_batch) {
    res_batch.resize(batch_size);
    int count = static_cast<int>(*decode_ptr_host);
    count = count > kMaxNumOutputBbox ? kMaxNumOutputBbox : count;
    // std::min(count, kMaxNumOutputBbox);
    for (int i = 0; i < batch_size; i++) {
        auto& img = const_cast<cv::Mat&>(img_batch[i]);
        process_decode_ptr_host(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count);
    }
}


================================================
FILE: yolov9/src/postprocess.cu
================================================
//
// Created by lindsay on 23-7-17.
//
#include "postprocess.h"

static __global__ void decode_kernel(float* predict, int num_bboxes, float confidence_threshold, float* parray,
                                     int max_objects) {

    float count = predict[0];
    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    if (position >= count)
        return;
    float* pitem = predict + 1 + position * 6;
    int index = atomicAdd(parray, 1);
    if (index >= max_objects)
        return;
    float confidence = pitem[4];
    if (confidence < confidence_threshold)
        return;
    float* pout_item = parray + 1 + index * bbox_element;
    float left = pitem[0];
    float top = pitem[1];
    float right = pitem[2];
    float bottom = pitem[3];
    float label = pitem[5];
    *pout_item++ = left;
    *pout_item++ = top;
    *pout_item++ = right;
    *pout_item++ = bottom;
    *pout_item++ = confidence;
    *pout_item++ = label;
    *pout_item++ = 1;  // 1 = keep, 0 = ignore
}

static __device__ float box_iou(float aleft, float atop, float aright, float abottom, float bleft, float btop,
                                float bright, float bbottom) {

    float cleft = max(aleft, bleft);
    float ctop = max(atop, btop);
    float cright = min(aright, bright);
    float cbottom = min(abottom, bbottom);

    float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
    if (c_area == 0.0f)
        return 0.0f;

    float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
    float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
    return c_area / (a_area + b_area - c_area);
}

static __global__ void nms_kernel(float* bboxes, int max_objects, float threshold) {

    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    int count = min(static_cast<int>(bboxes[0]), max_objects);

    // float count = 0.0f;
    if (position >= count)
        return;

    float* pcurrent = bboxes + 1 + position * bbox_element;
    for (int i = 1; i < count; ++i) {
        float* pitem = bboxes + 1 + i * bbox_element;
        if (i == position || pcurrent[5] != pitem[5])
            continue;

        if (pitem[4] >= pcurrent[4]) {
            if (pitem[4] == pcurrent[4] && i < position)
                continue;

            float iou =
                    box_iou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pitem[0], pitem[1], pitem[2], pitem[3]);

            if (iou > threshold) {
                pcurrent[6] = 0;
                return;
            }
        }
    }
}
// 置信度过滤
void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
                 cudaStream_t stream) {
    int block = 256;
    int grid = ceil(num_bboxes / (float)block);
    decode_kernel<<<grid, block, 0, stream>>>((float*)predict, num_bboxes, confidence_threshold, parray, max_objects);
}

void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) {
    int block = max_objects < 256 ? max_objects : 256;
    int grid = ceil(max_objects / (float)block);
    nms_kernel<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold);
}


================================================
FILE: yolov9/src/preprocess.cu
================================================
#include "preprocess.h"
#include "cuda_utils.h"

static uint8_t* img_buffer_host = nullptr;
static uint8_t* img_buffer_device = nullptr;

struct AffineMatrix {
    float value[6];
};

__global__ void warpaffine_kernel(
    uint8_t* src, int src_line_size, int src_width,
    int src_height, float* dst, int dst_width,
    int dst_height, uint8_t const_value_st,
    AffineMatrix d2s, int edge) {
    int position = blockDim.x * blockIdx.x + threadIdx.x;
    if (position >= edge) return;

    float m_x1 = d2s.value[0];
    float m_y1 = d2s.value[1];
    float m_z1 = d2s.value[2];
    float m_x2 = d2s.value[3];
    float m_y2 = d2s.value[4];
    float m_z2 = d2s.value[5];

    int dx = position % dst_width;
    int dy = position / dst_width;
    float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
    float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
    float c0, c1, c2;

    if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
        // out of range
        c0 = const_value_st;
        c1 = const_value_st;
        c2 = const_value_st;
    } else {
        int y_low = floorf(src_y);
        int x_low = floorf(src_x);
        int y_high = y_low + 1;
        int x_high = x_low + 1;

        uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
        float ly = src_y - y_low;
        float lx = src_x - x_low;
        float hy = 1 - ly;
        float hx = 1 - lx;
        float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
        uint8_t* v1 = const_value;
        uint8_t* v2 = const_value;
        uint8_t* v3 = const_value;
        uint8_t* v4 = const_value;

        if (y_low >= 0) {
            if (x_low >= 0)
                v1 = src + y_low * src_line_size + x_low * 3;

            if (x_high < src_width)
                v2 = src + y_low * src_line_size + x_high * 3;
        }

        if (y_high < src_height) {
            if (x_low >= 0)
                v3 = src + y_high * src_line_size + x_low * 3;

            if (x_high < src_width)
                v4 = src + y_high * src_line_size + x_high * 3;
        }

        c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
        c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
        c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
    }

    // bgr to rgb 
    float t = c2;
    c2 = c0;
    c0 = t;

    // normalization
    c0 = c0 / 255.0f;
    c1 = c1 / 255.0f;
    c2 = c2 / 255.0f;

    // rgbrgbrgb to rrrgggbbb
    int area = dst_width * dst_height;
    float* pdst_c0 = dst + dy * dst_width + dx;
    float* pdst_c1 = pdst_c0 + area;
    float* pdst_c2 = pdst_c1 + area;
    *pdst_c0 = c0;
    *pdst_c1 = c1;
    *pdst_c2 = c2;
}

void cuda_preprocess(
    uint8_t* src, int src_width, int src_height,
    float* dst, int dst_width, int dst_height,
    cudaStream_t stream) {

    int img_size = src_width * src_height * 3;
    // copy data to pinned memory
    memcpy(img_buffer_host, src, img_size);
    // copy data to device memory
    CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream));

    AffineMatrix s2d, d2s;
    float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width);

    s2d.value[0] = scale;
    s2d.value[1] = 0;
    s2d.value[2] = -scale * src_width  * 0.5  + dst_width * 0.5;
    s2d.value[3] = 0;
    s2d.value[4] = scale;
    s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5;

    cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
    cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
    cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);

    memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));

    int jobs = dst_height * dst_width;
    int threads = 256;
    int blocks = ceil(jobs / (float)threads);

    warpaffine_kernel<<<blocks, threads, 0, stream>>>(
        img_buffer_device, src_width * 3, src_width,
        src_height, dst, dst_width,
        dst_height, 128, d2s, jobs);
}

void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch,
                           float* dst, int dst_width, int dst_height,
                           cudaStream_t stream) {
    int dst_size = dst_width * dst_height * 3;
    for (size_t i = 0; i < img_batch.size(); i++) {
        cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width, dst_height, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
    }
}

void cuda_preprocess_init(int max_image_size) {
    // prepare input data in pinned memory
    CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3));
    // prepare input data in device memory
    CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3));
}

void cuda_preprocess_destroy() {
    CUDA_CHECK(cudaFree(img_buffer_device));
    CUDA_CHECK(cudaFreeHost(img_buffer_host));
}


================================================
FILE: yolov9/windows/dirent.h
================================================
/*
 * Dirent interface for Microsoft Visual Studio
 *
 * Copyright (C) 1998-2019 Toni Ronkko
 * This file is part of dirent.  Dirent may be freely distributed
 * under the MIT license.  For all details and documentation, see
 * https://github.com/tronkko/dirent
 */
#ifndef DIRENT_H
#define DIRENT_H

/* Hide warnings about unreferenced local functions */
#if defined(__clang__)
#   pragma clang diagnostic ignored "-Wunused-function"
#elif defined(_MSC_VER)
#   pragma warning(disable:4505)
#elif defined(__GNUC__)
#   pragma GCC diagnostic ignored "-Wunused-function"
#endif

/*
 * Include windows.h without Windows Sockets 1.1 to prevent conflicts with
 * Windows Sockets 2.0.
 */
#ifndef WIN32_LEAN_AND_MEAN
#   define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>

#include <stdio.h>
#include <stdarg.h>
#include <wchar.h>
#include <string.h>
#include <stdlib.h>
#include <malloc.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <errno.h>

/* Indicates that d_type field is available in dirent structure */
#define _DIRENT_HAVE_D_TYPE

/* Indicates that d_namlen field is available in dirent structure */
#define _DIRENT_HAVE_D_NAMLEN

/* Entries missing from MSVC 6.0 */
#if !defined(FILE_ATTRIBUTE_DEVICE)
#   define FILE_ATTRIBUTE_DEVICE 0x40
#endif

/* File type and permission flags for stat(), general mask */
#if !defined(S_IFMT)
#   define S_IFMT _S_IFMT
#endif

/* Directory bit */
#if !defined(S_IFDIR)
#   define S_IFDIR _S_IFDIR
#endif

/* Character device bit */
#if !defined(S_IFCHR)
#   define S_IFCHR _S_IFCHR
#endif

/* Pipe bit */
#if !defined(S_IFFIFO)
#   define S_IFFIFO _S_IFFIFO
#endif

/* Regular file bit */
#if !defined(S_IFREG)
#   define S_IFREG _S_IFREG
#endif

/* Read permission */
#if !defined(S_IREAD)
#   define S_IREAD _S_IREAD
#endif

/* Write permission */
#if !defined(S_IWRITE)
#   define S_IWRITE _S_IWRITE
#endif

/* Execute permission */
#if !defined(S_IEXEC)
#   define S_IEXEC _S_IEXEC
#endif

/* Pipe */
#if !defined(S_IFIFO)
#   define S_IFIFO _S_IFIFO
#endif

/* Block device */
#if !defined(S_IFBLK)
#   define S_IFBLK 0
#endif

/* Link */
#if !defined(S_IFLNK)
#   define S_IFLNK 0
#endif

/* Socket */
#if !defined(S_IFSOCK)
#   define S_IFSOCK 0
#endif

/* Read user permission */
#if !defined(S_IRUSR)
#   define S_IRUSR S_IREAD
#endif

/* Write user permission */
#if !defined(S_IWUSR)
#   define S_IWUSR S_IWRITE
#endif

/* Execute user permission */
#if !defined(S_IXUSR)
#   define S_IXUSR 0
#endif

/* Read group permission */
#if !defined(S_IRGRP)
#   define S_IRGRP 0
#endif

/* Write group permission */
#if !defined(S_IWGRP)
#   define S_IWGRP 0
#endif

/* Execute group permission */
#if !defined(S_IXGRP)
#   define S_IXGRP 0
#endif

/* Read others permission */
#if !defined(S_IROTH)
#   define S_IROTH 0
#endif

/* Write others permission */
#if !defined(S_IWOTH)
#   define S_IWOTH 0
#endif

/* Execute others permission */
#if !defined(S_IXOTH)
#   define S_IXOTH 0
#endif

/* Maximum length of file name */
#if !defined(PATH_MAX)
#   define PATH_MAX MAX_PATH
#endif
#if !defined(FILENAME_MAX)
#   define FILENAME_MAX MAX_PATH
#endif
#if !defined(NAME_MAX)
#   define NAME_MAX FILENAME_MAX
#endif

/* File type flags for d_type */
#define DT_UNKNOWN 0
#define DT_REG S_IFREG
#define DT_DIR S_IFDIR
#define DT_FIFO S_IFIFO
#define DT_SOCK S_IFSOCK
#define DT_CHR S_IFCHR
#define DT_BLK S_IFBLK
#define DT_LNK S_IFLNK

/* Macros for converting between st_mode and d_type */
#define IFTODT(mode) ((mode) & S_IFMT)
#define DTTOIF(type) (type)

/*
 * File type macros.  Note that block devices, sockets and links cannot be
 * distinguished on Windows and the macros S_ISBLK, S_ISSOCK and S_ISLNK are
 * only defined for compatibility.  These macros should always return false
 * on Windows.
 */
#if !defined(S_ISFIFO)
#   define S_ISFIFO(mode) (((mode) & S_IFMT) == S_IFIFO)
#endif
#if !defined(S_ISDIR)
#   define S_ISDIR(mode) (((mode) & S_IFMT) == S_IFDIR)
#endif
#if !defined(S_ISREG)
#   define S_ISREG(mode) (((mode) & S_IFMT) == S_IFREG)
#endif
#if !defined(S_ISLNK)
#   define S_ISLNK(mode) (((mode) & S_IFMT) == S_IFLNK)
#endif
#if !defined(S_ISSOCK)
#   define S_ISSOCK(mode) (((mode) & S_IFMT) == S_IFSOCK)
#endif
#if !defined(S_ISCHR)
#   define S_ISCHR(mode) (((mode) & S_IFMT) == S_IFCHR)
#endif
#if !defined(S_ISBLK)
#   define S_ISBLK(mode) (((mode) & S_IFMT) == S_IFBLK)
#endif

/* Return the exact length of the file name without zero terminator */
#define _D_EXACT_NAMLEN(p) ((p)->d_namlen)

/* Return the maximum size of a file name */
#define _D_ALLOC_NAMLEN(p) ((PATH_MAX)+1)


#ifdef __cplusplus
extern "C" {
#endif


/* Wide-character version */
struct _wdirent {
    /* Always zero */
    long d_ino;

    /* File position within stream */
    long d_off;

    /* Structure size */
    unsigned short d_reclen;

    /* Length of name without \0 */
    size_t d_namlen;

    /* File type */
    int d_type;

    /* File name */
    wchar_t d_name[PATH_MAX+1];
};
typedef struct _wdirent _wdirent;

struct _WDIR {
    /* Current directory entry */
    struct _wdirent ent;

    /* Private file data */
    WIN32_FIND_DATAW data;

    /* True if data is valid */
    int cached;

    /* Win32 search handle */
    HANDLE handle;

    /* Initial directory name */
    wchar_t *patt;
};
typedef struct _WDIR _WDIR;

/* Multi-byte character version */
struct dirent {
    /* Always zero */
    long d_ino;

    /* File position within stream */
    long d_off;

    /* Structure size */
    unsigned short d_reclen;

    /* Length of name without \0 */
    size_t d_namlen;

    /* File type */
    int d_type;

    /* File name */
    char d_name[PATH_MAX+1];
};
typedef struct dirent dirent;

struct DIR {
    struct dirent ent;
    struct _WDIR *wdirp;
};
typedef struct DIR DIR;


/* Dirent functions */
static DIR *opendir (const char *dirname);
static _WDIR *_wopendir (const wchar_t *dirname);

static struct dirent *readdir (DIR *dirp);
static struct _wdirent *_wreaddir (_WDIR *dirp);

static int readdir_r(
        DIR *dirp, struct dirent *entry, struct dirent **result);
static int _wreaddir_r(
        _WDIR *dirp, struct _wdirent *entry, struct _wdirent **result);

static int closedir (DIR *dirp);
static int _wclosedir (_WDIR *dirp);

static void rewinddir (DIR* dirp);
static void _wrewinddir (_WDIR* dirp);

static int scandir (const char *dirname, struct dirent ***namelist,
                    int (*filter)(const struct dirent*),
                    int (*compare)(const struct dirent**, const struct dirent**));

static int alphasort (const struct dirent **a, const struct dirent **b);

static int versionsort (const struct dirent **a, const struct dirent **b);


/* For compatibility with Symbian */
#define wdirent _wdirent
#define WDIR _WDIR
#define wopendir _wopendir
#define wreaddir _wreaddir
#define wclosedir _wclosedir
#define wrewinddir _wrewinddir


/* Internal utility functions */
static WIN32_FIND_DATAW *dirent_first (_WDIR *dirp);
static WIN32_FIND_DATAW *dirent_next (_WDIR *dirp);

static int dirent_mbstowcs_s(
        size_t *pReturnValue,
        wchar_t *wcstr,
        size_t sizeInWords,
        const char *mbstr,
        size_t count);

static int dirent_wcstombs_s(
        size_t *pReturnValue,
        char *mbstr,
        size_t sizeInBytes,
        const wchar_t *wcstr,
        size_t count);

static void dirent_set_errno (int error);


/*
 * Open directory stream DIRNAME for read and return a pointer to the
 * internal working area that is used to retrieve individual directory
 * entries.
 */
static _WDIR*
_wopendir(
        const wchar_t *dirname)
{
    _WDIR *dirp;
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
    /* Desktop */
    DWORD n;
#else
    /* WinRT */
    size_t n;
#endif
    wchar_t *p;

    /* Must have directory name */
    if (dirname == NULL  ||  dirname[0] == '\0') {
        dirent_set_errno (ENOENT);
        return NULL;
    }

    /* Allocate new _WDIR structure */
    dirp = (_WDIR*) malloc (sizeof (struct _WDIR));
    if (!dirp) {
        return NULL;
    }

    /* Reset _WDIR structure */
    dirp->handle = INVALID_HANDLE_VALUE;
    dirp->patt = NULL;
    dirp->cached = 0;

    /*
     * Compute the length of full path plus zero terminator
     *
     * Note that on WinRT there's no way to convert relative paths
     * into absolute paths, so just assume it is an absolute path.
     */
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
    /* Desktop */
    n = GetFullPathNameW (dirname, 0, NULL, NULL);
#else
    /* WinRT */
    n = wcslen (dirname);
#endif

    /* Allocate room for absolute directory name and search pattern */
    dirp->patt = (wchar_t*) malloc (sizeof (wchar_t) * n + 16);
    if (dirp->patt == NULL) {
        goto exit_closedir;
    }

    /*
     * Convert relative directory name to an absolute one.  This
     * allows rewinddir() to function correctly even when current
     * working directory is changed between opendir() and rewinddir().
     *
     * Note that on WinRT there's no way to convert relative paths
     * into absolute paths, so just assume it is an absolute path.
     */
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
    /* Desktop */
    n = GetFullPathNameW (dirname, n, dirp->patt, NULL);
    if (n <= 0) {
        goto exit_closedir;
    }
#else
    /* WinRT */
    wcsncpy_s (dirp->patt, n+1, dirname, n);
#endif

    /* Append search pattern \* to the directory name */
    p = dirp->patt + n;
    switch (p[-1]) {
        case '\\':
        case '/':
        case ':':
            /* Directory ends in path separator, e.g. c:\temp\ */
            /*NOP*/;
            break;

        default:
            /* Directory name doesn't end in path separator */
            *p++ = '\\';
    }
    *p++ = '*';
    *p = '\0';

    /* Open directory stream and retrieve the first entry */
    if (!dirent_first (dirp)) {
        goto exit_closedir;
    }

    /* Success */
    return dirp;

    /* Failure */
    exit_closedir:
    _wclosedir (dirp);
    return NULL;
}

/*
 * Read next directory entry.
 *
 * Returns pointer to static directory entry which may be overwritten by
 * subsequent calls to _wreaddir().
 */
static struct _wdirent*
_wreaddir(
        _WDIR *dirp)
{
    struct _wdirent *entry;

    /*
     * Read directory entry to buffer.  We can safely ignore the return value
     * as entry will be set to NULL in case of error.
     */
    (void) _wreaddir_r (dirp, &dirp->ent, &entry);

    /* Return pointer to statically allocated directory entry */
    return entry;
}

/*
 * Read next directory entry.
 *
 * Returns zero on success.  If end of directory stream is reached, then sets
 * result to NULL and returns zero.
 */
static int
_wreaddir_r(
        _WDIR *dirp,
        struct _wdirent *entry,
        struct _wdirent **result)
{
    WIN32_FIND_DATAW *datap;

    /* Read next directory entry */
    datap = dirent_next (dirp);
    if (datap) {
        size_t n;
        DWORD attr;

        /*
         * Copy file name as wide-character string.  If the file name is too
         * long to fit in to the destination buffer, then truncate file name
         * to PATH_MAX characters and zero-terminate the buffer.
         */
        n = 0;
        while (n < PATH_MAX  &&  datap->cFileName[n] != 0) {
            entry->d_name[n] = datap->cFileName[n];
            n++;
        }
        entry->d_name[n] = 0;

        /* Length of file name excluding zero terminator */
        entry->d_namlen = n;

        /* File type */
        attr = datap->dwFileAttributes;
        if ((attr & FILE_ATTRIBUTE_DEVICE) != 0) {
            entry->d_type = DT_CHR;
        } else if ((attr & FILE_ATTRIBUTE_DIRECTORY) != 0) {
            entry->d_type = DT_DIR;
        } else {
            entry->d_type = DT_REG;
        }

        /* Reset dummy fields */
        entry->d_ino = 0;
        entry->d_off = 0;
        entry->d_reclen = sizeof (struct _wdirent);

        /* Set result address */
        *result = entry;

    } else {

        /* Return NULL to indicate end of directory */
        *result = NULL;

    }

    return /*OK*/0;
}

/*
 * Close directory stream opened by opendir() function.  This invalidates the
 * DIR structure as well as any directory entry read previously by
 * _wreaddir().
 */
static int
_wclosedir(
        _WDIR *dirp)
{
    int ok;
    if (dirp) {

        /* Release search handle */
        if (dirp->handle != INVALID_HANDLE_VALUE) {
            FindClose (dirp->handle);
        }

        /* Release search pattern */
        free (dirp->patt);

        /* Release directory structure */
        free (dirp);
        ok = /*success*/0;

    } else {

        /* Invalid directory stream */
        dirent_set_errno (EBADF);
        ok = /*failure*/-1;

    }
    return ok;
}

/*
 * Rewind directory stream such that _wreaddir() returns the very first
 * file name again.
 */
static void
_wrewinddir(
        _WDIR* dirp)
{
    if (dirp) {
        /* Release existing search handle */
        if (dirp->handle != INVALID_HANDLE_VALUE) {
            FindClose (dirp->handle);
        }

        /* Open new search handle */
        dirent_first (dirp);
    }
}

/* Get first directory entry (internal) */
static WIN32_FIND_DATAW*
dirent_first(
        _WDIR *dirp)
{
    WIN32_FIND_DATAW *datap;
    DWORD error;

    /* Open directory and retrieve the first entry */
    dirp->handle = FindFirstFileExW(
            dirp->patt, FindExInfoStandard, &dirp->data,
            FindExSearchNameMatch, NULL, 0);
    if (dirp->handle != INVALID_HANDLE_VALUE) {

        /* a directory entry is now waiting in memory */
        datap = &dirp->data;
        dirp->cached = 1;

    } else {

        /* Failed to open directory: no directory entry in memory */
        dirp->cached = 0;
        datap = NULL;

        /* Set error code */
        error = GetLastError ();
        switch (error) {
            case ERROR_ACCESS_DENIED:
                /* No read access to directory */
                dirent_set_errno (EACCES);
                break;

            case ERROR_DIRECTORY:
                /* Directory name is invalid */
                dirent_set_errno (ENOTDIR);
                break;

            case ERROR_PATH_NOT_FOUND:
            default:
                /* Cannot find the file */
                dirent_set_errno (ENOENT);
        }

    }
    return datap;
}

/*
 * Get next directory entry (internal).
 *
 * Returns
 */
static WIN32_FIND_DATAW*
dirent_next(
        _WDIR *dirp)
{
    WIN32_FIND_DATAW *p;

    /* Get next directory entry */
    if (dirp->cached != 0) {

        /* A valid directory entry already in memory */
        p = &dirp->data;
        dirp->cached = 0;

    } else if (dirp->handle != INVALID_HANDLE_VALUE) {

        /* Get the next directory entry from stream */
        if (FindNextFileW (dirp->handle, &dirp->data) != FALSE) {
            /* Got a file */
            p = &dirp->data;
        } else {
            /* The very last entry has been processed or an error occurred */
            FindClose (dirp->handle);
            dirp->handle = INVALID_HANDLE_VALUE;
            p = NULL;
        }

    } else {

        /* End of directory stream reached */
        p = NULL;

    }

    return p;
}

/*
 * Open directory stream using plain old C-string.
 */
static DIR*
opendir(
        const char *dirname)
{
    struct DIR *dirp;

    /* Must have directory name */
    if (dirname == NULL  ||  dirname[0] == '\0') {
        dirent_set_errno (ENOENT);
        return NULL;
    }

    /* Allocate memory for DIR structure */
    dirp = (DIR*) malloc (sizeof (struct DIR));
    if (!dirp) {
        return NULL;
    }
    {
        int error;
        wchar_t wname[PATH_MAX + 1];
        size_t n;

        /* Convert directory name to wide-character string */
        error = dirent_mbstowcs_s(
                &n, wname, PATH_MAX + 1, dirname, PATH_MAX + 1);
        if (error) {
            /*
             * Cannot convert file name to wide-character string.  This
             * occurs if the string contains invalid multi-byte sequences or
             * the output buffer is too small to contain the resulting
             * string.
             */
            goto exit_free;
        }


        /* Open directory stream using wide-character name */
        dirp->wdirp = _wopendir (wname);
        if (!dirp->wdirp) {
            goto exit_free;
        }

    }

    /* Success */
    return dirp;

    /* Failure */
    exit_free:
    free (dirp);
    return NULL;
}

/*
 * Read next directory entry.
 */
static struct dirent*
readdir(
        DIR *dirp)
{
    struct dirent *entry;

    /*
     * Read directory entry to buffer.  We can safely ignore the return value
     * as entry will be set to NULL in case of error.
     */
    (void) readdir_r (dirp, &dirp->ent, &entry);

    /* Return pointer to statically allocated directory entry */
    return entry;
}

/*
 * Read next directory entry into called-allocated buffer.
 *
 * Returns zero on success.  If the end of directory stream is reached, then
 * sets result to NULL and returns zero.
 */
static int
readdir_r(
        DIR *dirp,
        struct dirent *entry,
        struct dirent **result)
{
    WIN32_FIND_DATAW *datap;

    /* Read next directory entry */
    datap = dirent_next (dirp->wdirp);
    if (datap) {
        size_t n;
        int error;

        /* Attempt to convert file name to multi-byte string */
        error = dirent_wcstombs_s(
                &n, entry->d_name, PATH_MAX + 1, datap->cFileName, PATH_MAX + 1);

        /*
         * If the file name cannot be represented by a multi-byte string,
         * then attempt to use old 8+3 file name.  This allows traditional
         * Unix-code to access some file names despite of unicode
         * characters, although file names may seem unfamiliar to the user.
         *
         * Be ware that the code below cannot come up with a short file
         * name unless the file system provides one.  At least
         * VirtualBox shared folders fail to do this.
         */
        if (error  &&  datap->cAlternateFileName[0] != '\0') {
            error = dirent_wcstombs_s(
                    &n, entry->d_name, PATH_MAX + 1,
                    datap->cAlternateFileName, PATH_MAX + 1);
        }

        if (!error) {
            DWORD attr;

            /* Length of file name excluding zero terminator */
            entry->d_namlen = n - 1;

            /* File attributes */
            attr = datap->dwFileAttributes;
            if ((attr & FILE_ATTRIBUTE_DEVICE) != 0) {
                entry->d_type = DT_CHR;
            } else if ((attr & FILE_ATTRIBUTE_DIRECTORY) != 0) {
                entry->d_type = DT_DIR;
            } else {
                entry->d_type = DT_REG;
            }

            /* Reset dummy fields */
            entry->d_ino = 0;
            entry->d_off = 0;
            entry->d_reclen = sizeof (struct dirent);

        } else {

            /*
             * Cannot convert file name to multi-byte string so construct
             * an erroneous directory entry and return that.  Note that
             * we cannot return NULL as that would stop the processing
             * of directory entries completely.
             */
            entry->d_name[0] = '?';
            entry->d_name[1] = '\0';
            entry->d_namlen = 1;
            entry->d_type = DT_UNKNOWN;
            entry->d_ino = 0;
            entry->d_off = -1;
            entry->d_reclen = 0;

        }

        /* Return pointer to directory entry */
        *result = entry;

    } else {

        /* No more directory entries */
        *result = NULL;

    }

    return /*OK*/0;
}

/*
 * Close directory stream.
 */
static int
closedir(
        DIR *dirp)
{
    int ok;
    if (dirp) {

        /* Close wide-character directory stream */
        ok = _wclosedir (dirp->wdirp);
        dirp->wdirp = NULL;

        /* Release multi-byte character version */
        free (dirp);

    } else {

        /* Invalid directory stream */
        dirent_set_errno (EBADF);
        ok = /*failure*/-1;

    }
    return ok;
}

/*
 * Rewind directory stream to beginning.
 */
static void
rewinddir(
        DIR* dirp)
{
    /* Rewind wide-character string directory stream */
    _wrewinddir (dirp->wdirp);
}

/*
 * Scan directory for entries.
 */
static int
scandir(
        const char *dirname,
        struct dirent ***namelist,
        int (*filter)(const struct dirent*),
        int (*compare)(const struct dirent**, const struct dirent**))
{
    struct dirent **files = NULL;
    size_t size = 0;
    size_t allocated = 0;
    const size_t init_size = 1;
    DIR *dir = NULL;
    struct dirent *entry;
    struct dirent *tmp = NULL;
    size_t i;
    int result = 0;

    /* Open directory stream */
    dir = opendir (dirname);
    if (dir) {

        /* Read directory entries to memory */
        while (1) {

            /* Enlarge pointer table to make room for another pointer */
            if (size >= allocated) {
                void *p;
                size_t num_entries;

                /* Compute number of entries in the enlarged pointer table */
                if (size < init_size) {
                    /* Allocate initial pointer table */
                    num_entries = init_size;
                } else {
                    /* Double the size */
                    num_entries = size * 2;
                }

                /* Allocate first pointer table or enlarge existing table */
                p = realloc (files, sizeof (void*) * num_entries);
                if (p != NULL) {
                    /* Got the memory */
                    files = (dirent**) p;
                    allocated = num_entries;
                } else {
                    /* Out of memory */
                    result = -1;
                    break;
                }

            }

            /* Allocate room for temporary directory entry */
            if (tmp == NULL) {
                tmp = (struct dirent*) malloc (sizeof (struct dirent));
                if (tmp == NULL) {
                    /* Cannot allocate temporary directory entry */
                    result = -1;
                    break;
                }
            }

            /* Read directory entry to temporary area */
            if (readdir_r (dir, tmp, &entry) == /*OK*/0) {

                /* Did we get an entry? */
                if (entry != NULL) {
                    int pass;

                    /* Determine whether to include the entry in result */
                    if (filter) {
                        /* Let the filter function decide */
                        pass = filter (tmp);
                    } else {
                        /* No filter function, include everything */
                        pass = 1;
                    }

                    if (pass) {
                        /* Store the temporary entry to pointer table */
                        files[size++] = tmp;
                        tmp = NULL;

                        /* Keep up with the number of files */
                        result++;
                    }

                } else {

                    /*
                     * End of directory stream reached => sort entries and
                     * exit.
                     */
                    qsort (files, size, sizeof (void*),
                           (int (*) (const void*, const void*)) compare);
                    break;

                }

            } else {
                /* Error reading directory entry */
                result = /*Error*/ -1;
                break;
            }

        }

    } else {
        /* Cannot open directory */
        result = /*Error*/ -1;
    }

    /* Release temporary directory entry */
    free (tmp);

    /* Release allocated memory on error */
    if (result < 0) {
        for (i = 0; i < size; i++) {
            free (files[i]);
        }
        free (files);
        files = NULL;
    }

    /* Close directory stream */
    if (dir) {
        closedir (dir);
    }

    /* Pass pointer table to caller */
    if (namelist) {
        *namelist = files;
    }
    return result;
}

/* Alphabetical sorting */
static int
alphasort(
        const struct dirent **a, const struct dirent **b)
{
    return strcoll ((*a)->d_name, (*b)->d_name);
}

/* Sort versions */
static int
versionsort(
        const struct dirent **a, const struct dirent **b)
{
    /* FIXME: implement strverscmp and use that */
    return alphasort (a, b);
}

/* Convert multi-byte string to wide character string */
static int
dirent_mbstowcs_s(
        size_t *pReturnValue,
        wchar_t *wcstr,
        size_t sizeInWords,
        const char *mbstr,
        size_t count)
{
    int error;

#if defined(_MSC_VER)  &&  _MSC_VER >= 1400

    /* Microsoft Visual Studio 2005 or later */
    error = mbstowcs_s (pReturnValue, wcstr, sizeInWords, mbstr, count);

#else

    /* Older Visual Studio or non-Microsoft compiler */
    size_t n;

    /* Convert to wide-character string (or count characters) */
    n = mbstowcs (wcstr, mbstr, sizeInWords);
    if (!wcstr  ||  n < count) {

        /* Zero-terminate output buffer */
        if (wcstr  &&  sizeInWords) {
            if (n >= sizeInWords) {
                n = sizeInWords - 1;
            }
            wcstr[n] = 0;
        }

        /* Length of resulting multi-byte string WITH zero terminator */
        if (pReturnValue) {
            *pReturnValue = n + 1;
        }

        /* Success */
        error = 0;

    } else {

        /* Could not convert string */
        error = 1;

    }

#endif
    return error;
}

/* Convert wide-character string to multi-byte string */
static int
dirent_wcstombs_s(
        size_t *pReturnValue,
        char *mbstr,
        size_t sizeInBytes, /* max size of mbstr */
        const wchar_t *wcstr,
        size_t count)
{
    int error;

#if defined(_MSC_VER)  &&  _MSC_VER >= 1400

    /* Microsoft Visual Studio 2005 or later */
    error = wcstombs_s (pReturnValue, mbstr, sizeInBytes, wcstr, count);

#else

    /* Older Visual Studio or non-Microsoft compiler */
    size_t n;

    /* Convert to multi-byte string (or count the number of bytes needed) */
    n = wcstombs (mbstr, wcstr, sizeInBytes);
    if (!mbstr  ||  n < count) {

        /* Zero-terminate output buffer */
        if (mbstr  &&  sizeInBytes) {
            if (n >= sizeInBytes) {
                n = sizeInBytes - 1;
            }
            mbstr[n] = '\0';
        }

        /* Length of resulting multi-bytes string WITH zero-terminator */
        if (pReturnValue) {
            *pReturnValue = n + 1;
        }

        /* Success */
        error = 0;

    } else {

        /* Cannot convert string */
        error = 1;

    }

#endif
    return error;
}

/* Set errno variable */
static void
dirent_set_errno(
        int error)
{
#if defined(_MSC_VER)  &&  _MSC_VER >= 1400

    /* Microsoft Visual Studio 2005 and later */
    _set_errno (error);

#else

    /* Non-Microsoft compiler or older Microsoft compiler */
    errno = error;

#endif
}


#ifdef __cplusplus
}
#endif
#endif /*DIRENT_H*/


================================================
FILE: yolov9/yolov9_trt.py
================================================
"""
An example that uses TensorRT's Python api to make inferences.
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit  # noqa: F401
import pycuda.driver as cuda
import tensorrt as trt

CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4


def get_img_path_batches(batch_size, img_dir):
    ret = []
    batch = []
    for root, dirs, files in os.walk(img_dir):
        for name in files:
            if len(batch) == batch_size:
                ret.append(batch)
                batch = []
            batch.append(os.path.join(root, name))
    if len(batch) > 0:
        ret.append(batch)
    return ret


def plot_one_box(x, img, color=None, label=None, line_thickness=None):
    """
    description: Plots one bounding box on image img,
                 this function comes from yolov9 project.
    param:
        x:      a box likes [x1,y1,x2,y2]
        img:    a opencv image object
        color:  color to draw rectangle, such as (0,255,0)
        label:  str
        line_thickness: int
    return:
        no return

    """
    tl = (
            line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
    )  # line/font thickness
    color = color or [random.randint(0, 255) for _ in range(3)]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
    if label:
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(
            img,
            label,
            (c1[0], c1[1] - 2),
            0,
            tl / 3,
            [225, 255, 255],
            thickness=tf,
            lineType=cv2.LINE_AA,
        )


class yolov9TRT(object):
    """
    description: A yolov9 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.batch_size = engine.max_batch_size

    def infer(self, raw_image_generator):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_origin_h = []
        batch_origin_w = []
        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
            batch_image_raw.append(image_raw)
            batch_origin_h.append(origin_h)
            batch_origin_w.append(origin_w)
            np.copyto(batch_input_image[i], input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        # Do postprocess
        for i in range(self.batch_size):
            result_boxes, result_scores, result_classid = self.post_process(
                output[i * 38001: (i + 1) * 38001], batch_origin_h[i], batch_origin_w[i]
            )
            # Draw rectangles and labels on the original image
            for j in range(len(result_boxes)):
                box = result_boxes[j]
                plot_one_box(
                    box,
                    batch_image_raw[i],
                    label="{}:{:.2f}".format(
                        categories[int(result_classid[j])], result_scores[j]
                    ),
                )
        return batch_image_raw, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()

    def get_raw_image(self, image_path_batch):
        """
        description: Read an image from image path
        """
        for img_path in image_path_batch:
            yield cv2.imread(img_path)

    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_image(self, raw_bgr_image):
        """
        description: Convert BGR image to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = self.input_w / w
        r_h = self.input_h / h
        if r_h > r_w:
            tw = self.input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.input_h - th) / 2)
            ty2 = self.input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = self.input_h
            tx1 = int((self.input_w - tw) / 2)
            tx2 = self.input_w - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy(self, origin_h, origin_w, x):
        """
        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
        param:
            origin_h:   height of original image
            origin_w:   width of original image
            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
        return:
            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
        """
        y = np.zeros_like(x)
        r_w = self.input_w / origin_w
        r_h = self.input_h / origin_h
        if r_h > r_w:
            y[:, 0] = x[:, 0]
            y[:, 2] = x[:, 2]
            y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2
            y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2
            y /= r_w
        else:
            y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2
            y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2
            y[:, 1] = x[:, 1]
            y[:, 3] = x[:, 3]
            y /= r_h

        return y

    def post_process(self, output, origin_h, origin_w):
        """
        description: postprocess the prediction
        param:
            output:     A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
            result_scores: finally scores, a numpy, each element is the score correspoing to box
            result_classid: finally classid, a numpy, each element is the classid correspoing to box
        """
        # Get the num of boxes detected
        num = int(output[0])
        # Reshape to a two dimentional ndarray
        pred = np.reshape(output[1:], (-1, 38))[:num, :]
        # Do nms
        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
        result_scores = boxes[:, 4] if len(boxes) else np.array([])
        result_classid = boxes[:, 5] if len(boxes) else np.array([])
        return result_boxes, result_scores, result_classid

    def bbox_iou(self, box1, box2, x1y1x2y2=True):
        """
        description: compute the IoU of two bounding boxes
        param:
            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            x1y1x2y2: select the coordinate format
        return:
            iou: computed iou
        """
        if not x1y1x2y2:
            # Transform from center and width to exact coordinates
            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
        else:
            # Get the coordinates of bounding boxes
            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

        # Get the coordinates of the intersection rectangle
        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
        # Intersection area
        inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \
            np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)
        # Union Area
        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)

        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

        return iou

    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
        """
        description: Removes detections with lower object confidence score than 'conf_thres' and performs
        Non-Maximum Suppression to further filter detections.
        param:
            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
            origin_h: original image height
            origin_w: original image width
            conf_thres: a confidence threshold to filter detections
            nms_thres: a iou threshold to filter detections
        return:
            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
        """
        # Get the boxes that score > CONF_THRESH
        boxes = prediction[prediction[:, 4] >= conf_thres]
        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
        boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
        # clip the coordinates
        boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1)
        boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1)
        boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1)
        boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1)
        # Object confidence
        confs = boxes[:, 4]
        # Sort by the confs
        boxes = boxes[np.argsort(-confs)]
        # Perform non-maximum suppression
        keep_boxes = []
        while boxes.shape[0]:
            large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
            label_match = boxes[0, -1] == boxes[:, -1]
            # Indices of boxes with lower confidence scores, large IOUs and matching labels
            invalid = large_overlap & label_match
            keep_boxes += [boxes[0]]
            boxes = boxes[~invalid]
        boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
        return boxes


class inferThread(threading.Thread):
    def __init__(self, yolov9_wrapper, image_path_batch):
        threading.Thread.__init__(self)
        self.yolov9_wrapper = yolov9_wrapper
        self.image_path_batch = image_path_batch

    def run(self):
        batch_image_raw, use_time = self.yolov9_wrapper.infer(self.yolov9_wrapper.get_raw_image(self.image_path_batch))
        for i, img_path in enumerate(self.image_path_batch):
            parent, filename = os.path.split(img_path)
            save_name = os.path.join('output', filename)
            # Save image
            cv2.imwrite(save_name, batch_image_raw[i])
        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))


class warmUpThread(threading.Thread):
    def __init__(self, yolov9_wrapper):
        threading.Thread.__init__(self)
        self.yolov9_wrapper = yolov9_wrapper

    def run(self):
        batch_image_raw, use_time = self.yolov9_wrapper.infer(self.yolov9_wrapper.get_raw_image_zeros())
        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))


if __name__ == "__main__":
    # load custom plugin and engine
    PLUGIN_LIBRARY = "build/libmyplugins.so"
    engine_file_path = "yolov9-c.engine"

    if len(sys.argv) > 1:
        engine_file_path = sys.argv[1]
    if len(sys.argv) > 2:
        PLUGIN_LIBRARY = sys.argv[2]

    ctypes.CDLL(PLUGIN_LIBRARY)

    # load coco labels

    categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
                  "traffic light",
                  "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
                  "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase",
                  "frisbee",
                  "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
                  "surfboard",
                  "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
                  "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
                  "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
                  "cell phone",
                  "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
                  "teddy bear",
                  "hair drier", "toothbrush"]

    if os.path.exists('output/'):
        shutil.rmtree('output/')
    os.makedirs('output/')
    # a yolov9TRT instance
    yolov9_wrapper = yolov9TRT(engine_file_path)
    try:
        print('batch size is', yolov9_wrapper.batch_size)

        image_dir = "images/"
        image_path_batches = get_img_path_batches(yolov9_wrapper.batch_size, image_dir)

        for i in range(10):
            # create a new thread to do warm_up
            thread1 = warmUpThread(yolov9_wrapper)
            thread1.start()
            thread1.join()
        for batch in image_path_batches:
            # create a new thread to do inference
            thread1 = inferThread(yolov9_wrapper, batch)
            thread1.start()
            thread1.join()
    finally:
        # destroy the instance
        yolov9_wrapper.destroy()