Repository: Tencent/ncnn Branch: master Commit: 939f24fc2b44 Files: 3805 Total size: 33.0 MB Directory structure: gitextract_nmtq5ath/ ├── .astylerc ├── .clang-format ├── .gitattributes ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug.md │ │ ├── model-convert.md │ │ ├── others.md │ │ └── quantization.md │ ├── dependabot.yml │ ├── labeler.yml │ └── workflows/ │ ├── android.yml │ ├── code-format-msg.yml │ ├── code-format.yml │ ├── codeql-analysis.yml │ ├── compare-binary-size-pr-comment.yml │ ├── compare-binary-size.yml │ ├── elf-riscv32.yml │ ├── elf-riscv64.yml │ ├── esp32.yml │ ├── harmonyos.yml │ ├── ios.yml │ ├── labeler.yml │ ├── linux-aarch64.yml │ ├── linux-arm.yml │ ├── linux-loongarch64.yml │ ├── linux-mips.yml │ ├── linux-mips64.yml │ ├── linux-ppc64.yml │ ├── linux-riscv32.yml │ ├── linux-riscv64.yml │ ├── linux-x64-cpu-clang.yml │ ├── linux-x64-cpu-gcc-musl.yml │ ├── linux-x64-cpu-gcc.yml │ ├── linux-x64-gpu-clang.yml │ ├── linux-x64-gpu-gcc.yml │ ├── linux-x64-sde.yml │ ├── linux-x86-cpu-clang.yml │ ├── linux-x86-cpu-gcc.yml │ ├── mac-catalyst.yml │ ├── macos.yml │ ├── pnnx.yml │ ├── python.yml │ ├── release-python.yml │ ├── release.yml │ ├── sync-wiki.yml │ ├── test-coverage.yml │ ├── tvos.yml │ ├── visionos.yml │ ├── watchos.yml │ ├── web-assembly.yml │ ├── windows-arm.yml │ ├── windows-clang.yml │ ├── windows-mingw.yml │ ├── windows-xp.yml │ └── windows.yml ├── .gitignore ├── .gitmodules ├── CITATION.cff ├── CMakeLists.txt ├── CONTRIBUTING.md ├── Info.plist ├── LICENSE.txt ├── MANIFEST.in ├── README.md ├── benchmark/ │ ├── CMakeLists.txt │ ├── FastestDet.param │ ├── README.md │ ├── RankCards/ │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ ├── Rcards.h │ │ └── main.cpp │ ├── alexnet.param │ ├── benchncnn.cpp │ ├── benchncnn_param_data.h.in │ ├── blazeface.param │ ├── efficientnet_b0.param │ ├── efficientnetv2_b0.param │ ├── googlenet.param │ ├── googlenet_int8.param │ ├── mnasnet.param │ ├── mobilenet.param │ ├── mobilenet_int8.param │ ├── mobilenet_ssd.param │ ├── mobilenet_ssd_int8.param │ ├── mobilenet_v2.param │ ├── mobilenet_v3.param │ ├── mobilenet_yolo.param │ ├── mobilenetv2_yolov3.param │ ├── nanodet_m.param │ ├── proxylessnasnet.param │ ├── regnety_400m.param │ ├── resnet18.param │ ├── resnet18_int8.param │ ├── resnet50.param │ ├── resnet50_int8.param │ ├── shufflenet.param │ ├── shufflenet_v2.param │ ├── squeezenet.param │ ├── squeezenet_int8.param │ ├── squeezenet_ssd.param │ ├── squeezenet_ssd_int8.param │ ├── vgg16.param │ ├── vgg16_int8.param │ ├── vision_transformer.param │ ├── yolo-fastest-1.1.param │ ├── yolo-fastestv2.param │ └── yolov4-tiny.param ├── build-android.cmd ├── build.sh ├── cmake/ │ ├── ncnnConfig.cmake.in │ ├── ncnn_add_layer.cmake │ ├── ncnn_add_param.cmake │ ├── ncnn_add_shader.cmake │ ├── ncnn_generate_avx512_source.cmake │ ├── ncnn_generate_avx_source.cmake │ ├── ncnn_generate_fma_source.cmake │ ├── ncnn_generate_lasx_source.cmake │ ├── ncnn_generate_lsx_source.cmake │ ├── ncnn_generate_msa_source.cmake │ ├── ncnn_generate_param_header.cmake │ ├── ncnn_generate_rvv_source.cmake │ ├── ncnn_generate_shader_comp_header.cmake │ ├── ncnn_generate_xtheadvector_source.cmake │ └── run_test.cmake ├── codeformat.sh ├── docs/ │ ├── Home.md │ ├── application-with-ncnn-inside.md │ ├── benchmark/ │ │ ├── the-benchmark-of-caffe-android-lib,-mini-caffe,-and-ncnn.md │ │ └── vulkan-conformance-test.md │ ├── developer-guide/ │ │ ├── aarch64-mix-assembly-and-intrinsic.md │ │ ├── add-custom-layer.zh.md │ │ ├── arm-a53-a55-dual-issue.md │ │ ├── armv7-mix-assembly-and-intrinsic.md │ │ ├── binaryop-broadcasting.md │ │ ├── build-ncnn-on-windows-xp.zh.md │ │ ├── custom-allocator.md │ │ ├── element-packing.md │ │ ├── expression.md │ │ ├── glsl-extension.md │ │ ├── glsl-extension.zh.md │ │ ├── how-to-be-a-contributor.zh.md │ │ ├── how-to-implement-custom-layer-step-by-step.md │ │ ├── how-to-write-a-neon-optimized-op-kernel.md │ │ ├── how-to-write-a-sse-optimized-op-kernel.zh.md │ │ ├── kvcache.md │ │ ├── layer-feat-mask.md │ │ ├── layer-support-behavior.md │ │ ├── low-level-operation-api.md │ │ ├── ncnn-tips-and-tricks.zh.md │ │ ├── new-model-load-api.md │ │ ├── new-param-load-api.md │ │ ├── operation-param-weight-table.md │ │ ├── operators.md │ │ ├── param-and-model-file-structure.md │ │ ├── preload-practice.zh.md │ │ ├── tensorflow-op-combination.md │ │ └── vulkan-driver-loader.md │ ├── faq.en.md │ ├── faq.md │ ├── how-to-build/ │ │ ├── build-mlir2ncnn.md │ │ └── how-to-build.md │ └── how-to-use-and-FAQ/ │ ├── FAQ-ncnn-produce-wrong-result.md │ ├── FAQ-ncnn-protobuf-problem.zh.md │ ├── FAQ-ncnn-throw-error.md │ ├── FAQ-ncnn-vulkan.md │ ├── build-minimal-library.md │ ├── efficient-roi-resize-rotate.md │ ├── ncnn-load-model.md │ ├── openmp-best-practice.md │ ├── openmp-best-practice.zh.md │ ├── quantized-int8-inference.md │ ├── use-ncnn-with-alexnet.md │ ├── use-ncnn-with-alexnet.zh.md │ ├── use-ncnn-with-opencv.md │ ├── use-ncnn-with-own-project.md │ ├── use-ncnn-with-pytorch-or-onnx.md │ ├── use-ncnnoptimize-to-optimize-model.md │ └── vulkan-notes.md ├── examples/ │ ├── CMakeLists.txt │ ├── arcface.cpp │ ├── fasterrcnn.cpp │ ├── mobilenetssd.cpp │ ├── mobilenetv2ssdlite.cpp │ ├── mobilenetv3ssdlite.cpp │ ├── nanodet.cpp │ ├── nanodetplus_pnnx.cpp │ ├── p2pnet.cpp │ ├── peleenetssd_seg.cpp │ ├── piper.cpp │ ├── ppocrv5.cpp │ ├── ppocrv5_dict.h │ ├── retinaface.cpp │ ├── rfcn.cpp │ ├── rvm.cpp │ ├── scrfd.cpp │ ├── scrfd_crowdhuman.cpp │ ├── shufflenetv2.cpp │ ├── simplepose.cpp │ ├── squeezencnn/ │ │ └── README.md │ ├── squeezenet.cpp │ ├── squeezenet_c_api.cpp │ ├── squeezenet_v1.1.caffemodel │ ├── squeezenet_v1.1.param │ ├── squeezenet_v1.1.prototxt │ ├── squeezenetssd.cpp │ ├── synset_words.txt │ ├── whisper.cpp │ ├── yolact.cpp │ ├── yolo11.cpp │ ├── yolo11_cls.cpp │ ├── yolo11_obb.cpp │ ├── yolo11_pose.cpp │ ├── yolo11_seg.cpp │ ├── yolov2.cpp │ ├── yolov3.cpp │ ├── yolov4.cpp │ ├── yolov5.cpp │ ├── yolov5_pnnx.cpp │ ├── yolov7.cpp │ ├── yolov7_pnnx.cpp │ ├── yolov8.cpp │ ├── yolov8_cls.cpp │ ├── yolov8_obb.cpp │ ├── yolov8_pose.cpp │ ├── yolov8_seg.cpp │ ├── yoloworld.cpp │ └── yolox.cpp ├── package.sh ├── pyproject.toml ├── python/ │ ├── CMakeLists.txt │ ├── README.md │ ├── examples/ │ │ ├── fasterrcnn.py │ │ ├── mobilenetssd.py │ │ ├── mobilenetv2ssdlite.py │ │ ├── mobilenetv3ssdlite.py │ │ ├── model_zoo.py │ │ ├── nanodet.py │ │ ├── peleenetssd.py │ │ ├── retinaface.py │ │ ├── rfcn.py │ │ ├── shufflenetv2.py │ │ ├── simplepose.py │ │ ├── squeezenet.py │ │ ├── squeezenetssd.py │ │ ├── yolact.py │ │ ├── yolov2.py │ │ ├── yolov3.py │ │ ├── yolov4.py │ │ ├── yolov5.py │ │ └── yolov8.py │ ├── ncnn/ │ │ ├── __init__.py │ │ ├── model_zoo/ │ │ │ ├── __init__.py │ │ │ ├── fasterrcnn.py │ │ │ ├── mobilenetssd.py │ │ │ ├── mobilenetv2ssdlite.py │ │ │ ├── mobilenetv3ssdlite.py │ │ │ ├── model_store.py │ │ │ ├── model_zoo.py │ │ │ ├── nanodet.py │ │ │ ├── peleenetssd.py │ │ │ ├── retinaface.py │ │ │ ├── rfcn.py │ │ │ ├── shufflenetv2.py │ │ │ ├── simplepose.py │ │ │ ├── squeezenet.py │ │ │ ├── squeezenetssd.py │ │ │ ├── yolact.py │ │ │ ├── yolov2.py │ │ │ ├── yolov3.py │ │ │ ├── yolov4.py │ │ │ ├── yolov5.py │ │ │ ├── yolov7.py │ │ │ └── yolov8.py │ │ └── utils/ │ │ ├── __init__.py │ │ ├── download.py │ │ ├── functional.py │ │ ├── objects.py │ │ └── visual.py │ ├── requirements.txt │ ├── setup.py.i │ ├── src/ │ │ ├── main.cpp │ │ ├── pybind11_allocator.h │ │ ├── pybind11_bind.h │ │ ├── pybind11_datareader.h │ │ ├── pybind11_layer.h │ │ ├── pybind11_mat.h │ │ └── pybind11_modelbin.h │ └── tests/ │ ├── benchmark.py │ ├── custom_layer.param │ ├── test.param │ ├── test_allocator.py │ ├── test_blob.py │ ├── test_extractor.py │ ├── test_mat.py │ ├── test_net.py │ ├── test_option.py │ ├── test_paramdict.py │ ├── test_vulkan_allocator.py │ └── test_vulkan_device.py ├── setup.py ├── src/ │ ├── CMakeLists.txt │ ├── allocator.cpp │ ├── allocator.h │ ├── benchmark.cpp │ ├── benchmark.h │ ├── blob.cpp │ ├── blob.h │ ├── c_api.cpp │ ├── c_api.h │ ├── command.cpp │ ├── command.h │ ├── convert_ycbcr.comp │ ├── cpu.cpp │ ├── cpu.h │ ├── datareader.cpp │ ├── datareader.h │ ├── expression.cpp │ ├── expression.h │ ├── gpu.cpp │ ├── gpu.h │ ├── layer/ │ │ ├── absval.cpp │ │ ├── absval.h │ │ ├── argmax.cpp │ │ ├── argmax.h │ │ ├── arm/ │ │ │ ├── absval_arm.cpp │ │ │ ├── absval_arm.h │ │ │ ├── arm_activation.h │ │ │ ├── arm_usability.h │ │ │ ├── batchnorm_arm.cpp │ │ │ ├── batchnorm_arm.h │ │ │ ├── batchnorm_arm_asimdhp.cpp │ │ │ ├── bias_arm.cpp │ │ │ ├── bias_arm.h │ │ │ ├── binaryop_arm.cpp │ │ │ ├── binaryop_arm.h │ │ │ ├── binaryop_arm_asimdhp.cpp │ │ │ ├── cast_arm.cpp │ │ │ ├── cast_arm.h │ │ │ ├── cast_arm_bf16.cpp │ │ │ ├── cast_arm_vfpv4.cpp │ │ │ ├── cast_bf16.h │ │ │ ├── cast_fp16.h │ │ │ ├── clip_arm.cpp │ │ │ ├── clip_arm.h │ │ │ ├── clip_arm_asimdhp.cpp │ │ │ ├── concat_arm.cpp │ │ │ ├── concat_arm.h │ │ │ ├── convolution1d_arm.cpp │ │ │ ├── convolution1d_arm.h │ │ │ ├── convolution1d_arm_asimdhp.cpp │ │ │ ├── convolution1d_packed.h │ │ │ ├── convolution1d_packed_bf16s.h │ │ │ ├── convolution1d_packed_fp16s.h │ │ │ ├── convolution_1x1.h │ │ │ ├── convolution_2x2.h │ │ │ ├── convolution_3x3.h │ │ │ ├── convolution_3x3_int8.h │ │ │ ├── convolution_3x3_pack1to4.h │ │ │ ├── convolution_3x3_pack1to4_bf16s.h │ │ │ ├── convolution_3x3_pack1to4_fp16s.h │ │ │ ├── convolution_3x3_pack1to8_fp16s.h │ │ │ ├── convolution_3x3_pack4.h │ │ │ ├── convolution_3x3_pack4_bf16s.h │ │ │ ├── convolution_3x3_pack4_fp16s.h │ │ │ ├── convolution_3x3_pack4to1.h │ │ │ ├── convolution_3x3_pack8_fp16s.h │ │ │ ├── convolution_3x3_winograd.h │ │ │ ├── convolution_3x3_winograd_bf16s.h │ │ │ ├── convolution_3x3_winograd_fp16s.h │ │ │ ├── convolution_3x3_winograd_int8.h │ │ │ ├── convolution_4x4.h │ │ │ ├── convolution_5x5.h │ │ │ ├── convolution_5x5_pack4.h │ │ │ ├── convolution_5x5_pack4_bf16s.h │ │ │ ├── convolution_5x5_pack8_fp16s.h │ │ │ ├── convolution_7x7.h │ │ │ ├── convolution_7x7_pack1to4.h │ │ │ ├── convolution_7x7_pack1to4_bf16s.h │ │ │ ├── convolution_7x7_pack1to8_fp16s.h │ │ │ ├── convolution_arm.cpp │ │ │ ├── convolution_arm.h │ │ │ ├── convolution_arm_asimddp.cpp │ │ │ ├── convolution_arm_asimdhp.cpp │ │ │ ├── convolution_arm_i8mm.cpp │ │ │ ├── convolution_im2col_gemm.h │ │ │ ├── convolution_im2col_gemm_bf16s.h │ │ │ ├── convolution_im2col_gemm_bf16s_fp16s.h │ │ │ ├── convolution_im2col_gemm_fp16s.h │ │ │ ├── convolution_im2col_gemm_int8.h │ │ │ ├── convolution_packed.h │ │ │ ├── convolution_packed_bf16s.h │ │ │ ├── convolution_packed_fp16s.h │ │ │ ├── convolution_packed_int8.h │ │ │ ├── convolutiondepthwise_3x3.h │ │ │ ├── convolutiondepthwise_3x3_fp16s.h │ │ │ ├── convolutiondepthwise_3x3_int8.h │ │ │ ├── convolutiondepthwise_3x3_pack4.h │ │ │ ├── convolutiondepthwise_3x3_pack4_bf16s.h │ │ │ ├── convolutiondepthwise_3x3_pack8_fp16s.h │ │ │ ├── convolutiondepthwise_3x3_pack8_int8.h │ │ │ ├── convolutiondepthwise_5x5.h │ │ │ ├── convolutiondepthwise_5x5_pack4.h │ │ │ ├── convolutiondepthwise_5x5_pack4_bf16s.h │ │ │ ├── convolutiondepthwise_5x5_pack8_fp16s.h │ │ │ ├── convolutiondepthwise_arm.cpp │ │ │ ├── convolutiondepthwise_arm.h │ │ │ ├── convolutiondepthwise_arm_asimdhp.cpp │ │ │ ├── crop_arm.cpp │ │ │ ├── crop_arm.h │ │ │ ├── deconvolution_3x3.h │ │ │ ├── deconvolution_4x4.h │ │ │ ├── deconvolution_4x4_fp16s.h │ │ │ ├── deconvolution_arm.cpp │ │ │ ├── deconvolution_arm.h │ │ │ ├── deconvolution_arm_asimdhp.cpp │ │ │ ├── deconvolutiondepthwise_arm.cpp │ │ │ ├── deconvolutiondepthwise_arm.h │ │ │ ├── deconvolutiondepthwise_arm_asimdhp.cpp │ │ │ ├── dequantize_arm.cpp │ │ │ ├── dequantize_arm.h │ │ │ ├── dequantize_arm_asimdhp.cpp │ │ │ ├── dropout_arm.cpp │ │ │ ├── dropout_arm.h │ │ │ ├── eltwise_arm.cpp │ │ │ ├── eltwise_arm.h │ │ │ ├── eltwise_arm_asimdhp.cpp │ │ │ ├── flatten_arm.cpp │ │ │ ├── flatten_arm.h │ │ │ ├── gelu_arm.cpp │ │ │ ├── gelu_arm.h │ │ │ ├── gelu_arm_asimdhp.cpp │ │ │ ├── gemm_arm.cpp │ │ │ ├── gemm_arm.h │ │ │ ├── gemm_arm_asimddp.cpp │ │ │ ├── gemm_arm_asimdfhm.cpp │ │ │ ├── gemm_arm_asimdhp.cpp │ │ │ ├── gemm_arm_i8mm.cpp │ │ │ ├── gemm_arm_vfpv4.cpp │ │ │ ├── gemm_bf16s.h │ │ │ ├── gemm_bf16s_fp16s.h │ │ │ ├── gemm_fp16s.h │ │ │ ├── gemm_int8.h │ │ │ ├── gemm_int8_bf16s.h │ │ │ ├── gemm_int8_fp16s.h │ │ │ ├── groupnorm_arm.cpp │ │ │ ├── groupnorm_arm.h │ │ │ ├── groupnorm_arm_asimdhp.cpp │ │ │ ├── gru_arm.cpp │ │ │ ├── gru_arm.h │ │ │ ├── gru_arm_asimddp.cpp │ │ │ ├── gru_arm_asimdhp.cpp │ │ │ ├── gru_arm_vfpv4.cpp │ │ │ ├── gru_int8.h │ │ │ ├── hardsigmoid_arm.cpp │ │ │ ├── hardsigmoid_arm.h │ │ │ ├── hardsigmoid_arm_asimdhp.cpp │ │ │ ├── hardswish_arm.cpp │ │ │ ├── hardswish_arm.h │ │ │ ├── hardswish_arm_asimdhp.cpp │ │ │ ├── innerproduct_arm.cpp │ │ │ ├── innerproduct_arm.h │ │ │ ├── innerproduct_arm_asimdfhm.cpp │ │ │ ├── innerproduct_arm_asimdhp.cpp │ │ │ ├── innerproduct_arm_vfpv4.cpp │ │ │ ├── innerproduct_fp16s.h │ │ │ ├── innerproduct_gemm_fp16s.h │ │ │ ├── instancenorm_arm.cpp │ │ │ ├── instancenorm_arm.h │ │ │ ├── instancenorm_arm_asimdhp.cpp │ │ │ ├── interp_arm.cpp │ │ │ ├── interp_arm.h │ │ │ ├── interp_arm_asimdhp.cpp │ │ │ ├── interp_bicubic.h │ │ │ ├── interp_bicubic_bf16s.h │ │ │ ├── interp_bicubic_fp16s.h │ │ │ ├── interp_bicubic_pack4.h │ │ │ ├── interp_bicubic_pack4_bf16s.h │ │ │ ├── interp_bicubic_pack4_fp16s.h │ │ │ ├── interp_bicubic_pack8_fp16s.h │ │ │ ├── interp_bilinear.h │ │ │ ├── interp_bilinear_bf16s.h │ │ │ ├── interp_bilinear_fp16s.h │ │ │ ├── interp_bilinear_pack4.h │ │ │ ├── interp_bilinear_pack4_bf16s.h │ │ │ ├── interp_bilinear_pack4_fp16s.h │ │ │ ├── interp_bilinear_pack8_fp16s.h │ │ │ ├── layernorm_arm.cpp │ │ │ ├── layernorm_arm.h │ │ │ ├── layernorm_arm_asimdhp.cpp │ │ │ ├── lrn_arm.cpp │ │ │ ├── lrn_arm.h │ │ │ ├── lstm_arm.cpp │ │ │ ├── lstm_arm.h │ │ │ ├── lstm_arm_asimddp.cpp │ │ │ ├── lstm_arm_asimdhp.cpp │ │ │ ├── lstm_arm_vfpv4.cpp │ │ │ ├── lstm_int8.h │ │ │ ├── matmul_arm.cpp │ │ │ ├── matmul_arm.h │ │ │ ├── mish_arm.cpp │ │ │ ├── mish_arm.h │ │ │ ├── mish_arm_asimdhp.cpp │ │ │ ├── multiheadattention_arm.cpp │ │ │ ├── multiheadattention_arm.h │ │ │ ├── neon_mathfun.h │ │ │ ├── neon_mathfun_fp16s.h │ │ │ ├── neon_mathfun_tanh.h │ │ │ ├── packing_arm.cpp │ │ │ ├── packing_arm.h │ │ │ ├── padding_arm.cpp │ │ │ ├── padding_arm.h │ │ │ ├── padding_pack4.h │ │ │ ├── padding_pack4_bf16s_fp16s.h │ │ │ ├── padding_pack8_fp16s.h │ │ │ ├── padding_pack8_int8.h │ │ │ ├── pixelshuffle_arm.cpp │ │ │ ├── pixelshuffle_arm.h │ │ │ ├── pooling_2x2.h │ │ │ ├── pooling_2x2_pack4.h │ │ │ ├── pooling_2x2_pack4_bf16s.h │ │ │ ├── pooling_3x3.h │ │ │ ├── pooling_3x3_pack4.h │ │ │ ├── pooling_3x3_pack4_bf16s.h │ │ │ ├── pooling_arm.cpp │ │ │ ├── pooling_arm.h │ │ │ ├── pooling_arm_asimdhp.cpp │ │ │ ├── prelu_arm.cpp │ │ │ ├── prelu_arm.h │ │ │ ├── prelu_arm_asimdhp.cpp │ │ │ ├── quantize_arm.cpp │ │ │ ├── quantize_arm.h │ │ │ ├── quantize_arm_asimdhp.cpp │ │ │ ├── relu_arm.cpp │ │ │ ├── relu_arm.h │ │ │ ├── relu_arm_asimdhp.cpp │ │ │ ├── requantize_arm.cpp │ │ │ ├── requantize_arm.h │ │ │ ├── reshape_arm.cpp │ │ │ ├── reshape_arm.h │ │ │ ├── rmsnorm_arm.cpp │ │ │ ├── rmsnorm_arm.h │ │ │ ├── rmsnorm_arm_asimdhp.cpp │ │ │ ├── rnn_arm.cpp │ │ │ ├── rnn_arm.h │ │ │ ├── rnn_arm_asimddp.cpp │ │ │ ├── rnn_arm_asimdhp.cpp │ │ │ ├── rnn_arm_vfpv4.cpp │ │ │ ├── rnn_int8.h │ │ │ ├── scale_arm.cpp │ │ │ ├── scale_arm.h │ │ │ ├── selu_arm.cpp │ │ │ ├── selu_arm.h │ │ │ ├── shufflechannel_arm.cpp │ │ │ ├── shufflechannel_arm.h │ │ │ ├── sigmoid_arm.cpp │ │ │ ├── sigmoid_arm.h │ │ │ ├── sigmoid_arm_asimdhp.cpp │ │ │ ├── slice_arm.cpp │ │ │ ├── slice_arm.h │ │ │ ├── softmax_arm.cpp │ │ │ ├── softmax_arm.h │ │ │ ├── softmax_arm_asimdhp.cpp │ │ │ ├── swish_arm.cpp │ │ │ ├── swish_arm.h │ │ │ ├── swish_arm_asimdhp.cpp │ │ │ ├── tanh_arm.cpp │ │ │ ├── tanh_arm.h │ │ │ ├── tanh_arm_asimdhp.cpp │ │ │ ├── unaryop_arm.cpp │ │ │ ├── unaryop_arm.h │ │ │ └── unaryop_arm_asimdhp.cpp │ │ ├── batchnorm.cpp │ │ ├── batchnorm.h │ │ ├── bias.cpp │ │ ├── bias.h │ │ ├── binaryop.cpp │ │ ├── binaryop.h │ │ ├── bnll.cpp │ │ ├── bnll.h │ │ ├── cast.cpp │ │ ├── cast.h │ │ ├── celu.cpp │ │ ├── celu.h │ │ ├── clip.cpp │ │ ├── clip.h │ │ ├── concat.cpp │ │ ├── concat.h │ │ ├── convolution.cpp │ │ ├── convolution.h │ │ ├── convolution1d.cpp │ │ ├── convolution1d.h │ │ ├── convolution3d.cpp │ │ ├── convolution3d.h │ │ ├── convolutiondepthwise.cpp │ │ ├── convolutiondepthwise.h │ │ ├── convolutiondepthwise1d.cpp │ │ ├── convolutiondepthwise1d.h │ │ ├── convolutiondepthwise3d.cpp │ │ ├── convolutiondepthwise3d.h │ │ ├── copyto.cpp │ │ ├── copyto.h │ │ ├── crop.cpp │ │ ├── crop.h │ │ ├── cumulativesum.cpp │ │ ├── cumulativesum.h │ │ ├── deconvolution.cpp │ │ ├── deconvolution.h │ │ ├── deconvolution1d.cpp │ │ ├── deconvolution1d.h │ │ ├── deconvolution3d.cpp │ │ ├── deconvolution3d.h │ │ ├── deconvolutiondepthwise.cpp │ │ ├── deconvolutiondepthwise.h │ │ ├── deconvolutiondepthwise1d.cpp │ │ ├── deconvolutiondepthwise1d.h │ │ ├── deconvolutiondepthwise3d.cpp │ │ ├── deconvolutiondepthwise3d.h │ │ ├── deepcopy.cpp │ │ ├── deepcopy.h │ │ ├── deformableconv2d.cpp │ │ ├── deformableconv2d.h │ │ ├── dequantize.cpp │ │ ├── dequantize.h │ │ ├── detectionoutput.cpp │ │ ├── detectionoutput.h │ │ ├── diag.cpp │ │ ├── diag.h │ │ ├── dropout.cpp │ │ ├── dropout.h │ │ ├── einsum.cpp │ │ ├── einsum.h │ │ ├── eltwise.cpp │ │ ├── eltwise.h │ │ ├── elu.cpp │ │ ├── elu.h │ │ ├── embed.cpp │ │ ├── embed.h │ │ ├── erf.cpp │ │ ├── erf.h │ │ ├── exp.cpp │ │ ├── exp.h │ │ ├── expanddims.cpp │ │ ├── expanddims.h │ │ ├── flatten.cpp │ │ ├── flatten.h │ │ ├── flip.cpp │ │ ├── flip.h │ │ ├── fold.cpp │ │ ├── fold.h │ │ ├── fused_activation.h │ │ ├── gelu.cpp │ │ ├── gelu.h │ │ ├── gemm.cpp │ │ ├── gemm.h │ │ ├── glu.cpp │ │ ├── glu.h │ │ ├── gridsample.cpp │ │ ├── gridsample.h │ │ ├── groupnorm.cpp │ │ ├── groupnorm.h │ │ ├── gru.cpp │ │ ├── gru.h │ │ ├── hardsigmoid.cpp │ │ ├── hardsigmoid.h │ │ ├── hardswish.cpp │ │ ├── hardswish.h │ │ ├── innerproduct.cpp │ │ ├── innerproduct.h │ │ ├── input.cpp │ │ ├── input.h │ │ ├── instancenorm.cpp │ │ ├── instancenorm.h │ │ ├── interp.cpp │ │ ├── interp.h │ │ ├── inversespectrogram.cpp │ │ ├── inversespectrogram.h │ │ ├── layernorm.cpp │ │ ├── layernorm.h │ │ ├── log.cpp │ │ ├── log.h │ │ ├── loongarch/ │ │ │ ├── absval_loongarch.cpp │ │ │ ├── absval_loongarch.h │ │ │ ├── batchnorm_loongarch.cpp │ │ │ ├── batchnorm_loongarch.h │ │ │ ├── bias_loongarch.cpp │ │ │ ├── bias_loongarch.h │ │ │ ├── binaryop_loongarch.cpp │ │ │ ├── binaryop_loongarch.h │ │ │ ├── cast_loongarch.cpp │ │ │ ├── cast_loongarch.h │ │ │ ├── clip_loongarch.cpp │ │ │ ├── clip_loongarch.h │ │ │ ├── concat_loongarch.cpp │ │ │ ├── concat_loongarch.h │ │ │ ├── convolution1d_loongarch.cpp │ │ │ ├── convolution1d_loongarch.h │ │ │ ├── convolution_1x1.h │ │ │ ├── convolution_1x1_int8.h │ │ │ ├── convolution_1x1_pack1to4_int8.h │ │ │ ├── convolution_1x1_pack4.h │ │ │ ├── convolution_1x1_pack4to1.h │ │ │ ├── convolution_1x1_pack8to1_int8.h │ │ │ ├── convolution_1x1_pack8to4_int8.h │ │ │ ├── convolution_3x3.h │ │ │ ├── convolution_3x3_int8.h │ │ │ ├── convolution_3x3_pack1to4.h │ │ │ ├── convolution_3x3_pack4.h │ │ │ ├── convolution_3x3_pack8to1_int8.h │ │ │ ├── convolution_3x3_pack8to4_int8.h │ │ │ ├── convolution_7x7_pack1to4.h │ │ │ ├── convolution_int8.h │ │ │ ├── convolution_loongarch.cpp │ │ │ ├── convolution_loongarch.h │ │ │ ├── convolution_pack1to4.h │ │ │ ├── convolution_pack1to4_int8.h │ │ │ ├── convolution_pack4.h │ │ │ ├── convolution_pack4to1.h │ │ │ ├── convolution_pack8to1_int8.h │ │ │ ├── convolution_pack8to4_int8.h │ │ │ ├── convolution_sgemm.h │ │ │ ├── convolution_sgemm_int8.h │ │ │ ├── convolution_sgemm_pack1to4_int8.h │ │ │ ├── convolution_sgemm_pack4.h │ │ │ ├── convolution_sgemm_pack4to1.h │ │ │ ├── convolution_sgemm_pack8to1_int8.h │ │ │ ├── convolution_sgemm_pack8to4_int8.h │ │ │ ├── convolution_winograd_dot.h │ │ │ ├── convolution_winograd_dot_int8.h │ │ │ ├── convolution_winograd_dot_pack4.h │ │ │ ├── convolution_winograd_dot_pack8to1_int8.h │ │ │ ├── convolution_winograd_dot_pack8to4_int8.h │ │ │ ├── convolution_winograd_transform.h │ │ │ ├── convolution_winograd_transform_int8.h │ │ │ ├── convolution_winograd_transform_pack4.h │ │ │ ├── convolution_winograd_transform_pack4_int8.h │ │ │ ├── convolution_winograd_transform_pack8_int8.h │ │ │ ├── convolutiondepthwise_3x3.h │ │ │ ├── convolutiondepthwise_3x3_pack4.h │ │ │ ├── convolutiondepthwise_5x5_pack4.h │ │ │ ├── convolutiondepthwise_loongarch.cpp │ │ │ ├── convolutiondepthwise_loongarch.h │ │ │ ├── crop_loongarch.cpp │ │ │ ├── crop_loongarch.h │ │ │ ├── deconvolution_loongarch.cpp │ │ │ ├── deconvolution_loongarch.h │ │ │ ├── deconvolution_pack1to4.h │ │ │ ├── deconvolution_pack4.h │ │ │ ├── deconvolution_pack4to1.h │ │ │ ├── deconvolutiondepthwise_loongarch.cpp │ │ │ ├── deconvolutiondepthwise_loongarch.h │ │ │ ├── dequantize_loongarch.cpp │ │ │ ├── dequantize_loongarch.h │ │ │ ├── dropout_loongarch.cpp │ │ │ ├── dropout_loongarch.h │ │ │ ├── eltwise_loongarch.cpp │ │ │ ├── eltwise_loongarch.h │ │ │ ├── flatten_loongarch.cpp │ │ │ ├── flatten_loongarch.h │ │ │ ├── hardsigmoid_loongarch.cpp │ │ │ ├── hardsigmoid_loongarch.h │ │ │ ├── hardswish_loongarch.cpp │ │ │ ├── hardswish_loongarch.h │ │ │ ├── innerproduct_loongarch.cpp │ │ │ ├── innerproduct_loongarch.h │ │ │ ├── interp_bicubic.h │ │ │ ├── interp_bicubic_pack4.h │ │ │ ├── interp_bilinear.h │ │ │ ├── interp_bilinear_pack4.h │ │ │ ├── interp_loongarch.cpp │ │ │ ├── interp_loongarch.h │ │ │ ├── lasx_mathfun.h │ │ │ ├── loongarch_activation.h │ │ │ ├── loongarch_usability.h │ │ │ ├── lsx_mathfun.h │ │ │ ├── mish_loongarch.cpp │ │ │ ├── mish_loongarch.h │ │ │ ├── packing_loongarch.cpp │ │ │ ├── packing_loongarch.h │ │ │ ├── padding_loongarch.cpp │ │ │ ├── padding_loongarch.h │ │ │ ├── padding_pack4.h │ │ │ ├── padding_pack8_int8.h │ │ │ ├── pooling_loongarch.cpp │ │ │ ├── pooling_loongarch.h │ │ │ ├── prelu_loongarch.cpp │ │ │ ├── prelu_loongarch.h │ │ │ ├── quantize_loongarch.cpp │ │ │ ├── quantize_loongarch.h │ │ │ ├── relu_loongarch.cpp │ │ │ ├── relu_loongarch.h │ │ │ ├── requantize_loongarch.cpp │ │ │ ├── requantize_loongarch.h │ │ │ ├── sigmoid_loongarch.cpp │ │ │ ├── sigmoid_loongarch.h │ │ │ ├── slice_loongarch.cpp │ │ │ ├── slice_loongarch.h │ │ │ ├── softmax_loongarch.cpp │ │ │ ├── softmax_loongarch.h │ │ │ ├── swish_loongarch.cpp │ │ │ ├── swish_loongarch.h │ │ │ ├── tanh_loongarch.cpp │ │ │ ├── tanh_loongarch.h │ │ │ ├── unaryop_loongarch.cpp │ │ │ └── unaryop_loongarch.h │ │ ├── lrn.cpp │ │ ├── lrn.h │ │ ├── lstm.cpp │ │ ├── lstm.h │ │ ├── matmul.cpp │ │ ├── matmul.h │ │ ├── memorydata.cpp │ │ ├── memorydata.h │ │ ├── mips/ │ │ │ ├── absval_mips.cpp │ │ │ ├── absval_mips.h │ │ │ ├── batchnorm_mips.cpp │ │ │ ├── batchnorm_mips.h │ │ │ ├── bias_mips.cpp │ │ │ ├── bias_mips.h │ │ │ ├── binaryop_mips.cpp │ │ │ ├── binaryop_mips.h │ │ │ ├── cast_mips.cpp │ │ │ ├── cast_mips.h │ │ │ ├── clip_mips.cpp │ │ │ ├── clip_mips.h │ │ │ ├── concat_mips.cpp │ │ │ ├── concat_mips.h │ │ │ ├── convolution1d_mips.cpp │ │ │ ├── convolution1d_mips.h │ │ │ ├── convolution_1x1.h │ │ │ ├── convolution_1x1_int8.h │ │ │ ├── convolution_1x1_pack1to4_int8.h │ │ │ ├── convolution_1x1_pack4.h │ │ │ ├── convolution_1x1_pack4to1.h │ │ │ ├── convolution_1x1_pack8to1_int8.h │ │ │ ├── convolution_1x1_pack8to4_int8.h │ │ │ ├── convolution_3x3.h │ │ │ ├── convolution_3x3_int8.h │ │ │ ├── convolution_3x3_pack1to4.h │ │ │ ├── convolution_3x3_pack4.h │ │ │ ├── convolution_3x3_pack8to1_int8.h │ │ │ ├── convolution_3x3_pack8to4_int8.h │ │ │ ├── convolution_7x7_pack1to4.h │ │ │ ├── convolution_int8.h │ │ │ ├── convolution_mips.cpp │ │ │ ├── convolution_mips.h │ │ │ ├── convolution_mips_mmi.cpp │ │ │ ├── convolution_pack1to4.h │ │ │ ├── convolution_pack1to4_int8.h │ │ │ ├── convolution_pack4.h │ │ │ ├── convolution_pack4to1.h │ │ │ ├── convolution_pack8to1_int8.h │ │ │ ├── convolution_pack8to4_int8.h │ │ │ ├── convolution_sgemm.h │ │ │ ├── convolution_sgemm_int8.h │ │ │ ├── convolution_sgemm_pack1to4_int8.h │ │ │ ├── convolution_sgemm_pack4.h │ │ │ ├── convolution_sgemm_pack4to1.h │ │ │ ├── convolution_sgemm_pack8to1_int8.h │ │ │ ├── convolution_sgemm_pack8to4_int8.h │ │ │ ├── convolution_winograd_dot.h │ │ │ ├── convolution_winograd_dot_int8.h │ │ │ ├── convolution_winograd_dot_pack4.h │ │ │ ├── convolution_winograd_dot_pack8to1_int8.h │ │ │ ├── convolution_winograd_dot_pack8to4_int8.h │ │ │ ├── convolution_winograd_transform.h │ │ │ ├── convolution_winograd_transform_int8.h │ │ │ ├── convolution_winograd_transform_pack4.h │ │ │ ├── convolution_winograd_transform_pack4_int8.h │ │ │ ├── convolution_winograd_transform_pack8_int8.h │ │ │ ├── convolutiondepthwise_3x3.h │ │ │ ├── convolutiondepthwise_3x3_pack4.h │ │ │ ├── convolutiondepthwise_5x5_pack4.h │ │ │ ├── convolutiondepthwise_mips.cpp │ │ │ ├── convolutiondepthwise_mips.h │ │ │ ├── crop_mips.cpp │ │ │ ├── crop_mips.h │ │ │ ├── deconvolution_mips.cpp │ │ │ ├── deconvolution_mips.h │ │ │ ├── deconvolution_pack1to4.h │ │ │ ├── deconvolution_pack4.h │ │ │ ├── deconvolution_pack4to1.h │ │ │ ├── deconvolutiondepthwise_mips.cpp │ │ │ ├── deconvolutiondepthwise_mips.h │ │ │ ├── dequantize_mips.cpp │ │ │ ├── dequantize_mips.h │ │ │ ├── dropout_mips.cpp │ │ │ ├── dropout_mips.h │ │ │ ├── eltwise_mips.cpp │ │ │ ├── eltwise_mips.h │ │ │ ├── elu_mips.cpp │ │ │ ├── elu_mips.h │ │ │ ├── erf_mips.cpp │ │ │ ├── erf_mips.h │ │ │ ├── flatten_mips.cpp │ │ │ ├── flatten_mips.h │ │ │ ├── gelu_mips.cpp │ │ │ ├── gelu_mips.h │ │ │ ├── hardsigmoid_mips.cpp │ │ │ ├── hardsigmoid_mips.h │ │ │ ├── hardswish_mips.cpp │ │ │ ├── hardswish_mips.h │ │ │ ├── innerproduct_mips.cpp │ │ │ ├── innerproduct_mips.h │ │ │ ├── interp_bicubic.h │ │ │ ├── interp_bicubic_pack4.h │ │ │ ├── interp_bilinear.h │ │ │ ├── interp_bilinear_pack4.h │ │ │ ├── interp_mips.cpp │ │ │ ├── interp_mips.h │ │ │ ├── loongson_mmi.h │ │ │ ├── mips_activation.h │ │ │ ├── mips_usability.h │ │ │ ├── mish_mips.cpp │ │ │ ├── mish_mips.h │ │ │ ├── msa_mathfun.h │ │ │ ├── packing_mips.cpp │ │ │ ├── packing_mips.h │ │ │ ├── padding_mips.cpp │ │ │ ├── padding_mips.h │ │ │ ├── padding_pack4.h │ │ │ ├── padding_pack8_int8.h │ │ │ ├── pooling_mips.cpp │ │ │ ├── pooling_mips.h │ │ │ ├── prelu_mips.cpp │ │ │ ├── prelu_mips.h │ │ │ ├── quantize_mips.cpp │ │ │ ├── quantize_mips.h │ │ │ ├── relu_mips.cpp │ │ │ ├── relu_mips.h │ │ │ ├── requantize_mips.cpp │ │ │ ├── requantize_mips.h │ │ │ ├── selu_mips.cpp │ │ │ ├── selu_mips.h │ │ │ ├── sigmoid_mips.cpp │ │ │ ├── sigmoid_mips.h │ │ │ ├── slice_mips.cpp │ │ │ ├── slice_mips.h │ │ │ ├── softmax_mips.cpp │ │ │ ├── softmax_mips.h │ │ │ ├── swish_mips.cpp │ │ │ ├── swish_mips.h │ │ │ ├── tanh_mips.cpp │ │ │ ├── tanh_mips.h │ │ │ ├── unaryop_mips.cpp │ │ │ └── unaryop_mips.h │ │ ├── mish.cpp │ │ ├── mish.h │ │ ├── multiheadattention.cpp │ │ ├── multiheadattention.h │ │ ├── mvn.cpp │ │ ├── mvn.h │ │ ├── noop.cpp │ │ ├── noop.h │ │ ├── normalize.cpp │ │ ├── normalize.h │ │ ├── packing.cpp │ │ ├── packing.h │ │ ├── padding.cpp │ │ ├── padding.h │ │ ├── permute.cpp │ │ ├── permute.h │ │ ├── pixelshuffle.cpp │ │ ├── pixelshuffle.h │ │ ├── pooling.cpp │ │ ├── pooling.h │ │ ├── pooling1d.cpp │ │ ├── pooling1d.h │ │ ├── pooling3d.cpp │ │ ├── pooling3d.h │ │ ├── power.cpp │ │ ├── power.h │ │ ├── prelu.cpp │ │ ├── prelu.h │ │ ├── priorbox.cpp │ │ ├── priorbox.h │ │ ├── proposal.cpp │ │ ├── proposal.h │ │ ├── psroipooling.cpp │ │ ├── psroipooling.h │ │ ├── quantize.cpp │ │ ├── quantize.h │ │ ├── reduction.cpp │ │ ├── reduction.h │ │ ├── relu.cpp │ │ ├── relu.h │ │ ├── reorg.cpp │ │ ├── reorg.h │ │ ├── requantize.cpp │ │ ├── requantize.h │ │ ├── reshape.cpp │ │ ├── reshape.h │ │ ├── riscv/ │ │ │ ├── absval_riscv.cpp │ │ │ ├── absval_riscv.h │ │ │ ├── absval_riscv_zfh.cpp │ │ │ ├── batchnorm_riscv.cpp │ │ │ ├── batchnorm_riscv.h │ │ │ ├── batchnorm_riscv_zfh.cpp │ │ │ ├── bias_riscv.cpp │ │ │ ├── bias_riscv.h │ │ │ ├── bias_riscv_zfh.cpp │ │ │ ├── binaryop_riscv.cpp │ │ │ ├── binaryop_riscv.h │ │ │ ├── binaryop_riscv_zfh.cpp │ │ │ ├── bnll_riscv.cpp │ │ │ ├── bnll_riscv.h │ │ │ ├── bnll_riscv_zfh.cpp │ │ │ ├── cast_riscv.cpp │ │ │ ├── cast_riscv.h │ │ │ ├── cast_riscv_zfh.cpp │ │ │ ├── celu_riscv.cpp │ │ │ ├── celu_riscv.h │ │ │ ├── celu_riscv_zfh.cpp │ │ │ ├── clip_riscv.cpp │ │ │ ├── clip_riscv.h │ │ │ ├── clip_riscv_zfh.cpp │ │ │ ├── concat_riscv.cpp │ │ │ ├── concat_riscv.h │ │ │ ├── convolution1d_riscv.cpp │ │ │ ├── convolution1d_riscv.h │ │ │ ├── convolution1d_riscv_zfh.cpp │ │ │ ├── convolution_1x1.h │ │ │ ├── convolution_1x1_fp16s.h │ │ │ ├── convolution_1x1_pack1ton.h │ │ │ ├── convolution_1x1_pack1ton_fp16s.h │ │ │ ├── convolution_1x1_packn.h │ │ │ ├── convolution_1x1_packn_fp16s.h │ │ │ ├── convolution_1x1_packnto1.h │ │ │ ├── convolution_1x1_packnto1_fp16s.h │ │ │ ├── convolution_3x3.h │ │ │ ├── convolution_3x3_pack1ton.h │ │ │ ├── convolution_3x3_pack1ton_fp16s.h │ │ │ ├── convolution_3x3_packn.h │ │ │ ├── convolution_3x3_packn_fp16s.h │ │ │ ├── convolution_7x7_pack1ton.h │ │ │ ├── convolution_7x7_pack1ton_fp16s.h │ │ │ ├── convolution_fp16s.h │ │ │ ├── convolution_pack1ton.h │ │ │ ├── convolution_pack1ton_fp16s.h │ │ │ ├── convolution_packn.h │ │ │ ├── convolution_packn_fp16s.h │ │ │ ├── convolution_packnto1.h │ │ │ ├── convolution_packnto1_fp16s.h │ │ │ ├── convolution_riscv.cpp │ │ │ ├── convolution_riscv.h │ │ │ ├── convolution_riscv_zfh.cpp │ │ │ ├── convolution_sgemm.h │ │ │ ├── convolution_sgemm_fp16s.h │ │ │ ├── convolution_sgemm_pack1ton.h │ │ │ ├── convolution_sgemm_pack1ton_fp16s.h │ │ │ ├── convolution_sgemm_packn.h │ │ │ ├── convolution_sgemm_packn_fp16s.h │ │ │ ├── convolution_sgemm_packnto1.h │ │ │ ├── convolution_sgemm_packnto1_fp16s.h │ │ │ ├── convolution_winograd_dot.h │ │ │ ├── convolution_winograd_dot_packn.h │ │ │ ├── convolution_winograd_dot_packn_fp16s.h │ │ │ ├── convolution_winograd_transform.h │ │ │ ├── convolution_winograd_transform_packn.h │ │ │ ├── convolution_winograd_transform_packn_fp16s.h │ │ │ ├── convolutiondepthwise_3x3.h │ │ │ ├── convolutiondepthwise_3x3_packn.h │ │ │ ├── convolutiondepthwise_3x3_packn_fp16s.h │ │ │ ├── convolutiondepthwise_5x5_packn.h │ │ │ ├── convolutiondepthwise_5x5_packn_fp16s.h │ │ │ ├── convolutiondepthwise_riscv.cpp │ │ │ ├── convolutiondepthwise_riscv.h │ │ │ ├── convolutiondepthwise_riscv_zfh.cpp │ │ │ ├── crop_riscv.cpp │ │ │ ├── crop_riscv.h │ │ │ ├── deconvolution_fp16s.h │ │ │ ├── deconvolution_pack1ton.h │ │ │ ├── deconvolution_pack1ton_fp16s.h │ │ │ ├── deconvolution_packn.h │ │ │ ├── deconvolution_packn_fp16s.h │ │ │ ├── deconvolution_packnto1.h │ │ │ ├── deconvolution_packnto1_fp16s.h │ │ │ ├── deconvolution_riscv.cpp │ │ │ ├── deconvolution_riscv.h │ │ │ ├── deconvolution_riscv_zfh.cpp │ │ │ ├── deconvolutiondepthwise_riscv.cpp │ │ │ ├── deconvolutiondepthwise_riscv.h │ │ │ ├── deconvolutiondepthwise_riscv_zfh.cpp │ │ │ ├── deformableconv2d_pack1ton.h │ │ │ ├── deformableconv2d_packn.h │ │ │ ├── deformableconv2d_packnto1.h │ │ │ ├── deformableconv2d_riscv.cpp │ │ │ ├── deformableconv2d_riscv.h │ │ │ ├── dropout_riscv.cpp │ │ │ ├── dropout_riscv.h │ │ │ ├── eltwise_riscv.cpp │ │ │ ├── eltwise_riscv.h │ │ │ ├── eltwise_riscv_zfh.cpp │ │ │ ├── flatten_riscv.cpp │ │ │ ├── flatten_riscv.h │ │ │ ├── gelu_riscv.cpp │ │ │ ├── gelu_riscv.h │ │ │ ├── gemm_bf16s_fp16s.h │ │ │ ├── gemm_fp16s.h │ │ │ ├── gemm_riscv.cpp │ │ │ ├── gemm_riscv.h │ │ │ ├── gemm_riscv_zfh.cpp │ │ │ ├── gru_riscv.cpp │ │ │ ├── gru_riscv.h │ │ │ ├── gru_riscv_zfh.cpp │ │ │ ├── hardsigmoid_riscv.cpp │ │ │ ├── hardsigmoid_riscv.h │ │ │ ├── hardsigmoid_riscv_zfh.cpp │ │ │ ├── hardswish_riscv.cpp │ │ │ ├── hardswish_riscv.h │ │ │ ├── hardswish_riscv_zfh.cpp │ │ │ ├── innerproduct_riscv.cpp │ │ │ ├── innerproduct_riscv.h │ │ │ ├── innerproduct_riscv_zfh.cpp │ │ │ ├── instancenorm_riscv.cpp │ │ │ ├── instancenorm_riscv.h │ │ │ ├── instancenorm_riscv_zfh.cpp │ │ │ ├── interp_bicubic.h │ │ │ ├── interp_bicubic_fp16s.h │ │ │ ├── interp_bicubic_packn.h │ │ │ ├── interp_bicubic_packn_fp16s.h │ │ │ ├── interp_bilinear.h │ │ │ ├── interp_bilinear_fp16s.h │ │ │ ├── interp_bilinear_packn.h │ │ │ ├── interp_bilinear_packn_fp16s.h │ │ │ ├── interp_riscv.cpp │ │ │ ├── interp_riscv.h │ │ │ ├── interp_riscv_zfh.cpp │ │ │ ├── layernorm_riscv.cpp │ │ │ ├── layernorm_riscv.h │ │ │ ├── layernorm_riscv_zfh.cpp │ │ │ ├── mish_riscv.cpp │ │ │ ├── mish_riscv.h │ │ │ ├── mish_riscv_zfh.cpp │ │ │ ├── packing_riscv.cpp │ │ │ ├── packing_riscv.h │ │ │ ├── padding_packn.h │ │ │ ├── padding_riscv.cpp │ │ │ ├── padding_riscv.h │ │ │ ├── pooling_riscv.cpp │ │ │ ├── pooling_riscv.h │ │ │ ├── pooling_riscv_zfh.cpp │ │ │ ├── prelu_riscv.cpp │ │ │ ├── prelu_riscv.h │ │ │ ├── prelu_riscv_zfh.cpp │ │ │ ├── relu_riscv.cpp │ │ │ ├── relu_riscv.h │ │ │ ├── relu_riscv_zfh.cpp │ │ │ ├── riscv_activation.h │ │ │ ├── riscv_usability.h │ │ │ ├── rvv_mathfun.h │ │ │ ├── rvv_mathfun_fp16s.h │ │ │ ├── selu_riscv.cpp │ │ │ ├── selu_riscv.h │ │ │ ├── shufflechannel_riscv.cpp │ │ │ ├── shufflechannel_riscv.h │ │ │ ├── sigmoid_riscv.cpp │ │ │ ├── sigmoid_riscv.h │ │ │ ├── sigmoid_riscv_zfh.cpp │ │ │ ├── softmax_riscv.cpp │ │ │ ├── softmax_riscv.h │ │ │ ├── swish_riscv.cpp │ │ │ ├── swish_riscv.h │ │ │ ├── swish_riscv_zfh.cpp │ │ │ ├── tanh_riscv.cpp │ │ │ ├── tanh_riscv.h │ │ │ ├── tanh_riscv_zfh.cpp │ │ │ ├── unaryop_riscv.cpp │ │ │ ├── unaryop_riscv.h │ │ │ └── unaryop_riscv_zfh.cpp │ │ ├── rmsnorm.cpp │ │ ├── rmsnorm.h │ │ ├── rnn.cpp │ │ ├── rnn.h │ │ ├── roialign.cpp │ │ ├── roialign.h │ │ ├── roipooling.cpp │ │ ├── roipooling.h │ │ ├── rotaryembed.cpp │ │ ├── rotaryembed.h │ │ ├── scale.cpp │ │ ├── scale.h │ │ ├── sdpa.cpp │ │ ├── sdpa.h │ │ ├── selu.cpp │ │ ├── selu.h │ │ ├── shrink.cpp │ │ ├── shrink.h │ │ ├── shufflechannel.cpp │ │ ├── shufflechannel.h │ │ ├── sigmoid.cpp │ │ ├── sigmoid.h │ │ ├── slice.cpp │ │ ├── slice.h │ │ ├── softmax.cpp │ │ ├── softmax.h │ │ ├── softplus.cpp │ │ ├── softplus.h │ │ ├── spectrogram.cpp │ │ ├── spectrogram.h │ │ ├── split.cpp │ │ ├── split.h │ │ ├── spp.cpp │ │ ├── spp.h │ │ ├── squeeze.cpp │ │ ├── squeeze.h │ │ ├── statisticspooling.cpp │ │ ├── statisticspooling.h │ │ ├── swish.cpp │ │ ├── swish.h │ │ ├── tanh.cpp │ │ ├── tanh.h │ │ ├── threshold.cpp │ │ ├── threshold.h │ │ ├── tile.cpp │ │ ├── tile.h │ │ ├── unaryop.cpp │ │ ├── unaryop.h │ │ ├── unfold.cpp │ │ ├── unfold.h │ │ ├── vulkan/ │ │ │ ├── absval_vulkan.cpp │ │ │ ├── absval_vulkan.h │ │ │ ├── batchnorm_vulkan.cpp │ │ │ ├── batchnorm_vulkan.h │ │ │ ├── binaryop_vulkan.cpp │ │ │ ├── binaryop_vulkan.h │ │ │ ├── cast_vulkan.cpp │ │ │ ├── cast_vulkan.h │ │ │ ├── celu_vulkan.cpp │ │ │ ├── celu_vulkan.h │ │ │ ├── clip_vulkan.cpp │ │ │ ├── clip_vulkan.h │ │ │ ├── concat_vulkan.cpp │ │ │ ├── concat_vulkan.h │ │ │ ├── convolution1d_vulkan.cpp │ │ │ ├── convolution1d_vulkan.h │ │ │ ├── convolution_vulkan.cpp │ │ │ ├── convolution_vulkan.h │ │ │ ├── convolutiondepthwise_vulkan.cpp │ │ │ ├── convolutiondepthwise_vulkan.h │ │ │ ├── crop_vulkan.cpp │ │ │ ├── crop_vulkan.h │ │ │ ├── deconvolution_vulkan.cpp │ │ │ ├── deconvolution_vulkan.h │ │ │ ├── deconvolutiondepthwise_vulkan.cpp │ │ │ ├── deconvolutiondepthwise_vulkan.h │ │ │ ├── deepcopy_vulkan.cpp │ │ │ ├── deepcopy_vulkan.h │ │ │ ├── dequantize_vulkan.cpp │ │ │ ├── dequantize_vulkan.h │ │ │ ├── dropout_vulkan.cpp │ │ │ ├── dropout_vulkan.h │ │ │ ├── eltwise_vulkan.cpp │ │ │ ├── eltwise_vulkan.h │ │ │ ├── elu_vulkan.cpp │ │ │ ├── elu_vulkan.h │ │ │ ├── erf_vulkan.cpp │ │ │ ├── erf_vulkan.h │ │ │ ├── flatten_vulkan.cpp │ │ │ ├── flatten_vulkan.h │ │ │ ├── gelu_vulkan.cpp │ │ │ ├── gelu_vulkan.h │ │ │ ├── gemm_vulkan.cpp │ │ │ ├── gemm_vulkan.h │ │ │ ├── groupnorm_vulkan.cpp │ │ │ ├── groupnorm_vulkan.h │ │ │ ├── hardsigmoid_vulkan.cpp │ │ │ ├── hardsigmoid_vulkan.h │ │ │ ├── hardswish_vulkan.cpp │ │ │ ├── hardswish_vulkan.h │ │ │ ├── innerproduct_vulkan.cpp │ │ │ ├── innerproduct_vulkan.h │ │ │ ├── instancenorm_vulkan.cpp │ │ │ ├── instancenorm_vulkan.h │ │ │ ├── interp_vulkan.cpp │ │ │ ├── interp_vulkan.h │ │ │ ├── layernorm_vulkan.cpp │ │ │ ├── layernorm_vulkan.h │ │ │ ├── lrn_vulkan.cpp │ │ │ ├── lrn_vulkan.h │ │ │ ├── memorydata_vulkan.cpp │ │ │ ├── memorydata_vulkan.h │ │ │ ├── mish_vulkan.cpp │ │ │ ├── mish_vulkan.h │ │ │ ├── multiheadattention_vulkan.cpp │ │ │ ├── multiheadattention_vulkan.h │ │ │ ├── noop_vulkan.cpp │ │ │ ├── noop_vulkan.h │ │ │ ├── normalize_vulkan.cpp │ │ │ ├── normalize_vulkan.h │ │ │ ├── packing_vulkan.cpp │ │ │ ├── packing_vulkan.h │ │ │ ├── padding_vulkan.cpp │ │ │ ├── padding_vulkan.h │ │ │ ├── permute_vulkan.cpp │ │ │ ├── permute_vulkan.h │ │ │ ├── pixelshuffle_vulkan.cpp │ │ │ ├── pixelshuffle_vulkan.h │ │ │ ├── pooling_vulkan.cpp │ │ │ ├── pooling_vulkan.h │ │ │ ├── prelu_vulkan.cpp │ │ │ ├── prelu_vulkan.h │ │ │ ├── priorbox_vulkan.cpp │ │ │ ├── priorbox_vulkan.h │ │ │ ├── quantize_vulkan.cpp │ │ │ ├── quantize_vulkan.h │ │ │ ├── reduction_vulkan.cpp │ │ │ ├── reduction_vulkan.h │ │ │ ├── relu_vulkan.cpp │ │ │ ├── relu_vulkan.h │ │ │ ├── reorg_vulkan.cpp │ │ │ ├── reorg_vulkan.h │ │ │ ├── requantize_vulkan.cpp │ │ │ ├── requantize_vulkan.h │ │ │ ├── reshape_vulkan.cpp │ │ │ ├── reshape_vulkan.h │ │ │ ├── rmsnorm_vulkan.cpp │ │ │ ├── rmsnorm_vulkan.h │ │ │ ├── rotaryembed_vulkan.cpp │ │ │ ├── rotaryembed_vulkan.h │ │ │ ├── scale_vulkan.cpp │ │ │ ├── scale_vulkan.h │ │ │ ├── sdpa_vulkan.cpp │ │ │ ├── sdpa_vulkan.h │ │ │ ├── selu_vulkan.cpp │ │ │ ├── selu_vulkan.h │ │ │ ├── shader/ │ │ │ │ ├── .clang-format │ │ │ │ ├── absval.comp │ │ │ │ ├── batchnorm.comp │ │ │ │ ├── batchnorm_pack4.comp │ │ │ │ ├── binaryop.comp │ │ │ │ ├── binaryop_broadcast.comp │ │ │ │ ├── binaryop_broadcast_pack1to4.comp │ │ │ │ ├── binaryop_broadcast_pack4.comp │ │ │ │ ├── binaryop_pack4.comp │ │ │ │ ├── cast_fp16_to_fp32.comp │ │ │ │ ├── cast_fp16_to_fp32_pack4.comp │ │ │ │ ├── cast_fp32_to_fp16.comp │ │ │ │ ├── cast_fp32_to_fp16_pack4.comp │ │ │ │ ├── celu.comp │ │ │ │ ├── clip.comp │ │ │ │ ├── concat.comp │ │ │ │ ├── concat_pack4.comp │ │ │ │ ├── concat_pack4to1.comp │ │ │ │ ├── convolution1d_packed.comp │ │ │ │ ├── convolution_1x1s1d1_cm.comp │ │ │ │ ├── convolution_3x3s1d1_winograd23_transform_input.comp │ │ │ │ ├── convolution_3x3s1d1_winograd23_transform_output.comp │ │ │ │ ├── convolution_3x3s1d1_winograd43_transform_input.comp │ │ │ │ ├── convolution_3x3s1d1_winograd43_transform_output.comp │ │ │ │ ├── convolution_3x3s1d1_winograd_gemm.comp │ │ │ │ ├── convolution_gemm_cm.comp │ │ │ │ ├── convolution_pack1to4_3x3s1d1_winograd_gemm.comp │ │ │ │ ├── convolution_pack4_3x3s1d1_winograd23_transform_input.comp │ │ │ │ ├── convolution_pack4_3x3s1d1_winograd23_transform_output.comp │ │ │ │ ├── convolution_pack4_3x3s1d1_winograd43_transform_input.comp │ │ │ │ ├── convolution_pack4_3x3s1d1_winograd43_transform_output.comp │ │ │ │ ├── convolution_pack4_3x3s1d1_winograd_gemm.comp │ │ │ │ ├── convolution_pack4to1_3x3s1d1_winograd_gemm.comp │ │ │ │ ├── convolution_packed.comp │ │ │ │ ├── convolution_packed_1x1s1d1.comp │ │ │ │ ├── convolution_packed_gemm.comp │ │ │ │ ├── convolution_winograd_gemm_cm.comp │ │ │ │ ├── convolutiondepthwise.comp │ │ │ │ ├── convolutiondepthwise_group.comp │ │ │ │ ├── convolutiondepthwise_group_pack1to4.comp │ │ │ │ ├── convolutiondepthwise_group_pack4.comp │ │ │ │ ├── convolutiondepthwise_group_pack4to1.comp │ │ │ │ ├── convolutiondepthwise_pack4.comp │ │ │ │ ├── crop.comp │ │ │ │ ├── crop_pack1to4.comp │ │ │ │ ├── crop_pack4.comp │ │ │ │ ├── crop_pack4to1.comp │ │ │ │ ├── deconvolution_col2im.comp │ │ │ │ ├── deconvolution_gemm_cm.comp │ │ │ │ ├── deconvolution_gemm_packed.comp │ │ │ │ ├── deconvolution_pack4_col2im.comp │ │ │ │ ├── deconvolution_packed.comp │ │ │ │ ├── deconvolutiondepthwise.comp │ │ │ │ ├── deconvolutiondepthwise_group.comp │ │ │ │ ├── deconvolutiondepthwise_group_pack1to4.comp │ │ │ │ ├── deconvolutiondepthwise_group_pack4.comp │ │ │ │ ├── deconvolutiondepthwise_group_pack4to1.comp │ │ │ │ ├── deconvolutiondepthwise_pack4.comp │ │ │ │ ├── deepcopy.comp │ │ │ │ ├── deepcopy_pack4.comp │ │ │ │ ├── dequantize.comp │ │ │ │ ├── dequantize_pack4.comp │ │ │ │ ├── dropout.comp │ │ │ │ ├── eltwise.comp │ │ │ │ ├── elu.comp │ │ │ │ ├── erf.comp │ │ │ │ ├── flatten.comp │ │ │ │ ├── flatten_pack1to4.comp │ │ │ │ ├── flatten_pack4.comp │ │ │ │ ├── gelu.comp │ │ │ │ ├── gemm.comp │ │ │ │ ├── gemm_cm.comp │ │ │ │ ├── gemm_sg.comp │ │ │ │ ├── groupnorm_coeffs.comp │ │ │ │ ├── groupnorm_coeffs_pack4.comp │ │ │ │ ├── groupnorm_norm.comp │ │ │ │ ├── groupnorm_norm_pack4.comp │ │ │ │ ├── groupnorm_reduce_mean.comp │ │ │ │ ├── groupnorm_reduce_mean_pack4.comp │ │ │ │ ├── groupnorm_reduce_sum4_fp16_to_fp32.comp │ │ │ │ ├── groupnorm_reduce_sum4_fp16_to_fp32_pack4.comp │ │ │ │ ├── groupnorm_reduce_sum4_fp32.comp │ │ │ │ ├── groupnorm_reduce_sum4_fp32_pack4.comp │ │ │ │ ├── groupnorm_sub_mean_square.comp │ │ │ │ ├── groupnorm_sub_mean_square_pack4.comp │ │ │ │ ├── hardsigmoid.comp │ │ │ │ ├── hardswish.comp │ │ │ │ ├── innerproduct.comp │ │ │ │ ├── innerproduct_gemm.comp │ │ │ │ ├── innerproduct_gemm_wp1to4.comp │ │ │ │ ├── innerproduct_gemm_wp4.comp │ │ │ │ ├── innerproduct_gemm_wp4to1.comp │ │ │ │ ├── innerproduct_pack1to4.comp │ │ │ │ ├── innerproduct_pack4.comp │ │ │ │ ├── innerproduct_pack4to1.comp │ │ │ │ ├── innerproduct_reduce_sum8.comp │ │ │ │ ├── innerproduct_reduce_sum8_pack4.comp │ │ │ │ ├── innerproduct_sum8.comp │ │ │ │ ├── innerproduct_sum8_pack1to4.comp │ │ │ │ ├── innerproduct_sum8_pack4.comp │ │ │ │ ├── innerproduct_sum8_pack4to1.comp │ │ │ │ ├── instancenorm_coeffs.comp │ │ │ │ ├── instancenorm_coeffs_pack4.comp │ │ │ │ ├── instancenorm_norm.comp │ │ │ │ ├── instancenorm_norm_pack4.comp │ │ │ │ ├── instancenorm_reduce_mean.comp │ │ │ │ ├── instancenorm_reduce_mean_pack4.comp │ │ │ │ ├── instancenorm_reduce_sum4_fp16_to_fp32.comp │ │ │ │ ├── instancenorm_reduce_sum4_fp16_to_fp32_pack4.comp │ │ │ │ ├── instancenorm_reduce_sum4_fp32.comp │ │ │ │ ├── instancenorm_reduce_sum4_fp32_pack4.comp │ │ │ │ ├── instancenorm_sub_mean_square.comp │ │ │ │ ├── instancenorm_sub_mean_square_pack4.comp │ │ │ │ ├── interp.comp │ │ │ │ ├── interp_bicubic.comp │ │ │ │ ├── interp_bicubic_coeffs.comp │ │ │ │ ├── interp_bicubic_pack4.comp │ │ │ │ ├── interp_pack4.comp │ │ │ │ ├── layernorm_coeffs.comp │ │ │ │ ├── layernorm_coeffs_pack4.comp │ │ │ │ ├── layernorm_norm.comp │ │ │ │ ├── layernorm_norm_pack4.comp │ │ │ │ ├── layernorm_reduce_mean.comp │ │ │ │ ├── layernorm_reduce_mean_pack4.comp │ │ │ │ ├── layernorm_reduce_sum4_fp16_to_fp32.comp │ │ │ │ ├── layernorm_reduce_sum4_fp16_to_fp32_pack4.comp │ │ │ │ ├── layernorm_reduce_sum4_fp32.comp │ │ │ │ ├── layernorm_reduce_sum4_fp32_pack4.comp │ │ │ │ ├── layernorm_sub_mean_square.comp │ │ │ │ ├── layernorm_sub_mean_square_pack4.comp │ │ │ │ ├── lrn_norm.comp │ │ │ │ ├── lrn_norm_across_channel_pack4.comp │ │ │ │ ├── lrn_norm_within_channel_pack4.comp │ │ │ │ ├── lrn_square_pad.comp │ │ │ │ ├── lrn_square_pad_across_channel_pack4.comp │ │ │ │ ├── lrn_square_pad_within_channel_pack4.comp │ │ │ │ ├── mish.comp │ │ │ │ ├── multiheadattention_qk_cross.comp │ │ │ │ ├── multiheadattention_qk_cross_pack1to4.comp │ │ │ │ ├── multiheadattention_qk_cross_pack4.comp │ │ │ │ ├── multiheadattention_qk_cross_pack4to1.comp │ │ │ │ ├── multiheadattention_qkv_cross.comp │ │ │ │ ├── multiheadattention_qkv_cross_pack1to4.comp │ │ │ │ ├── multiheadattention_qkv_cross_pack4.comp │ │ │ │ ├── multiheadattention_qkv_cross_pack4to1.comp │ │ │ │ ├── normalize_coeffs.comp │ │ │ │ ├── normalize_coeffs_pack4.comp │ │ │ │ ├── normalize_norm.comp │ │ │ │ ├── normalize_norm_pack4.comp │ │ │ │ ├── normalize_reduce_sum4_fp16_to_fp32.comp │ │ │ │ ├── normalize_reduce_sum4_fp16_to_fp32_pack4.comp │ │ │ │ ├── normalize_reduce_sum4_fp32.comp │ │ │ │ ├── normalize_reduce_sum4_fp32_pack4.comp │ │ │ │ ├── packing.comp │ │ │ │ ├── packing_int8.comp │ │ │ │ ├── packing_pack1to4.comp │ │ │ │ ├── packing_pack1to4_int8.comp │ │ │ │ ├── packing_pack4to1.comp │ │ │ │ ├── packing_pack4to1_int8.comp │ │ │ │ ├── padding.comp │ │ │ │ ├── padding_3d.comp │ │ │ │ ├── padding_3d_pack4.comp │ │ │ │ ├── padding_pack1to4.comp │ │ │ │ ├── padding_pack4.comp │ │ │ │ ├── padding_pack4to1.comp │ │ │ │ ├── permute.comp │ │ │ │ ├── permute_pack1to4.comp │ │ │ │ ├── permute_pack4.comp │ │ │ │ ├── permute_pack4to1.comp │ │ │ │ ├── pixelshuffle.comp │ │ │ │ ├── pixelshuffle_pack4.comp │ │ │ │ ├── pixelshuffle_pack4to1.comp │ │ │ │ ├── pooling.comp │ │ │ │ ├── pooling_adaptive.comp │ │ │ │ ├── pooling_adaptive_pack4.comp │ │ │ │ ├── pooling_global_reduce_max.comp │ │ │ │ ├── pooling_global_reduce_max_first.comp │ │ │ │ ├── pooling_global_reduce_max_first_pack4.comp │ │ │ │ ├── pooling_global_reduce_max_last.comp │ │ │ │ ├── pooling_global_reduce_max_last_pack4.comp │ │ │ │ ├── pooling_global_reduce_max_pack4.comp │ │ │ │ ├── pooling_global_reduce_sum.comp │ │ │ │ ├── pooling_global_reduce_sum_first.comp │ │ │ │ ├── pooling_global_reduce_sum_first_pack4.comp │ │ │ │ ├── pooling_global_reduce_sum_last.comp │ │ │ │ ├── pooling_global_reduce_sum_last_pack4.comp │ │ │ │ ├── pooling_global_reduce_sum_pack4.comp │ │ │ │ ├── pooling_pack4.comp │ │ │ │ ├── prelu.comp │ │ │ │ ├── prelu_pack4.comp │ │ │ │ ├── priorbox.comp │ │ │ │ ├── priorbox_mxnet.comp │ │ │ │ ├── quantize.comp │ │ │ │ ├── quantize_pack4.comp │ │ │ │ ├── reduction.comp │ │ │ │ ├── relu.comp │ │ │ │ ├── reorg.comp │ │ │ │ ├── reorg_pack1to4.comp │ │ │ │ ├── reorg_pack4.comp │ │ │ │ ├── requantize.comp │ │ │ │ ├── requantize_pack4.comp │ │ │ │ ├── reshape.comp │ │ │ │ ├── reshape_pack1to4.comp │ │ │ │ ├── reshape_pack4.comp │ │ │ │ ├── reshape_pack4to1.comp │ │ │ │ ├── rmsnorm_coeffs.comp │ │ │ │ ├── rmsnorm_coeffs_pack4.comp │ │ │ │ ├── rmsnorm_norm.comp │ │ │ │ ├── rmsnorm_norm_pack4.comp │ │ │ │ ├── rmsnorm_square.comp │ │ │ │ ├── rmsnorm_square_pack4.comp │ │ │ │ ├── rotaryembed.comp │ │ │ │ ├── rotaryembed_pack4.comp │ │ │ │ ├── scale.comp │ │ │ │ ├── scale_pack4.comp │ │ │ │ ├── sdpa_cross.comp │ │ │ │ ├── sdpa_cross_cm.comp │ │ │ │ ├── sdpa_fa.comp │ │ │ │ ├── sdpa_fa_cm.comp │ │ │ │ ├── selu.comp │ │ │ │ ├── shrink.comp │ │ │ │ ├── shufflechannel.comp │ │ │ │ ├── shufflechannel_pack4.comp │ │ │ │ ├── sigmoid.comp │ │ │ │ ├── slice.comp │ │ │ │ ├── slice_pack1to4.comp │ │ │ │ ├── slice_pack4.comp │ │ │ │ ├── softmax_div_sum.comp │ │ │ │ ├── softmax_div_sum_pack4.comp │ │ │ │ ├── softmax_exp_sub_max.comp │ │ │ │ ├── softmax_exp_sub_max_pack4.comp │ │ │ │ ├── softmax_reduce_max.comp │ │ │ │ ├── softmax_reduce_max_pack4.comp │ │ │ │ ├── softmax_reduce_sum.comp │ │ │ │ ├── softmax_reduce_sum_pack4.comp │ │ │ │ ├── softplus.comp │ │ │ │ ├── swish.comp │ │ │ │ ├── tanh.comp │ │ │ │ ├── unaryop.comp │ │ │ │ ├── unfold_im2col.comp │ │ │ │ ├── unfold_im2col_pack1to4.comp │ │ │ │ ├── unfold_im2col_pack4.comp │ │ │ │ ├── unfold_im2col_pack4to1.comp │ │ │ │ └── vulkan_activation.comp │ │ │ ├── shrink_vulkan.cpp │ │ │ ├── shrink_vulkan.h │ │ │ ├── shufflechannel_vulkan.cpp │ │ │ ├── shufflechannel_vulkan.h │ │ │ ├── sigmoid_vulkan.cpp │ │ │ ├── sigmoid_vulkan.h │ │ │ ├── slice_vulkan.cpp │ │ │ ├── slice_vulkan.h │ │ │ ├── softmax_vulkan.cpp │ │ │ ├── softmax_vulkan.h │ │ │ ├── softplus_vulkan.cpp │ │ │ ├── softplus_vulkan.h │ │ │ ├── split_vulkan.cpp │ │ │ ├── split_vulkan.h │ │ │ ├── swish_vulkan.cpp │ │ │ ├── swish_vulkan.h │ │ │ ├── tanh_vulkan.cpp │ │ │ ├── tanh_vulkan.h │ │ │ ├── unaryop_vulkan.cpp │ │ │ ├── unaryop_vulkan.h │ │ │ ├── unfold_vulkan.cpp │ │ │ └── unfold_vulkan.h │ │ ├── x86/ │ │ │ ├── absval_x86.cpp │ │ │ ├── absval_x86.h │ │ │ ├── avx512_mathfun.h │ │ │ ├── avx_mathfun.h │ │ │ ├── batchnorm_bf16s.h │ │ │ ├── batchnorm_x86.cpp │ │ │ ├── batchnorm_x86.h │ │ │ ├── batchnorm_x86_avx512bf16.cpp │ │ │ ├── bias_x86.cpp │ │ │ ├── bias_x86.h │ │ │ ├── binaryop_bf16s.h │ │ │ ├── binaryop_functor.h │ │ │ ├── binaryop_x86.cpp │ │ │ ├── binaryop_x86.h │ │ │ ├── binaryop_x86_avx512bf16.cpp │ │ │ ├── bnll_x86.cpp │ │ │ ├── bnll_x86.h │ │ │ ├── cast_bf16.h │ │ │ ├── cast_fp16.h │ │ │ ├── cast_x86.cpp │ │ │ ├── cast_x86.h │ │ │ ├── cast_x86_avx2.cpp │ │ │ ├── cast_x86_avx512bf16.cpp │ │ │ ├── cast_x86_f16c.cpp │ │ │ ├── clip_bf16s.h │ │ │ ├── clip_x86.cpp │ │ │ ├── clip_x86.h │ │ │ ├── clip_x86_avx512bf16.cpp │ │ │ ├── concat_x86.cpp │ │ │ ├── concat_x86.h │ │ │ ├── convolution1d_packed.h │ │ │ ├── convolution1d_x86.cpp │ │ │ ├── convolution1d_x86.h │ │ │ ├── convolution_1x1.h │ │ │ ├── convolution_2x2_pack8.h │ │ │ ├── convolution_3x3.h │ │ │ ├── convolution_3x3_int8.h │ │ │ ├── convolution_3x3_pack16to1.h │ │ │ ├── convolution_3x3_pack1to4.h │ │ │ ├── convolution_3x3_pack1to8.h │ │ │ ├── convolution_3x3_pack8.h │ │ │ ├── convolution_3x3_pack8to1.h │ │ │ ├── convolution_3x3_winograd.h │ │ │ ├── convolution_3x3_winograd_int8.h │ │ │ ├── convolution_5x5.h │ │ │ ├── convolution_im2col_gemm.h │ │ │ ├── convolution_im2col_gemm_int8.h │ │ │ ├── convolution_packed.h │ │ │ ├── convolution_packed_int8.h │ │ │ ├── convolution_x86.cpp │ │ │ ├── convolution_x86.h │ │ │ ├── convolution_x86_avx2.cpp │ │ │ ├── convolution_x86_avx512vnni.cpp │ │ │ ├── convolution_x86_avxvnni.cpp │ │ │ ├── convolution_x86_avxvnniint8.cpp │ │ │ ├── convolution_x86_xop.cpp │ │ │ ├── convolutiondepthwise_3x3.h │ │ │ ├── convolutiondepthwise_3x3_int8.h │ │ │ ├── convolutiondepthwise_3x3_pack16.h │ │ │ ├── convolutiondepthwise_3x3_pack4.h │ │ │ ├── convolutiondepthwise_3x3_pack8.h │ │ │ ├── convolutiondepthwise_5x5_pack16.h │ │ │ ├── convolutiondepthwise_5x5_pack4.h │ │ │ ├── convolutiondepthwise_5x5_pack8.h │ │ │ ├── convolutiondepthwise_x86.cpp │ │ │ ├── convolutiondepthwise_x86.h │ │ │ ├── crop_x86.cpp │ │ │ ├── crop_x86.h │ │ │ ├── deconvolution_packed.h │ │ │ ├── deconvolution_x86.cpp │ │ │ ├── deconvolution_x86.h │ │ │ ├── deconvolutiondepthwise_x86.cpp │ │ │ ├── deconvolutiondepthwise_x86.h │ │ │ ├── deformableconv2d_packed.h │ │ │ ├── deformableconv2d_x86.cpp │ │ │ ├── deformableconv2d_x86.h │ │ │ ├── dequantize_x86.cpp │ │ │ ├── dequantize_x86.h │ │ │ ├── dropout_x86.cpp │ │ │ ├── dropout_x86.h │ │ │ ├── eltwise_x86.cpp │ │ │ ├── eltwise_x86.h │ │ │ ├── elu_x86.cpp │ │ │ ├── elu_x86.h │ │ │ ├── erf_x86.cpp │ │ │ ├── erf_x86.h │ │ │ ├── flatten_x86.cpp │ │ │ ├── flatten_x86.h │ │ │ ├── gelu_x86.cpp │ │ │ ├── gelu_x86.h │ │ │ ├── gemm_bf16s.h │ │ │ ├── gemm_int8.h │ │ │ ├── gemm_x86.cpp │ │ │ ├── gemm_x86.h │ │ │ ├── gemm_x86_avx2.cpp │ │ │ ├── gemm_x86_avx512vnni.cpp │ │ │ ├── gemm_x86_avxvnni.cpp │ │ │ ├── gemm_x86_avxvnniint8.cpp │ │ │ ├── gemm_x86_xop.cpp │ │ │ ├── gridsample_bicubic_apply_interpolation.h │ │ │ ├── gridsample_bicubic_compute_blob.h │ │ │ ├── gridsample_bilinear_apply_interpolation.h │ │ │ ├── gridsample_bilinear_compute_blob.h │ │ │ ├── gridsample_compute_blob.h │ │ │ ├── gridsample_nearest_apply_interpolation.h │ │ │ ├── gridsample_nearest_compute_blob.h │ │ │ ├── gridsample_x86.cpp │ │ │ ├── gridsample_x86.h │ │ │ ├── groupnorm_bf16s.h │ │ │ ├── groupnorm_x86.cpp │ │ │ ├── groupnorm_x86.h │ │ │ ├── groupnorm_x86_avx512bf16.cpp │ │ │ ├── hardsigmoid_x86.cpp │ │ │ ├── hardsigmoid_x86.h │ │ │ ├── hardswish_x86.cpp │ │ │ ├── hardswish_x86.h │ │ │ ├── innerproduct_fp.h │ │ │ ├── innerproduct_gemm_fp.h │ │ │ ├── innerproduct_x86.cpp │ │ │ ├── innerproduct_x86.h │ │ │ ├── innerproduct_x86_f16c.cpp │ │ │ ├── instancenorm_bf16s.h │ │ │ ├── instancenorm_x86.cpp │ │ │ ├── instancenorm_x86.h │ │ │ ├── instancenorm_x86_avx512bf16.cpp │ │ │ ├── interp_bicubic.h │ │ │ ├── interp_bicubic_pack16.h │ │ │ ├── interp_bicubic_pack4.h │ │ │ ├── interp_bicubic_pack8.h │ │ │ ├── interp_bilinear.h │ │ │ ├── interp_bilinear_pack16.h │ │ │ ├── interp_bilinear_pack4.h │ │ │ ├── interp_bilinear_pack8.h │ │ │ ├── interp_x86.cpp │ │ │ ├── interp_x86.h │ │ │ ├── interp_x86_avx2.cpp │ │ │ ├── layernorm_bf16s.h │ │ │ ├── layernorm_x86.cpp │ │ │ ├── layernorm_x86.h │ │ │ ├── layernorm_x86_avx512bf16.cpp │ │ │ ├── lrn_x86.cpp │ │ │ ├── lrn_x86.h │ │ │ ├── lstm_int8.h │ │ │ ├── lstm_x86.cpp │ │ │ ├── lstm_x86.h │ │ │ ├── lstm_x86_avx2.cpp │ │ │ ├── lstm_x86_avx512vnni.cpp │ │ │ ├── lstm_x86_avxvnni.cpp │ │ │ ├── lstm_x86_xop.cpp │ │ │ ├── matmul_x86.cpp │ │ │ ├── matmul_x86.h │ │ │ ├── mish_x86.cpp │ │ │ ├── mish_x86.h │ │ │ ├── multiheadattention_x86.cpp │ │ │ ├── multiheadattention_x86.h │ │ │ ├── packing_x86.cpp │ │ │ ├── packing_x86.h │ │ │ ├── padding_pack16.h │ │ │ ├── padding_pack16_bf16s_fp16s.h │ │ │ ├── padding_pack4.h │ │ │ ├── padding_pack4_bf16s_fp16s.h │ │ │ ├── padding_pack8.h │ │ │ ├── padding_pack8_bf16s_fp16s.h │ │ │ ├── padding_pack8_int8.h │ │ │ ├── padding_x86.cpp │ │ │ ├── padding_x86.h │ │ │ ├── pooling_2x2.h │ │ │ ├── pooling_2x2_pack16.h │ │ │ ├── pooling_2x2_pack4.h │ │ │ ├── pooling_2x2_pack8.h │ │ │ ├── pooling_3x3_pack16.h │ │ │ ├── pooling_3x3_pack4.h │ │ │ ├── pooling_3x3_pack8.h │ │ │ ├── pooling_x86.cpp │ │ │ ├── pooling_x86.h │ │ │ ├── prelu_bf16s.h │ │ │ ├── prelu_x86.cpp │ │ │ ├── prelu_x86.h │ │ │ ├── prelu_x86_avx512bf16.cpp │ │ │ ├── quantize_x86.cpp │ │ │ ├── quantize_x86.h │ │ │ ├── relu_bf16s.h │ │ │ ├── relu_x86.cpp │ │ │ ├── relu_x86.h │ │ │ ├── relu_x86_avx512bf16.cpp │ │ │ ├── requantize_x86.cpp │ │ │ ├── requantize_x86.h │ │ │ ├── reshape_x86.cpp │ │ │ ├── reshape_x86.h │ │ │ ├── rmsnorm_bf16s.h │ │ │ ├── rmsnorm_x86.cpp │ │ │ ├── rmsnorm_x86.h │ │ │ ├── rmsnorm_x86_avx512bf16.cpp │ │ │ ├── roialign_x86.cpp │ │ │ ├── roialign_x86.h │ │ │ ├── rotaryembed_x86.cpp │ │ │ ├── rotaryembed_x86.h │ │ │ ├── scale_bf16s.h │ │ │ ├── scale_x86.cpp │ │ │ ├── scale_x86.h │ │ │ ├── scale_x86_avx512bf16.cpp │ │ │ ├── sdpa_x86.cpp │ │ │ ├── sdpa_x86.h │ │ │ ├── selu_x86.cpp │ │ │ ├── selu_x86.h │ │ │ ├── shufflechannel_x86.cpp │ │ │ ├── shufflechannel_x86.h │ │ │ ├── sigmoid_bf16s.h │ │ │ ├── sigmoid_x86.cpp │ │ │ ├── sigmoid_x86.h │ │ │ ├── sigmoid_x86_avx512bf16.cpp │ │ │ ├── slice_x86.cpp │ │ │ ├── slice_x86.h │ │ │ ├── softmax_bf16s.h │ │ │ ├── softmax_x86.cpp │ │ │ ├── softmax_x86.h │ │ │ ├── softmax_x86_avx512bf16.cpp │ │ │ ├── sse_mathfun.h │ │ │ ├── swish_bf16s.h │ │ │ ├── swish_x86.cpp │ │ │ ├── swish_x86.h │ │ │ ├── swish_x86_avx512bf16.cpp │ │ │ ├── tanh_x86.cpp │ │ │ ├── tanh_x86.h │ │ │ ├── unaryop_bf16s.h │ │ │ ├── unaryop_functor.h │ │ │ ├── unaryop_x86.cpp │ │ │ ├── unaryop_x86.h │ │ │ ├── unaryop_x86_avx512bf16.cpp │ │ │ ├── x86_activation.h │ │ │ ├── x86_usability.h │ │ │ ├── yolov3detectionoutput_x86.cpp │ │ │ └── yolov3detectionoutput_x86.h │ │ ├── yolodetectionoutput.cpp │ │ ├── yolodetectionoutput.h │ │ ├── yolov3detectionoutput.cpp │ │ └── yolov3detectionoutput.h │ ├── layer.cpp │ ├── layer.h │ ├── layer_declaration.h.in │ ├── layer_registry.h.in │ ├── layer_shader_registry.h.in │ ├── layer_shader_spv_data.h.in │ ├── layer_shader_type.h │ ├── layer_shader_type_enum.h.in │ ├── layer_type.h │ ├── layer_type_enum.h.in │ ├── mat.cpp │ ├── mat.h │ ├── mat_pixel.cpp │ ├── mat_pixel_affine.cpp │ ├── mat_pixel_android.cpp │ ├── mat_pixel_drawing.cpp │ ├── mat_pixel_drawing_font.h │ ├── mat_pixel_resize.cpp │ ├── mat_pixel_rotate.cpp │ ├── modelbin.cpp │ ├── modelbin.h │ ├── ncnn.pc.in │ ├── net.cpp │ ├── net.h │ ├── option.cpp │ ├── option.h │ ├── paramdict.cpp │ ├── paramdict.h │ ├── pipeline.cpp │ ├── pipeline.h │ ├── pipelinecache.cpp │ ├── pipelinecache.h │ ├── platform.h.in │ ├── ruapu.h │ ├── simplemath.cpp │ ├── simplemath.h │ ├── simpleocv.cpp │ ├── simpleocv.h │ ├── simpleomp.cpp │ ├── simpleomp.h │ ├── simplestl.cpp │ ├── simplestl.h │ ├── simplevk.cpp │ ├── simplevk.h │ ├── simplevk.tbd │ ├── stb_image.h │ ├── stb_image_write.h │ └── vulkan_header_fix.h ├── tests/ │ ├── CMakeLists.txt │ ├── perf/ │ │ ├── CMakeLists.txt │ │ ├── perf_batchnorm.cpp │ │ ├── perf_binaryop.cpp │ │ ├── perf_concat.cpp │ │ ├── perf_convolution.cpp │ │ ├── perf_convolutiondepthwise.cpp │ │ ├── perf_deconvolution.cpp │ │ ├── perf_innerproduct.cpp │ │ ├── perf_pooling.cpp │ │ ├── perf_relu.cpp │ │ ├── perf_sigmoid.cpp │ │ ├── perf_softmax.cpp │ │ ├── perfutil.cpp │ │ └── perfutil.h │ ├── prng.h │ ├── test_absval.cpp │ ├── test_batchnorm.cpp │ ├── test_bias.cpp │ ├── test_binaryop.cpp │ ├── test_binaryop_1.cpp │ ├── test_binaryop_2.cpp │ ├── test_binaryop_3.cpp │ ├── test_binaryop_4.cpp │ ├── test_bnll.cpp │ ├── test_c_api.cpp │ ├── test_cast.cpp │ ├── test_celu.cpp │ ├── test_clip.cpp │ ├── test_command.cpp │ ├── test_concat.cpp │ ├── test_concat_oom.cpp │ ├── test_convolution.cpp │ ├── test_convolution1d.cpp │ ├── test_convolution3d.cpp │ ├── test_convolution_1.cpp │ ├── test_convolution_2.cpp │ ├── test_convolution_3.cpp │ ├── test_convolution_oom.cpp │ ├── test_convolutiondepthwise.cpp │ ├── test_convolutiondepthwise1d.cpp │ ├── test_convolutiondepthwise3d.cpp │ ├── test_convolutiondepthwise_1.cpp │ ├── test_copyto.cpp │ ├── test_copyto_1.cpp │ ├── test_cpu.cpp │ ├── test_crop.cpp │ ├── test_crop_1.cpp │ ├── test_crop_2.cpp │ ├── test_crop_3.cpp │ ├── test_crop_oom.cpp │ ├── test_cumulativesum.cpp │ ├── test_deconvolution.cpp │ ├── test_deconvolution1d.cpp │ ├── test_deconvolution3d.cpp │ ├── test_deconvolutiondepthwise.cpp │ ├── test_deconvolutiondepthwise1d.cpp │ ├── test_deconvolutiondepthwise3d.cpp │ ├── test_deconvolutiondepthwise_1.cpp │ ├── test_deepcopy.cpp │ ├── test_deformableconv2d.cpp │ ├── test_deformableconv2d_1.cpp │ ├── test_deformableconv2d_2.cpp │ ├── test_deformableconv2d_3.cpp │ ├── test_deformableconv2d_4.cpp │ ├── test_dequantize.cpp │ ├── test_diag.cpp │ ├── test_dropout.cpp │ ├── test_einsum.cpp │ ├── test_eltwise.cpp │ ├── test_elu.cpp │ ├── test_embed.cpp │ ├── test_erf.cpp │ ├── test_expanddims.cpp │ ├── test_expression.cpp │ ├── test_flatten.cpp │ ├── test_flip.cpp │ ├── test_fold.cpp │ ├── test_gelu.cpp │ ├── test_gemm_0.h │ ├── test_gemm_0a.cpp │ ├── test_gemm_0b.cpp │ ├── test_gemm_0c.cpp │ ├── test_gemm_0d.cpp │ ├── test_gemm_0e.cpp │ ├── test_gemm_0f.cpp │ ├── test_gemm_1.h │ ├── test_gemm_1a.cpp │ ├── test_gemm_1b.cpp │ ├── test_gemm_2.h │ ├── test_gemm_2a.cpp │ ├── test_gemm_2b.cpp │ ├── test_gemm_2c.cpp │ ├── test_gemm_2d.cpp │ ├── test_gemm_2e.cpp │ ├── test_gemm_3.cpp │ ├── test_gemm_4.cpp │ ├── test_gemm_nt.cpp │ ├── test_gemm_oom.cpp │ ├── test_glu.cpp │ ├── test_gridsample.cpp │ ├── test_groupnorm.cpp │ ├── test_gru.cpp │ ├── test_hardsigmoid.cpp │ ├── test_hardswish.cpp │ ├── test_innerproduct.cpp │ ├── test_instancenorm.cpp │ ├── test_interp.cpp │ ├── test_interp_1.cpp │ ├── test_inversespectrogram.cpp │ ├── test_layernorm.cpp │ ├── test_lrn.cpp │ ├── test_lstm.cpp │ ├── test_mat_pixel.cpp │ ├── test_mat_pixel_affine.cpp │ ├── test_mat_pixel_drawing.cpp │ ├── test_mat_pixel_resize.cpp │ ├── test_mat_pixel_rotate.cpp │ ├── test_matmul.cpp │ ├── test_memorydata.cpp │ ├── test_mish.cpp │ ├── test_multiheadattention.cpp │ ├── test_multiheadattention_1.cpp │ ├── test_multiheadattention_kvcache.cpp │ ├── test_multiheadattention_oom.cpp │ ├── test_noop.cpp │ ├── test_normalize.cpp │ ├── test_packing.cpp │ ├── test_padding.cpp │ ├── test_paramdict.cpp │ ├── test_permute.cpp │ ├── test_pixelshuffle.cpp │ ├── test_pooling.cpp │ ├── test_pooling1d.cpp │ ├── test_pooling3d.cpp │ ├── test_power.cpp │ ├── test_prelu.cpp │ ├── test_priorbox.cpp │ ├── test_quantize.cpp │ ├── test_quantize_oom.cpp │ ├── test_reduction.cpp │ ├── test_relu.cpp │ ├── test_reorg.cpp │ ├── test_requantize.cpp │ ├── test_requantize_oom.cpp │ ├── test_reshape.cpp │ ├── test_reshape_1.cpp │ ├── test_reshape_oom.cpp │ ├── test_rmsnorm.cpp │ ├── test_rnn.cpp │ ├── test_roialign.cpp │ ├── test_roipooling.cpp │ ├── test_rotaryembed.cpp │ ├── test_rotaryembed_oom.cpp │ ├── test_scale.cpp │ ├── test_sdpa.cpp │ ├── test_sdpa_kvcache.cpp │ ├── test_sdpa_oom.cpp │ ├── test_selu.cpp │ ├── test_shrink.cpp │ ├── test_shufflechannel.cpp │ ├── test_sigmoid.cpp │ ├── test_slice.cpp │ ├── test_slice_oom.cpp │ ├── test_softmax.cpp │ ├── test_softmax_oom.cpp │ ├── test_softplus.cpp │ ├── test_spectrogram.cpp │ ├── test_squeeze.cpp │ ├── test_squeezenet.cpp │ ├── test_swish.cpp │ ├── test_tanh.cpp │ ├── test_tile.cpp │ ├── test_tile_oom.cpp │ ├── test_unaryop.cpp │ ├── test_unfold.cpp │ ├── test_yolov3detectionoutput.cpp │ ├── testutil.cpp │ └── testutil.h ├── toolchains/ │ ├── aarch64-linux-gnu-c.toolchain.cmake │ ├── aarch64-linux-gnu.toolchain.cmake │ ├── aarch64-qnx.toolchain.cmake │ ├── anykav500.toolchain.cmake │ ├── arm-linux-gnueabi-c.toolchain.cmake │ ├── arm-linux-gnueabi.toolchain.cmake │ ├── arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake │ ├── arm-linux-gnueabihf.toolchain.cmake │ ├── c906-v310.toolchain.cmake │ ├── c907-rv32-v310.toolchain.cmake │ ├── c907-v310.toolchain.cmake │ ├── c908-v310.toolchain.cmake │ ├── c910-v310.toolchain.cmake │ ├── esp32.toolchain.cmake │ ├── himix100.toolchain.cmake │ ├── himix200.toolchain.cmake │ ├── himix210.toolchain.cmake │ ├── hisiv300.toolchain.cmake │ ├── hisiv500.toolchain.cmake │ ├── hisiv600.toolchain.cmake │ ├── host-c.clang.toolchain.cmake │ ├── host-c.gcc.toolchain.cmake │ ├── host.clang-m32.toolchain.cmake │ ├── host.gcc-c++03.toolchain.cmake │ ├── host.gcc-m32.toolchain.cmake │ ├── host.gcc.toolchain.cmake │ ├── ingenic-x2000.toolchain.cmake │ ├── ios.toolchain.cmake │ ├── iossimxc-x64.toolchain.cmake │ ├── iossimxc.toolchain.cmake │ ├── iosxc-arm64.toolchain.cmake │ ├── iosxc.toolchain.cmake │ ├── jetson.toolchain.cmake │ ├── k1.llvm.toolchain.cmake │ ├── k1.toolchain.cmake │ ├── loongarch64-linux-gnu.toolchain.cmake │ ├── loongarch64-unknown-linux-gnu.toolchain.cmake │ ├── loongson2f-linux-gnuabi64.toolchain.cmake │ ├── mips-mti-linux-gnu.toolchain.cmake │ ├── mips32r2-linux-gnu.toolchain.cmake │ ├── mips64el-linux-gnuabi64.toolchain.cmake │ ├── mipsel-linux-gnu.toolchain.cmake │ ├── mipsisa32r6el-linux-gnu.toolchain.cmake │ ├── mipsisa64r6el-linux-gnuabi64.toolchain.cmake │ ├── pi3.toolchain.cmake │ ├── power8le-linux-gnu-vsx.clang.toolchain.cmake │ ├── power8le-linux-gnu-vsx.toolchain.cmake │ ├── power9le-linux-gnu-vsx.clang.toolchain.cmake │ ├── power9le-linux-gnu-vsx.toolchain.cmake │ ├── powerpc-linux-gnu.toolchain.cmake │ ├── powerpc64le-linux-gnu.toolchain.cmake │ ├── riscv32-unknown-elf.toolchain.cmake │ ├── riscv64-linux-gnu.toolchain.cmake │ ├── riscv64-unknown-elf.toolchain.cmake │ ├── riscv64-unknown-linux-gnu.llvm-toolchain.cmake │ ├── riscv64-unknown-linux-gnu.toolchain.cmake │ ├── v831.toolchain.cmake │ ├── windows-xp-clang.toolchain.cmake │ ├── windows-xp-mingw.toolchain.cmake │ └── windows-xp-msvc.toolchain.cmake └── tools/ ├── CMakeLists.txt ├── caffe/ │ ├── CMakeLists.txt │ ├── caffe.proto │ └── caffe2ncnn.cpp ├── darknet/ │ ├── CMakeLists.txt │ ├── README.md │ └── darknet2ncnn.cpp ├── keras/ │ └── readme.md ├── mlir/ │ ├── CMakeLists.txt │ ├── fix_td.sh │ ├── mlir2ncnn.cpp │ ├── ncnn_dialect.cpp │ ├── ncnn_dialect.h │ ├── ncnn_ops.td │ ├── ncnn_rewriter.cpp │ ├── ncnn_rewriter.td │ ├── tf_attributes.cc │ ├── tf_attributes.h │ ├── tf_dialect.cpp │ ├── tf_dialect.h │ ├── tf_generated_ops.td │ ├── tf_op_base.td │ ├── tf_ops.td │ ├── tf_side_effects.h │ ├── tf_traits.h │ ├── tf_types.cc │ ├── tf_types.def │ └── tf_types.h ├── modelwriter.h ├── mxnet/ │ ├── CMakeLists.txt │ └── mxnet2ncnn.cpp ├── ncnn2mem.cpp ├── ncnnmerge.cpp ├── ncnnoptimize.cpp ├── onnx/ │ ├── CMakeLists.txt │ ├── README.md │ ├── onnx.proto │ └── onnx2ncnn.cpp ├── plugin/ │ ├── ImageWatchNCNN.natvis │ ├── ImageWatchNNIE.natvis │ └── README.md ├── pnnx/ │ ├── CMakeLists.txt │ ├── README.md │ ├── cmake/ │ │ └── PNNXPyTorch.cmake │ ├── python/ │ │ ├── README.md │ │ ├── examples/ │ │ │ ├── convert.py │ │ │ └── export.py │ │ ├── pnnx/ │ │ │ ├── __init__.py │ │ │ └── utils/ │ │ │ ├── __init__.py │ │ │ ├── convert.py │ │ │ ├── export.py │ │ │ └── utils.py │ │ ├── requirements.txt │ │ ├── setup.py │ │ └── tests/ │ │ ├── test_convert.py │ │ ├── test_dynamicinput_convert.py │ │ ├── test_dynamicinput_export.py │ │ ├── test_export.py │ │ ├── test_naiveinput_convert.py │ │ └── test_naiveinput_export.py │ ├── src/ │ │ ├── CMakeLists.txt │ │ ├── ir.cpp │ │ ├── ir.h │ │ ├── load_onnx.cpp │ │ ├── load_onnx.h │ │ ├── load_tnn.cpp │ │ ├── load_tnn.h │ │ ├── load_torchscript.cpp │ │ ├── load_torchscript.h │ │ ├── main.cpp │ │ ├── onnx-data.proto │ │ ├── onnx-ml.proto │ │ ├── onnx-operators-ml.proto │ │ ├── pass_level0/ │ │ │ ├── constant_unpooling.cpp │ │ │ ├── constant_unpooling.h │ │ │ ├── convert_half_to_float.cpp │ │ │ ├── convert_half_to_float.h │ │ │ ├── flatten_input.cpp │ │ │ ├── flatten_input.h │ │ │ ├── inline_block.cpp │ │ │ ├── inline_block.h │ │ │ ├── reset_device.cpp │ │ │ ├── reset_device.h │ │ │ ├── shape_inference.cpp │ │ │ └── shape_inference.h │ │ ├── pass_level0.cpp │ │ ├── pass_level0.h │ │ ├── pass_level1/ │ │ │ ├── fuse_module_pass.cpp │ │ │ ├── fuse_module_pass.h │ │ │ ├── nn_AdaptiveAvgPool1d.cpp │ │ │ ├── nn_AdaptiveAvgPool2d.cpp │ │ │ ├── nn_AdaptiveAvgPool3d.cpp │ │ │ ├── nn_AdaptiveMaxPool1d.cpp │ │ │ ├── nn_AdaptiveMaxPool2d.cpp │ │ │ ├── nn_AdaptiveMaxPool3d.cpp │ │ │ ├── nn_AlphaDropout.cpp │ │ │ ├── nn_AvgPool1d.cpp │ │ │ ├── nn_AvgPool2d.cpp │ │ │ ├── nn_AvgPool3d.cpp │ │ │ ├── nn_BatchNorm1d.cpp │ │ │ ├── nn_BatchNorm2d.cpp │ │ │ ├── nn_BatchNorm3d.cpp │ │ │ ├── nn_CELU.cpp │ │ │ ├── nn_ChannelShuffle.cpp │ │ │ ├── nn_ConstantPad1d.cpp │ │ │ ├── nn_ConstantPad2d.cpp │ │ │ ├── nn_ConstantPad3d.cpp │ │ │ ├── nn_Conv1d.cpp │ │ │ ├── nn_Conv2d.cpp │ │ │ ├── nn_Conv3d.cpp │ │ │ ├── nn_ConvTranspose1d.cpp │ │ │ ├── nn_ConvTranspose2d.cpp │ │ │ ├── nn_ConvTranspose3d.cpp │ │ │ ├── nn_Dropout.cpp │ │ │ ├── nn_Dropout2d.cpp │ │ │ ├── nn_Dropout3d.cpp │ │ │ ├── nn_ELU.cpp │ │ │ ├── nn_Embedding.cpp │ │ │ ├── nn_Fold.cpp │ │ │ ├── nn_GELU.cpp │ │ │ ├── nn_GLU.cpp │ │ │ ├── nn_GRU.cpp │ │ │ ├── nn_GroupNorm.cpp │ │ │ ├── nn_Hardshrink.cpp │ │ │ ├── nn_Hardsigmoid.cpp │ │ │ ├── nn_Hardswish.cpp │ │ │ ├── nn_Hardtanh.cpp │ │ │ ├── nn_InstanceNorm1d.cpp │ │ │ ├── nn_InstanceNorm2d.cpp │ │ │ ├── nn_InstanceNorm3d.cpp │ │ │ ├── nn_LPPool1d.cpp │ │ │ ├── nn_LPPool2d.cpp │ │ │ ├── nn_LSTM.cpp │ │ │ ├── nn_LayerNorm.cpp │ │ │ ├── nn_LeakyReLU.cpp │ │ │ ├── nn_Linear.cpp │ │ │ ├── nn_LocalResponseNorm.cpp │ │ │ ├── nn_LogSigmoid.cpp │ │ │ ├── nn_LogSoftmax.cpp │ │ │ ├── nn_MaxPool1d.cpp │ │ │ ├── nn_MaxPool2d.cpp │ │ │ ├── nn_MaxPool3d.cpp │ │ │ ├── nn_Mish.cpp │ │ │ ├── nn_MultiheadAttention.cpp │ │ │ ├── nn_PReLU.cpp │ │ │ ├── nn_PixelShuffle.cpp │ │ │ ├── nn_PixelUnshuffle.cpp │ │ │ ├── nn_RMSNorm.cpp │ │ │ ├── nn_RNN.cpp │ │ │ ├── nn_RReLU.cpp │ │ │ ├── nn_ReLU.cpp │ │ │ ├── nn_ReLU6.cpp │ │ │ ├── nn_ReflectionPad1d.cpp │ │ │ ├── nn_ReflectionPad2d.cpp │ │ │ ├── nn_ReplicationPad1d.cpp │ │ │ ├── nn_ReplicationPad2d.cpp │ │ │ ├── nn_ReplicationPad3d.cpp │ │ │ ├── nn_SELU.cpp │ │ │ ├── nn_SiLU.cpp │ │ │ ├── nn_Sigmoid.cpp │ │ │ ├── nn_Softmax.cpp │ │ │ ├── nn_Softmax2d.cpp │ │ │ ├── nn_Softmin.cpp │ │ │ ├── nn_Softplus.cpp │ │ │ ├── nn_Softshrink.cpp │ │ │ ├── nn_Softsign.cpp │ │ │ ├── nn_Tanh.cpp │ │ │ ├── nn_Tanhshrink.cpp │ │ │ ├── nn_Threshold.cpp │ │ │ ├── nn_Unfold.cpp │ │ │ ├── nn_Upsample.cpp │ │ │ ├── nn_UpsamplingBilinear2d.cpp │ │ │ ├── nn_UpsamplingNearest2d.cpp │ │ │ ├── nn_ZeroPad2d.cpp │ │ │ ├── nn_maxunpool2d.cpp │ │ │ ├── nn_quantized_Conv2d.cpp │ │ │ ├── nn_quantized_DeQuantize.cpp │ │ │ ├── nn_quantized_Linear.cpp │ │ │ ├── nn_quantized_Quantize.cpp │ │ │ ├── torchvision_DeformConv2d.cpp │ │ │ └── torchvision_RoIAlign.cpp │ │ ├── pass_level1.cpp │ │ ├── pass_level1.h │ │ ├── pass_level2/ │ │ │ ├── F_adaptive_avg_pool1d.cpp │ │ │ ├── F_adaptive_avg_pool2d.cpp │ │ │ ├── F_adaptive_avg_pool3d.cpp │ │ │ ├── F_adaptive_max_pool1d.cpp │ │ │ ├── F_adaptive_max_pool2d.cpp │ │ │ ├── F_adaptive_max_pool3d.cpp │ │ │ ├── F_affine_grid.cpp │ │ │ ├── F_alpha_dropout.cpp │ │ │ ├── F_avg_pool1d.cpp │ │ │ ├── F_avg_pool2d.cpp │ │ │ ├── F_avg_pool3d.cpp │ │ │ ├── F_batch_norm.cpp │ │ │ ├── F_celu.cpp │ │ │ ├── F_conv1d.cpp │ │ │ ├── F_conv2d.cpp │ │ │ ├── F_conv3d.cpp │ │ │ ├── F_conv_transpose1d.cpp │ │ │ ├── F_conv_transpose2d.cpp │ │ │ ├── F_conv_transpose3d.cpp │ │ │ ├── F_dropout.cpp │ │ │ ├── F_dropout23d.cpp │ │ │ ├── F_elu.cpp │ │ │ ├── F_embedding.cpp │ │ │ ├── F_feature_alpha_dropout.cpp │ │ │ ├── F_fold.cpp │ │ │ ├── F_gelu.cpp │ │ │ ├── F_glu.cpp │ │ │ ├── F_grid_sample.cpp │ │ │ ├── F_group_norm.cpp │ │ │ ├── F_hardshrink.cpp │ │ │ ├── F_hardsigmoid.cpp │ │ │ ├── F_hardswish.cpp │ │ │ ├── F_hardtanh.cpp │ │ │ ├── F_instance_norm.cpp │ │ │ ├── F_interpolate.cpp │ │ │ ├── F_layer_norm.cpp │ │ │ ├── F_leaky_relu.cpp │ │ │ ├── F_linear.cpp │ │ │ ├── F_local_response_norm.cpp │ │ │ ├── F_log_softmax.cpp │ │ │ ├── F_logsigmoid.cpp │ │ │ ├── F_lp_pool1d.cpp │ │ │ ├── F_lp_pool2d.cpp │ │ │ ├── F_max_pool1d.cpp │ │ │ ├── F_max_pool2d.cpp │ │ │ ├── F_max_pool3d.cpp │ │ │ ├── F_mish.cpp │ │ │ ├── F_normalize.cpp │ │ │ ├── F_pad.cpp │ │ │ ├── F_pairwise_distance.cpp │ │ │ ├── F_pixel_shuffle.cpp │ │ │ ├── F_pixel_unshuffle.cpp │ │ │ ├── F_prelu.cpp │ │ │ ├── F_relu.cpp │ │ │ ├── F_relu6.cpp │ │ │ ├── F_rms_norm.cpp │ │ │ ├── F_rrelu.cpp │ │ │ ├── F_scaled_dot_product_attention.cpp │ │ │ ├── F_selu.cpp │ │ │ ├── F_sigmoid.cpp │ │ │ ├── F_silu.cpp │ │ │ ├── F_softmax.cpp │ │ │ ├── F_softmin.cpp │ │ │ ├── F_softplus.cpp │ │ │ ├── F_softshrink.cpp │ │ │ ├── F_softsign.cpp │ │ │ ├── F_tanh.cpp │ │ │ ├── F_tanhshrink.cpp │ │ │ ├── F_threshold.cpp │ │ │ ├── F_unfold.cpp │ │ │ ├── F_upsample.cpp │ │ │ ├── F_upsample_bilinear.cpp │ │ │ ├── F_upsample_nearest.cpp │ │ │ ├── README.md │ │ │ ├── Tensor_copy.cpp │ │ │ ├── Tensor_expand.cpp │ │ │ ├── Tensor_expand_as.cpp │ │ │ ├── Tensor_fill.cpp │ │ │ ├── Tensor_index.cpp │ │ │ ├── Tensor_index_put.cpp │ │ │ ├── Tensor_masked_fill.cpp │ │ │ ├── Tensor_new_empty.cpp │ │ │ ├── Tensor_new_ones.cpp │ │ │ ├── Tensor_new_zeros.cpp │ │ │ ├── Tensor_permute.cpp │ │ │ ├── Tensor_repeat.cpp │ │ │ ├── Tensor_reshape.cpp │ │ │ ├── Tensor_reshape_as.cpp │ │ │ ├── Tensor_select.cpp │ │ │ ├── Tensor_size.cpp │ │ │ ├── Tensor_slice.cpp │ │ │ ├── Tensor_to.cpp │ │ │ ├── Tensor_type_as.cpp │ │ │ ├── Tensor_unflatten.cpp │ │ │ ├── eliminate_contiguous.cpp │ │ │ ├── eliminate_contiguous.h │ │ │ ├── eliminate_size_numtotensor_int.cpp │ │ │ ├── eliminate_size_numtotensor_int.h │ │ │ ├── functionize.cpp │ │ │ ├── functionize.h │ │ │ ├── fuse_constantlist.cpp │ │ │ ├── fuse_constantlist.h │ │ │ ├── nn_GRU.cpp │ │ │ ├── nn_LSTM.cpp │ │ │ ├── nn_RNN.cpp │ │ │ ├── nn_quantized_FloatFunctional.cpp │ │ │ ├── torch_addmm.cpp │ │ │ ├── torch_amax.cpp │ │ │ ├── torch_amin.cpp │ │ │ ├── torch_arange.cpp │ │ │ ├── torch_argmax.cpp │ │ │ ├── torch_argmin.cpp │ │ │ ├── torch_as_strided.cpp │ │ │ ├── torch_baddbmm.cpp │ │ │ ├── torch_bitwise_and.cpp │ │ │ ├── torch_bitwise_left_shift.cpp │ │ │ ├── torch_bitwise_not.cpp │ │ │ ├── torch_bitwise_or.cpp │ │ │ ├── torch_bitwise_right_shift.cpp │ │ │ ├── torch_bitwise_xor.cpp │ │ │ ├── torch_bmm.cpp │ │ │ ├── torch_cat.cpp │ │ │ ├── torch_chunk.cpp │ │ │ ├── torch_clamp.cpp │ │ │ ├── torch_clone.cpp │ │ │ ├── torch_complex.cpp │ │ │ ├── torch_cross.cpp │ │ │ ├── torch_cumprod.cpp │ │ │ ├── torch_cumsum.cpp │ │ │ ├── torch_dequantize.cpp │ │ │ ├── torch_diag.cpp │ │ │ ├── torch_einsum.cpp │ │ │ ├── torch_empty.cpp │ │ │ ├── torch_empty_like.cpp │ │ │ ├── torch_eq.cpp │ │ │ ├── torch_fft_fft.cpp │ │ │ ├── torch_fft_fft2.cpp │ │ │ ├── torch_fft_fftn.cpp │ │ │ ├── torch_fft_hfft.cpp │ │ │ ├── torch_fft_hfft2.cpp │ │ │ ├── torch_fft_hfftn.cpp │ │ │ ├── torch_fft_ifft.cpp │ │ │ ├── torch_fft_ifft2.cpp │ │ │ ├── torch_fft_ifftn.cpp │ │ │ ├── torch_fft_ihfft.cpp │ │ │ ├── torch_fft_ihfft2.cpp │ │ │ ├── torch_fft_ihfftn.cpp │ │ │ ├── torch_fft_irfft.cpp │ │ │ ├── torch_fft_irfft2.cpp │ │ │ ├── torch_fft_irfftn.cpp │ │ │ ├── torch_fft_rfft.cpp │ │ │ ├── torch_fft_rfft2.cpp │ │ │ ├── torch_fft_rfftn.cpp │ │ │ ├── torch_flatten.cpp │ │ │ ├── torch_flip.cpp │ │ │ ├── torch_full.cpp │ │ │ ├── torch_full_like.cpp │ │ │ ├── torch_gather.cpp │ │ │ ├── torch_ge.cpp │ │ │ ├── torch_gt.cpp │ │ │ ├── torch_imag.cpp │ │ │ ├── torch_index_select.cpp │ │ │ ├── torch_istft.cpp │ │ │ ├── torch_le.cpp │ │ │ ├── torch_lgamma.cpp │ │ │ ├── torch_logical_and.cpp │ │ │ ├── torch_logical_not.cpp │ │ │ ├── torch_logical_or.cpp │ │ │ ├── torch_logical_xor.cpp │ │ │ ├── torch_logsumexp.cpp │ │ │ ├── torch_lt.cpp │ │ │ ├── torch_masked_select.cpp │ │ │ ├── torch_matmul.cpp │ │ │ ├── torch_max.cpp │ │ │ ├── torch_mean.cpp │ │ │ ├── torch_min.cpp │ │ │ ├── torch_mm.cpp │ │ │ ├── torch_mv.cpp │ │ │ ├── torch_narrow.cpp │ │ │ ├── torch_ne.cpp │ │ │ ├── torch_norm.cpp │ │ │ ├── torch_normal.cpp │ │ │ ├── torch_ones.cpp │ │ │ ├── torch_ones_like.cpp │ │ │ ├── torch_positive.cpp │ │ │ ├── torch_prod.cpp │ │ │ ├── torch_quantize_per_tensor.cpp │ │ │ ├── torch_randn.cpp │ │ │ ├── torch_randn_like.cpp │ │ │ ├── torch_real.cpp │ │ │ ├── torch_repeat_interleave.cpp │ │ │ ├── torch_roll.cpp │ │ │ ├── torch_scatter_add.cpp │ │ │ ├── torch_slice_scatter.cpp │ │ │ ├── torch_split.cpp │ │ │ ├── torch_squeeze.cpp │ │ │ ├── torch_stack.cpp │ │ │ ├── torch_std.cpp │ │ │ ├── torch_stft.cpp │ │ │ ├── torch_sum.cpp │ │ │ ├── torch_t.cpp │ │ │ ├── torch_tensor_split.cpp │ │ │ ├── torch_tile.cpp │ │ │ ├── torch_topk.cpp │ │ │ ├── torch_transpose.cpp │ │ │ ├── torch_unbind.cpp │ │ │ ├── torch_unsqueeze.cpp │ │ │ ├── torch_var.cpp │ │ │ ├── torch_view_as_complex.cpp │ │ │ ├── torch_view_as_real.cpp │ │ │ ├── torch_where.cpp │ │ │ ├── torch_zeros.cpp │ │ │ ├── torch_zeros_like.cpp │ │ │ ├── torchaudio_F_inverse_spectrogram.cpp │ │ │ └── torchaudio_F_spectrogram.cpp │ │ ├── pass_level2.cpp │ │ ├── pass_level2.h │ │ ├── pass_level3/ │ │ │ ├── assign_unique_name.cpp │ │ │ ├── assign_unique_name.h │ │ │ ├── eliminate_noop_math.cpp │ │ │ ├── eliminate_noop_math.h │ │ │ ├── eliminate_squeeze_unsqueeze_pair.cpp │ │ │ ├── eliminate_squeeze_unsqueeze_pair.h │ │ │ ├── eliminate_tuple_pair.cpp │ │ │ ├── eliminate_tuple_pair.h │ │ │ ├── expand_quantization_modules.cpp │ │ │ ├── expand_quantization_modules.h │ │ │ ├── fuse_dynamic_adaptive_pool.cpp │ │ │ ├── fuse_dynamic_adaptive_pool.h │ │ │ ├── fuse_einsum_operands.cpp │ │ │ ├── fuse_einsum_operands.h │ │ │ ├── fuse_expression.cpp │ │ │ ├── fuse_expression.h │ │ │ ├── fuse_index_expression.cpp │ │ │ ├── fuse_index_expression.h │ │ │ ├── fuse_maxpool_unpack.cpp │ │ │ ├── fuse_maxpool_unpack.h │ │ │ ├── fuse_multiheadattention_unpack.cpp │ │ │ ├── fuse_multiheadattention_unpack.h │ │ │ ├── fuse_op1ton_unpack.cpp │ │ │ ├── fuse_op1ton_unpack.h │ │ │ ├── fuse_opnto1_tensors.cpp │ │ │ ├── fuse_opnto1_tensors.h │ │ │ ├── fuse_rnn_unpack.cpp │ │ │ ├── fuse_rnn_unpack.h │ │ │ ├── rename_F_dropoutnd.cpp │ │ │ └── rename_F_dropoutnd.h │ │ ├── pass_level3.cpp │ │ ├── pass_level3.h │ │ ├── pass_level4/ │ │ │ ├── attribute_pooling.cpp │ │ │ ├── attribute_pooling.h │ │ │ ├── canonicalize.cpp │ │ │ ├── canonicalize.h │ │ │ ├── dead_code_elimination.cpp │ │ │ ├── dead_code_elimination.h │ │ │ ├── fuse_custom_op.cpp │ │ │ └── fuse_custom_op.h │ │ ├── pass_level4.cpp │ │ ├── pass_level4.h │ │ ├── pass_level5/ │ │ │ ├── attribute_unpooling.cpp │ │ │ ├── attribute_unpooling.h │ │ │ ├── eliminate_dropout.cpp │ │ │ ├── eliminate_dropout.h │ │ │ ├── eliminate_identity_operator.cpp │ │ │ ├── eliminate_identity_operator.h │ │ │ ├── eliminate_maxpool_indices.cpp │ │ │ ├── eliminate_maxpool_indices.h │ │ │ ├── eliminate_noop_cat.cpp │ │ │ ├── eliminate_noop_cat.h │ │ │ ├── eliminate_noop_einsum.cpp │ │ │ ├── eliminate_noop_einsum.h │ │ │ ├── eliminate_noop_expand.cpp │ │ │ ├── eliminate_noop_expand.h │ │ │ ├── eliminate_noop_expression.cpp │ │ │ ├── eliminate_noop_expression.h │ │ │ ├── eliminate_noop_pad.cpp │ │ │ ├── eliminate_noop_pad.h │ │ │ ├── eliminate_noop_permute.cpp │ │ │ ├── eliminate_noop_permute.h │ │ │ ├── eliminate_noop_reshape.cpp │ │ │ ├── eliminate_noop_reshape.h │ │ │ ├── eliminate_noop_slice.cpp │ │ │ ├── eliminate_noop_slice.h │ │ │ ├── eliminate_noop_upsample.cpp │ │ │ ├── eliminate_noop_upsample.h │ │ │ ├── eliminate_reshape_shape_expression.cpp │ │ │ ├── eliminate_reshape_shape_expression.h │ │ │ ├── eliminate_type_as.cpp │ │ │ ├── eliminate_type_as.h │ │ │ ├── eval_expression.cpp │ │ │ ├── eval_expression.h │ │ │ ├── fold_constants.cpp │ │ │ ├── fold_constants.h │ │ │ ├── fuse_adjacent_permute.cpp │ │ │ ├── fuse_adjacent_permute.h │ │ │ ├── fuse_adjacent_reshape.cpp │ │ │ ├── fuse_adjacent_reshape.h │ │ │ ├── fuse_channel_shuffle.cpp │ │ │ ├── fuse_channel_shuffle.h │ │ │ ├── fuse_constant_expression.cpp │ │ │ ├── fuse_constant_expression.h │ │ │ ├── fuse_conv1d_batchnorm1d.cpp │ │ │ ├── fuse_conv1d_batchnorm1d.h │ │ │ ├── fuse_conv2d_batchnorm2d.cpp │ │ │ ├── fuse_conv2d_batchnorm2d.h │ │ │ ├── fuse_conv3d_batchnorm3d.cpp │ │ │ ├── fuse_conv3d_batchnorm3d.h │ │ │ ├── fuse_convtranspose1d_batchnorm1d.cpp │ │ │ ├── fuse_convtranspose1d_batchnorm1d.h │ │ │ ├── fuse_convtranspose2d_batchnorm2d.cpp │ │ │ ├── fuse_convtranspose2d_batchnorm2d.h │ │ │ ├── fuse_convtranspose3d_batchnorm3d.cpp │ │ │ ├── fuse_convtranspose3d_batchnorm3d.h │ │ │ ├── fuse_layernorm.cpp │ │ │ ├── fuse_layernorm.h │ │ │ ├── fuse_linear_batchnorm1d.cpp │ │ │ ├── fuse_linear_batchnorm1d.h │ │ │ ├── fuse_multiheadattention.cpp │ │ │ ├── fuse_multiheadattention.h │ │ │ ├── fuse_multiheadattention_sameqkv.cpp │ │ │ ├── fuse_multiheadattention_sameqkv.h │ │ │ ├── fuse_pad_conv1d.cpp │ │ │ ├── fuse_pad_conv1d.h │ │ │ ├── fuse_pad_conv2d.cpp │ │ │ ├── fuse_pad_conv2d.h │ │ │ ├── fuse_pixel_shuffle.cpp │ │ │ ├── fuse_pixel_shuffle.h │ │ │ ├── fuse_pixel_unshuffle.cpp │ │ │ ├── fuse_pixel_unshuffle.h │ │ │ ├── fuse_rmsnorm.cpp │ │ │ ├── fuse_rmsnorm.h │ │ │ ├── fuse_scaled_dot_product_attention.cpp │ │ │ ├── fuse_scaled_dot_product_attention.h │ │ │ ├── fuse_select_to_unbind.cpp │ │ │ ├── fuse_select_to_unbind.h │ │ │ ├── fuse_silu.cpp │ │ │ ├── fuse_silu.h │ │ │ ├── fuse_slice_copy.cpp │ │ │ ├── fuse_slice_copy.h │ │ │ ├── fuse_slice_indices.cpp │ │ │ ├── fuse_slice_indices.h │ │ │ ├── fuse_slice_squeeze_to_select.cpp │ │ │ ├── fuse_slice_squeeze_to_select.h │ │ │ ├── fuse_slice_to_tensor_split.cpp │ │ │ ├── fuse_slice_to_tensor_split.h │ │ │ ├── fuse_static_batchnorm.cpp │ │ │ ├── fuse_static_batchnorm.h │ │ │ ├── fuse_static_conv.cpp │ │ │ ├── fuse_static_conv.h │ │ │ ├── fuse_static_convtranspose.cpp │ │ │ ├── fuse_static_convtranspose.h │ │ │ ├── fuse_static_embedding.cpp │ │ │ ├── fuse_static_embedding.h │ │ │ ├── fuse_static_groupnorm.cpp │ │ │ ├── fuse_static_groupnorm.h │ │ │ ├── fuse_static_instancenorm.cpp │ │ │ ├── fuse_static_instancenorm.h │ │ │ ├── fuse_static_layernorm.cpp │ │ │ ├── fuse_static_layernorm.h │ │ │ ├── fuse_static_linear.cpp │ │ │ ├── fuse_static_linear.h │ │ │ ├── fuse_static_prelu.cpp │ │ │ ├── fuse_static_prelu.h │ │ │ ├── fuse_static_rmsnorm.cpp │ │ │ ├── fuse_static_rmsnorm.h │ │ │ ├── fuse_transformers_multiheadattention.cpp │ │ │ ├── fuse_transformers_multiheadattention.h │ │ │ ├── fuse_transformers_scaled_dot_product_attention.cpp │ │ │ ├── fuse_transformers_scaled_dot_product_attention.h │ │ │ ├── normalize_einsum_equation.cpp │ │ │ ├── normalize_einsum_equation.h │ │ │ ├── unroll_rnn_op.cpp │ │ │ └── unroll_rnn_op.h │ │ ├── pass_level5.cpp │ │ ├── pass_level5.h │ │ ├── pass_ncnn/ │ │ │ ├── F_adaptive_avg_pool1d.cpp │ │ │ ├── F_adaptive_avg_pool2d.cpp │ │ │ ├── F_adaptive_avg_pool3d.cpp │ │ │ ├── F_adaptive_max_pool1d.cpp │ │ │ ├── F_adaptive_max_pool2d.cpp │ │ │ ├── F_adaptive_max_pool3d.cpp │ │ │ ├── F_avg_pool1d.cpp │ │ │ ├── F_avg_pool2d.cpp │ │ │ ├── F_avg_pool3d.cpp │ │ │ ├── F_batch_norm.cpp │ │ │ ├── F_celu.cpp │ │ │ ├── F_conv1d.cpp │ │ │ ├── F_conv2d.cpp │ │ │ ├── F_conv3d.cpp │ │ │ ├── F_conv_transpose1d.cpp │ │ │ ├── F_conv_transpose2d.cpp │ │ │ ├── F_conv_transpose3d.cpp │ │ │ ├── F_elu.cpp │ │ │ ├── F_embedding.cpp │ │ │ ├── F_fold.cpp │ │ │ ├── F_gelu.cpp │ │ │ ├── F_glu.cpp │ │ │ ├── F_grid_sample.cpp │ │ │ ├── F_group_norm.cpp │ │ │ ├── F_hardshrink.cpp │ │ │ ├── F_hardsigmoid.cpp │ │ │ ├── F_hardswish.cpp │ │ │ ├── F_hardtanh.cpp │ │ │ ├── F_instance_norm.cpp │ │ │ ├── F_interpolate.cpp │ │ │ ├── F_layer_norm.cpp │ │ │ ├── F_leaky_relu.cpp │ │ │ ├── F_linear.cpp │ │ │ ├── F_local_response_norm.cpp │ │ │ ├── F_log_softmax.cpp │ │ │ ├── F_logsigmoid.cpp │ │ │ ├── F_max_pool1d.cpp │ │ │ ├── F_max_pool2d.cpp │ │ │ ├── F_max_pool3d.cpp │ │ │ ├── F_mish.cpp │ │ │ ├── F_normalize.cpp │ │ │ ├── F_pad.cpp │ │ │ ├── F_pixel_shuffle.cpp │ │ │ ├── F_pixel_unshuffle.cpp │ │ │ ├── F_prelu.cpp │ │ │ ├── F_relu.cpp │ │ │ ├── F_relu6.cpp │ │ │ ├── F_rms_norm.cpp │ │ │ ├── F_scaled_dot_product_attention.cpp │ │ │ ├── F_selu.cpp │ │ │ ├── F_sigmoid.cpp │ │ │ ├── F_silu.cpp │ │ │ ├── F_softmax.cpp │ │ │ ├── F_softplus.cpp │ │ │ ├── F_softshrink.cpp │ │ │ ├── F_tanh.cpp │ │ │ ├── F_unfold.cpp │ │ │ ├── F_upsample.cpp │ │ │ ├── F_upsample_bilinear.cpp │ │ │ ├── F_upsample_nearest.cpp │ │ │ ├── Tensor_expand.cpp │ │ │ ├── Tensor_permute.cpp │ │ │ ├── Tensor_repeat.cpp │ │ │ ├── Tensor_reshape.cpp │ │ │ ├── Tensor_reshape_as.cpp │ │ │ ├── Tensor_unflatten.cpp │ │ │ ├── chain_multi_output.cpp │ │ │ ├── chain_multi_output.h │ │ │ ├── convert_Tensor_select.cpp │ │ │ ├── convert_Tensor_select.h │ │ │ ├── convert_Tensor_slice.cpp │ │ │ ├── convert_Tensor_slice.h │ │ │ ├── convert_Tensor_slice_copy.cpp │ │ │ ├── convert_Tensor_slice_copy.h │ │ │ ├── convert_attribute.cpp │ │ │ ├── convert_attribute.h │ │ │ ├── convert_custom_op.cpp │ │ │ ├── convert_custom_op.h │ │ │ ├── convert_half_to_float.cpp │ │ │ ├── convert_half_to_float.h │ │ │ ├── convert_input.cpp │ │ │ ├── convert_input.h │ │ │ ├── convert_module_op.cpp │ │ │ ├── convert_module_op.h │ │ │ ├── convert_reshape_interp_expression.cpp │ │ │ ├── convert_reshape_interp_expression.h │ │ │ ├── convert_slice_expression.cpp │ │ │ ├── convert_slice_expression.h │ │ │ ├── convert_torch_cat.cpp │ │ │ ├── convert_torch_cat.h │ │ │ ├── convert_torch_chunk.cpp │ │ │ ├── convert_torch_chunk.h │ │ │ ├── convert_torch_einsum.cpp │ │ │ ├── convert_torch_einsum.h │ │ │ ├── convert_torch_split.cpp │ │ │ ├── convert_torch_split.h │ │ │ ├── convert_torch_stack.cpp │ │ │ ├── convert_torch_stack.h │ │ │ ├── convert_torch_tensor_split.cpp │ │ │ ├── convert_torch_tensor_split.h │ │ │ ├── convert_torch_unbind.cpp │ │ │ ├── convert_torch_unbind.h │ │ │ ├── eliminate_noop.cpp │ │ │ ├── eliminate_noop.h │ │ │ ├── eliminate_output.cpp │ │ │ ├── eliminate_output.h │ │ │ ├── expand_expression.cpp │ │ │ ├── expand_expression.h │ │ │ ├── fuse_binaryop_eltwise.cpp │ │ │ ├── fuse_binaryop_eltwise.h │ │ │ ├── fuse_convert_rotaryembed.cpp │ │ │ ├── fuse_convert_rotaryembed.h │ │ │ ├── fuse_convert_shufflechannel_slice.cpp │ │ │ ├── fuse_convert_shufflechannel_slice.h │ │ │ ├── fuse_convolution1d_activation.cpp │ │ │ ├── fuse_convolution1d_activation.h │ │ │ ├── fuse_convolution_activation.cpp │ │ │ ├── fuse_convolution_activation.h │ │ │ ├── fuse_convolutiondepthwise1d_activation.cpp │ │ │ ├── fuse_convolutiondepthwise1d_activation.h │ │ │ ├── fuse_convolutiondepthwise_activation.cpp │ │ │ ├── fuse_convolutiondepthwise_activation.h │ │ │ ├── fuse_deconvolution_activation.cpp │ │ │ ├── fuse_deconvolution_activation.h │ │ │ ├── fuse_deconvolutiondepthwise_activation.cpp │ │ │ ├── fuse_deconvolutiondepthwise_activation.h │ │ │ ├── fuse_innerproduct_activation.cpp │ │ │ ├── fuse_innerproduct_activation.h │ │ │ ├── fuse_padding_convolution.cpp │ │ │ ├── fuse_padding_convolution.h │ │ │ ├── fuse_padding_convolutiondepthwise.cpp │ │ │ ├── fuse_padding_convolutiondepthwise.h │ │ │ ├── fuse_transpose_matmul.cpp │ │ │ ├── fuse_transpose_matmul.h │ │ │ ├── insert_reshape_global_pooling.cpp │ │ │ ├── insert_reshape_global_pooling.h │ │ │ ├── insert_reshape_linear.cpp │ │ │ ├── insert_reshape_linear.h │ │ │ ├── insert_reshape_numpy_binaryop_broadcast.cpp │ │ │ ├── insert_reshape_numpy_binaryop_broadcast.h │ │ │ ├── insert_reshape_pooling.cpp │ │ │ ├── insert_reshape_pooling.h │ │ │ ├── insert_split.cpp │ │ │ ├── insert_split.h │ │ │ ├── nn_AdaptiveAvgPool1d.cpp │ │ │ ├── nn_AdaptiveAvgPool2d.cpp │ │ │ ├── nn_AdaptiveAvgPool3d.cpp │ │ │ ├── nn_AdaptiveMaxPool1d.cpp │ │ │ ├── nn_AdaptiveMaxPool2d.cpp │ │ │ ├── nn_AdaptiveMaxPool3d.cpp │ │ │ ├── nn_AvgPool1d.cpp │ │ │ ├── nn_AvgPool2d.cpp │ │ │ ├── nn_AvgPool3d.cpp │ │ │ ├── nn_BatchNorm1d.cpp │ │ │ ├── nn_BatchNorm2d.cpp │ │ │ ├── nn_BatchNorm3d.cpp │ │ │ ├── nn_CELU.cpp │ │ │ ├── nn_ChannelShuffle.cpp │ │ │ ├── nn_ConstantPad1d.cpp │ │ │ ├── nn_ConstantPad2d.cpp │ │ │ ├── nn_ConstantPad3d.cpp │ │ │ ├── nn_Conv1d.cpp │ │ │ ├── nn_Conv2d.cpp │ │ │ ├── nn_Conv3d.cpp │ │ │ ├── nn_ConvTranspose1d.cpp │ │ │ ├── nn_ConvTranspose2d.cpp │ │ │ ├── nn_ConvTranspose3d.cpp │ │ │ ├── nn_ELU.cpp │ │ │ ├── nn_Embedding.cpp │ │ │ ├── nn_Fold.cpp │ │ │ ├── nn_GELU.cpp │ │ │ ├── nn_GLU.cpp │ │ │ ├── nn_GRU.cpp │ │ │ ├── nn_GroupNorm.cpp │ │ │ ├── nn_Hardshrink.cpp │ │ │ ├── nn_Hardsigmoid.cpp │ │ │ ├── nn_Hardswish.cpp │ │ │ ├── nn_Hardtanh.cpp │ │ │ ├── nn_InstanceNorm2d.cpp │ │ │ ├── nn_LSTM.cpp │ │ │ ├── nn_LayerNorm.cpp │ │ │ ├── nn_LeakyReLU.cpp │ │ │ ├── nn_Linear.cpp │ │ │ ├── nn_LocalResponseNorm.cpp │ │ │ ├── nn_LogSigmoid.cpp │ │ │ ├── nn_LogSoftmax.cpp │ │ │ ├── nn_MaxPool1d.cpp │ │ │ ├── nn_MaxPool2d.cpp │ │ │ ├── nn_MaxPool3d.cpp │ │ │ ├── nn_Mish.cpp │ │ │ ├── nn_MultiheadAttention.cpp │ │ │ ├── nn_PReLU.cpp │ │ │ ├── nn_PixelShuffle.cpp │ │ │ ├── nn_PixelUnshuffle.cpp │ │ │ ├── nn_RMSNorm.cpp │ │ │ ├── nn_RNN.cpp │ │ │ ├── nn_ReLU.cpp │ │ │ ├── nn_ReLU6.cpp │ │ │ ├── nn_ReflectionPad1d.cpp │ │ │ ├── nn_ReflectionPad2d.cpp │ │ │ ├── nn_ReplicationPad1d.cpp │ │ │ ├── nn_ReplicationPad2d.cpp │ │ │ ├── nn_ReplicationPad3d.cpp │ │ │ ├── nn_SELU.cpp │ │ │ ├── nn_SiLU.cpp │ │ │ ├── nn_Sigmoid.cpp │ │ │ ├── nn_Softmax.cpp │ │ │ ├── nn_Softmax2d.cpp │ │ │ ├── nn_Softplus.cpp │ │ │ ├── nn_Softshrink.cpp │ │ │ ├── nn_Tanh.cpp │ │ │ ├── nn_Unfold.cpp │ │ │ ├── nn_Upsample.cpp │ │ │ ├── nn_UpsamplingBilinear2d.cpp │ │ │ ├── nn_UpsamplingNearest2d.cpp │ │ │ ├── nn_ZeroPad2d.cpp │ │ │ ├── solve_batch_index.cpp │ │ │ ├── solve_batch_index.h │ │ │ ├── torch_addmm.cpp │ │ │ ├── torch_amax.cpp │ │ │ ├── torch_amin.cpp │ │ │ ├── torch_bmm.cpp │ │ │ ├── torch_clamp.cpp │ │ │ ├── torch_clone.cpp │ │ │ ├── torch_cumsum.cpp │ │ │ ├── torch_diag.cpp │ │ │ ├── torch_flatten.cpp │ │ │ ├── torch_flip.cpp │ │ │ ├── torch_istft.cpp │ │ │ ├── torch_logsumexp.cpp │ │ │ ├── torch_matmul.cpp │ │ │ ├── torch_max.cpp │ │ │ ├── torch_mean.cpp │ │ │ ├── torch_min.cpp │ │ │ ├── torch_mm.cpp │ │ │ ├── torch_norm.cpp │ │ │ ├── torch_prod.cpp │ │ │ ├── torch_roll.cpp │ │ │ ├── torch_slice_scatter.cpp │ │ │ ├── torch_squeeze.cpp │ │ │ ├── torch_stft.cpp │ │ │ ├── torch_sum.cpp │ │ │ ├── torch_t.cpp │ │ │ ├── torch_transpose.cpp │ │ │ ├── torch_unsqueeze.cpp │ │ │ ├── torchaudio_F_inverse_spectrogram.cpp │ │ │ ├── torchaudio_F_spectrogram.cpp │ │ │ └── torchvision_DeformConv2d.cpp │ │ ├── pass_ncnn.cpp │ │ ├── pass_ncnn.h │ │ ├── pass_onnx/ │ │ │ ├── canonicalize.cpp │ │ │ ├── canonicalize.h │ │ │ ├── dead_code_elimination.cpp │ │ │ ├── dead_code_elimination.h │ │ │ ├── eliminate_initializer_input.cpp │ │ │ ├── eliminate_initializer_input.h │ │ │ ├── eliminate_noop.cpp │ │ │ ├── eliminate_noop.h │ │ │ ├── fold_constants.cpp │ │ │ ├── fold_constants.h │ │ │ ├── fuse_constant_as_attribute.cpp │ │ │ ├── fuse_constant_as_attribute.h │ │ │ ├── inline_containers.cpp │ │ │ ├── inline_containers.h │ │ │ ├── inline_if_graph.cpp │ │ │ ├── inline_if_graph.h │ │ │ ├── model_stat.cpp │ │ │ ├── model_stat.h │ │ │ ├── shape_inference.cpp │ │ │ └── shape_inference.h │ │ ├── pass_onnx.cpp │ │ ├── pass_onnx.h │ │ ├── pass_tnn/ │ │ │ ├── fuse_shape_list_construct.cpp │ │ │ ├── fuse_shape_list_construct.h │ │ │ ├── fuse_shape_size.cpp │ │ │ ├── fuse_shape_size.h │ │ │ ├── lower_concat.cpp │ │ │ ├── lower_concat.h │ │ │ ├── lower_convolution_activation.cpp │ │ │ ├── lower_convolution_activation.h │ │ │ ├── lower_power.cpp │ │ │ └── lower_power.h │ │ ├── save_ncnn.cpp │ │ ├── save_ncnn.h │ │ ├── save_onnx.cpp │ │ ├── save_onnx.h │ │ ├── storezip.cpp │ │ ├── storezip.h │ │ ├── utils.cpp │ │ └── utils.h │ └── tests/ │ ├── CMakeLists.txt │ ├── ncnn/ │ │ ├── CMakeLists.txt │ │ ├── test_F_adaptive_avg_pool1d.py │ │ ├── test_F_adaptive_avg_pool2d.py │ │ ├── test_F_adaptive_avg_pool3d.py │ │ ├── test_F_adaptive_max_pool1d.py │ │ ├── test_F_adaptive_max_pool2d.py │ │ ├── test_F_adaptive_max_pool3d.py │ │ ├── test_F_alpha_dropout.py │ │ ├── test_F_avg_pool1d.py │ │ ├── test_F_avg_pool2d.py │ │ ├── test_F_avg_pool3d.py │ │ ├── test_F_batch_norm.py │ │ ├── test_F_celu.py │ │ ├── test_F_conv1d.py │ │ ├── test_F_conv2d.py │ │ ├── test_F_conv3d.py │ │ ├── test_F_conv_transpose1d.py │ │ ├── test_F_conv_transpose2d.py │ │ ├── test_F_conv_transpose3d.py │ │ ├── test_F_dropout.py │ │ ├── test_F_dropout2d.py │ │ ├── test_F_dropout3d.py │ │ ├── test_F_elu.py │ │ ├── test_F_embedding.py │ │ ├── test_F_feature_alpha_dropout.py │ │ ├── test_F_fold.py │ │ ├── test_F_gelu.py │ │ ├── test_F_glu.py │ │ ├── test_F_grid_sample.py │ │ ├── test_F_group_norm.py │ │ ├── test_F_hardshrink.py │ │ ├── test_F_hardsigmoid.py │ │ ├── test_F_hardswish.py │ │ ├── test_F_hardtanh.py │ │ ├── test_F_interpolate.py │ │ ├── test_F_layer_norm.py │ │ ├── test_F_leaky_relu.py │ │ ├── test_F_local_response_norm.py │ │ ├── test_F_log_softmax.py │ │ ├── test_F_logsigmoid.py │ │ ├── test_F_max_pool1d.py │ │ ├── test_F_max_pool2d.py │ │ ├── test_F_max_pool3d.py │ │ ├── test_F_mish.py │ │ ├── test_F_normalize.py │ │ ├── test_F_pad.py │ │ ├── test_F_pixel_shuffle.py │ │ ├── test_F_pixel_unshuffle.py │ │ ├── test_F_prelu.py │ │ ├── test_F_relu.py │ │ ├── test_F_relu6.py │ │ ├── test_F_rms_norm.py │ │ ├── test_F_scaled_dot_product_attention.py │ │ ├── test_F_selu.py │ │ ├── test_F_sigmoid.py │ │ ├── test_F_silu.py │ │ ├── test_F_softmax.py │ │ ├── test_F_softshrink.py │ │ ├── test_F_tanh.py │ │ ├── test_F_unfold.py │ │ ├── test_F_upsample.py │ │ ├── test_F_upsample_bilinear.py │ │ ├── test_F_upsample_nearest.py │ │ ├── test_Tensor_expand.py │ │ ├── test_Tensor_permute.py │ │ ├── test_Tensor_repeat.py │ │ ├── test_Tensor_reshape.py │ │ ├── test_Tensor_reshape_as.py │ │ ├── test_Tensor_slice.py │ │ ├── test_Tensor_slice_copy.py │ │ ├── test_Tensor_unflatten.py │ │ ├── test_Tensor_view.py │ │ ├── test_convnext_tiny.py │ │ ├── test_mobilenet_v2.py │ │ ├── test_mobilenet_v3_small.py │ │ ├── test_ncnn_fuse_binaryop_eltwise.py │ │ ├── test_ncnn_fuse_pad_conv.py │ │ ├── test_ncnn_fuse_shufflechannel_slice.py │ │ ├── test_ncnn_fuse_transpose_matmul.py │ │ ├── test_ncnn_interp_expr.py │ │ ├── test_ncnn_numpy_binaryop_broadcast.py │ │ ├── test_ncnn_reshape_expr.py │ │ ├── test_ncnn_slice_expr.py │ │ ├── test_ncnn_solve_batch_index.py │ │ ├── test_nn_AdaptiveAvgPool1d.py │ │ ├── test_nn_AdaptiveAvgPool2d.py │ │ ├── test_nn_AdaptiveAvgPool3d.py │ │ ├── test_nn_AdaptiveMaxPool1d.py │ │ ├── test_nn_AdaptiveMaxPool2d.py │ │ ├── test_nn_AdaptiveMaxPool3d.py │ │ ├── test_nn_AlphaDropout.py │ │ ├── test_nn_AvgPool1d.py │ │ ├── test_nn_AvgPool2d.py │ │ ├── test_nn_AvgPool3d.py │ │ ├── test_nn_BatchNorm1d.py │ │ ├── test_nn_BatchNorm2d.py │ │ ├── test_nn_BatchNorm3d.py │ │ ├── test_nn_CELU.py │ │ ├── test_nn_ChannelShuffle.py │ │ ├── test_nn_ConstantPad1d.py │ │ ├── test_nn_ConstantPad2d.py │ │ ├── test_nn_ConstantPad3d.py │ │ ├── test_nn_Conv1d.py │ │ ├── test_nn_Conv2d.py │ │ ├── test_nn_Conv3d.py │ │ ├── test_nn_ConvTranspose1d.py │ │ ├── test_nn_ConvTranspose2d.py │ │ ├── test_nn_ConvTranspose3d.py │ │ ├── test_nn_Dropout.py │ │ ├── test_nn_Dropout2d.py │ │ ├── test_nn_Dropout3d.py │ │ ├── test_nn_ELU.py │ │ ├── test_nn_Embedding.py │ │ ├── test_nn_Fold.py │ │ ├── test_nn_GELU.py │ │ ├── test_nn_GLU.py │ │ ├── test_nn_GRU.py │ │ ├── test_nn_GroupNorm.py │ │ ├── test_nn_Hardshrink.py │ │ ├── test_nn_Hardsigmoid.py │ │ ├── test_nn_Hardswish.py │ │ ├── test_nn_Hardtanh.py │ │ ├── test_nn_Identity.py │ │ ├── test_nn_InstanceNorm2d.py │ │ ├── test_nn_LSTM.py │ │ ├── test_nn_LayerNorm.py │ │ ├── test_nn_LeakyReLU.py │ │ ├── test_nn_Linear.py │ │ ├── test_nn_LocalResponseNorm.py │ │ ├── test_nn_LogSigmoid.py │ │ ├── test_nn_LogSoftmax.py │ │ ├── test_nn_MaxPool1d.py │ │ ├── test_nn_MaxPool2d.py │ │ ├── test_nn_MaxPool3d.py │ │ ├── test_nn_Mish.py │ │ ├── test_nn_MultiheadAttention.py │ │ ├── test_nn_PReLU.py │ │ ├── test_nn_PixelShuffle.py │ │ ├── test_nn_PixelUnshuffle.py │ │ ├── test_nn_RMSNorm.py │ │ ├── test_nn_RNN.py │ │ ├── test_nn_ReLU.py │ │ ├── test_nn_ReLU6.py │ │ ├── test_nn_ReflectionPad1d.py │ │ ├── test_nn_ReflectionPad2d.py │ │ ├── test_nn_ReplicationPad1d.py │ │ ├── test_nn_ReplicationPad2d.py │ │ ├── test_nn_ReplicationPad3d.py │ │ ├── test_nn_SELU.py │ │ ├── test_nn_SiLU.py │ │ ├── test_nn_Sigmoid.py │ │ ├── test_nn_Softmax.py │ │ ├── test_nn_Softmax2d.py │ │ ├── test_nn_Softshrink.py │ │ ├── test_nn_Tanh.py │ │ ├── test_nn_Unfold.py │ │ ├── test_nn_Upsample.py │ │ ├── test_nn_UpsamplingBilinear2d.py │ │ ├── test_nn_UpsamplingNearest2d.py │ │ ├── test_nn_ZeroPad2d.py │ │ ├── test_resnet18.py │ │ ├── test_shufflenet_v2_x1_0.py │ │ ├── test_squeezenet1_1.py │ │ ├── test_torch_abs.py │ │ ├── test_torch_acos.py │ │ ├── test_torch_addmm.py │ │ ├── test_torch_amax.py │ │ ├── test_torch_amin.py │ │ ├── test_torch_asin.py │ │ ├── test_torch_atan.py │ │ ├── test_torch_atan2.py │ │ ├── test_torch_bmm.py │ │ ├── test_torch_cat.py │ │ ├── test_torch_ceil.py │ │ ├── test_torch_chunk.py │ │ ├── test_torch_clamp.py │ │ ├── test_torch_clone.py │ │ ├── test_torch_cos.py │ │ ├── test_torch_cumsum.py │ │ ├── test_torch_diag.py │ │ ├── test_torch_einsum.py │ │ ├── test_torch_exp.py │ │ ├── test_torch_flatten.py │ │ ├── test_torch_flip.py │ │ ├── test_torch_floor.py │ │ ├── test_torch_istft.py │ │ ├── test_torch_log.py │ │ ├── test_torch_log10.py │ │ ├── test_torch_logsumexp.py │ │ ├── test_torch_matmul.py │ │ ├── test_torch_max.py │ │ ├── test_torch_maximum.py │ │ ├── test_torch_mean.py │ │ ├── test_torch_min.py │ │ ├── test_torch_minimum.py │ │ ├── test_torch_mm.py │ │ ├── test_torch_neg.py │ │ ├── test_torch_norm.py │ │ ├── test_torch_pow.py │ │ ├── test_torch_prod.py │ │ ├── test_torch_reciprocal.py │ │ ├── test_torch_roll.py │ │ ├── test_torch_round.py │ │ ├── test_torch_rsqrt.py │ │ ├── test_torch_sin.py │ │ ├── test_torch_slice_scatter.py │ │ ├── test_torch_sqrt.py │ │ ├── test_torch_square.py │ │ ├── test_torch_squeeze.py │ │ ├── test_torch_stack.py │ │ ├── test_torch_stft.py │ │ ├── test_torch_sum.py │ │ ├── test_torch_t.py │ │ ├── test_torch_tan.py │ │ ├── test_torch_tanh.py │ │ ├── test_torch_tensor_split.py │ │ ├── test_torch_transpose.py │ │ ├── test_torch_trunc.py │ │ ├── test_torch_unbind.py │ │ ├── test_torch_unsqueeze.py │ │ ├── test_torchaudio_F_inverse_spectrogram.py │ │ ├── test_torchaudio_F_spectrogram.py │ │ ├── test_torchaudio_InverseSpectrogram.py │ │ ├── test_torchaudio_Spectrogram.py │ │ ├── test_torchvision_DeformConv2d.py │ │ ├── test_transformers_deepseek_v3_attention.py │ │ ├── test_transformers_qwen2_attention.py │ │ ├── test_transformers_qwen3_attention.py │ │ └── test_vit_b_32.py │ ├── onnx/ │ │ ├── CMakeLists.txt │ │ ├── test_F_adaptive_avg_pool1d.py │ │ ├── test_F_adaptive_avg_pool2d.py │ │ ├── test_F_adaptive_avg_pool3d.py │ │ ├── test_F_adaptive_max_pool1d.py │ │ ├── test_F_adaptive_max_pool2d.py │ │ ├── test_F_adaptive_max_pool3d.py │ │ ├── test_F_avg_pool1d.py │ │ ├── test_F_avg_pool2d.py │ │ ├── test_F_avg_pool3d.py │ │ ├── test_F_batch_norm.py │ │ ├── test_F_celu.py │ │ ├── test_F_conv1d.py │ │ ├── test_F_conv2d.py │ │ ├── test_F_conv3d.py │ │ ├── test_F_conv_transpose1d.py │ │ ├── test_F_conv_transpose2d.py │ │ ├── test_F_conv_transpose3d.py │ │ ├── test_F_elu.py │ │ ├── test_F_gelu.py │ │ ├── test_F_group_norm.py │ │ ├── test_F_hardshrink.py │ │ ├── test_F_hardsigmoid.py │ │ ├── test_F_hardswish.py │ │ ├── test_F_hardtanh.py │ │ ├── test_F_interpolate.py │ │ ├── test_F_layer_norm.py │ │ ├── test_F_leaky_relu.py │ │ ├── test_F_linear.py │ │ ├── test_F_local_response_norm.py │ │ ├── test_F_log_softmax.py │ │ ├── test_F_logsigmoid.py │ │ ├── test_F_max_pool1d.py │ │ ├── test_F_max_pool2d.py │ │ ├── test_F_max_pool3d.py │ │ ├── test_F_mish.py │ │ ├── test_F_normalize.py │ │ ├── test_F_pad.py │ │ ├── test_F_pixel_shuffle.py │ │ ├── test_F_pixel_unshuffle.py │ │ ├── test_F_prelu.py │ │ ├── test_F_relu.py │ │ ├── test_F_relu6.py │ │ ├── test_F_scaled_dot_product_attention.py │ │ ├── test_F_selu.py │ │ ├── test_F_sigmoid.py │ │ ├── test_F_silu.py │ │ ├── test_F_softmax.py │ │ ├── test_F_softmin.py │ │ ├── test_F_softplus.py │ │ ├── test_F_softshrink.py │ │ ├── test_F_softsign.py │ │ ├── test_F_tanh.py │ │ ├── test_F_tanhshrink.py │ │ ├── test_F_upsample.py │ │ ├── test_F_upsample_bilinear.py │ │ ├── test_F_upsample_nearest.py │ │ ├── test_Tensor_expand.py │ │ ├── test_Tensor_permute.py │ │ ├── test_Tensor_repeat.py │ │ ├── test_Tensor_reshape.py │ │ ├── test_Tensor_reshape_as.py │ │ ├── test_Tensor_select.py │ │ ├── test_Tensor_slice.py │ │ ├── test_Tensor_unflatten.py │ │ ├── test_Tensor_view.py │ │ ├── test_convnext_tiny.py │ │ ├── test_mobilenet_v2.py │ │ ├── test_mobilenet_v3_small.py │ │ ├── test_nn_AdaptiveAvgPool1d.py │ │ ├── test_nn_AdaptiveAvgPool2d.py │ │ ├── test_nn_AdaptiveAvgPool3d.py │ │ ├── test_nn_AdaptiveMaxPool1d.py │ │ ├── test_nn_AdaptiveMaxPool2d.py │ │ ├── test_nn_AdaptiveMaxPool3d.py │ │ ├── test_nn_AvgPool1d.py │ │ ├── test_nn_AvgPool2d.py │ │ ├── test_nn_AvgPool3d.py │ │ ├── test_nn_BatchNorm1d.py │ │ ├── test_nn_BatchNorm2d.py │ │ ├── test_nn_BatchNorm3d.py │ │ ├── test_nn_CELU.py │ │ ├── test_nn_ConstantPad1d.py │ │ ├── test_nn_ConstantPad2d.py │ │ ├── test_nn_ConstantPad3d.py │ │ ├── test_nn_Conv1d.py │ │ ├── test_nn_Conv2d.py │ │ ├── test_nn_Conv3d.py │ │ ├── test_nn_ConvTranspose1d.py │ │ ├── test_nn_ConvTranspose2d.py │ │ ├── test_nn_ConvTranspose3d.py │ │ ├── test_nn_ELU.py │ │ ├── test_nn_GELU.py │ │ ├── test_nn_GRU.py │ │ ├── test_nn_GroupNorm.py │ │ ├── test_nn_Hardshrink.py │ │ ├── test_nn_Hardsigmoid.py │ │ ├── test_nn_Hardswish.py │ │ ├── test_nn_Hardtanh.py │ │ ├── test_nn_InstanceNorm1d.py │ │ ├── test_nn_InstanceNorm2d.py │ │ ├── test_nn_InstanceNorm3d.py │ │ ├── test_nn_LSTM.py │ │ ├── test_nn_LayerNorm.py │ │ ├── test_nn_LeakyReLU.py │ │ ├── test_nn_Linear.py │ │ ├── test_nn_LocalResponseNorm.py │ │ ├── test_nn_LogSigmoid.py │ │ ├── test_nn_LogSoftmax.py │ │ ├── test_nn_MaxPool1d.py │ │ ├── test_nn_MaxPool2d.py │ │ ├── test_nn_MaxPool3d.py │ │ ├── test_nn_Mish.py │ │ ├── test_nn_MultiheadAttention.py │ │ ├── test_nn_PReLU.py │ │ ├── test_nn_PixelShuffle.py │ │ ├── test_nn_PixelUnshuffle.py │ │ ├── test_nn_RNN.py │ │ ├── test_nn_ReLU.py │ │ ├── test_nn_ReLU6.py │ │ ├── test_nn_ReflectionPad1d.py │ │ ├── test_nn_ReflectionPad2d.py │ │ ├── test_nn_ReplicationPad1d.py │ │ ├── test_nn_ReplicationPad2d.py │ │ ├── test_nn_ReplicationPad3d.py │ │ ├── test_nn_SELU.py │ │ ├── test_nn_SiLU.py │ │ ├── test_nn_Sigmoid.py │ │ ├── test_nn_Softmax.py │ │ ├── test_nn_Softmin.py │ │ ├── test_nn_Softplus.py │ │ ├── test_nn_Softshrink.py │ │ ├── test_nn_Softsign.py │ │ ├── test_nn_Tanh.py │ │ ├── test_nn_Tanhshrink.py │ │ ├── test_nn_Upsample.py │ │ ├── test_nn_UpsamplingBilinear2d.py │ │ ├── test_nn_UpsamplingNearest2d.py │ │ ├── test_nn_ZeroPad2d.py │ │ ├── test_onnx_activation_ops.py │ │ ├── test_onnx_conv_ops.py │ │ ├── test_onnx_dense_ops.py │ │ ├── test_onnx_fuse_channel_shuffle.py │ │ ├── test_onnx_fuse_pixel_shuffle.py │ │ ├── test_onnx_fuse_pixel_unshuffle.py │ │ ├── test_onnx_layout_ops.py │ │ ├── test_onnx_math_ops.py │ │ ├── test_onnx_normalize_ops.py │ │ ├── test_onnx_opset21_ops.py │ │ ├── test_onnx_pool_ops.py │ │ ├── test_onnx_reduce_ops.py │ │ ├── test_onnx_rnn_ops.py │ │ ├── test_resnet18.py │ │ ├── test_shufflenet_v2_x1_0.py │ │ ├── test_squeezenet1_1.py │ │ ├── test_swin_t.py │ │ ├── test_torch_cat.py │ │ ├── test_torch_ceil.py │ │ ├── test_torch_chunk.py │ │ ├── test_torch_clamp.py │ │ ├── test_torch_flatten.py │ │ ├── test_torch_flip.py │ │ ├── test_torch_floor.py │ │ ├── test_torch_logical_and.py │ │ ├── test_torch_logical_not.py │ │ ├── test_torch_logical_or.py │ │ ├── test_torch_logical_xor.py │ │ ├── test_torch_max.py │ │ ├── test_torch_maximum.py │ │ ├── test_torch_mean.py │ │ ├── test_torch_min.py │ │ ├── test_torch_minimum.py │ │ ├── test_torch_norm.py │ │ ├── test_torch_prod.py │ │ ├── test_torch_roll.py │ │ ├── test_torch_split.py │ │ ├── test_torch_squeeze.py │ │ ├── test_torch_stack.py │ │ ├── test_torch_sum.py │ │ ├── test_torch_transpose.py │ │ ├── test_torch_unbind.py │ │ ├── test_torch_unsqueeze.py │ │ ├── test_transformers_albert_attention.py │ │ ├── test_transformers_bart_attention.py │ │ ├── test_transformers_bert_attention.py │ │ ├── test_transformers_bert_generation_attention.py │ │ ├── test_transformers_blenderbot_attention.py │ │ ├── test_transformers_camembert_attention.py │ │ ├── test_transformers_chinese_clip_attention.py │ │ ├── test_transformers_clip_attention.py │ │ ├── test_transformers_ctrl_attention.py │ │ ├── test_transformers_deberta_attention.py │ │ ├── test_transformers_distilbert_attention.py │ │ ├── test_transformers_electra_attention.py │ │ ├── test_transformers_flaubert_attention.py │ │ ├── test_transformers_fsmt_attention.py │ │ ├── test_transformers_funnel_attention.py │ │ ├── test_transformers_gpt2_attention.py │ │ ├── test_transformers_layoutlm_attention.py │ │ ├── test_transformers_lxmert_attention.py │ │ ├── test_transformers_m2m_100_attention.py │ │ ├── test_transformers_marian_attention.py │ │ ├── test_transformers_mbart_attention.py │ │ ├── test_transformers_mobilebert_attention.py │ │ ├── test_transformers_mt5_attention.py │ │ ├── test_transformers_openai_attention.py │ │ ├── test_transformers_pegasus_attention.py │ │ ├── test_transformers_prophetnet_attention.py │ │ ├── test_transformers_reformer_attention.py │ │ ├── test_transformers_roberta_attention.py │ │ ├── test_transformers_squeezebert_attention.py │ │ ├── test_transformers_t5_attention.py │ │ ├── test_transformers_xlm_attention.py │ │ ├── test_transformers_xlm_roberta_attention.py │ │ └── test_vit_b_32.py │ ├── run_test.cmake │ ├── test_F_adaptive_avg_pool1d.py │ ├── test_F_adaptive_avg_pool2d.py │ ├── test_F_adaptive_avg_pool3d.py │ ├── test_F_adaptive_max_pool1d.py │ ├── test_F_adaptive_max_pool2d.py │ ├── test_F_adaptive_max_pool3d.py │ ├── test_F_affine_grid.py │ ├── test_F_alpha_dropout.py │ ├── test_F_avg_pool1d.py │ ├── test_F_avg_pool2d.py │ ├── test_F_avg_pool3d.py │ ├── test_F_batch_norm.py │ ├── test_F_celu.py │ ├── test_F_conv1d.py │ ├── test_F_conv2d.py │ ├── test_F_conv3d.py │ ├── test_F_conv_transpose1d.py │ ├── test_F_conv_transpose2d.py │ ├── test_F_conv_transpose3d.py │ ├── test_F_dropout.py │ ├── test_F_dropout2d.py │ ├── test_F_dropout3d.py │ ├── test_F_elu.py │ ├── test_F_embedding.py │ ├── test_F_feature_alpha_dropout.py │ ├── test_F_fold.py │ ├── test_F_gelu.py │ ├── test_F_glu.py │ ├── test_F_grid_sample.py │ ├── test_F_group_norm.py │ ├── test_F_hardshrink.py │ ├── test_F_hardsigmoid.py │ ├── test_F_hardswish.py │ ├── test_F_hardtanh.py │ ├── test_F_instance_norm.py │ ├── test_F_interpolate.py │ ├── test_F_layer_norm.py │ ├── test_F_leaky_relu.py │ ├── test_F_linear.py │ ├── test_F_local_response_norm.py │ ├── test_F_log_softmax.py │ ├── test_F_logsigmoid.py │ ├── test_F_lp_pool1d.py │ ├── test_F_lp_pool2d.py │ ├── test_F_max_pool1d.py │ ├── test_F_max_pool2d.py │ ├── test_F_max_pool3d.py │ ├── test_F_mish.py │ ├── test_F_normalize.py │ ├── test_F_pad.py │ ├── test_F_pairwise_distance.py │ ├── test_F_pixel_shuffle.py │ ├── test_F_pixel_unshuffle.py │ ├── test_F_prelu.py │ ├── test_F_relu.py │ ├── test_F_relu6.py │ ├── test_F_rms_norm.py │ ├── test_F_rrelu.py │ ├── test_F_scaled_dot_product_attention.py │ ├── test_F_selu.py │ ├── test_F_sigmoid.py │ ├── test_F_silu.py │ ├── test_F_softmax.py │ ├── test_F_softmin.py │ ├── test_F_softplus.py │ ├── test_F_softshrink.py │ ├── test_F_softsign.py │ ├── test_F_tanh.py │ ├── test_F_tanhshrink.py │ ├── test_F_threshold.py │ ├── test_F_unfold.py │ ├── test_F_upsample.py │ ├── test_F_upsample_bilinear.py │ ├── test_F_upsample_nearest.py │ ├── test_Tensor_expand.py │ ├── test_Tensor_fill.py │ ├── test_Tensor_index.py │ ├── test_Tensor_index_put.py │ ├── test_Tensor_masked_fill.py │ ├── test_Tensor_new_empty.py │ ├── test_Tensor_new_full.py │ ├── test_Tensor_new_ones.py │ ├── test_Tensor_new_zeros.py │ ├── test_Tensor_permute.py │ ├── test_Tensor_repeat.py │ ├── test_Tensor_reshape.py │ ├── test_Tensor_reshape_as.py │ ├── test_Tensor_select.py │ ├── test_Tensor_slice.py │ ├── test_Tensor_slice_copy.py │ ├── test_Tensor_to.py │ ├── test_Tensor_type_as.py │ ├── test_Tensor_unflatten.py │ ├── test_Tensor_view.py │ ├── test_convnext_tiny.py │ ├── test_ir_complex.py │ ├── test_mobilenet_v2.py │ ├── test_mobilenet_v3_small.py │ ├── test_nn_AdaptiveAvgPool1d.py │ ├── test_nn_AdaptiveAvgPool2d.py │ ├── test_nn_AdaptiveAvgPool3d.py │ ├── test_nn_AdaptiveMaxPool1d.py │ ├── test_nn_AdaptiveMaxPool2d.py │ ├── test_nn_AdaptiveMaxPool3d.py │ ├── test_nn_AlphaDropout.py │ ├── test_nn_AvgPool1d.py │ ├── test_nn_AvgPool2d.py │ ├── test_nn_AvgPool3d.py │ ├── test_nn_BatchNorm1d.py │ ├── test_nn_BatchNorm2d.py │ ├── test_nn_BatchNorm3d.py │ ├── test_nn_CELU.py │ ├── test_nn_ChannelShuffle.py │ ├── test_nn_ConstantPad1d.py │ ├── test_nn_ConstantPad2d.py │ ├── test_nn_ConstantPad3d.py │ ├── test_nn_Conv1d.py │ ├── test_nn_Conv2d.py │ ├── test_nn_Conv3d.py │ ├── test_nn_ConvTranspose1d.py │ ├── test_nn_ConvTranspose2d.py │ ├── test_nn_ConvTranspose3d.py │ ├── test_nn_Dropout.py │ ├── test_nn_Dropout2d.py │ ├── test_nn_Dropout3d.py │ ├── test_nn_ELU.py │ ├── test_nn_Embedding.py │ ├── test_nn_Fold.py │ ├── test_nn_GELU.py │ ├── test_nn_GLU.py │ ├── test_nn_GRU.py │ ├── test_nn_GroupNorm.py │ ├── test_nn_Hardshrink.py │ ├── test_nn_Hardsigmoid.py │ ├── test_nn_Hardswish.py │ ├── test_nn_Hardtanh.py │ ├── test_nn_Identity.py │ ├── test_nn_InstanceNorm1d.py │ ├── test_nn_InstanceNorm2d.py │ ├── test_nn_InstanceNorm3d.py │ ├── test_nn_LPPool1d.py │ ├── test_nn_LPPool2d.py │ ├── test_nn_LSTM.py │ ├── test_nn_LayerNorm.py │ ├── test_nn_LeakyReLU.py │ ├── test_nn_Linear.py │ ├── test_nn_LocalResponseNorm.py │ ├── test_nn_LogSigmoid.py │ ├── test_nn_LogSoftmax.py │ ├── test_nn_MaxPool1d.py │ ├── test_nn_MaxPool2d.py │ ├── test_nn_MaxPool3d.py │ ├── test_nn_Mish.py │ ├── test_nn_MultiheadAttention.py │ ├── test_nn_PReLU.py │ ├── test_nn_PixelShuffle.py │ ├── test_nn_PixelUnshuffle.py │ ├── test_nn_RMSNorm.py │ ├── test_nn_RNN.py │ ├── test_nn_RReLU.py │ ├── test_nn_ReLU.py │ ├── test_nn_ReLU6.py │ ├── test_nn_ReflectionPad1d.py │ ├── test_nn_ReflectionPad2d.py │ ├── test_nn_ReplicationPad1d.py │ ├── test_nn_ReplicationPad2d.py │ ├── test_nn_ReplicationPad3d.py │ ├── test_nn_SELU.py │ ├── test_nn_SiLU.py │ ├── test_nn_Sigmoid.py │ ├── test_nn_Softmax.py │ ├── test_nn_Softmax2d.py │ ├── test_nn_Softmin.py │ ├── test_nn_Softplus.py │ ├── test_nn_Softshrink.py │ ├── test_nn_Softsign.py │ ├── test_nn_Tanh.py │ ├── test_nn_Tanhshrink.py │ ├── test_nn_Threshold.py │ ├── test_nn_Unfold.py │ ├── test_nn_Upsample.py │ ├── test_nn_UpsamplingBilinear2d.py │ ├── test_nn_UpsamplingNearest2d.py │ ├── test_nn_ZeroPad2d.py │ ├── test_pnnx_eliminate_noop_cat.py │ ├── test_pnnx_eliminate_noop_expand.py │ ├── test_pnnx_eliminate_noop_math.py │ ├── test_pnnx_eliminate_noop_upsample.py │ ├── test_pnnx_expression.py │ ├── test_pnnx_fold_constant.py │ ├── test_pnnx_fuse_adjacent_permute.py │ ├── test_pnnx_fuse_adjacent_reshape.py │ ├── test_pnnx_fuse_channel_shuffle.py │ ├── test_pnnx_fuse_conv1d_batchnorm1d.py │ ├── test_pnnx_fuse_conv2d_batchnorm2d.py │ ├── test_pnnx_fuse_conv3d_batchnorm3d.py │ ├── test_pnnx_fuse_convtranspose1d_batchnorm1d.py │ ├── test_pnnx_fuse_convtranspose2d_batchnorm2d.py │ ├── test_pnnx_fuse_convtranspose3d_batchnorm3d.py │ ├── test_pnnx_fuse_input_unpack.py │ ├── test_pnnx_fuse_layernorm.py │ ├── test_pnnx_fuse_linear_batchnorm1d.py │ ├── test_pnnx_fuse_multiheadattention.py │ ├── test_pnnx_fuse_pad_conv1d.py │ ├── test_pnnx_fuse_pad_conv2d.py │ ├── test_pnnx_fuse_pixel_shuffle.py │ ├── test_pnnx_fuse_pixel_unshuffle.py │ ├── test_pnnx_fuse_rmsnorm.py │ ├── test_pnnx_fuse_scaled_dot_product_attention.py │ ├── test_pnnx_fuse_select_to_unbind.py │ ├── test_pnnx_fuse_slice_to_tensor_split.py │ ├── test_quantization_shufflenet_v2_x1_0.py │ ├── test_resnet18.py │ ├── test_shufflenet_v2_x1_0.py │ ├── test_squeezenet1_1.py │ ├── test_swin_t.py │ ├── test_torch_abs.py │ ├── test_torch_acos.py │ ├── test_torch_acosh.py │ ├── test_torch_addmm.py │ ├── test_torch_amax.py │ ├── test_torch_amin.py │ ├── test_torch_arange.py │ ├── test_torch_argmax.py │ ├── test_torch_argmin.py │ ├── test_torch_asin.py │ ├── test_torch_asinh.py │ ├── test_torch_atan.py │ ├── test_torch_atan2.py │ ├── test_torch_atanh.py │ ├── test_torch_bitwise_and.py │ ├── test_torch_bitwise_left_shift.py │ ├── test_torch_bitwise_not.py │ ├── test_torch_bitwise_or.py │ ├── test_torch_bitwise_right_shift.py │ ├── test_torch_bitwise_xor.py │ ├── test_torch_bmm.py │ ├── test_torch_cat.py │ ├── test_torch_ceil.py │ ├── test_torch_chunk.py │ ├── test_torch_clamp.py │ ├── test_torch_clone.py │ ├── test_torch_complex.py │ ├── test_torch_cos.py │ ├── test_torch_cosh.py │ ├── test_torch_cross.py │ ├── test_torch_cumprod.py │ ├── test_torch_cumsum.py │ ├── test_torch_diag.py │ ├── test_torch_einsum.py │ ├── test_torch_eq.py │ ├── test_torch_exp.py │ ├── test_torch_fft_fft.py │ ├── test_torch_fft_fft2.py │ ├── test_torch_fft_fftn.py │ ├── test_torch_fft_hfft.py │ ├── test_torch_fft_hfft2.py │ ├── test_torch_fft_hfftn.py │ ├── test_torch_fft_ifft.py │ ├── test_torch_fft_ifft2.py │ ├── test_torch_fft_ifftn.py │ ├── test_torch_fft_ihfft.py │ ├── test_torch_fft_ihfft2.py │ ├── test_torch_fft_ihfftn.py │ ├── test_torch_fft_irfft.py │ ├── test_torch_fft_irfft2.py │ ├── test_torch_fft_irfftn.py │ ├── test_torch_fft_rfft.py │ ├── test_torch_fft_rfft2.py │ ├── test_torch_fft_rfftn.py │ ├── test_torch_flatten.py │ ├── test_torch_flip.py │ ├── test_torch_floor.py │ ├── test_torch_full.py │ ├── test_torch_full_like.py │ ├── test_torch_gather.py │ ├── test_torch_ge.py │ ├── test_torch_gt.py │ ├── test_torch_imag.py │ ├── test_torch_index_select.py │ ├── test_torch_istft.py │ ├── test_torch_le.py │ ├── test_torch_lgamma.py │ ├── test_torch_log.py │ ├── test_torch_log10.py │ ├── test_torch_logaddexp.py │ ├── test_torch_logical_and.py │ ├── test_torch_logical_not.py │ ├── test_torch_logical_or.py │ ├── test_torch_logical_xor.py │ ├── test_torch_logsumexp.py │ ├── test_torch_lt.py │ ├── test_torch_masked_select.py │ ├── test_torch_matmul.py │ ├── test_torch_max.py │ ├── test_torch_maximum.py │ ├── test_torch_mean.py │ ├── test_torch_min.py │ ├── test_torch_minimum.py │ ├── test_torch_mm.py │ ├── test_torch_mv.py │ ├── test_torch_narrow.py │ ├── test_torch_ne.py │ ├── test_torch_neg.py │ ├── test_torch_norm.py │ ├── test_torch_ones.py │ ├── test_torch_ones_like.py │ ├── test_torch_positive.py │ ├── test_torch_pow.py │ ├── test_torch_prod.py │ ├── test_torch_real.py │ ├── test_torch_reciprocal.py │ ├── test_torch_repeat_interleave.py │ ├── test_torch_roll.py │ ├── test_torch_round.py │ ├── test_torch_rsqrt.py │ ├── test_torch_scatter_add.py │ ├── test_torch_sign.py │ ├── test_torch_sin.py │ ├── test_torch_sinh.py │ ├── test_torch_slice_scatter.py │ ├── test_torch_split.py │ ├── test_torch_sqrt.py │ ├── test_torch_square.py │ ├── test_torch_squeeze.py │ ├── test_torch_stack.py │ ├── test_torch_std.py │ ├── test_torch_stft.py │ ├── test_torch_sum.py │ ├── test_torch_t.py │ ├── test_torch_tan.py │ ├── test_torch_tanh.py │ ├── test_torch_tensor_split.py │ ├── test_torch_tile.py │ ├── test_torch_topk.py │ ├── test_torch_transpose.py │ ├── test_torch_trunc.py │ ├── test_torch_unbind.py │ ├── test_torch_unsqueeze.py │ ├── test_torch_view_as_complex.py │ ├── test_torch_view_as_real.py │ ├── test_torch_where.py │ ├── test_torch_zeros.py │ ├── test_torch_zeros_like.py │ ├── test_torchaudio_F_inverse_spectrogram.py │ ├── test_torchaudio_F_spectrogram.py │ ├── test_torchaudio_InverseSpectrogram.py │ ├── test_torchaudio_Spectrogram.py │ ├── test_torchvision_DeformConv2d.py │ ├── test_torchvision_RoIAlign.py │ ├── test_transformers_albert_attention.py │ ├── test_transformers_bart_attention.py │ ├── test_transformers_bert_attention.py │ ├── test_transformers_bert_generation_attention.py │ ├── test_transformers_blenderbot_attention.py │ ├── test_transformers_camembert_attention.py │ ├── test_transformers_chinese_clip_attention.py │ ├── test_transformers_clip_attention.py │ ├── test_transformers_ctrl_attention.py │ ├── test_transformers_deberta_attention.py │ ├── test_transformers_deepseek_v3_attention.py │ ├── test_transformers_distilbert_attention.py │ ├── test_transformers_electra_attention.py │ ├── test_transformers_flaubert_attention.py │ ├── test_transformers_fsmt_attention.py │ ├── test_transformers_funnel_attention.py │ ├── test_transformers_gpt2_attention.py │ ├── test_transformers_layoutlm_attention.py │ ├── test_transformers_longformer_attention.py │ ├── test_transformers_lxmert_attention.py │ ├── test_transformers_m2m_100_attention.py │ ├── test_transformers_marian_attention.py │ ├── test_transformers_mbart_attention.py │ ├── test_transformers_mobilebert_attention.py │ ├── test_transformers_mt5_attention.py │ ├── test_transformers_openai_attention.py │ ├── test_transformers_pegasus_attention.py │ ├── test_transformers_prophetnet_attention.py │ ├── test_transformers_qwen2_attention.py │ ├── test_transformers_qwen3_attention.py │ ├── test_transformers_reformer_attention.py │ ├── test_transformers_roberta_attention.py │ ├── test_transformers_squeezebert_attention.py │ ├── test_transformers_t5_attention.py │ ├── test_transformers_xlm_attention.py │ ├── test_transformers_xlm_roberta_attention.py │ ├── test_transformers_xlnet_attention.py │ └── test_vit_b_32.py ├── pytorch/ │ └── README.md ├── quantize/ │ ├── CMakeLists.txt │ ├── README.md │ ├── imreadwrite.cpp │ ├── imreadwrite.h │ ├── ncnn2int8.cpp │ ├── ncnn2table.cpp │ └── npy.hpp └── tensorflow/ └── readme.txt ================================================ FILE CONTENTS ================================================ ================================================ FILE: .astylerc ================================================ # astyle -n -r "benchmark/*.h,*.cpp" "src/*.h,*.cpp" "tests/*.h,*.cpp" "tools/*.h,*.cpp" "examples/*.h,*.cpp" # brace style --style=allman # tab --attach-namespaces --attach-extern-c --attach-closing-while # indentation --indent-preproc-define --indent-col1-comments --min-conditional-indent=0 --max-continuation-indent=120 # padding --pad-oper --pad-comma --pad-header --align-pointer=type --align-reference=type # formatting --break-closing-braces --attach-return-type --attach-return-type-decl --keep-one-line-blocks --keep-one-line-statements --convert-tabs --max-code-length=200 --mode=c # other --lineend=linux ================================================ FILE: .clang-format ================================================ # find src/ tools/ tests/ examples/ benchmark/ -type f -name '*.c' -o -name '*.cpp' -o -name '*.h' | xargs -i clang-format -i {} # need clang-format >= 10.0 AccessModifierOffset: -4 AlignAfterOpenBracket: Align AlignConsecutiveAssignments: false # AlignConsecutiveBitFields: true AlignConsecutiveDeclarations: false AlignConsecutiveMacros: true AlignEscapedNewlines: Left # AlignOperands: AlignAfterOperator AlignTrailingComments: true AllowAllArgumentsOnNextLine: true AllowAllConstructorInitializersOnNextLine: true AllowAllParametersOfDeclarationOnNextLine: true AllowShortBlocksOnASingleLine: Always AllowShortCaseLabelsOnASingleLine: true # AllowShortEnumsOnASingleLine: true AllowShortFunctionsOnASingleLine: None AllowShortIfStatementsOnASingleLine: WithoutElse AllowShortLambdasOnASingleLine: All AllowShortLoopsOnASingleLine: true AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: Yes BinPackArguments: true BinPackParameters: true BraceWrapping: AfterCaseLabel: true AfterClass: true AfterControlStatement: Always AfterEnum: true AfterFunction: true AfterNamespace: false AfterObjCDeclaration: false AfterStruct: true AfterUnion: true AfterExternBlock: false BeforeCatch: true BeforeElse: true # BeforeLambdaBody: false # BeforeWhile: false IndentBraces: false SplitEmptyFunction: true SplitEmptyRecord: true SplitEmptyNamespace: false BreakAfterJavaFieldAnnotations: true BreakBeforeBinaryOperators: All BreakBeforeBraces: Custom BreakBeforeTernaryOperators: true BreakConstructorInitializers: BeforeColon BreakInheritanceList: BeforeColon BreakStringLiterals: false ColumnLimit: 0 # CommentPragmas: CompactNamespaces: false ConstructorInitializerAllOnOneLineOrOnePerLine: true ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true DeriveLineEnding: false DerivePointerAlignment: false # DisableFormat: # ExperimentalAutoDetectBinPacking: FixNamespaceComments: true # ForEachMacros: IncludeBlocks: Regroup # IncludeCategories: # IncludeIsMainRegex: # IncludeIsMainSourceRegex: # IndentCaseBlocks: false IndentCaseLabels: false # IndentExternBlock: NoIndent IndentGotoLabels: false IndentPPDirectives: None IndentWidth: 4 # IndentWrappedFunctionNames: 4 # InsertTrailingCommas: None # JavaImportGroups: # JavaScriptQuotes # JavaScriptWrapImports: KeepEmptyLinesAtTheStartOfBlocks: false Language: Cpp # MacroBlockBegin: # MacroBlockEnd: MaxEmptyLinesToKeep: 1 NamespaceIndentation: None # NamespaceMacros: # ObjCBinPackProtocolList: # ObjCBlockIndentWidth: # ObjCBreakBeforeNestedBlockParam: # ObjCSpaceAfterProperty: # ObjCSpaceBeforeProtocolList: # PenaltyBreakAssignment: # PenaltyBreakBeforeFirstCallParameter: # PenaltyBreakComment: # PenaltyBreakFirstLessLess: # PenaltyBreakString: # PenaltyBreakTemplateDeclaration: # PenaltyExcessCharacter: # PenaltyReturnTypeOnItsOwnLine: PointerAlignment: Left # RawStringFormats: ReflowComments: false SortIncludes: false SortUsingDeclarations: true SpaceAfterCStyleCast: false SpaceAfterLogicalNot: false SpaceAfterTemplateKeyword: false SpaceBeforeAssignmentOperators: true SpaceBeforeCpp11BracedList: false SpaceBeforeCtorInitializerColon: true SpaceBeforeInheritanceColon: true SpaceBeforeParens: ControlStatements SpaceBeforeRangeBasedForLoopColon: true SpaceBeforeSquareBrackets: false SpaceInEmptyBlock: false SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: false SpacesInCStyleCastParentheses: false SpacesInConditionalStatement: false SpacesInContainerLiterals: false SpacesInParentheses: false SpacesInSquareBrackets: false Standard: c++03 #StatementMacros: TabWidth: 4 # TypenameMacros: UseCRLF: false UseTab: Never ================================================ FILE: .gitattributes ================================================ *.comp linguist-language=GLSL ================================================ FILE: .github/ISSUE_TEMPLATE/bug.md ================================================ --- name: "\U0001F41B bug issue" about: submit a bug report +_+ --- ## error log | 日志或报错信息 | ログ ## context | 编译/运行环境 | バックグラウンド ## how to reproduce | 复现步骤 | 再現方法 1. 2. 3. ## more | 其他 | その他 ================================================ FILE: .github/ISSUE_TEMPLATE/model-convert.md ================================================ --- name: "\U0001F6B8 model convert issue" about: "Life is Short, Use pnnx and convertmodel.com" --- ## error log | 日志或报错信息 | ログ ## model | 模型 | モデル 1. original model ## how to reproduce | 复现步骤 | 再現方法 1. 2. 3. ================================================ FILE: .github/ISSUE_TEMPLATE/others.md ================================================ --- name: "\U0001F4DD others" about: discussion, suggestion and question --- ## detail | 详细描述 | 詳細な説明 ================================================ FILE: .github/ISSUE_TEMPLATE/quantization.md ================================================ --- name: "\U0001F4C8 quantization" about: best wishes for your low bit quantization has a low accuracy loss...\(^▽^)/...2333... --- ## expectation | 诉求 | 期待する 1. speed 2. precision ## model | 模型 | モデル 1. model.param and model.bin ## detail | 详细描述 | 詳細な説明 ================================================ FILE: .github/dependabot.yml ================================================ version: 2 updates: - package-ecosystem: "github-actions" directory: "/" schedule: interval: "daily" ================================================ FILE: .github/labeler.yml ================================================ cmake: - changed-files: - any-glob-to-any-file: ['cmake/**', 'toolchains/**'] doc: - changed-files: - any-glob-to-any-file: docs/** python: - changed-files: - any-glob-to-any-file: python/** example: - changed-files: - any-glob-to-any-file: examples/** test: - changed-files: - any-glob-to-any-file: tests/** tool: - changed-files: - any-glob-to-any-file: tools/** pnnx: - changed-files: - any-glob-to-any-file: tools/pnnx/** core: - changed-files: - any-glob-to-any-file: src/* layer: - changed-files: - any-glob-to-any-file: src/layer/* arm: - changed-files: - any-glob-to-any-file: src/layer/arm/** loongarch: - changed-files: - any-glob-to-any-file: src/layer/loongarch/** mips: - changed-files: - any-glob-to-any-file: src/layer/mips/** riscv: - changed-files: - any-glob-to-any-file: src/layer/riscv/** vulkan: - changed-files: - any-glob-to-any-file: src/layer/vulkan/** x86: - changed-files: - any-glob-to-any-file: src/layer/x86/** ================================================ FILE: .github/workflows/android.yml ================================================ name: android on: push: branches: [master] paths: - '.github/workflows/android.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' - 'src/layer/riscv/**' - 'src/layer/x86/**' - 'src/layer/vulkan/**' - 'glslang' pull_request: branches: [master] paths: - '.github/workflows/android.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' - 'src/layer/riscv/**' - 'src/layer/x86/**' - 'src/layer/vulkan/**' - 'glslang' concurrency: group: android-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: build: runs-on: ubuntu-latest env: NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake \ -DANDROID_PLATFORM=android-21 \ -DANDROID_SUPPORT_FLEXIBLE_PAGE_SIZES=ON \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DNCNN_VULKAN=ON \ steps: - uses: actions/checkout@v6 with: submodules: true - name: armeabi-v7a run: | mkdir build-armeabi-v7a && cd build-armeabi-v7a cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON .. cmake --build . -j $(nproc) - name: arm64-v8a run: | mkdir build-arm64-v8a && cd build-arm64-v8a cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="arm64-v8a" .. cmake --build . -j $(nproc) - name: x86 run: | mkdir build-x86 && cd build-x86 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="x86" .. cmake --build . -j $(nproc) - name: x86_64 run: | mkdir build-x86_64 && cd build-x86_64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="x86_64" .. cmake --build . -j $(nproc) - name: riscv64 run: | mkdir build-riscv64 && cd build-riscv64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="riscv64" .. cmake --build . -j $(nproc) - name: armeabi-v7a-shared run: | mkdir build-armeabi-v7a-shared && cd build-armeabi-v7a-shared cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DNCNN_SHARED_LIB=ON .. cmake --build . -j $(nproc) - name: arm64-v8a-shared run: | mkdir build-arm64-v8a-shared && cd build-arm64-v8a-shared cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="arm64-v8a" -DNCNN_SHARED_LIB=ON .. cmake --build . -j $(nproc) - name: x86-shared run: | mkdir build-x86-shared && cd build-x86-shared cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="x86" -DNCNN_SHARED_LIB=ON .. cmake --build . -j $(nproc) - name: x86_64-shared run: | mkdir build-x86_64-shared && cd build-x86_64-shared cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="x86_64" -DNCNN_SHARED_LIB=ON .. cmake --build . -j $(nproc) - name: riscv64-shared run: | mkdir build-riscv64-shared && cd build-riscv64-shared cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="riscv64" -DNCNN_SHARED_LIB=ON .. cmake --build . -j $(nproc) ndk-r16b: runs-on: ubuntu-latest env: NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/android-ndk-r16b/build/cmake/android.toolchain.cmake \ -DANDROID_PLATFORM=android-21 \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DNCNN_VULKAN=ON \ steps: - uses: actions/checkout@v6 with: submodules: true - name: ndk-r16b env: DEBIAN_FRONTEND: noninteractive run: | pushd /usr/lib/x86_64-linux-gnu/ sudo ln -s libncurses.so.6 libncurses.so.5 sudo ln -s libtinfo.so.6 libtinfo.so.5 popd wget -q https://dl.google.com/android/repository/android-ndk-r16b-linux-x86_64.zip -O $GITHUB_WORKSPACE/android-ndk-r16b-linux-x86_64.zip cd $GITHUB_WORKSPACE && unzip -q android-ndk-r16b-linux-x86_64.zip - name: armeabi-v7a run: | mkdir build-armeabi-v7a && cd build-armeabi-v7a cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON .. cmake --build . -j $(nproc) - name: armeabi-v7a-no-neon run: | mkdir build-armeabi-v7a-no-neon && cd build-armeabi-v7a-no-neon cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=OFF .. cmake --build . -j $(nproc) - name: arm64-v8a run: | mkdir build-arm64-v8a && cd build-arm64-v8a cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="arm64-v8a" .. cmake --build . -j $(nproc) - name: armeabi-v7a-shared run: | mkdir build-armeabi-v7a-shared && cd build-armeabi-v7a-shared cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DNCNN_SHARED_LIB=ON .. cmake --build . -j $(nproc) - name: armeabi-v7a-no-neon-shared run: | mkdir build-armeabi-v7a-no-neon-shared && cd build-armeabi-v7a-no-neon-shared cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=OFF -DNCNN_SHARED_LIB=ON .. cmake --build . -j $(nproc) - name: arm64-v8a-shared run: | mkdir build-arm64-v8a-shared && cd build-arm64-v8a-shared cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="arm64-v8a" -DNCNN_SHARED_LIB=ON .. cmake --build . -j $(nproc) ================================================ FILE: .github/workflows/code-format-msg.yml ================================================ name: code-format-msg on: workflow_run: workflows: [code-format] types: [completed] concurrency: group: code-format-msg-${{ github.head_ref || github.run_id }} permissions: contents: read pull-requests: write jobs: pr-context: name: acquire-pr-context runs-on: ubuntu-latest outputs: PR_HEADSHA: ${{ steps.set-pr-context.outputs.head-sha }} PR_NUMBER: ${{ steps.set-pr-context.outputs.number }} if: ${{ github.event.workflow_run.event == 'pull_request' }} steps: - name: get-pr-context id: set-pr-context env: GH_TOKEN: ${{ github.token }} PR_TARGET_REPO: ${{ github.repository }} PR_BRANCH: |- ${{ (github.event.workflow_run.head_repository.owner.login != github.event.workflow_run.repository.owner.login) && format('{0}:{1}', github.event.workflow_run.head_repository.owner.login, github.event.workflow_run.head_branch) || github.event.workflow_run.head_branch }} run: | gh pr view --repo "${PR_TARGET_REPO}" "${PR_BRANCH}" \ --json 'number,headRefOid' \ --jq '"number=\(.number)\nhead-sha=\(.headRefOid)"' \ >> $GITHUB_OUTPUT remove-comment-if-success: if: ${{ github.event.workflow_run.conclusion == 'success' }} runs-on: ubuntu-latest needs: [pr-context] env: PR_HEADSHA: ${{ needs.pr-context.outputs.PR_HEADSHA }} PR_NUMBER: ${{ needs.pr-context.outputs.PR_NUMBER }} steps: - name: Remove existing "format check failed" comment uses: actions/github-script@v8 with: script: | const owner = context.repo.owner; const repo = context.repo.repo; const { data: comments } = await github.rest.issues.listComments({ owner, repo, issue_number: ${{ env.PR_NUMBER }}, }); const targetComment = comments.find(comment => comment.body.includes("Please enable github action in **YOUR FORKED REPO** to make code-format workflow work") ); if (targetComment) { await github.rest.issues.deleteComment({ owner, repo, comment_id: targetComment.id, }); console.log("Removed existing code-format failure comment."); } else { console.log("No existing format failure comment to remove."); } post-comment-if-failure: if: ${{ github.event.workflow_run.conclusion == 'failure' }} runs-on: ubuntu-latest needs: [pr-context] env: PR_HEADSHA: ${{ needs.pr-context.outputs.PR_HEADSHA }} PR_NUMBER: ${{ needs.pr-context.outputs.PR_NUMBER }} steps: - name: Post comment on failed code-format if not existing uses: actions/github-script@v8 with: script: | const owner = context.repo.owner; const repo = context.repo.repo; const { data: comments } = await github.rest.issues.listComments({ owner, repo, issue_number: ${{ env.PR_NUMBER }}, }); const existingComment = comments.find(comment => comment.body.includes("Please enable github action in **YOUR FORKED REPO** to make code-format workflow work") ); if (existingComment) { console.log("A code-format failure comment already exists."); } else { await github.rest.issues.createComment({ owner, repo, issue_number: ${{ env.PR_NUMBER }}, body: "Please enable github action in **YOUR FORKED REPO** to make code-format workflow work", }); console.log("Created code-format failure comment."); } ================================================ FILE: .github/workflows/code-format.yml ================================================ name: code-format on: [push, pull_request] concurrency: group: code-format-${{ github.ref }} cancel-in-progress: true permissions: contents: write jobs: code-format: runs-on: ubuntu-latest container: ubuntu:20.04 steps: - name: astyle run: | export DEBIAN_FRONTEND=noninteractive apt-get update -y apt-get install -y astyle git - uses: actions/checkout@v6 - name: cache-clang-format id: cache-clang-format uses: actions/cache@v5 with: path: clang-format-install key: clang-format-install-5 - name: clang-format if: steps.cache-clang-format.outputs.cache-hit != 'true' run: | export DEBIAN_FRONTEND=noninteractive apt-get update -y apt-get install -y build-essential wget curl cmake unzip zip python3-pip wget https://github.com/llvm/llvm-project/releases/download/llvmorg-10.0.1/llvm-project-10.0.1.tar.xz tar -xf llvm-project-10.0.1.tar.xz cd llvm-project-10.0.1 mkdir build cd build cmake -DCMAKE_INSTALL_PREFIX=install -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_TARGETS_TO_BUILD="" -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_DOCS=OFF ../llvm/ make -j4 clang-format mkdir $GITHUB_WORKSPACE/clang-format-install cp -r bin/clang-format $GITHUB_WORKSPACE/clang-format-install cd ../../ rm -rf llvm-project-10.0.1 rm llvm-project-10.0.1.tar.xz - name: cache-clang-format-21 id: cache-clang-format-21 uses: actions/cache@v5 with: path: clang-format-21-install key: clang-format-21-install - name: clang-format-21 if: steps.cache-clang-format-21.outputs.cache-hit != 'true' run: | export DEBIAN_FRONTEND=noninteractive apt-get update -y apt-get install -y build-essential wget curl cmake unzip zip python3-pip pip install cmake wget https://github.com/llvm/llvm-project/releases/download/llvmorg-21.1.8/llvm-project-21.1.8.src.tar.xz tar -xf llvm-project-21.1.8.src.tar.xz cd llvm-project-21.1.8.src mkdir build cd build cmake -DCMAKE_INSTALL_PREFIX=install -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_TARGETS_TO_BUILD="" -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_DOCS=OFF ../llvm/ make -j4 clang-format mkdir $GITHUB_WORKSPACE/clang-format-21-install cp -r bin/clang-format $GITHUB_WORKSPACE/clang-format-21-install cd ../../ rm -rf llvm-project-21.1.8.src rm llvm-project-21.1.8.src.tar.xz - name: code-format run: | mv $GITHUB_WORKSPACE/clang-format-install/clang-format /usr/local/bin/clang-format rm -rf $GITHUB_WORKSPACE/clang-format-install sh codeformat.sh - name: code-format-glsl run: | mv $GITHUB_WORKSPACE/clang-format-21-install/clang-format /usr/local/bin/clang-format-21 rm -rf $GITHUB_WORKSPACE/clang-format-21-install cd src/layer/vulkan/shader find . -type f -name '*.comp' | xargs -i clang-format-21 -i -assume-filename=main.cpp {} - name: configure-git-safe-directory run: git config --global --add safe.directory /__w/ncnn/ncnn - uses: stefanzweifel/git-auto-commit-action@v7 with: commit_message: apply code-format changes - name: restore-clang-format-cache run: | mkdir $GITHUB_WORKSPACE/clang-format-install cp -r /usr/local/bin/clang-format $GITHUB_WORKSPACE/clang-format-install mkdir $GITHUB_WORKSPACE/clang-format-21-install cp -r /usr/local/bin/clang-format-21 $GITHUB_WORKSPACE/clang-format-21-install/clang-format ================================================ FILE: .github/workflows/codeql-analysis.yml ================================================ # For most projects, this workflow file will not need changing; you simply need # to commit it to your repository. # # You may wish to alter this file to override the set of languages analyzed, # or to provide custom queries or build logic. name: "CodeQL" on: push: branches: [master] paths-ignore: ['**.md'] pull_request: # The branches below must be a subset of the branches above branches: [master] paths-ignore: ['**.md'] schedule: - cron: '0 20 * * 4' concurrency: group: CodeQL-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: analyze: permissions: actions: read # for github/codeql-action/init to get workflow details contents: read # for actions/checkout to fetch code security-events: write # for github/codeql-action/autobuild to send a status report name: Analyze runs-on: ubuntu-latest strategy: fail-fast: false matrix: # Override automatic language detection by changing the below list # Supported options are ['csharp', 'cpp', 'go', 'java', 'javascript', 'python'] language: ['cpp'] # Learn more... # https://docs.github.com/en/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#overriding-automatic-language-detection steps: - name: Checkout repository uses: actions/checkout@v6 with: # We must fetch at least the immediate parents so that if this is # a pull request then we can checkout the head. fetch-depth: 2 # If this run was triggered by a pull request event, then checkout # the head of the pull request instead of the merge commit. - run: git checkout HEAD^2 if: ${{ github.event_name == 'pull_request' }} # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL uses: github/codeql-action/init@v4 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. # By default, queries listed here will override any specified in a config file. # Prefix the list here with "+" to use these queries and those in the config file. # queries: ./path/to/local/query, your-org/your-repo/queries@main # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild uses: github/codeql-action/autobuild@v4 # ℹ️ Command-line programs to run using the OS shell. # 📚 https://git.io/JvXDl # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines # and modify them (or add more) to build your code if your project # uses a compiled language #- run: | # make bootstrap # make release - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v4 ================================================ FILE: .github/workflows/compare-binary-size-pr-comment.yml ================================================ name: compare-binary-size-pr-comment on: workflow_run: workflows: ["compare-binary-size"] types: - completed permissions: actions: read contents: read pull-requests: write jobs: pr-comment: runs-on: ubuntu-latest steps: - name: Setup tools run: | sudo apt-get update sudo apt-get install -y jq unzip - name: Ensure workflow_run is for a PR id: validate run: | # Use the event payload file provided by GitHub Actions directly echo "Using event payload from: $GITHUB_EVENT_PATH" echo "Event file size: $(wc -c < "$GITHUB_EVENT_PATH") bytes" # Safely compute number of associated PRs (use // 0 to default if missing) PR_COUNT=$(jq -r '.workflow_run.pull_requests | length // 0' "$GITHUB_EVENT_PATH") echo "Associated pull_request count: $PR_COUNT" if [ "$PR_COUNT" -eq 0 ]; then echo "No pull_request associated with this workflow_run; nothing to do." echo "skip=true" >> $GITHUB_OUTPUT exit 0 fi echo "skip=false" >> $GITHUB_OUTPUT - name: Download artifact zip for this run if: steps.validate.outputs.skip != 'true' env: RUN_ID: ${{ github.event.workflow_run.id }} OWNER: ${{ github.repository_owner }} REPO: ${{ github.repository }} TOKEN: ${{ secrets.COMMENTER_PAT }} ART_NAME: "compare-binary-size.md" run: | echo "Listing artifacts for run $RUN_ID" API="https://api.github.com/repos/$OWNER/${REPO#*/}/actions/runs/$RUN_ID/artifacts" # Save artifact list to a file (avoid pipe/echo issues) curl -s -H "Authorization: token $TOKEN" "$API" -o /tmp/art_list.json echo "Art list size: $(wc -c < /tmp/art_list.json) bytes" if ! jq . /tmp/art_list.json; then echo "Failed to parse /tmp/art_list.json with jq; aborting for safety." exit 1 fi # find artifact archive_download_url by name (first match) ARCHIVE_URL=$(jq -r --arg name "$ART_NAME" '.artifacts[] | select(.name==$name) | .archive_download_url' /tmp/art_list.json | head -n1) if [ -z "$ARCHIVE_URL" ] || [ "$ARCHIVE_URL" = "null" ]; then echo "Artifact named '$ART_NAME' not found for run $RUN_ID. Exiting." exit 0 fi echo "Downloading artifact from: $ARCHIVE_URL" # download and unzip to temp dir mkdir -p /tmp/artifact_contents curl -L -H "Authorization: token $TOKEN" -o /tmp/artifact.zip "$ARCHIVE_URL" if ! unzip -q /tmp/artifact.zip -d /tmp/artifact_contents; then echo "Failed to unzip /tmp/artifact.zip"; exit 1 fi ls -la /tmp/artifact_contents - name: Read compare-binary-size.md content if: steps.validate.outputs.skip != 'true' id: read run: | # find file inside artifact_contents FILE=$(find /tmp/artifact_contents -type f -name "compare-binary-size.md" | head -n1 || true) if [ -z "$FILE" ]; then # If artifact name matched but internal filename differs, try any .md FILE=$(find /tmp/artifact_contents -type f -name "*.md" | head -n1 || true) fi if [ -z "$FILE" ]; then echo "compare_content<> $GITHUB_OUTPUT echo "No compare-binary-size.md found in artifact." >> $GITHUB_OUTPUT echo "EOF" >> $GITHUB_OUTPUT else # Truncate to avoid overly long comments (adjust lines as needed) head -n 1000 "$FILE" > /tmp/compare-truncated.md || true echo "compare_content<> $GITHUB_OUTPUT cat /tmp/compare-truncated.md >> $GITHUB_OUTPUT echo "EOF" >> $GITHUB_OUTPUT fi - name: Post or update PR comment via actions/github-script if: steps.validate.outputs.skip != 'true' uses: actions/github-script@v8 with: github-token: ${{ secrets.COMMENTER_PAT }} script: | const pr = context.payload.workflow_run.pull_requests[0]; if (!pr) { core.info("No pull request found in workflow_run payload; skipping."); return; } const owner = context.repo.owner; const repo = context.repo.repo; const issue_number = pr.number; const marker = ''; // Read the compare content from env (set in previous step outputs) const compare = process.env.COMPARE_CONTENT || ""; const body = `${marker}\n**Binary size comparison** (from artifact)\n\n\`\`\`markdown\n${compare}\n\`\`\``; // List existing comments and find our bot comment (by marker) const { data: comments } = await github.rest.issues.listComments({ owner, repo, issue_number, per_page: 100 }); const existing = comments.find(c => c.body && c.body.includes(marker)); if (existing) { await github.rest.issues.updateComment({ owner, repo, comment_id: existing.id, body }); core.info(`Updated comment id=${existing.id}`); } else { await github.rest.issues.createComment({ owner, repo, issue_number, body }); core.info("Created new comment"); } env: # pass the content from previous step into the github-script environment COMPARE_CONTENT: ${{ steps.read.outputs.compare_content }} ================================================ FILE: .github/workflows/compare-binary-size.yml ================================================ name: compare-binary-size on: pull_request: branches: [master] paths: - '.github/workflows/compare-binary-size.yml' - 'toolchains/**' - 'CMakeLists.txt' - 'cmake/**' - 'src/**' - 'glslang' concurrency: group: compare-binary-size-${{ github.ref }} cancel-in-progress: true permissions: contents: read actions: read jobs: compare-size: runs-on: ubuntu-latest steps: - name: checkout-pr-branch uses: actions/checkout@v6 with: ref: refs/pull/${{ github.event.pull_request.number }}/merge submodules: true path: pr - name: checkout-base-branch uses: actions/checkout@v6 with: ref: ${{ github.event.pull_request.base.ref }} repository: ${{ github.event.pull_request.base.repo.full_name }} submodules: true path: base - name: install-toolchain run: | sudo apt-get update sudo apt-get install g++-arm-linux-gnueabihf g++-aarch64-linux-gnu - name: compare-sizes env: COMMON_CMAKE_ARGS: -DNCNN_SHARED_LIB=ON -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF run: | # define target architectures archs=("x86_64" "armhf" "aarch64") # generate table echo "The binary size change of libncnn.so (bytes)" >> compare-binary-size.md echo "| architecture | base size | pr size | difference |" >> compare-binary-size.md echo "|--------------|-----------|---------|------------|" >> compare-binary-size.md for arch in "${archs[@]}"; do mkdir -p pr/build_$arch pushd pr/build_$arch if [ "$arch" = "armhf" ]; then cmake ${{env.COMMON_CMAKE_ARGS}} -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf.toolchain.cmake .. elif [ "$arch" = "aarch64" ]; then cmake ${{env.COMMON_CMAKE_ARGS}} -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake .. else cmake ${{env.COMMON_CMAKE_ARGS}} .. fi cmake --build . -j $(nproc) PR_SIZE=$(stat -c%s $(readlink -f src/libncnn.so)) popd mkdir -p base/build_$arch pushd base/build_$arch if [ "$arch" = "armhf" ]; then cmake ${{env.COMMON_CMAKE_ARGS}} -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf.toolchain.cmake .. elif [ "$arch" = "aarch64" ]; then cmake ${{env.COMMON_CMAKE_ARGS}} -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake .. else cmake ${{env.COMMON_CMAKE_ARGS}} .. fi cmake --build . -j $(nproc) BASE_SIZE=$(stat -c%s $(readlink -f src/libncnn.so)) popd DIFF=$(($PR_SIZE - $BASE_SIZE)) if [ $DIFF -gt 0 ]; then DIFF_STR="+$DIFF :warning:" else DIFF_STR="$DIFF :kissing_heart:" fi echo "| $arch | $BASE_SIZE | $PR_SIZE | $DIFF_STR |" >> compare-binary-size.md done cat compare-binary-size.md - name: upload-compare-binary-size-md uses: actions/upload-artifact@v6 with: name: compare-binary-size.md path: compare-binary-size.md ================================================ FILE: .github/workflows/elf-riscv32.yml ================================================ name: elf-riscv32 on: push: branches: [master] paths: - '.github/workflows/elf-riscv32.yml' - 'toolchains/riscv32-unknown-elf.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/riscv/**' - 'tests/**' pull_request: branches: [master] paths: - '.github/workflows/elf-riscv32.yml' - 'toolchains/riscv32-unknown-elf.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/riscv/**' - 'tests/**' concurrency: group: elf-riscv32-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: rv32gc: runs-on: [self-hosted, linux, centos] steps: - uses: actions/checkout@v6 #- name: riscv-gnu-toolchain #run: | #wget -c https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2025.01.20/riscv32-elf-ubuntu-22.04-gcc-nightly-2025.01.20-nightly.tar.xz #tar -xf riscv32-elf-ubuntu-22.04-gcc-nightly-2025.01.20-nightly.tar.xz #mv riscv riscv32-elf #- name: checkout-riscv-pk #uses: actions/checkout@v6 #with: #repository: riscv/riscv-pk #path: riscv-pk #ref: d8659a4e8e888bdc9caf840ad17bfe83239b1d64 #- name: riscv-pk #run: | #cd riscv-pk #mkdir build && cd build #export PATH=$GITHUB_WORKSPACE/riscv32-elf/bin:$PATH #export CFLAGS="-O3" #export CXXFLAGS="-O3" #../configure --prefix=$GITHUB_WORKSPACE/riscv32-elf --with-arch=rv32gc_zicsr_zifencei --host=riscv32-unknown-elf --with-abi=ilp32d #make -j4 #make install #- name: checkout-riscv-isa-sim #uses: actions/checkout@v6 #with: #repository: riscv-software-src/riscv-isa-sim #path: riscv-isa-sim #ref: 5ef9a61f5fecdb9bf77da155172c8018ce820308 #- name: riscv-isa-sim #run: | #cd riscv-isa-sim #mkdir build && cd build #export PATH=$GITHUB_WORKSPACE/riscv32-elf/bin:$PATH #export CFLAGS="-O3" #export CXXFLAGS="-O3" #../configure --prefix=$GITHUB_WORKSPACE/riscv32-elf #make -j4 #make install #- name: riscv-strip-install #run: find $GITHUB_WORKSPACE/riscv32-elf -type f | xargs -i strip -g {} || true - name: build run: | export RISCV_ROOT_PATH=/data/action/osd/riscv32-elf mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv32-unknown-elf.toolchain.cmake -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_RVV=OFF -DNCNN_XTHEADVECTOR=OFF -DNCNN_ZFH=OFF -DNCNN_ZVFH=OFF .. cmake --build . -j 4 - name: test run: | export PATH=/data/action/osd/riscv32-elf/bin:$PATH cd build TESTS_EXECUTABLE_LOADER=spike TESTS_EXECUTABLE_LOADER_ARGUMENTS="--isa=rv32gc;/data/action/osd/riscv32-elf/riscv32-unknown-elf/bin/pk" ctest --output-on-failure -j 4 ================================================ FILE: .github/workflows/elf-riscv64.yml ================================================ name: elf-riscv64 on: push: branches: [master] paths: - '.github/workflows/elf-riscv64.yml' - 'toolchains/riscv64-unknown-elf.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/riscv/**' - 'tests/**' pull_request: branches: [master] paths: - '.github/workflows/elf-riscv64.yml' - 'toolchains/riscv64-unknown-elf.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/riscv/**' - 'tests/**' concurrency: group: elf-riscv64-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: rv64gc: runs-on: [self-hosted, linux, centos] steps: - uses: actions/checkout@v6 #- name: riscv-gnu-toolchain #run: | #wget -c https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2025.01.20/riscv64-elf-ubuntu-22.04-gcc-nightly-2025.01.20-nightly.tar.xz #tar -xf riscv64-elf-ubuntu-22.04-gcc-nightly-2025.01.20-nightly.tar.xz #mv riscv riscv64-elf #- name: checkout-riscv-pk #uses: actions/checkout@v6 #with: #repository: riscv/riscv-pk #path: riscv-pk #ref: d8659a4e8e888bdc9caf840ad17bfe83239b1d64 #- name: riscv-pk #run: | #cd riscv-pk #mkdir build && cd build #export PATH=$GITHUB_WORKSPACE/riscv64-elf/bin:$PATH #export CFLAGS="-O3" #export CXXFLAGS="-O3" #../configure --prefix=$GITHUB_WORKSPACE/riscv64-elf --with-arch=rv64gc_zicsr_zifencei --host=riscv64-unknown-elf --with-abi=lp64d #make -j4 #make install #- name: checkout-riscv-isa-sim #uses: actions/checkout@v6 #with: #repository: riscv-software-src/riscv-isa-sim #path: riscv-isa-sim #ref: 5ef9a61f5fecdb9bf77da155172c8018ce820308 #- name: riscv-isa-sim #run: | #cd riscv-isa-sim #mkdir build && cd build #export PATH=$GITHUB_WORKSPACE/riscv64-elf/bin:$PATH #export CFLAGS="-O3" #export CXXFLAGS="-O3" #../configure --prefix=$GITHUB_WORKSPACE/riscv64-elf #make -j4 #make install #- name: riscv-strip-install #run: find $GITHUB_WORKSPACE/riscv64-elf -type f | xargs -i strip -g {} || true - name: build run: | export RISCV_ROOT_PATH=/data/action/osd/riscv64-elf mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-elf.toolchain.cmake -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_XTHEADVECTOR=OFF .. cmake --build . -j 4 - name: test run: | export PATH=/data/action/osd/riscv64-elf/bin:$PATH cd build TESTS_EXECUTABLE_LOADER=spike TESTS_EXECUTABLE_LOADER_ARGUMENTS="--isa=rv64gc;/data/action/osd/riscv64-elf/riscv64-unknown-elf/bin/pk" ctest --output-on-failure -j 4 ================================================ FILE: .github/workflows/esp32.yml ================================================ name: ESP32 on: push: branches: [master] paths: - '.github/workflows/esp32.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' pull_request: branches: [master] paths: - '.github/workflows/esp32.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' concurrency: group: esp32-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: build: name: ESP32 runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 with: submodules: true - name: Setup Python uses: actions/setup-python@v6 with: python-version: '3.8' - name: Install dependencies run: | sudo apt-get update sudo apt-get install -y cmake ninja-build ccache - name: Checkout ESP-IDF uses: actions/checkout@v6 with: repository: espressif/esp-idf path: esp-idf-install ref: release/v5.3 - name: Install ESP-IDF run: | cd esp-idf-install git submodule update --init --recursive ./install.sh - name: Set environment and build NCNN for ESP32 run: | source esp-idf-install/export.sh echo "IDF_PATH=$IDF_PATH" >> $GITHUB_ENV echo "${IDF_PATH}/tools" >> $GITHUB_PATH echo "${IDF_PATH}/components" >> $GITHUB_PATH mkdir -p build-esp32 && cd build-esp32 cmake -DCMAKE_TOOLCHAIN_FILE="../toolchains/esp32.toolchain.cmake" -DCMAKE_BUILD_TYPE=Release -DNCNN_BUILD_EXAMPLES=OFF .. make -j 4 make install ================================================ FILE: .github/workflows/harmonyos.yml ================================================ name: harmonyos on: push: branches: [master] paths: - '.github/workflows/harmonyos.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' - 'src/layer/x86/**' - 'src/layer/vulkan/**' - 'glslang' pull_request: branches: [master] paths: - '.github/workflows/harmonyos.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' - 'src/layer/x86/**' - 'src/layer/vulkan/**' - 'glslang' concurrency: group: harmonyos-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: build: runs-on: [self-hosted, linux, centos] env: OHOS_NDK_HOME: /data/action/osd/ohos-sdk/linux/native OHOS_NDK_CMAKE: /data/action/osd/ohos-sdk/linux/native/build-tools/cmake/bin/cmake NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=/data/action/osd/ohos-sdk/linux/native/build/cmake/ohos.toolchain.cmake \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DNCNN_SIMPLEOMP=ON \ -DNCNN_VULKAN=ON \ steps: - uses: actions/checkout@v6 with: submodules: true # - name: setup-sdk # run: | # cd /data/action/osd # wget -q https://repo.huaweicloud.com/harmonyos/os/4.1.1-Release/ohos-sdk-windows_linux-public.tar.gz # tar -xf ohos-sdk-windows_linux-public.tar.gz # cd ohos-sdk/linux # unzip -q native-linux-x64-4.1.7.8-Release.zip - name: armeabi-v7a run: | mkdir build-armeabi-v7a && cd build-armeabi-v7a ${{ env.OHOS_NDK_CMAKE }} ${{ env.NCNN_CMAKE_OPTIONS }} -DOHOS_ARCH="armeabi-v7a" .. ${{ env.OHOS_NDK_CMAKE }} --build . -j 4 - name: arm64-v8a run: | mkdir build-arm64-v8a && cd build-arm64-v8a ${{ env.OHOS_NDK_CMAKE }} ${{ env.NCNN_CMAKE_OPTIONS }} -DOHOS_ARCH="arm64-v8a" .. ${{ env.OHOS_NDK_CMAKE }} --build . -j 4 - name: x86_64 run: | mkdir build-x86_64 && cd build-x86_64 ${{ env.OHOS_NDK_CMAKE }} ${{ env.NCNN_CMAKE_OPTIONS }} -DOHOS_ARCH="x86_64" .. ${{ env.OHOS_NDK_CMAKE }} --build . -j 4 - name: armeabi-v7a-shared run: | mkdir build-armeabi-v7a-shared && cd build-armeabi-v7a-shared ${{ env.OHOS_NDK_CMAKE }} ${{ env.NCNN_CMAKE_OPTIONS }} -DOHOS_ARCH="armeabi-v7a" -DNCNN_SHARED_LIB=ON .. ${{ env.OHOS_NDK_CMAKE }} --build . -j 4 - name: arm64-v8a-shared run: | mkdir build-arm64-v8a-shared && cd build-arm64-v8a-shared ${{ env.OHOS_NDK_CMAKE }} ${{ env.NCNN_CMAKE_OPTIONS }} -DOHOS_ARCH="arm64-v8a" -DNCNN_SHARED_LIB=ON .. ${{ env.OHOS_NDK_CMAKE }} --build . -j 4 - name: x86_64-shared run: | mkdir build-x86_64-shared && cd build-x86_64-shared ${{ env.OHOS_NDK_CMAKE }} ${{ env.NCNN_CMAKE_OPTIONS }} -DOHOS_ARCH="x86_64" -DNCNN_SHARED_LIB=ON .. ${{ env.OHOS_NDK_CMAKE }} --build . -j 4 ================================================ FILE: .github/workflows/ios.yml ================================================ name: ios on: push: branches: [master] paths: - '.github/workflows/ios.yml' - 'toolchains/ios.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' - 'src/layer/x86/**' - 'src/layer/vulkan/**' - 'glslang' pull_request: branches: [master] paths: - '.github/workflows/ios.yml' - 'toolchains/ios.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' - 'src/layer/x86/**' - 'src/layer/vulkan/**' - 'glslang' concurrency: group: ios-${{ github.ref }} cancel-in-progress: true env: DEVELOPER_DIR: /Applications/Xcode_16.4.0.app/Contents/Developer IOS_DEPLOYMENT_TARGET: '13.0' ENABLE_BITCODE: OFF ENABLE_ARC: OFF ENABLE_VISIBILITY: OFF permissions: contents: read jobs: build: runs-on: macos-15-intel env: OPENMP_VERSION: '18.1.2' OPENMP_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DLIBOMP_ENABLE_SHARED=OFF \ -DLIBOMP_OMPT_SUPPORT=OFF \ -DLIBOMP_USE_HWLOC=OFF \ NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VULKAN=ON \ steps: - uses: actions/checkout@v6 with: submodules: true - name: cache-openmp id: cache-openmp uses: actions/cache@v5 with: path: openmp-install key: openmp-ios-install-20251004 - name: openmp if: steps.cache-openmp.outputs.cache-hit != 'true' run: | wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/ cd openmp-${{ env.OPENMP_VERSION }}.src wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch - name: openmp-arm64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-arm64 && cd build-arm64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=OS64 -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install - name: openmp-simulator-x86_64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-simulator-x86_64 && cd build-simulator-x86_64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR64 -DARCHS="x86_64" .. cmake --build . -j 4 cmake --build . --target install - name: openmp-simulator-arm64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-simulator-arm64 && cd build-simulator-arm64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATORARM64 -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install - name: openmp-merge-fat-library if: steps.cache-openmp.outputs.cache-hit != 'true' run: | mkdir -p $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install/ios mkdir -p $GITHUB_WORKSPACE/openmp-install/ios-simulator cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/include $GITHUB_WORKSPACE/openmp-install/ios mkdir -p $GITHUB_WORKSPACE/openmp-install/ios/lib cp openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a $GITHUB_WORKSPACE/openmp-install/ios/lib/libomp.a cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-simulator-x86_64/install/include $GITHUB_WORKSPACE/openmp-install/ios-simulator mkdir -p $GITHUB_WORKSPACE/openmp-install/ios-simulator/lib lipo -create \ openmp-${{ env.OPENMP_VERSION }}.src/build-simulator-x86_64/install/lib/libomp.a \ openmp-${{ env.OPENMP_VERSION }}.src/build-simulator-arm64/install/lib/libomp.a \ -o $GITHUB_WORKSPACE/openmp-install/ios-simulator/lib/libomp.a - name: install-openmp run: | sudo cp $GITHUB_WORKSPACE/openmp-install/ios/include/* $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/include sudo cp $GITHUB_WORKSPACE/openmp-install/ios/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib sudo cp $GITHUB_WORKSPACE/openmp-install/ios-simulator/include/* $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/include sudo cp $GITHUB_WORKSPACE/openmp-install/ios-simulator/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib - name: arm64 run: | mkdir build-arm64 && cd build-arm64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=OS64 -DARCHS="arm64" .. cmake --build . -j 4 - name: simulator-x86_64 run: | mkdir build-simulator-x86_64 && cd build-simulator-x86_64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR64 -DARCHS="x86_64" .. cmake --build . -j 4 - name: simulator-arm64 run: | mkdir build-simulator-arm64 && cd build-simulator-arm64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATORARM64 -DARCHS="arm64" .. cmake --build . -j 4 ================================================ FILE: .github/workflows/labeler.yml ================================================ name: labeler on: [pull_request_target] permissions: contents: read pull-requests: write jobs: label: runs-on: ubuntu-latest steps: - uses: actions/labeler@v6 ================================================ FILE: .github/workflows/linux-aarch64.yml ================================================ name: linux-aarch64 on: push: branches: [master] paths: - '.github/workflows/linux-aarch64.yml' - 'toolchains/aarch64-linux-gnu.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' - 'tests/**' pull_request: branches: [master] paths: - '.github/workflows/linux-aarch64.yml' - 'toolchains/aarch64-linux-gnu.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' - 'tests/**' concurrency: group: linux-aarch64-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: aarch64-native: runs-on: ubuntu-24.04-arm steps: - uses: actions/checkout@v6 - name: build run: | mkdir build && cd build cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test run: cd build && ctest --output-on-failure -j $(nproc) - name: build-noint8 run: | mkdir build-noint8 && cd build-noint8 cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_INT8=OFF .. cmake --build . -j $(nproc) - name: test-noint8 run: cd build-noint8 && ctest --output-on-failure -j $(nproc) - name: build-simplestl-simplemath run: | mkdir build-simplestl-simplemath && cd build-simplestl-simplemath cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEMATH=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. cmake --build . -j $(nproc) - name: test-simplestl-simplemath run: cd build-simplestl-simplemath && ctest --output-on-failure -j $(nproc) asan: runs-on: ubuntu-24.04-arm steps: - uses: actions/checkout@v6 - name: build run: | mkdir build && cd build cmake -DCMAKE_BUILD_TYPE=relwithdebinfo -DNCNN_ASAN=ON -DNCNN_BUILD_TESTS=ON -DNCNN_SHARED_LIB=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. cmake --build . -j $(nproc) - name: test run: | cd build ctest --output-on-failure -j $(nproc) aarch64: runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v6 - name: aarch64-gnu-toolchain run: | sudo apt-get update sudo apt-get install g++-aarch64-linux-gnu qemu-user-static - name: build run: | mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test-a53 run: cd build && TESTS_EXECUTABLE_LOADER=qemu-aarch64-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu;-cpu;cortex-a53" ctest --output-on-failure -j $(nproc) - name: test-a55 run: cd build && TESTS_EXECUTABLE_LOADER=qemu-aarch64-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu;-cpu;cortex-a55" ctest --output-on-failure -j $(nproc) - name: test-a72 run: cd build && TESTS_EXECUTABLE_LOADER=qemu-aarch64-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu;-cpu;cortex-a72" ctest --output-on-failure -j $(nproc) - name: test-a76 run: cd build && TESTS_EXECUTABLE_LOADER=qemu-aarch64-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu;-cpu;cortex-a76" ctest --output-on-failure -j $(nproc) - name: test-a710 run: cd build && TESTS_EXECUTABLE_LOADER=qemu-aarch64-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu;-cpu;cortex-a710" ctest --output-on-failure -j $(nproc) - name: test-max run: cd build && TESTS_EXECUTABLE_LOADER=qemu-aarch64-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu;-cpu;max" ctest --output-on-failure -j $(nproc) ================================================ FILE: .github/workflows/linux-arm.yml ================================================ name: linux-arm on: push: branches: [master] paths: - '.github/workflows/linux-arm.yml' - 'toolchains/arm-linux-gnueabi.toolchain.cmake' - 'toolchains/arm-linux-gnueabihf.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' - 'tests/**' pull_request: branches: [master] paths: - '.github/workflows/linux-arm.yml' - 'toolchains/arm-linux-gnueabi.toolchain.cmake' - 'toolchains/arm-linux-gnueabihf.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' - 'tests/**' concurrency: group: linux-arm-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: arm: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: arm-gnu-toolchain run: | sudo apt-get update sudo apt-get install g++-arm-linux-gnueabi qemu-user-static - name: build run: | mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test run: | cd build TESTS_EXECUTABLE_LOADER=qemu-arm-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabi" ctest --output-on-failure -j $(nproc) - name: build-noint8 run: | mkdir build-noint8 && cd build-noint8 cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_INT8=OFF .. cmake --build . -j $(nproc) - name: test-noint8 run: | cd build-noint8 TESTS_EXECUTABLE_LOADER=qemu-arm-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabi" ctest --output-on-failure -j $(nproc) armhf: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: arm-gnu-toolchain run: | sudo apt-get update sudo apt-get install g++-arm-linux-gnueabihf qemu-user-static - name: build run: | mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test run: | cd build TESTS_EXECUTABLE_LOADER=qemu-arm-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j $(nproc) - name: build-noint8 run: | mkdir build-noint8 && cd build-noint8 cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_INT8=OFF .. cmake --build . -j $(nproc) - name: test-noint8 run: | cd build-noint8 TESTS_EXECUTABLE_LOADER=qemu-arm-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j $(nproc) armhf-vfpv3-d16: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: arm-gnu-toolchain run: | sudo apt-get update sudo apt-get install g++-arm-linux-gnueabihf qemu-user-static - name: build run: | mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test run: | cd build TESTS_EXECUTABLE_LOADER=qemu-arm-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j $(nproc) - name: build-noint8 run: | mkdir build-noint8 && cd build-noint8 cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_INT8=OFF .. cmake --build . -j $(nproc) - name: test-noint8 run: | cd build-noint8 TESTS_EXECUTABLE_LOADER=qemu-arm-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j $(nproc) ================================================ FILE: .github/workflows/linux-loongarch64.yml ================================================ name: linux-loongarch64 on: push: branches: [master] paths: - '.github/workflows/linux-loongarch64.yml' - 'toolchains/loongarch64-linux-gnu.toolchain.cmake' - 'toolchains/loongarch64-unknown-linux-gnu.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/loongarch/**' - 'tests/**' pull_request: branches: [master] paths: - '.github/workflows/linux-loongarch64.yml' - 'toolchains/loongarch64-linux-gnu.toolchain.cmake' - 'toolchains/loongarch64-unknown-linux-gnu.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/loongarch/**' - 'tests/**' concurrency: group: linux-loongarch64-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: gcc-loongarch64: runs-on: [self-hosted, linux, centos] steps: - uses: actions/checkout@v6 # - name: qemu # run: | # sudo apt-get update # sudo apt-get install -y qemu-user-static # - name: loongarch64-toolchain # run: | # wget https://github.com/sunhaiyong1978/CLFS-for-LoongArch/releases/download/8.0/loongarch64-clfs-8.0-cross-tools-gcc-full.tar.xz # tar -xf loongarch64-clfs-8.0-cross-tools-gcc-full.tar.xz - name: build run: | export LOONGARCH64_ROOT_PATH=/data/action/osd/cross-tools mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/loongarch64-unknown-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j 4 - name: test run: | cd build TESTS_EXECUTABLE_LOADER=qemu-loongarch64-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/data/action/osd/cross-tools/target" ctest --output-on-failure -j 4 ================================================ FILE: .github/workflows/linux-mips.yml ================================================ name: linux-mips on: push: branches: [master] paths: - '.github/workflows/linux-mips.yml' - 'toolchains/mipsel-linux-gnu.toolchain.cmake' - 'toolchains/mipsisa32r6el-linux-gnu.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/mips/**' - 'tests/**' pull_request: branches: [master] paths: - '.github/workflows/linux-mips.yml' - 'toolchains/mipsel-linux-gnu.toolchain.cmake' - 'toolchains/mipsisa32r6el-linux-gnu.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/mips/**' - 'tests/**' concurrency: group: linux-mips-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: mipsel: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: mipsel-gnu-toolchain run: | sudo apt-get update sudo apt-get install g++-mipsel-linux-gnu qemu-user-static - name: build run: | mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsel-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test run: | cd build TESTS_EXECUTABLE_LOADER=qemu-mipsel-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsel-linux-gnu" ctest --output-on-failure -j $(nproc) mipsisa32r6el: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: mipsisa32r6el-gnu-toolchain run: | sudo apt-get update sudo apt-get install g++-mipsisa32r6el-linux-gnu qemu-user-static - name: build run: | mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsisa32r6el-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test run: | cd build TESTS_EXECUTABLE_LOADER=qemu-mipsel-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa32r6el-linux-gnu" ctest --output-on-failure -j $(nproc) ================================================ FILE: .github/workflows/linux-mips64.yml ================================================ name: linux-mips64 on: push: branches: [master] paths: - '.github/workflows/linux-mips64.yml' - 'toolchains/mips64el-linux-gnuabi64.toolchain.cmake' - 'toolchains/mipsisa64r6el-linux-gnuabi64.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/mips/**' - 'tests/**' pull_request: branches: [master] paths: - '.github/workflows/linux-mips64.yml' - 'toolchains/mips64el-linux-gnuabi64.toolchain.cmake' - 'toolchains/mipsisa64r6el-linux-gnuabi64.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/mips/**' - 'tests/**' concurrency: group: linux-mips64-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: mips64el: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: mips64el-gnuabi64-toolchain run: | sudo apt-get update sudo apt-get install g++-mips64el-linux-gnuabi64 qemu-user-static - name: build run: | mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mips64el-linux-gnuabi64.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test run: | cd build TESTS_EXECUTABLE_LOADER=qemu-mips64el-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mips64el-linux-gnuabi64" ctest --output-on-failure -j $(nproc) mipsisa64r6el: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: mipsisa64r6el-gnuabi64-toolchain run: | sudo apt-get update sudo apt-get install g++-mipsisa64r6el-linux-gnuabi64 qemu-user-static - name: build run: | mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsisa64r6el-linux-gnuabi64.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test run: | cd build TESTS_EXECUTABLE_LOADER=qemu-mips64el-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa64r6el-linux-gnuabi64" ctest --output-on-failure -j $(nproc) ================================================ FILE: .github/workflows/linux-ppc64.yml ================================================ name: linux-ppc64 on: push: branches: [master] paths: - '.github/workflows/linux-ppc64.yml' - 'toolchains/powerpc64le-linux-gnu.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/*' - 'tests/**' pull_request: branches: [master] paths: - '.github/workflows/linux-ppc64.yml' - 'toolchains/powerpc64le-linux-gnu.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/*' - 'tests/**' concurrency: group: linux-ppc64-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: ppc: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: powerpc-gnu-toolchain run: | sudo apt-get update sudo apt-get install g++-powerpc-linux-gnu qemu-user-static - name: build run: | mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/powerpc-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test run: | cd build TESTS_EXECUTABLE_LOADER=qemu-ppc-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc-linux-gnu" ctest --output-on-failure -j $(nproc) ppc64le: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: powerpc64le-gnu-toolchain run: | sudo apt-get update sudo apt-get install g++-powerpc64le-linux-gnu qemu-user-static - name: build run: | mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/powerpc64le-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test run: | cd build TESTS_EXECUTABLE_LOADER=qemu-ppc64le-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu" ctest --output-on-failure -j $(nproc) power8le-vsx: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: powerpc64le-gnu-toolchain run: | sudo apt-get update sudo apt-get install g++-powerpc64le-linux-gnu qemu-user-static - name: build run: | mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/power8le-linux-gnu-vsx.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test run: | cd build TESTS_EXECUTABLE_LOADER=qemu-ppc64le-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu" ctest --output-on-failure -j $(nproc) power9le-vsx: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: powerpc64le-gnu-toolchain run: | sudo apt-get update sudo apt-get install g++-powerpc64le-linux-gnu qemu-user-static - name: build run: | mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/power9le-linux-gnu-vsx.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test run: | cd build TESTS_EXECUTABLE_LOADER=qemu-ppc64le-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu;-cpu;power9_v2.0" ctest --output-on-failure -j $(nproc) ================================================ FILE: .github/workflows/linux-riscv32.yml ================================================ name: linux-riscv32 on: push: branches: [master] paths: - '.github/workflows/linux-riscv32.yml' - 'toolchains/c907-rv32-v310.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/riscv/**' - 'tests/**' pull_request: branches: [master] paths: - '.github/workflows/linux-riscv32.yml' - 'toolchains/c907-rv32-v310.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/riscv/**' - 'tests/**' concurrency: group: linux-riscv32-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: xuantie: name: xuantie-${{ matrix.cpu }} runs-on: [self-hosted, linux, ubuntu] strategy: fail-fast: false matrix: include: - { cpu: c907-rv32, QEMU_CPU: c907fdv-rv32, OPENMP: ON, RVV: ON, XTHEADVECTOR: OFF, ZFH: ON, ZVFH: ON } steps: - uses: actions/checkout@v6 - name: build run: | export RISCV_ROOT_PATH=/data/action/osd/Xuantie-900-gcc-linux-6.6.36-glibc-x86_64-V3.3.0 mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/${{ matrix.cpu }}-v310.toolchain.cmake -DCMAKE_BUILD_TYPE=release \ -DNCNN_OPENMP=${{ matrix.OPENMP }} -DNCNN_THREADS=${{ matrix.OPENMP }} \ -DNCNN_RUNTIME_CPU=OFF \ -DNCNN_RVV=${{ matrix.RVV }} \ -DNCNN_XTHEADVECTOR=${{ matrix.XTHEADVECTOR }} \ -DNCNN_ZFH=${{ matrix.ZFH }} \ -DNCNN_ZVFH=${{ matrix.ZVFH }} \ -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON -DNCNN_BUILD_TESTS=ON .. cmake --build . -j 8 - name: test run: | export PATH=/data/action/osd/Xuantie-qemu-x86_64-Ubuntu-20.04-V5.2.8-B20250721-0303/bin:$PATH cd build TESTS_EXECUTABLE_LOADER=qemu-riscv32 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;${{ matrix.QEMU_CPU }}" ctest --output-on-failure -j 8 ================================================ FILE: .github/workflows/linux-riscv64.yml ================================================ name: linux-riscv64 on: push: branches: [master] paths: - '.github/workflows/linux-riscv64.yml' - 'toolchains/riscv64-linux-gnu.toolchain.cmake' - 'toolchains/riscv64-unknown-linux-gnu.toolchain.cmake' - 'toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake' - 'toolchains/c906-v310.toolchain.cmake' - 'toolchains/c908-v310.toolchain.cmake' - 'toolchains/c910-v310.toolchain.cmake' - 'toolchains/k1.toolchain.cmake' - 'toolchains/k1.llvm.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/riscv/**' - 'tests/**' - 'examples/**' pull_request: branches: [master] paths: - '.github/workflows/linux-riscv64.yml' - 'toolchains/riscv64-linux-gnu.toolchain.cmake' - 'toolchains/riscv64-unknown-linux-gnu.toolchain.cmake' - 'toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake' - 'toolchains/c906-v310.toolchain.cmake' - 'toolchains/c908-v310.toolchain.cmake' - 'toolchains/c910-v310.toolchain.cmake' - 'toolchains/k1.toolchain.cmake' - 'toolchains/k1.llvm.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/riscv/**' - 'tests/**' - 'examples/**' concurrency: group: linux-riscv64-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: gcc-riscv64: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: cache-qemu id: cache-qemu uses: actions/cache@v5 with: path: qemu-install key: qemu-riscv64-install-20220502-4 - name: install-qemu-build-deps if: steps.cache-qemu.outputs.cache-hit != 'true' run: | sudo apt-get update sudo apt-get install autoconf automake autotools-dev ninja-build build-essential pkg-config libglib2.0-dev libpixman-1-dev zlib1g-dev python3 - name: checkout-qemu if: steps.cache-qemu.outputs.cache-hit != 'true' uses: actions/checkout@v6 with: repository: qemu/qemu path: qemu ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - name: qemu if: steps.cache-qemu.outputs.cache-hit != 'true' run: | cd qemu wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system make -j$(nproc) make install - name: riscv64-gnu-toolchain run: | sudo apt-get update sudo apt-get install g++-riscv64-linux-gnu - name: configure run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - name: build run: cmake --build build -j $(nproc) - name: test run: | export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH cd build TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/riscv64-linux-gnu" ctest --output-on-failure -j $(nproc) xuantie: name: xuantie-${{ matrix.cpu }} runs-on: [self-hosted, linux, ubuntu] strategy: fail-fast: false matrix: include: - { cpu: c906, QEMU_CPU: c906fdv, OPENMP: OFF, RVV: OFF, XTHEADVECTOR: ON, ZFH: ON, ZVFH: OFF } - { cpu: c910, QEMU_CPU: c910v, OPENMP: ON, RVV: OFF, XTHEADVECTOR: ON, ZFH: ON, ZVFH: OFF } - { cpu: c908, QEMU_CPU: c908v, OPENMP: ON, RVV: ON, XTHEADVECTOR: OFF, ZFH: ON, ZVFH: ON } - { cpu: c907, QEMU_CPU: c907fdv-rv64, OPENMP: ON, RVV: ON, XTHEADVECTOR: OFF, ZFH: ON, ZVFH: ON } steps: - uses: actions/checkout@v6 - name: build run: | export RISCV_ROOT_PATH=/data/action/osd/Xuantie-900-gcc-linux-6.6.36-glibc-x86_64-V3.3.0 mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/${{ matrix.cpu }}-v310.toolchain.cmake -DCMAKE_BUILD_TYPE=release \ -DNCNN_OPENMP=${{ matrix.OPENMP }} -DNCNN_THREADS=${{ matrix.OPENMP }} \ -DNCNN_RUNTIME_CPU=OFF \ -DNCNN_RVV=${{ matrix.RVV }} \ -DNCNN_XTHEADVECTOR=${{ matrix.XTHEADVECTOR }} \ -DNCNN_ZFH=${{ matrix.ZFH }} \ -DNCNN_ZVFH=${{ matrix.ZVFH }} \ -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON -DNCNN_BUILD_TESTS=ON .. cmake --build . -j 8 - name: test run: | export PATH=/data/action/osd/Xuantie-qemu-x86_64-Ubuntu-20.04-V5.2.8-B20250721-0303/bin:$PATH cd build TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;${{ matrix.QEMU_CPU }}" ctest --output-on-failure -j 8 spacemit: name: spacemit-${{ matrix.cpu }} runs-on: [self-hosted, linux, ubuntu] strategy: fail-fast: false matrix: include: - { cpu: x60, QEMU_CPU: "max,vlen=256,elen=64,vext_spec=v1.0", OPENMP: ON, RVV: ON, XTHEADVECTOR: OFF, ZFH: ON, ZVFH: ON } steps: - uses: actions/checkout@v6 # https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_64-v1.1.2.tar.xz - name: build-gcc run: | export RISCV_ROOT_PATH=/data/action/osd/spacemit-toolchain-linux-glibc-x86_64-v1.1.2 mkdir build-gcc && cd build-gcc cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/k1.toolchain.cmake -DCMAKE_BUILD_TYPE=release \ -DNCNN_OPENMP=${{ matrix.OPENMP }} -DNCNN_THREADS=${{ matrix.OPENMP }} \ -DNCNN_RUNTIME_CPU=OFF \ -DNCNN_RVV=${{ matrix.RVV }} \ -DNCNN_XTHEADVECTOR=${{ matrix.XTHEADVECTOR }} \ -DNCNN_ZFH=${{ matrix.ZFH }} \ -DNCNN_ZVFH=${{ matrix.ZVFH }} \ -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON -DNCNN_BUILD_TESTS=ON .. cmake --build . -j 8 - name: build-llvm run: | export RISCV_ROOT_PATH=/data/action/osd/spacemit-toolchain-linux-glibc-x86_64-v1.1.2 mkdir build-llvm && cd build-llvm cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/k1.llvm.toolchain.cmake -DCMAKE_BUILD_TYPE=release \ -DNCNN_OPENMP=${{ matrix.OPENMP }} -DNCNN_THREADS=${{ matrix.OPENMP }} \ -DNCNN_RUNTIME_CPU=OFF \ -DNCNN_RVV=${{ matrix.RVV }} \ -DNCNN_XTHEADVECTOR=${{ matrix.XTHEADVECTOR }} \ -DNCNN_ZFH=${{ matrix.ZFH }} \ -DNCNN_ZVFH=${{ matrix.ZVFH }} \ -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON -DNCNN_BUILD_TESTS=ON .. cmake --build . -j 8 # https://archive.spacemit.com/spacemit-ai/qemu/jdsk-qemu-v0.0.14.tar.gz - name: test-gcc run: | export RISCV_ROOT_PATH=/data/action/osd/spacemit-toolchain-linux-glibc-x86_64-v1.1.2 export PATH=/data/action/osd/jdsk-qemu/bin:$PATH cd build-gcc TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;${{ matrix.QEMU_CPU }};-L;${RISCV_ROOT_PATH}/sysroot" ctest --output-on-failure -j 8 - name: test-llvm run: | export RISCV_ROOT_PATH=/data/action/osd/spacemit-toolchain-linux-glibc-x86_64-v1.1.2 export PATH=/data/action/osd/jdsk-qemu/bin:$PATH cd build-llvm TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;${{ matrix.QEMU_CPU }};-L;${RISCV_ROOT_PATH}/sysroot" ctest --output-on-failure -j 8 gcc-rvv: runs-on: [self-hosted, linux, ubuntu] steps: - uses: actions/checkout@v6 #- name: cache-qemu #id: cache-qemu #uses: actions/cache@v5 #with: #path: qemu-install #key: qemu-riscv64-install-20241202 #- name: install-qemu-build-deps #if: steps.cache-qemu.outputs.cache-hit != 'true' #run: | #sudo apt-get update #sudo apt-get install autoconf automake autotools-dev ninja-build #- name: checkout-qemu #if: steps.cache-qemu.outputs.cache-hit != 'true' #uses: actions/checkout@v6 #with: #repository: qemu/qemu #path: qemu #ref: 72b88908d12ee9347d13539c7dd9a252625158d1 #- name: qemu #if: steps.cache-qemu.outputs.cache-hit != 'true' #run: | #cd qemu #./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system #make -j4 #make install #- name: cache-riscv #id: cache-riscv #uses: actions/cache@v5 #with: #path: riscv-install #key: riscv-linux-install-20241202 #- name: install-riscv-build-deps #if: steps.cache-riscv.outputs.cache-hit != 'true' #run: | #sudo apt-get update #sudo apt-get install autoconf automake autotools-dev curl python3 libmpc-dev libmpfr-dev libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc zlib1g-dev libexpat-dev device-tree-compiler #- name: checkout-riscv-gnu-toolchain #if: steps.cache-riscv.outputs.cache-hit != 'true' #uses: actions/checkout@v6 #with: #repository: riscv-collab/riscv-gnu-toolchain #path: riscv-gnu-toolchain #ref: 20f615317e2ce888dfc11b29ccde4a649494b654 #- name: checkout-riscv-gnu-toolchain-submodules #if: steps.cache-riscv.outputs.cache-hit != 'true' #run: | #cd riscv-gnu-toolchain #git submodule update --init --recursive --depth 1 glibc #git submodule update --init --recursive --depth 1 newlib #git submodule update --init --recursive --depth 1 riscv-binutils #git submodule update --init --recursive --depth 1 riscv-gcc #git submodule update --init --recursive --depth 1 riscv-dejagnu #git submodule update --init --recursive --depth 1 riscv-gdb #- name: riscv-gnu-toolchain #if: steps.cache-riscv.outputs.cache-hit != 'true' #run: | #cd riscv-gnu-toolchain #./configure --prefix=$GITHUB_WORKSPACE/riscv #make linux -j4 #- name: riscv-strip-install #if: steps.cache-riscv.outputs.cache-hit != 'true' #run: find $GITHUB_WORKSPACE/riscv -type f | xargs -i strip -g {} || true - name: configure run: export RISCV_ROOT_PATH=/data/action/osd/riscv && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - name: build run: cmake --build build -j 8 - name: test-vlen256 run: | export PATH=/data/action/osd/qemu-install/bin:$PATH cd build TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,zfh=true,zvfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/riscv/sysroot" ctest --output-on-failure -j 8 - name: test-vlen128 run: | export PATH=/data/action/osd/qemu-install/bin:$PATH cd build TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,zfh=true,zvfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/riscv/sysroot" ctest --output-on-failure -j 8 clang-rvv: runs-on: [self-hosted, linux, ubuntu] steps: - uses: actions/checkout@v6 #- name: cache-qemu #id: cache-qemu #uses: actions/cache@v5 #with: #path: qemu-install #key: qemu-riscv64-install-20241202 #- name: install-qemu-build-deps #if: steps.cache-qemu.outputs.cache-hit != 'true' #run: | #sudo apt-get update #sudo apt-get install autoconf automake autotools-dev ninja-build #- name: checkout-qemu #if: steps.cache-qemu.outputs.cache-hit != 'true' #uses: actions/checkout@v6 #with: #repository: qemu/qemu #path: qemu #ref: 72b88908d12ee9347d13539c7dd9a252625158d1 #- name: qemu #if: steps.cache-qemu.outputs.cache-hit != 'true' #run: | #cd qemu #./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system #make -j4 #make install #- name: cache-riscv #id: cache-riscv #uses: actions/cache@v5 #with: #path: riscv-install #key: riscv-linux-install-20241202 #- name: install-riscv-build-deps #if: steps.cache-riscv.outputs.cache-hit != 'true' #run: | #sudo apt-get update #sudo apt-get install autoconf automake autotools-dev curl python3 libmpc-dev libmpfr-dev libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc zlib1g-dev libexpat-dev device-tree-compiler #- name: checkout-riscv-gnu-toolchain #if: steps.cache-riscv.outputs.cache-hit != 'true' #uses: actions/checkout@v6 #with: #repository: riscv-collab/riscv-gnu-toolchain #path: riscv-gnu-toolchain #ref: 20f615317e2ce888dfc11b29ccde4a649494b654 #- name: checkout-riscv-gnu-toolchain-submodules #if: steps.cache-riscv.outputs.cache-hit != 'true' #run: | #cd riscv-gnu-toolchain #git submodule update --init --recursive --depth 1 glibc #git submodule update --init --recursive --depth 1 newlib #git submodule update --init --recursive --depth 1 riscv-binutils #git submodule update --init --recursive --depth 1 riscv-gcc #git submodule update --init --recursive --depth 1 riscv-dejagnu #git submodule update --init --recursive --depth 1 riscv-gdb #- name: riscv-gnu-toolchain #if: steps.cache-riscv.outputs.cache-hit != 'true' #run: | #cd riscv-gnu-toolchain #./configure --prefix=$GITHUB_WORKSPACE/riscv #make linux -j4 #- name: riscv-strip-install #if: steps.cache-riscv.outputs.cache-hit != 'true' #run: find $GITHUB_WORKSPACE/riscv -type f | xargs -i strip -g {} || true # - name: install-clang # run: | # wget https://github.com/llvm/llvm-project/releases/download/llvmorg-19.1.4/llvm-project-19.1.4.src.tar.xz # tar -xf llvm-project-19.1.4.src.tar.xz # cd llvm-project-19.1.4.src # mkdir build # cd build # cmake -DCMAKE_INSTALL_PREFIX=$GITHUB_WORKSPACE/riscv -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_TARGETS_TO_BUILD="RISCV" -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF ../llvm/ # make -j16 # make install - name: build run: | export RISCV_ROOT_PATH=/data/action/osd/riscv mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j 8 - name: test-vlen256 run: | export PATH=/data/action/osd/qemu-install/bin:$PATH cd build TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,zfh=true,zvfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/riscv/sysroot" ctest --output-on-failure -j 8 - name: test-vlen128 run: | export PATH=/data/action/osd/qemu-install/bin:$PATH cd build TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,zfh=true,zvfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/riscv/sysroot" ctest --output-on-failure -j 8 ================================================ FILE: .github/workflows/linux-x64-cpu-clang.yml ================================================ name: linux-x64-cpu-clang on: push: branches: [master] paths: - '.github/workflows/linux-x64-cpu-clang.yml' - 'toolchains/host-c.clang.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'tests/**' - 'tools/**' - '!tools/pnnx/**' - 'examples/**' pull_request: branches: [master] paths: - '.github/workflows/linux-x64-cpu-clang.yml' - 'toolchains/host-c.clang.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'tests/**' - 'tools/**' - '!tools/pnnx/**' - 'examples/**' concurrency: group: linux-x64-cpu-clang-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: linux-clang: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: update run: sudo apt-get update - name: protobuf run: sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev - name: build-sse2 env: CC: clang CXX: clang++ run: | mkdir build-sse2 && cd build-sse2 cmake -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test-sse2 run: cd build-sse2 && ctest --output-on-failure -j $(nproc) - name: build-shared env: CC: clang CXX: clang++ run: | mkdir build-shared && cd build-shared cmake -DNCNN_AVX2=ON -DNCNN_SHARED_LIB=ON .. cmake --build . -j $(nproc) - name: build-avx2 env: CC: clang CXX: clang++ run: | mkdir build-avx2 && cd build-avx2 cmake -DNCNN_AVX2=ON -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test-avx2 run: cd build-avx2 && ctest --output-on-failure -j $(nproc) - name: build-avx env: CC: clang CXX: clang++ run: | mkdir build-avx && cd build-avx cmake -DNCNN_AVX2=OFF -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test-avx run: cd build-avx && ctest --output-on-failure -j $(nproc) - name: build-avx1-2 env: CC: clang CXX: clang++ run: | mkdir build-avx1-2 && cd build-avx1-2 cmake -DNCNN_AVX2=ON -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test-avx1-2 run: cd build-avx1-2 && ctest --output-on-failure -j $(nproc) - name: build-noint8 env: CC: clang CXX: clang++ run: | mkdir build-noint8 && cd build-noint8 cmake -DNCNN_INT8=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test-noint8 run: cd build-noint8 && ctest --output-on-failure -j $(nproc) linux-clang-simplestl: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: build-simplestl env: CC: clang CXX: clang++ run: | mkdir build-simplestl && cd build-simplestl cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.clang.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. cmake --build . -j $(nproc) - name: test-simplestl run: cd build-simplestl && ctest --output-on-failure -j $(nproc) - name: build-simplestl-simpleomp env: CC: clang CXX: clang++ run: | mkdir build-simplestl-simpleomp && cd build-simplestl-simpleomp cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.clang.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEOMP=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. cmake --build . -j $(nproc) - name: test-simplestl-simpleomp run: cd build-simplestl-simpleomp && ctest --output-on-failure -j $(nproc) ================================================ FILE: .github/workflows/linux-x64-cpu-gcc-musl.yml ================================================ name: linux-x64-cpu-gcc-musl on: push: branches: [master] paths: - '.github/workflows/linux-x64-cpu-gcc-musl.yml' - 'toolchains/host-c.gcc.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'tests/**' - 'tools/**' - '!tools/pnnx/**' - 'examples/**' pull_request: branches: [master] paths: - '.github/workflows/linux-x64-cpu-gcc-musl.yml' - 'toolchains/host-c.gcc.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'tests/**' - 'tools/**' - '!tools/pnnx/**' - 'examples/**' concurrency: group: linux-x64-cpu-gcc-musl-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: linux-gcc-musl: runs-on: ubuntu-latest steps: - uses: jirutka/setup-alpine@v1 with: packages: > cmake clang clang-dev make gcc g++ libc-dev linux-headers - uses: actions/checkout@v6 - name: build shell: alpine.sh {0} run: | mkdir build && cd build cmake -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test shell: alpine.sh {0} run: cd build && ctest --output-on-failure -j $(nproc) - name: build-shared run: | mkdir build-shared && cd build-shared cmake -DNCNN_SHARED_LIB=ON .. cmake --build . -j $(nproc) ================================================ FILE: .github/workflows/linux-x64-cpu-gcc.yml ================================================ name: linux-x64-cpu-gcc on: push: branches: [master] paths: - '.github/workflows/linux-x64-cpu-gcc.yml' - 'toolchains/host-c.gcc.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'tests/**' - 'tools/**' - '!tools/pnnx/**' - 'examples/**' pull_request: branches: [master] paths: - '.github/workflows/linux-x64-cpu-gcc.yml' - 'toolchains/host-c.gcc.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'tests/**' - 'tools/**' - '!tools/pnnx/**' - 'examples/**' concurrency: group: linux-x64-cpu-gcc-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: linux-gcc: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: update run: sudo apt-get update - name: protobuf run: sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev - name: build-sse2 run: | mkdir build-sse2 && cd build-sse2 cmake -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test-sse2 run: cd build-sse2 && ctest --output-on-failure -j $(nproc) - name: build-shared run: | mkdir build-shared && cd build-shared cmake -DNCNN_AVX2=ON -DNCNN_SHARED_LIB=ON .. cmake --build . -j $(nproc) - name: build-avx2 run: | mkdir build-avx2 && cd build-avx2 cmake -DNCNN_AVX2=ON -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test-avx2 run: cd build-avx2 && ctest --output-on-failure -j $(nproc) - name: build-avx run: | mkdir build-avx && cd build-avx cmake -DNCNN_AVX2=OFF -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test-avx run: cd build-avx && ctest --output-on-failure -j $(nproc) - name: build-avx1-2 run: | mkdir build-avx1-2 && cd build-avx1-2 cmake -DNCNN_AVX2=ON -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test-avx1-2 run: cd build-avx1-2 && ctest --output-on-failure -j $(nproc) - name: build-noint8 run: | mkdir build-noint8 && cd build-noint8 cmake -DNCNN_INT8=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test-noint8 run: cd build-noint8 && ctest --output-on-failure -j $(nproc) asan: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: build run: | mkdir build && cd build cmake -DCMAKE_BUILD_TYPE=relwithdebinfo -DNCNN_ASAN=ON -DNCNN_BUILD_TESTS=ON -DNCNN_SHARED_LIB=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. cmake --build . -j $(nproc) - name: test run: | cd build ctest --output-on-failure -j $(nproc) linux-gcc-cpp03-nostdio-nostring-simplestl: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: build-nostdio run: | mkdir build-nostdio && cd build-nostdio cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-c++03.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. cmake --build . -j $(nproc) - name: test-nostdio run: cd build-nostdio && ctest --output-on-failure -j $(nproc) - name: build-nostdio-nostring run: | mkdir build-nostdio-nostring && cd build-nostdio-nostring cmake -DNCNN_STDIO=OFF -DNCNN_STRING=OFF -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. cmake --build . -j $(nproc) - name: build-simplestl run: | mkdir build-simplestl && cd build-simplestl cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. cmake --build . -j $(nproc) - name: test-simplestl run: cd build-simplestl && ctest --output-on-failure -j $(nproc) - name: build-simplestl-simpleomp run: | mkdir build-simplestl-simpleomp && cd build-simplestl-simpleomp cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEOMP=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. cmake --build . -j $(nproc) - name: test-simplestl-simpleomp run: cd build-simplestl-simpleomp && ctest --output-on-failure -j $(nproc) linux-gcc-avx512: runs-on: [self-hosted, linux, t4] steps: - uses: actions/checkout@v6 - name: build env: CC: gcc CXX: g++ LD_LIBRARY_PATH: /data/action/install/lib64 run: | mkdir build && cd build cmake -DNCNN_AVX2=ON -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. cmake --build . -j 4 - name: test env: LD_LIBRARY_PATH: /data/action/install/lib64 run: cd build && ctest --output-on-failure -j 4 ================================================ FILE: .github/workflows/linux-x64-gpu-clang.yml ================================================ name: linux-x64-gpu-clang on: push: branches: [master] paths: - '.github/workflows/linux-x64-gpu-clang.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'src/layer/vulkan/**' - 'tests/**' - 'tools/**' - '!tools/pnnx/**' - 'examples/**' - 'glslang' pull_request: branches: [master] paths: - '.github/workflows/linux-x64-gpu-clang.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'src/layer/vulkan/**' - 'tests/**' - 'tools/**' - '!tools/pnnx/**' - 'examples/**' - 'glslang' concurrency: group: linux-x64-gpu-clang-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: linux-clang-gpu: runs-on: [self-hosted, linux, ubuntu25] steps: - uses: actions/checkout@v6 with: submodules: true - name: cache-swiftshader id: cache-swiftshader uses: actions/cache@v5 with: path: swiftshader-install key: swiftshader-linux-install-20250508 - name: checkout-swiftshader if: steps.cache-swiftshader.outputs.cache-hit != 'true' uses: actions/checkout@v6 with: repository: google/swiftshader path: swiftshader ref: 930d46d31b5d637f313fd5ef55da2bbf053c26c1 - name: swiftshader if: steps.cache-swiftshader.outputs.cache-hit != 'true' run: | cd swiftshader git -c submodule."third_party/git-hooks".update=none submodule update --init --recursive mkdir -p build; cd build cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release .. cmake --build . -j 8 mkdir $GITHUB_WORKSPACE/swiftshader-install cp Linux/* $GITHUB_WORKSPACE/swiftshader-install - name: build env: CC: clang CXX: clang++ run: | mkdir build && cd build cmake -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=ON .. cmake --build . -j 8 - name: test run: | printf "[Processor]\nThreadCount=1\n" > build/tests/SwiftShader.ini export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json" cd build && ctest --output-on-failure -j 8 - name: build-shared env: CC: clang CXX: clang++ run: | mkdir build-shared && cd build-shared cmake -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON .. cmake --build . -j 8 ================================================ FILE: .github/workflows/linux-x64-gpu-gcc.yml ================================================ name: linux-x64-gpu-gcc on: push: branches: [master] paths: - '.github/workflows/linux-x64-gpu-gcc.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'src/layer/vulkan/**' - 'tests/**' - 'tools/**' - '!tools/pnnx/**' - 'examples/**' - 'glslang' pull_request: branches: [master] paths: - '.github/workflows/linux-x64-gpu-gcc.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'src/layer/vulkan/**' - 'tests/**' - 'tools/**' - '!tools/pnnx/**' - 'examples/**' - 'glslang' concurrency: group: linux-x64-gpu-gcc-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: linux-gcc-gpu: runs-on: [self-hosted, linux, ubuntu25] steps: - uses: actions/checkout@v6 with: submodules: true - name: cache-swiftshader id: cache-swiftshader uses: actions/cache@v5 with: path: swiftshader-install key: swiftshader-linux-install-20250508 - name: checkout-swiftshader if: steps.cache-swiftshader.outputs.cache-hit != 'true' uses: actions/checkout@v6 with: repository: google/swiftshader path: swiftshader ref: 930d46d31b5d637f313fd5ef55da2bbf053c26c1 - name: swiftshader if: steps.cache-swiftshader.outputs.cache-hit != 'true' run: | cd swiftshader git -c submodule."third_party/git-hooks".update=none submodule update --init --recursive mkdir -p build; cd build cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release .. cmake --build . -j 8 mkdir $GITHUB_WORKSPACE/swiftshader-install cp Linux/* $GITHUB_WORKSPACE/swiftshader-install - name: build run: | mkdir build && cd build cmake -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=ON .. cmake --build . -j 8 - name: test run: | printf "[Processor]\nThreadCount=1\n" > build/tests/SwiftShader.ini export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json" cd build && ctest --output-on-failure -j 8 - name: build-shared run: | mkdir build-shared && cd build-shared cmake -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON .. cmake --build . -j 8 linux-gcc-gpu-system-glslang: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: install-deps run: | sudo apt-get update sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev libvulkan-dev glslang-dev glslang-tools spirv-tools - name: build run: | mkdir build && cd build cmake -DNCNN_VULKAN=ON -DNCNN_SYSTEM_GLSLANG=ON -DGLSLANG_TARGET_DIR=/usr/lib/x86_64-linux-gnu/cmake .. cmake --build . -j $(nproc) - name: build-shared run: | mkdir build-shared && cd build-shared cmake -DNCNN_VULKAN=ON -DNCNN_SYSTEM_GLSLANG=ON -DGLSLANG_TARGET_DIR=/usr/lib/x86_64-linux-gnu/cmake -DNCNN_SHARED_LIB=ON .. cmake --build . -j $(nproc) linux-gcc-gpu-t4: runs-on: [self-hosted, linux, t4] steps: - uses: actions/checkout@v6 with: submodules: true - name: build env: CC: gcc CXX: g++ LD_LIBRARY_PATH: /data/action/install/lib64 run: | mkdir build && cd build cmake -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. cmake --build . -j 4 - name: test env: LD_LIBRARY_PATH: /data/action/install/lib64 run: | cd build && ctest --output-on-failure -j 4 ================================================ FILE: .github/workflows/linux-x64-sde.yml ================================================ name: linux-x64-sde on: push: branches: [master] paths: - '.github/workflows/linux-x64-sde.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'tests/**' - 'tools/**' - '!tools/pnnx/**' - 'examples/**' pull_request: branches: [master] paths: - '.github/workflows/linux-x64-sde.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'tests/**' - 'tools/**' - '!tools/pnnx/**' - 'examples/**' concurrency: group: linux-x64-sde-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: gcc-sde: runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v6 - name: update run: sudo apt-get update - name: gcc14 run: sudo apt-get install gcc-14 g++-14 - name: Setup SDE binaries uses: petarpetrovt/setup-sde@v3.0 - name: build env: CC: gcc-14 CXX: g++-14 run: | mkdir build && cd build cmake -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test-p4p run: | cd build TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-p4p;--" ctest --output-on-failure -j $(nproc) - name: test-snb run: | cd build TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-snb;--" ctest --output-on-failure -j $(nproc) - name: test-hsw run: | cd build TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-hsw;--" ctest --output-on-failure -j $(nproc) - name: test-adl run: | cd build TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-adl;--" ctest --output-on-failure -j $(nproc) - name: test-arl run: | cd build TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-arl;--" ctest --output-on-failure -j $(nproc) - name: test-skx run: | cd build TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-skx;--" ctest --output-on-failure -j $(nproc) - name: test-spr run: | cd build TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-spr;--" ctest --output-on-failure -j $(nproc) - name: test-gnr run: | cd build TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-gnr;--" ctest --output-on-failure -j $(nproc) ================================================ FILE: .github/workflows/linux-x86-cpu-clang.yml ================================================ name: linux-x86-cpu-clang on: push: branches: [master] paths: - '.github/workflows/linux-x86-cpu-clang.yml' - 'toolchains/host.clang-m32.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'tests/**' pull_request: branches: [master] paths: - '.github/workflows/linux-x86-cpu-clang.yml' - 'toolchains/host.clang-m32.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'tests/**' concurrency: group: linux-x86-cpu-clang-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: linux-clang: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: update run: sudo apt-get update - name: gcc-multilib run: sudo apt-get install gcc-multilib g++-multilib - name: build env: CC: clang CXX: clang++ run: | mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.clang-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. cmake --build . -j $(nproc) - name: test run: cd build && ctest --output-on-failure -j $(nproc) - name: build-shared env: CC: clang CXX: clang++ run: | mkdir build-shared && cd build-shared cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.clang-m32.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON .. cmake --build . -j $(nproc) - name: build-noint8 env: CC: clang CXX: clang++ run: | mkdir build-noint8 && cd build-noint8 cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.clang-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_INT8=OFF .. cmake --build . -j $(nproc) - name: test-noint8 run: cd build-noint8 && ctest --output-on-failure -j $(nproc) ================================================ FILE: .github/workflows/linux-x86-cpu-gcc.yml ================================================ name: linux-x86-cpu-gcc on: push: branches: [master] paths: - '.github/workflows/linux-x86-cpu-gcc.yml' - 'toolchains/host.gcc-m32.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'tests/**' pull_request: branches: [master] paths: - '.github/workflows/linux-x86-cpu-gcc.yml' - 'toolchains/host.gcc-m32.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'tests/**' concurrency: group: linux-x86-cpu-gcc-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: linux-gcc: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: update run: sudo apt-get update - name: gcc-multilib run: sudo apt-get install gcc-multilib g++-multilib - name: build run: | mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. cmake --build . -j $(nproc) - name: test run: cd build && ctest --output-on-failure -j $(nproc) - name: build-nosse run: | mkdir build-nosse && cd build-nosse cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. cmake --build . -j $(nproc) - name: test-nosse run: cd build-nosse && ctest --output-on-failure -j $(nproc) - name: build-shared run: | mkdir build-shared && cd build-shared cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON .. cmake --build . -j $(nproc) - name: build-noint8 run: | mkdir build-noint8 && cd build-noint8 cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_INT8=OFF .. cmake --build . -j $(nproc) - name: test-noint8 run: cd build-noint8 && ctest --output-on-failure -j $(nproc) ================================================ FILE: .github/workflows/mac-catalyst.yml ================================================ name: mac-catalyst on: push: branches: [master] paths: - '.github/workflows/mac-catalyst.yml' - 'toolchains/ios.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' - 'src/layer/x86/**' - 'src/layer/vulkan/**' - 'glslang' pull_request: branches: [master] paths: - '.github/workflows/mac-catalyst.yml' - 'toolchains/ios.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' - 'src/layer/x86/**' - 'src/layer/vulkan/**' - 'glslang' concurrency: group: mac-catalyst-${{ github.ref }} cancel-in-progress: true env: DEVELOPER_DIR: /Applications/Xcode_16.4.0.app/Contents/Developer MAC_CATALYST_DEPLOYMENT_TARGET: '13.1' ENABLE_BITCODE: OFF ENABLE_ARC: OFF ENABLE_VISIBILITY: OFF permissions: contents: read jobs: build: runs-on: macos-15-intel env: OPENMP_VERSION: '18.1.2' OPENMP_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$MAC_CATALYST_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DLIBOMP_ENABLE_SHARED=OFF \ -DLIBOMP_OMPT_SUPPORT=OFF \ -DLIBOMP_USE_HWLOC=OFF \ NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$MAC_CATALYST_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VULKAN=ON \ steps: - uses: actions/checkout@v6 with: submodules: true - name: cache-openmp id: cache-openmp uses: actions/cache@v5 with: path: openmp-install key: openmp-mac-catalyst-install-20251004 - name: openmp if: steps.cache-openmp.outputs.cache-hit != 'true' run: | wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/ cd openmp-${{ env.OPENMP_VERSION }}.src wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch - name: openmp-x86_64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-x86_64 && cd build-x86_64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=MAC_CATALYST -DARCHS="x86_64" .. cmake --build . -j 4 cmake --build . --target install - name: openmp-arm64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-arm64 && cd build-arm64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=MAC_CATALYST_ARM64 -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install - name: openmp-merge-fat-library if: steps.cache-openmp.outputs.cache-hit != 'true' run: | mkdir -p $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install/mac-catalyst cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/include $GITHUB_WORKSPACE/openmp-install/mac-catalyst mkdir -p $GITHUB_WORKSPACE/openmp-install/mac-catalyst/lib lipo -create \ openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/lib/libomp.a \ openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a \ -o $GITHUB_WORKSPACE/openmp-install/mac-catalyst/lib/libomp.a - name: install-openmp run: | sudo cp $GITHUB_WORKSPACE/openmp-install/mac-catalyst/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include sudo cp $GITHUB_WORKSPACE/openmp-install/mac-catalyst/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib - name: x86_64 run: | mkdir build-x86_64 && cd build-x86_64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_CATALYST -DARCHS="x86_64" .. cmake --build . -j 4 - name: arm64 run: | mkdir build-arm64 && cd build-arm64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_CATALYST_ARM64 -DARCHS="arm64" .. cmake --build . -j 4 ================================================ FILE: .github/workflows/macos.yml ================================================ name: macos on: push: branches: [master] paths: - '.github/workflows/macos.yml' - 'toolchains/ios.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' - 'src/layer/x86/**' - 'src/layer/vulkan/**' - 'tests/**' - 'tools/**' - '!tools/pnnx/**' - 'examples/**' - 'glslang' pull_request: branches: [master] paths: - '.github/workflows/macos.yml' - 'toolchains/ios.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' - 'src/layer/x86/**' - 'src/layer/vulkan/**' - 'tests/**' - 'tools/**' - '!tools/pnnx/**' - 'examples/**' - 'glslang' concurrency: group: macos-${{ github.ref }} cancel-in-progress: true env: DEVELOPER_DIR: /Applications/Xcode_16.4.0.app/Contents/Developer MAC_DEPLOYMENT_TARGET: '11.0' ENABLE_BITCODE: OFF ENABLE_ARC: OFF ENABLE_VISIBILITY: OFF permissions: contents: read jobs: build: runs-on: macos-15-intel env: OPENMP_VERSION: '18.1.2' OPENMP_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DLIBOMP_ENABLE_SHARED=OFF \ -DLIBOMP_OMPT_SUPPORT=OFF \ -DLIBOMP_USE_HWLOC=OFF \ NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VULKAN=ON \ steps: - uses: actions/checkout@v6 with: submodules: true - name: cache-openmp id: cache-openmp uses: actions/cache@v5 with: path: openmp-install key: openmp-macos-install-20251004 - name: openmp if: steps.cache-openmp.outputs.cache-hit != 'true' run: | wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/ cd openmp-${{ env.OPENMP_VERSION }}.src wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch - name: openmp-x86_64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-x86_64 && cd build-x86_64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=MAC -DARCHS="x86_64" .. cmake --build . -j 4 cmake --build . --target install - name: openmp-arm64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-arm64 && cd build-arm64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=MAC_ARM64 -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install - name: openmp-merge-fat-library if: steps.cache-openmp.outputs.cache-hit != 'true' run: | mkdir -p $GITHUB_WORKSPACE/openmp-install cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/include $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install/lib lipo -create \ openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/lib/libomp.a \ openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a \ -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a - name: install-openmp run: | sudo cp $GITHUB_WORKSPACE/openmp-install/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include sudo cp $GITHUB_WORKSPACE/openmp-install/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib - name: cache-swiftshader id: cache-swiftshader uses: actions/cache@v5 with: path: swiftshader-install key: swiftshader-macos-install-20251004 - name: checkout-swiftshader if: steps.cache-swiftshader.outputs.cache-hit != 'true' uses: actions/checkout@v6 with: repository: google/swiftshader path: swiftshader ref: de870ac7518fe2b6bb651ecc22fc36647cf7b986 - name: checkout-swiftshader-submodules if: steps.cache-swiftshader.outputs.cache-hit != 'true' run: | cd swiftshader git -c submodule."third_party/git-hooks".update=none submodule update --init --recursive - name: swiftshader if: steps.cache-swiftshader.outputs.cache-hit != 'true' run: | cd swiftshader mkdir -p build; cd build cmake -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release .. cmake --build . -j 4 mkdir $GITHUB_WORKSPACE/swiftshader-install cp Darwin/* $GITHUB_WORKSPACE/swiftshader-install - name: arm64 run: | mkdir build-arm64 && cd build-arm64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_ARM64 -DARCHS="arm64" .. cmake --build . -j 4 - name: x86_64 run: | mkdir build-x86_64 && cd build-x86_64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC -DARCHS="x86_64" -DNCNN_BUILD_TESTS=ON .. cmake --build . -j 4 - name: arm64-shared run: | mkdir build-arm64-shared && cd build-arm64-shared cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_ARM64 -DARCHS="arm64" -DNCNN_SHARED_LIB=ON .. cmake --build . -j 4 - name: x86_64-shared run: | mkdir build-x86_64-shared && cd build-x86_64-shared cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC -DARCHS="x86_64" -DNCNN_SHARED_LIB=ON .. cmake --build . -j 4 - name: x86_64-test run: | printf "[Processor]\nThreadCount=1\n" > build-x86_64/tests/SwiftShader.ini export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json" cd build-x86_64 && ctest --output-on-failure -j 4 ================================================ FILE: .github/workflows/pnnx.yml ================================================ name: pnnx on: push: branches: [master] paths: - '.github/workflows/pnnx.yml' - 'src/layer/*' - 'tools/pnnx/**' - '!tools/pnnx/README.md' pull_request: branches: [master] paths: - '.github/workflows/pnnx.yml' - 'src/layer/*' - 'tools/pnnx/**' - '!tools/pnnx/README.md' concurrency: group: pnnx-${{ github.ref }} cancel-in-progress: true permissions: contents: read env: LIBTORCH_VERSION: 2.10.0 TORCHVISION_VERSION: 0.25.0 PROTOBUF_VERSION: 21.12 ONNXRUNTIME_VERSION: 1.24.3 CACHE_DATE: 20260309 SEGMENT_DOWNLOAD_TIMEOUT_MINS: 15 jobs: quick-test: runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: os: [ubuntu-latest, macos-latest, windows-latest] env: PYTHONUSERBASE: ${{ github.workspace }}/torch UseMultiToolTask: true steps: - uses: actions/checkout@v6 - uses: actions/setup-python@v6 with: python-version: 3.12 - name: setup-pytorch run: | python3 -m pip config set global.break-system-packages true pip3 install --user torch --index-url https://download.pytorch.org/whl/cpu pip3 install --user numpy packaging - name: build-pnnx run: | cd tools/pnnx mkdir build && cd build cmake -DCMAKE_BUILD_TYPE=Release .. cmake --build . --config Release -j 4 - name: quick-test if: matrix.os != 'windows-latest' run: | cd tools/pnnx cd build && ctest -C Release --output-on-failure -R test_nn_Conv build: runs-on: [self-hosted, linux, ubuntu25] steps: - uses: actions/checkout@v6 - name: local-cache-libtorch id: local-cache-libtorch uses: maxnowack/local-cache@v2 with: path: libtorch-${{ env.LIBTORCH_VERSION }}-install key: libtorch-${{ env.LIBTORCH_VERSION }}-linux-install-${{ env.CACHE_DATE }} - name: local-cache-torchvision id: local-cache-torchvision uses: maxnowack/local-cache@v2 with: path: torchvision-${{ env.TORCHVISION_VERSION }}-install key: torchvision-${{ env.TORCHVISION_VERSION }}-linux-install-${{ env.CACHE_DATE }} - name: local-cache-onnxruntime id: local-cache-onnxruntime uses: maxnowack/local-cache@v2 with: path: onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-install key: onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-linux-install-${{ env.CACHE_DATE }} - name: cache-libtorch id: cache-libtorch uses: actions/cache@v4 with: path: libtorch-${{ env.LIBTORCH_VERSION }}-install key: libtorch-${{ env.LIBTORCH_VERSION }}-linux-install-${{ env.CACHE_DATE }} - name: cache-torchvision id: cache-torchvision uses: actions/cache@v4 with: path: torchvision-${{ env.TORCHVISION_VERSION }}-install key: torchvision-${{ env.TORCHVISION_VERSION }}-linux-install-${{ env.CACHE_DATE }} - name: cache-onnxruntime id: cache-onnxruntime uses: actions/cache@v4 with: path: onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-install key: onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-linux-install-${{ env.CACHE_DATE }} - name: pnnx-patches if: (steps.local-cache-libtorch.outputs.cache-hit != 'true' && steps.cache-libtorch.outputs.cache-hit != 'true') || (steps.local-cache-torchvision.outputs.cache-hit != 'true' && steps.cache-torchvision.outputs.cache-hit != 'true') || (steps.local-cache-onnxruntime.outputs.cache-hit != 'true' && steps.cache-onnxruntime.outputs.cache-hit != 'true') uses: actions/checkout@v6 with: repository: pnnx/pnnx path: pnnx-patches - name: libtorch if: steps.local-cache-libtorch.outputs.cache-hit != 'true' && steps.cache-libtorch.outputs.cache-hit != 'true' run: | wget -q https://github.com/pytorch/pytorch/releases/download/v${{ env.LIBTORCH_VERSION }}/pytorch-v${{ env.LIBTORCH_VERSION }}.tar.gz tar -xf pytorch-v${{ env.LIBTORCH_VERSION }}.tar.gz cd pytorch-v${{ env.LIBTORCH_VERSION }} pip3 install -r requirements.txt --break-system-packages patch -p1 -i $GITHUB_WORKSPACE/pnnx-patches/pytorch-v${{ env.LIBTORCH_VERSION }}-fix-mobile-build.patch patch -p1 -i $GITHUB_WORKSPACE/pnnx-patches/pytorch-v${{ env.LIBTORCH_VERSION }}-no-link-system-lib.patch patch -p1 -i $GITHUB_WORKSPACE/pnnx-patches/pytorch-v${{ env.LIBTORCH_VERSION }}-fix-eigen-build.patch patch -p1 -i $GITHUB_WORKSPACE/pnnx-patches/pytorch-v${{ env.LIBTORCH_VERSION }}-fix-link-local-sleef.patch patch -p1 -i $GITHUB_WORKSPACE/pnnx-patches/pytorch-v${{ env.LIBTORCH_VERSION }}-revert-nativert-api.patch mkdir -p build && cd build cmake -DCMAKE_INSTALL_PREFIX=$GITHUB_WORKSPACE/libtorch-${{ env.LIBTORCH_VERSION }}-install \ -DCMAKE_BUILD_TYPE=MinSizeRel \ -DBUILD_SHARED_LIBS=OFF \ -DCMAKE_POLICY_VERSION_MINIMUM=3.5 \ -DBUILD_CUSTOM_PROTOBUF=OFF \ -DBUILD_LITE_INTERPRETER=OFF \ -DBUILD_PYTHON=OFF \ -DINTERN_BUILD_MOBILE=ON \ -DINTERN_DISABLE_AUTOGRAD=ON \ -DINTERN_DISABLE_ONNX=ON \ -DUSE_CUDA=OFF \ -DUSE_DISTRIBUTED=OFF \ -DUSE_ITT=OFF \ -DUSE_KINETO=OFF \ -DUSE_LITE_INTERPRETER_PROFILER=OFF \ -DUSE_MKLDNN=OFF \ -DUSE_MPS=OFF \ -DUSE_NUMPY=OFF \ -DUSE_OPENMP=OFF \ -DUSE_SOURCE_DEBUG_ON_MOBILE=OFF \ -DUSE_XNNPACK=OFF \ -DBUILD_TEST=OFF \ -DATEN_NO_TEST=ON \ .. cmake --build . -j 8 cmake --build . -j 8 --target install/strip - name: torchvision if: steps.local-cache-torchvision.outputs.cache-hit != 'true' && steps.cache-torchvision.outputs.cache-hit != 'true' run: | wget -q https://github.com/pytorch/vision/archive/v${{ env.TORCHVISION_VERSION }}.zip -O vision-${{ env.TORCHVISION_VERSION }}.zip unzip -q vision-${{ env.TORCHVISION_VERSION }}.zip cd vision-${{ env.TORCHVISION_VERSION }} patch -p1 -i $GITHUB_WORKSPACE/pnnx-patches/vision-${{ env.TORCHVISION_VERSION }}-ops-only.patch patch -p1 -i $GITHUB_WORKSPACE/pnnx-patches/vision-${{ env.TORCHVISION_VERSION }}-no-cuda-version.patch mkdir -p build && cd build cmake -DCMAKE_INSTALL_PREFIX=$GITHUB_WORKSPACE/torchvision-${{ env.TORCHVISION_VERSION }}-install \ -DTorch_DIR=$GITHUB_WORKSPACE/libtorch-${{ env.LIBTORCH_VERSION }}-install/share/cmake/Torch \ -DCMAKE_BUILD_TYPE=MinSizeRel \ -DWITH_PNG=OFF \ -DWITH_JPEG=OFF .. cmake --build . -j 8 cmake --build . -j 8 --target install/strip - name: onnxruntime if: steps.local-cache-onnxruntime.outputs.cache-hit != 'true' && steps.cache-onnxruntime.outputs.cache-hit != 'true' run: | wget -q https://github.com/protocolbuffers/protobuf/archive/v${{ env.PROTOBUF_VERSION }}.zip -O protobuf-${{ env.PROTOBUF_VERSION }}.zip unzip -q protobuf-${{ env.PROTOBUF_VERSION }}.zip cd protobuf-${{ env.PROTOBUF_VERSION }} mkdir -p build2 && cd build2 cmake -DCMAKE_INSTALL_PREFIX=$GITHUB_WORKSPACE/onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-install \ -Dprotobuf_BUILD_TESTS=OFF \ -DCMAKE_BUILD_TYPE=MinSizeRel \ -DCMAKE_POSITION_INDEPENDENT_CODE=ON .. cmake --build . -j 8 cmake --build . -j 8 --target install/strip cd ../../ wget -q https://github.com/microsoft/onnxruntime/archive/v${{ env.ONNXRUNTIME_VERSION }}.zip -O onnxruntime-${{ env.ONNXRUNTIME_VERSION }}.zip unzip -q onnxruntime-${{ env.ONNXRUNTIME_VERSION }}.zip cd onnxruntime-${{ env.ONNXRUNTIME_VERSION }} patch -p1 -i $GITHUB_WORKSPACE/pnnx-patches/onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-less-mlas-features.patch patch -p1 -i $GITHUB_WORKSPACE/pnnx-patches/onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-monolithic-static-library.patch patch -p1 -i $GITHUB_WORKSPACE/pnnx-patches/onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-use-clog.patch mkdir -p build2 && cd build2 cmake -DCMAKE_INSTALL_PREFIX=$GITHUB_WORKSPACE/onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-install \ -DCMAKE_BUILD_TYPE=MinSizeRel \ -Donnxruntime_USE_FULL_PROTOBUF=ON \ -Donnxruntime_BUILD_SHARED_LIB=ON \ -Donnxruntime_BUILD_UNIT_TESTS=OFF \ -Donnxruntime_ENABLE_CPUINFO=OFF \ -Donnxruntime_DISABLE_CONTRIB_OPS=ON \ -Donnxruntime_DISABLE_ML_OPS=ON \ -Donnxruntime_DISABLE_SPARSE_TENSORS=ON \ -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ --compile-no-warning-as-error ../cmake cmake --build . -j 8 cmake --build . -j 8 --target install/strip - name: pnnx run: | cd tools/pnnx mkdir build && cd build cmake -DCMAKE_BUILD_TYPE=MinSizeRel \ -DTorch_INSTALL_DIR=$GITHUB_WORKSPACE/libtorch-${{ env.LIBTORCH_VERSION }}-install \ -DTorchVision_INSTALL_DIR=$GITHUB_WORKSPACE/torchvision-${{ env.TORCHVISION_VERSION }}-install \ -Donnxruntime_INSTALL_DIR=$GITHUB_WORKSPACE/onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-install \ -Dprotobuf_DIR=$GITHUB_WORKSPACE/onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-install/lib/cmake/protobuf .. cmake --build . -j 8 strip src/pnnx - name: upload-pnnx uses: actions/upload-artifact@v5 with: name: pnnx path: tools/pnnx/build/src/pnnx compression-level: 9 test: needs: [build] runs-on: [self-hosted, linux, ubuntu25] strategy: fail-fast: false matrix: include: - { python: '3.8', numpy: '1.24.4', opencv: '4.5.*', torch: '1.8.1', torchvision: '0.9.1', torchaudio: '0.8.1', transformers: '4.52.1' } - { python: '3.8', numpy: '1.24.4', opencv: '4.5.*', torch: '1.9.1', torchvision: '0.10.1', torchaudio: '0.9.1', transformers: '4.52.1' } - { python: '3.8', numpy: '1.24.4', opencv: '4.6.*', torch: '1.10.0', torchvision: '0.11.1', torchaudio: '0.10.0+cpu', transformers: '4.52.1' } - { python: '3.9', numpy: '1.26.4', opencv: '4.6.*', torch: '1.11.0', torchvision: '0.12.0', torchaudio: '0.11.0+cpu', transformers: '4.52.1' } - { python: '3.9', numpy: '1.26.4', opencv: '4.7.*', torch: '1.12.0', torchvision: '0.13.0', torchaudio: '0.12.0+cpu', transformers: '4.52.1' } - { python: '3.10', numpy: '1.26.4', opencv: '4.7.*', torch: '1.13.0', torchvision: '0.14.0', torchaudio: '0.13.0+cpu', transformers: '4.52.1' } - { python: '3.10', numpy: '1.26.4', opencv: '4.8.*', torch: '2.0.0', torchvision: '0.15.1', torchaudio: '2.0.0+cpu', transformers: '4.52.1' } - { python: '3.10', numpy: '1.26.4', opencv: '4.8.*', torch: '2.1.0', torchvision: '0.16.0', torchaudio: '2.1.0+cpu', transformers: '4.52.1' } - { python: '3.11', numpy: '1.26.4', opencv: '4.9.*', torch: '2.2.1', torchvision: '0.17.1', torchaudio: '2.2.1+cpu', transformers: '4.52.1' } - { python: '3.11', numpy: '1.26.4', opencv: '4.9.*', torch: '2.3.0', torchvision: '0.18.0', torchaudio: '2.3.0+cpu', transformers: '4.52.1' } - { python: '3.11', numpy: '2.2.5', opencv: '4.10.*', torch: '2.4.0', torchvision: '0.19.0', torchaudio: '2.4.0+cpu', transformers: '4.52.1' } - { python: '3.12', numpy: '2.2.5', opencv: '4.10.*', torch: '2.5.0', torchvision: '0.20.0', torchaudio: '2.5.0+cpu', transformers: '4.52.1' } - { python: '3.12', numpy: '2.2.5', opencv: '4.11.*', torch: '2.6.0', torchvision: '0.21.0', torchaudio: '2.6.0+cpu', transformers: '4.52.1' } - { python: '3.12', numpy: '2.2.5', opencv: '4.11.*', torch: '2.7.0', torchvision: '0.22.0', torchaudio: '2.7.0+cpu', transformers: '4.52.1' } - { python: '3.13', numpy: '2.2.5', opencv: '4.12.*', torch: '2.8.0', torchvision: '0.23.0', torchaudio: '2.8.0+cpu', transformers: '4.56.2' } - { python: '3.13', numpy: '2.2.5', opencv: '4.12.*', torch: '2.9.0', torchvision: '0.24.0', torchaudio: '2.9.0+cpu', transformers: '4.56.2' } - { python: '3.13', numpy: '2.2.5', opencv: '4.12.*', torch: '2.10.0', torchvision: '0.25.0', torchaudio: '2.10.0+cpu', transformers: '4.56.2' } name: test-${{ matrix.torch }}-py${{ matrix.python }} env: PYTHONUSERBASE: ${{ github.workspace }}/python-${{ matrix.python }} steps: - uses: actions/checkout@v6 with: submodules: true - name: local-cache-libtorch id: local-cache-libtorch uses: maxnowack/local-cache@v2 with: path: libtorch-${{ env.LIBTORCH_VERSION }}-install key: libtorch-${{ env.LIBTORCH_VERSION }}-linux-install-${{ env.CACHE_DATE }} - name: local-cache-torchvision id: local-cache-torchvision uses: maxnowack/local-cache@v2 with: path: torchvision-${{ env.TORCHVISION_VERSION }}-install key: torchvision-${{ env.TORCHVISION_VERSION }}-linux-install-${{ env.CACHE_DATE }} - name: local-cache-onnxruntime id: local-cache-onnxruntime uses: maxnowack/local-cache@v2 with: path: onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-install key: onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-linux-install-${{ env.CACHE_DATE }} - name: cache-libtorch if: steps.local-cache-libtorch.outputs.cache-hit != 'true' id: cache-libtorch uses: actions/cache/restore@v5 with: path: libtorch-${{ env.LIBTORCH_VERSION }}-install key: libtorch-${{ env.LIBTORCH_VERSION }}-linux-install-${{ env.CACHE_DATE }} fail-on-cache-miss: true - name: cache-torchvision if: steps.local-cache-torchvision.outputs.cache-hit != 'true' id: cache-torchvision uses: actions/cache/restore@v5 with: path: torchvision-${{ env.TORCHVISION_VERSION }}-install key: torchvision-${{ env.TORCHVISION_VERSION }}-linux-install-${{ env.CACHE_DATE }} fail-on-cache-miss: true - name: cache-onnxruntime if: steps.local-cache-onnxruntime.outputs.cache-hit != 'true' id: cache-onnxruntime uses: actions/cache/restore@v5 with: path: onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-install key: onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-linux-install-${{ env.CACHE_DATE }} fail-on-cache-miss: true - uses: actions/setup-python@v6 with: python-version: ${{ matrix.python }} - name: setup-pytorch run: | export PATH=${{ env.PYTHONUSERBASE }}/bin:$PATH pip3 install --user pytest wheel twine requests einops numpy==${{ matrix.numpy }} opencv-python==${{ matrix.opencv }} pip3 install --user torch==${{ matrix.torch }}+cpu torchvision==${{ matrix.torchvision }}+cpu torchaudio==${{ matrix.torchaudio }} --index-url https://download.pytorch.org/whl/cpu pip3 install --user onnx onnxscript onnxruntime pip3 install --user "transformers<=${{ matrix.transformers }}" diffusers "safetensors<=0.6.2" - name: setup-pytorch-execstack-or-patchelf if: ${{ matrix.python }} == '3.8' || ${{ matrix.python }} == '3.9' run: | execstack -c ${{ env.PYTHONUSERBASE }}/lib/python${{ matrix.python }}/site-packages/torch/lib/libtorch_cpu.so || true patchelf --clear-execstack ${{ env.PYTHONUSERBASE }}/lib/python${{ matrix.python }}/site-packages/torch/lib/libtorch_cpu.so || true - name: python-ncnn run: | export CMAKE_BUILD_PARALLEL_LEVEL=8 pip3 install --user . --verbose - name: pnnx run: | cd tools/pnnx mkdir build && cd build cmake -DCMAKE_BUILD_TYPE=Release \ -DTorch_INSTALL_DIR=$GITHUB_WORKSPACE/libtorch-${{ env.LIBTORCH_VERSION }}-install \ -DTorchVision_INSTALL_DIR=$GITHUB_WORKSPACE/torchvision-${{ env.TORCHVISION_VERSION }}-install \ -Donnxruntime_INSTALL_DIR=$GITHUB_WORKSPACE/onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-install \ -Dprotobuf_DIR=$GITHUB_WORKSPACE/onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-install/lib/cmake/protobuf .. - name: download-pnnx uses: actions/download-artifact@v8 with: name: pnnx path: tools/pnnx/build/src - name: test run: | export PATH=${{ env.PYTHONUSERBASE }}/bin:$PATH chmod +x tools/pnnx/build/src/pnnx export OMP_THREAD_LIMIT=1 export OMP_NUM_THREADS=1 export MKL_NUM_THREADS=1 export MKL_ENABLE_INSTRUCTIONS=SSE4_2 cd tools/pnnx/build ctest --output-on-failure -j 8 - name: python-pnnx run: | export PATH=${{ env.PYTHONUSERBASE }}/bin:$PATH export PNNX_WHEEL_WITHOUT_BUILD=ON cd tools/pnnx/python cp ../build/src/pnnx pnnx/ python3 setup.py install --user pytest tests ================================================ FILE: .github/workflows/python.yml ================================================ name: python on: push: branches: [master] paths: - '.github/workflows/python.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'src/layer/vulkan/**' - 'python/**' - 'glslang' pull_request: branches: [master] paths: - '.github/workflows/python.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'src/layer/vulkan/**' - 'python/**' - 'glslang' concurrency: group: python-${{ github.ref }} cancel-in-progress: true env: DEVELOPER_DIR: /Applications/Xcode_16.4.0.app/Contents/Developer MAC_DEPLOYMENT_TARGET: '11.0' ENABLE_BITCODE: OFF ENABLE_ARC: OFF ENABLE_VISIBILITY: OFF CMAKE_BUILD_PARALLEL_LEVEL: 4 UseMultiToolTask: true permissions: contents: read jobs: build: strategy: matrix: os: [ubuntu-latest, macos-15-intel, windows-latest] python-version: [3.9, 3.12] runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v6 with: submodules: true - name: cache-swiftshader if: matrix.os == 'ubuntu-latest' id: cache-swiftshader uses: actions/cache@v5 with: path: swiftshader-install key: swiftshader-linux-install-20240622 - name: checkout-swiftshader if: matrix.os == 'ubuntu-latest' && steps.cache-swiftshader.outputs.cache-hit != 'true' uses: actions/checkout@v6 with: repository: google/swiftshader path: swiftshader ref: de870ac7518fe2b6bb651ecc22fc36647cf7b986 - name: checkout-swiftshader-submodules if: matrix.os == 'ubuntu-latest' && steps.cache-swiftshader.outputs.cache-hit != 'true' run: | cd swiftshader git -c submodule."third_party/git-hooks".update=none submodule update --init --recursive - name: swiftshader if: matrix.os == 'ubuntu-latest' && steps.cache-swiftshader.outputs.cache-hit != 'true' run: | cd swiftshader mkdir -p build; cd build cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release .. cmake --build . -j $(nproc) mkdir $GITHUB_WORKSPACE/swiftshader-install cp Linux/* $GITHUB_WORKSPACE/swiftshader-install - name: setup-python uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: install-deps run: | python -m pip install --upgrade pip pip install pytest setuptools wheel twine importlib-metadata - name: build if: matrix.os == 'ubuntu-latest' env: CC: clang CXX: clang++ run: | mkdir build && cd build cmake -DNCNN_VULKAN=ON -DNCNN_PYTHON=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. cmake --build . -j $(nproc) - name: build if: matrix.os == 'macos-15-intel' run: | mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DPLATFORM=MAC -DARCHS="x86_64" \ -DDEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET -DENABLE_BITCODE=$ENABLE_BITCODE -DENABLE_ARC=$ENABLE_ARC -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DNCNN_VULKAN=OFF -DNCNN_PYTHON=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. cmake --build . -j 4 - name: build if: matrix.os == 'windows-latest' run: | mkdir build; cd build cmake -T v142,host=x64 -A x64 -DNCNN_VULKAN=OFF -DNCNN_PYTHON=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. cmake --build . --config Release -j 4 - name: build-python run: cd python && pip install . - name: test if: matrix.os == 'ubuntu-latest' run: | export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json" cd python && pytest tests - name: test if: matrix.os != 'ubuntu-latest' run: | cd python && pytest tests ================================================ FILE: .github/workflows/release-python.yml ================================================ name: release-python on: push: tags: - '*' workflow_dispatch: env: DEVELOPER_DIR: /Applications/Xcode_16.4.0.app/Contents/Developer MAC_DEPLOYMENT_TARGET: '11.0' ENABLE_BITCODE: OFF ENABLE_ARC: OFF ENABLE_VISIBILITY: OFF CIBW_SKIP: "cp3??t-*" jobs: build_sdist: name: Build SDist runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 with: submodules: true - uses: actions/setup-python@v6 with: python-version: '3.x' - name: Install deps run: python -m pip install twine build - name: Build SDist run: python -m build -s - name: Check metadata run: twine check dist/* - uses: actions/upload-artifact@v6 with: name: sdist path: dist/*.tar.gz build_wheels: name: ${{ matrix.arch }} ${{ matrix.build_id }} on ${{ matrix.os }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: include: - { os: ubuntu-24.04, arch: x86_64, build: 'cp*-manylinux*', build_id: cp-manylinux } - { os: ubuntu-24.04, arch: x86_64, build: 'cp*-musllinux*', build_id: cp-musllinux } - { os: ubuntu-24.04, arch: x86_64, build: 'pp*', build_id: pp } - { os: ubuntu-24.04, arch: i686, build: 'cp*-manylinux*', build_id: cp-manylinux } - { os: ubuntu-24.04, arch: i686, build: 'cp*-musllinux*', build_id: cp-musllinux } - { os: ubuntu-24.04, arch: i686, build: 'pp*', build_id: pp } - { os: windows-2025, arch: x86, build: 'cp*', build_id: cp } - { os: windows-2025, arch: AMD64, build: 'cp*', build_id: cp } - { os: windows-2025, arch: AMD64, build: 'pp*', build_id: pp } - { os: windows-11-arm, arch: ARM64, build: 'cp*', build_id: cp } - { os: macos-15-intel, arch: x86_64, build: 'cp*', build_id: cp } - { os: macos-15, arch: arm64, build: 'cp*', build_id: cp } - { os: ubuntu-24.04-arm, arch: armv7l, build: 'cp*-manylinux*', build_id: cp-manylinux } - { os: ubuntu-24.04-arm, arch: armv7l, build: 'cp*-musllinux*', build_id: cp-musllinux } - { os: ubuntu-24.04-arm, arch: aarch64, build: 'cp*-manylinux*', build_id: cp-manylinux } - { os: ubuntu-24.04-arm, arch: aarch64, build: 'cp*-musllinux*', build_id: cp-musllinux } - { os: ubuntu-24.04-arm, arch: aarch64, build: 'pp*', build_id: pp } env: OPENMP_VERSION: '18.1.2' OPENMP_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DLIBOMP_ENABLE_SHARED=OFF \ -DLIBOMP_OMPT_SUPPORT=OFF \ -DLIBOMP_USE_HWLOC=OFF \ steps: - uses: actions/checkout@v6 with: submodules: true # build wheels for ubuntu - name: Build wheels for ubuntu if: matrix.os == 'ubuntu-24.04' uses: pypa/cibuildwheel@v3.3.1 env: CIBW_ARCHS_LINUX: ${{ matrix.arch }} CIBW_BUILD: ${{ matrix.build }} CIBW_ENABLE: pypy CIBW_BUILD_VERBOSITY: 1 CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=4 with: output-dir: wheelhouse # build wheels for ubuntu armv7l - name: Build wheels for ubuntu armv7l if: matrix.os == 'ubuntu-24.04-arm' && (matrix.arch == 'armv7l') uses: pypa/cibuildwheel@v3.3.1 env: CIBW_ARCHS_LINUX: ${{ matrix.arch }} CIBW_BUILD: ${{ matrix.build }} CIBW_ENABLE: pypy CIBW_BUILD_VERBOSITY: 1 CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=4 CFLAGS="-mfpu=neon" CXXFLAGS="-mfpu=neon" with: output-dir: wheelhouse # build wheels for ubuntu aarch64 - name: Build wheels for ubuntu aarch64 if: matrix.os == 'ubuntu-24.04-arm' && (matrix.arch == 'aarch64') uses: pypa/cibuildwheel@v3.3.1 env: CIBW_ARCHS_LINUX: ${{ matrix.arch }} CIBW_BUILD: ${{ matrix.build }} CIBW_ENABLE: pypy CIBW_BUILD_VERBOSITY: 1 CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=4 with: output-dir: wheelhouse # build wheels for windows - name: Build wheels for windows if: matrix.os == 'windows-2025' && (matrix.arch == 'AMD64' || matrix.arch == 'x86') uses: pypa/cibuildwheel@v3.3.1 env: CIBW_ARCHS_WINDOWS: ${{ matrix.arch }} CIBW_BUILD: ${{ matrix.build }} CIBW_ENABLE: pypy CIBW_BUILD_VERBOSITY: 1 CIBW_ENVIRONMENT_WINDOWS: CMAKE_BUILD_PARALLEL_LEVEL=4 CIBW_BEFORE_BUILD: pip install delvewheel CIBW_REPAIR_WHEEL_COMMAND: delvewheel repair -w {dest_dir} {wheel} with: output-dir: wheelhouse - name: Build wheels for windows ARM64 if: matrix.os == 'windows-11-arm' && matrix.arch == 'ARM64' uses: pypa/cibuildwheel@v3.3.1 env: CIBW_ARCHS_WINDOWS: ${{ matrix.arch }} CIBW_BUILD: ${{ matrix.build }} CIBW_ENABLE: pypy CIBW_BUILD_VERBOSITY: 1 CIBW_ENVIRONMENT_WINDOWS: CMAKE_BUILD_PARALLEL_LEVEL=4 CIBW_BEFORE_BUILD: pip install delvewheel CIBW_REPAIR_WHEEL_COMMAND: delvewheel repair -w {dest_dir} {wheel} --no-dll "msvcp140.dll;vcomp140.dll" with: output-dir: wheelhouse # build wheels for macos - name: cache-openmp for macos if: matrix.os == 'macos-15-intel' || matrix.os == 'macos-15' id: cache-openmp uses: actions/cache@v5 with: path: openmp-install key: openmp-macos-install-20251004 - name: openmp for macos if: (matrix.os == 'macos-15-intel' || matrix.os == 'macos-15') && steps.cache-openmp.outputs.cache-hit != 'true' run: | wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/ cd openmp-${{ env.OPENMP_VERSION }}.src wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch - name: openmp-build-x86_64 for macos if: (matrix.os == 'macos-15-intel' || matrix.os == 'macos-15') && steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-x86_64 && cd build-x86_64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=MAC -DARCHS="x86_64" .. cmake --build . -j 4 cmake --build . --target install - name: openmp-build-arm64 for macos if: (matrix.os == 'macos-15-intel' || matrix.os == 'macos-15') && steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-arm64 && cd build-arm64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=MAC_ARM64 -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install - name: openmp-merge-fat-library for macos if: (matrix.os == 'macos-15-intel' || matrix.os == 'macos-15') && steps.cache-openmp.outputs.cache-hit != 'true' run: | mkdir -p $GITHUB_WORKSPACE/openmp-install cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/include $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install/lib lipo -create \ openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/lib/libomp.a \ openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a \ -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a - name: install-openmp for macos if: matrix.os == 'macos-15-intel' || matrix.os == 'macos-15' run: | sudo cp $GITHUB_WORKSPACE/openmp-install/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include sudo cp $GITHUB_WORKSPACE/openmp-install/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib - name: vulkansdk for macos if: matrix.os == 'macos-15-intel' || matrix.os == 'macos-15' run: | wget -q https://sdk.lunarg.com/sdk/download/1.4.335.1/mac/vulkansdk-macos-1.4.335.1.zip?Human=true -O vulkansdk-macos-1.4.335.1.zip unzip vulkansdk-macos-1.4.335.1.zip sudo vulkansdk-macOS-1.4.335.1.app/Contents/MacOS/vulkansdk-macOS-1.4.335.1 --root $GITHUB_WORKSPACE/vulkansdk-macos-1.4.335.1 --accept-licenses --default-answer --confirm-command install - name: Build wheels for macos x86_64 if: matrix.os == 'macos-15-intel' && matrix.arch == 'x86_64' uses: pypa/cibuildwheel@v3.3.1 env: CIBW_ARCHS_MACOS: ${{ matrix.arch }} CIBW_BUILD: ${{ matrix.build }} CIBW_ENABLE: pypy CIBW_BUILD_VERBOSITY: 1 CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=4 CMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/toolchains/ios.toolchain.cmake PLATFORM=MAC ARCHS="x86_64" DEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET ENABLE_BITCODE=OFF ENABLE_ARC=OFF ENABLE_VISIBILITY=OFF OpenMP_C_FLAGS="-Xclang -fopenmp" OpenMP_CXX_FLAGS="-Xclang -fopenmp" OpenMP_C_LIB_NAMES="libomp" OpenMP_CXX_LIB_NAMES="libomp" OpenMP_libomp_LIBRARY="libomp.a" Vulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.4.335.1/macOS/lib/libMoltenVK.dylib MACOSX_DEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET with: output-dir: wheelhouse - name: Build wheels for macos arm64 if: matrix.os == 'macos-15' && matrix.arch == 'arm64' uses: pypa/cibuildwheel@v3.3.1 env: CIBW_ARCHS_MACOS: ${{ matrix.arch }} CIBW_BUILD: ${{ matrix.build }} CIBW_ENABLE: pypy CIBW_BUILD_VERBOSITY: 1 CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=4 CMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/toolchains/ios.toolchain.cmake PLATFORM=MAC_ARM64 ARCHS="arm64" DEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET ENABLE_BITCODE=OFF ENABLE_ARC=OFF ENABLE_VISIBILITY=OFF OpenMP_C_FLAGS="-Xclang -fopenmp" OpenMP_CXX_FLAGS="-Xclang -fopenmp" OpenMP_C_LIB_NAMES="libomp" OpenMP_CXX_LIB_NAMES="libomp" OpenMP_libomp_LIBRARY="libomp.a" Vulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.4.335.1/macOS/lib/libMoltenVK.dylib MACOSX_DEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET with: output-dir: wheelhouse - name: Show files run: ls -lh wheelhouse shell: bash - name: Verify clean directory run: git diff --exit-code shell: bash - name: Upload wheels uses: actions/upload-artifact@v6 with: name: wheels-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.build_id }} path: wheelhouse/*.whl build_wheels_qemu_cp: name: ${{ matrix.arch }} ${{ matrix.build_cp }} ${{ matrix.build_sub }} runs-on: ubuntu-24.04 strategy: fail-fast: false matrix: arch: [riscv64] build_cp: [cp38, cp39, cp310, cp311, cp312, cp313, cp314] build_sub: [manylinux, musllinux] steps: - uses: actions/checkout@v6 with: submodules: true - name: Set up QEMU uses: docker/setup-qemu-action@v3 with: platforms: all - name: Build wheels with qemu uses: pypa/cibuildwheel@v3.3.1 env: CIBW_ARCHS_LINUX: ${{ matrix.arch }} CIBW_BUILD: ${{ matrix.build_cp }}-${{ matrix.build_sub }}* CIBW_BUILD_VERBOSITY: 1 CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=4 EXTRA_CMAKE_ARGS="-DNCNN_XTHEADVECTOR=OFF" with: output-dir: wheelhouse - name: Show files run: ls -lh wheelhouse shell: bash - name: Verify clean directory run: git diff --exit-code shell: bash - name: Upload wheels uses: actions/upload-artifact@v6 with: name: wheels_qemu_cp-${{ matrix.arch }}-${{ matrix.build_cp }}-${{ matrix.build_sub }} path: wheelhouse/*.whl upload_all: permissions: contents: none name: Upload needs: [build_wheels, build_wheels_qemu_cp, build_sdist] runs-on: ubuntu-latest steps: - uses: actions/download-artifact@v8 with: path: dist merge-multiple: true - uses: pypa/gh-action-pypi-publish@release/v1 with: user: __token__ password: ${{ secrets.PYPI_API_TOKEN }} ================================================ FILE: .github/workflows/release.yml ================================================ name: release on: push: tags: - '*' env: DEVELOPER_DIR: /Applications/Xcode_16.4.0.app/Contents/Developer IOS_DEPLOYMENT_TARGET: '13.0' MAC_DEPLOYMENT_TARGET: '11.0' MAC_CATALYST_DEPLOYMENT_TARGET: '13.1' WATCHOS_DEPLOYMENT_TARGET: '6.0' TVOS_DEPLOYMENT_TARGET: '11.0' VISIONOS_DEPLOYMENT_TARGET: '1.0' ENABLE_BITCODE: OFF ENABLE_ARC: OFF ENABLE_VISIBILITY: OFF EMSCRIPTEN_VERSION: 3.1.28 permissions: contents: read jobs: setup: permissions: contents: none runs-on: ubuntu-latest outputs: VERSION: ${{ steps.get_version.outputs.VERSION }} steps: - name: get-version id: get_version run: echo "VERSION=${GITHUB_REF/refs\/tags\//}" >> $GITHUB_OUTPUT full-source: needs: [setup] runs-on: ubuntu-latest env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-full-source steps: - uses: actions/checkout@v6 with: submodules: true - name: package run: | rm -rf .git rm -f /tmp/${{ env.PACKAGENAME }}.zip zip -9 -y -r /tmp/${{ env.PACKAGENAME }}.zip . - name: upload-zip uses: actions/upload-artifact@v6 with: name: ${{ env.PACKAGENAME }} path: /tmp/${{ env.PACKAGENAME }}.zip ubuntu: needs: [setup] strategy: matrix: opt: - { shared-lib: OFF, os: ubuntu-22.04, id: ubuntu-2204 } - { shared-lib: OFF, os: ubuntu-24.04, id: ubuntu-2404 } - { shared-lib: ON, os: ubuntu-22.04, id: ubuntu-2204-shared } - { shared-lib: ON, os: ubuntu-24.04, id: ubuntu-2404-shared } runs-on: ${{ matrix.opt.os }} env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} steps: - uses: actions/checkout@v6 with: submodules: true - name: apt run: | sudo apt-get install -y libprotobuf-dev protobuf-compiler - name: build run: | mkdir build && cd build cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=${{ matrix.opt.shared-lib }} .. cmake --build . -j $(nproc) cmake --build . --target install/strip - name: package run: | rm -rf ${{ env.PACKAGENAME }} mkdir -p ${{ env.PACKAGENAME }} cp -a build/install/* ${{ env.PACKAGENAME }} rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - name: upload-zip uses: actions/upload-artifact@v6 with: name: ${{ env.PACKAGENAME }} path: ${{ env.PACKAGENAME }}.zip openmp-macos: runs-on: macos-15-intel env: OPENMP_VERSION: '18.1.2' OPENMP_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DLIBOMP_ENABLE_SHARED=OFF \ -DLIBOMP_OMPT_SUPPORT=OFF \ -DLIBOMP_USE_HWLOC=OFF \ steps: - name: cache-openmp id: cache-openmp uses: actions/cache@v5 with: path: openmp-install key: openmp-macos-release-18.1.2-20251004 - name: checkout if: steps.cache-openmp.outputs.cache-hit != 'true' uses: actions/checkout@v6 - name: openmp if: steps.cache-openmp.outputs.cache-hit != 'true' run: | wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/ cd openmp-${{ env.OPENMP_VERSION }}.src wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch - name: build-x86_64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-x86_64 && cd build-x86_64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=MAC -DARCHS="x86_64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: build-arm64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-arm64 && cd build-arm64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=MAC_ARM64 -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: merge-fat-library if: steps.cache-openmp.outputs.cache-hit != 'true' run: | rm -rf $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/include $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install/lib lipo -create \ openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/lib/libomp.a \ openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a \ -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a - name: upload uses: actions/upload-artifact@v6 with: name: openmp-macos path: openmp-install macos: needs: [setup, openmp-macos] strategy: matrix: opt: - { vulkan: OFF, id: macos } - { vulkan: ON, id: macos-vulkan } runs-on: macos-15-intel env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ -DNCNN_BUILD_TOOLS=OFF \ -DNCNN_BUILD_EXAMPLES=OFF \ -DNCNN_BUILD_BENCHMARK=OFF \ -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ steps: - uses: actions/checkout@v6 with: submodules: true - name: download-openmp-macos uses: actions/download-artifact@v8 with: name: openmp-macos path: openmp-macos - name: install-openmp run: | sudo cp openmp-macos/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include sudo cp openmp-macos/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib - name: build-x86_64 run: | mkdir build-x86_64 && cd build-x86_64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC -DARCHS="x86_64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: build-arm64 run: | mkdir build-arm64 && cd build-arm64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_ARM64 -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: package-openmp run: | rm -rf openmp.framework mkdir -p openmp.framework/Versions/A/Headers mkdir -p openmp.framework/Versions/A/Resources ln -s A openmp.framework/Versions/Current ln -s Versions/Current/Headers openmp.framework/Headers ln -s Versions/Current/Resources openmp.framework/Resources ln -s Versions/Current/openmp openmp.framework/openmp cp openmp-macos/lib/libomp.a openmp.framework/Versions/A/openmp cp -a openmp-macos/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - name: package-glslang if: matrix.opt.vulkan == 'ON' run: | rm -rf glslang.framework mkdir -p glslang.framework/Versions/A/Headers mkdir -p glslang.framework/Versions/A/Resources ln -s A glslang.framework/Versions/Current ln -s Versions/Current/Headers glslang.framework/Headers ln -s Versions/Current/Resources glslang.framework/Resources ln -s Versions/Current/glslang glslang.framework/glslang libtool -static \ build-x86_64/install/lib/libglslang.a \ build-x86_64/install/lib/libSPIRV.a \ -o build-x86_64/install/lib/libglslang_combined.a libtool -static \ build-arm64/install/lib/libglslang.a \ build-arm64/install/lib/libSPIRV.a \ -o build-arm64/install/lib/libglslang_combined.a lipo -create build-x86_64/install/lib/libglslang_combined.a build-arm64/install/lib/libglslang_combined.a -o glslang.framework/Versions/A/glslang cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/ sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist - name: package-ncnn run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers mkdir -p ncnn.framework/Versions/A/Resources ln -s A ncnn.framework/Versions/Current ln -s Versions/Current/Headers ncnn.framework/Headers ln -s Versions/Current/Resources ncnn.framework/Resources ln -s Versions/Current/ncnn ncnn.framework/ncnn lipo -create build-x86_64/install/lib/libncnn.a build-arm64/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - name: package if: matrix.opt.vulkan == 'OFF' run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework - name: package if: matrix.opt.vulkan == 'ON' run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - name: upload-zip uses: actions/upload-artifact@v6 with: name: ${{ env.PACKAGENAME }} path: ${{ env.PACKAGENAME }}.zip openmp-ios: runs-on: macos-15-intel env: OPENMP_VERSION: '18.1.2' OPENMP_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DLIBOMP_ENABLE_SHARED=OFF \ -DLIBOMP_OMPT_SUPPORT=OFF \ -DLIBOMP_USE_HWLOC=OFF \ steps: - name: cache-openmp id: cache-openmp uses: actions/cache@v5 with: path: openmp-install key: openmp-ios-release-18.1.2-20251004 - name: checkout if: steps.cache-openmp.outputs.cache-hit != 'true' uses: actions/checkout@v6 - name: openmp if: steps.cache-openmp.outputs.cache-hit != 'true' run: | wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/ cd openmp-${{ env.OPENMP_VERSION }}.src wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch - name: build-arm64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-arm64 && cd build-arm64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=OS64 -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: merge-fat-library if: steps.cache-openmp.outputs.cache-hit != 'true' run: | rm -rf $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/include $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install/lib cp openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a $GITHUB_WORKSPACE/openmp-install/lib/libomp.a - name: upload uses: actions/upload-artifact@v6 with: name: openmp-ios path: openmp-install ios: needs: [setup, openmp-ios] strategy: matrix: opt: - { vulkan: OFF, id: ios } - { vulkan: ON, id: ios-vulkan } runs-on: macos-15-intel env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ -DNCNN_BUILD_BENCHMARK=OFF \ -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ steps: - uses: actions/checkout@v6 with: submodules: true - name: download-openmp-ios uses: actions/download-artifact@v8 with: name: openmp-ios path: openmp-ios - name: install-openmp run: | sudo cp openmp-ios/include/* $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/include sudo cp openmp-ios/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib - name: build-arm64 run: | mkdir build-arm64 && cd build-arm64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=OS64 -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: package-openmp run: | rm -rf openmp.framework mkdir -p openmp.framework/Versions/A/Headers mkdir -p openmp.framework/Versions/A/Resources ln -s A openmp.framework/Versions/Current ln -s Versions/Current/Headers openmp.framework/Headers ln -s Versions/Current/Resources openmp.framework/Resources ln -s Versions/Current/openmp openmp.framework/openmp cp openmp-ios/lib/libomp.a openmp.framework/Versions/A/openmp cp -a openmp-ios/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - name: package-glslang if: matrix.opt.vulkan == 'ON' run: | rm -rf glslang.framework mkdir -p glslang.framework/Versions/A/Headers mkdir -p glslang.framework/Versions/A/Resources ln -s A glslang.framework/Versions/Current ln -s Versions/Current/Headers glslang.framework/Headers ln -s Versions/Current/Resources glslang.framework/Resources ln -s Versions/Current/glslang glslang.framework/glslang libtool -static \ build-arm64/install/lib/libglslang.a \ build-arm64/install/lib/libSPIRV.a \ -o build-arm64/install/lib/libglslang_combined.a cp build-arm64/install/lib/libglslang_combined.a glslang.framework/Versions/A/glslang cp -a build-arm64/install/include/glslang glslang.framework/Versions/A/Headers/ sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist - name: package-ncnn run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers mkdir -p ncnn.framework/Versions/A/Resources ln -s A ncnn.framework/Versions/Current ln -s Versions/Current/Headers ncnn.framework/Headers ln -s Versions/Current/Resources ncnn.framework/Resources ln -s Versions/Current/ncnn ncnn.framework/ncnn cp build-arm64/install/lib/libncnn.a ncnn.framework/Versions/A/ncnn cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - name: package if: matrix.opt.vulkan == 'OFF' run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework - name: package if: matrix.opt.vulkan == 'ON' run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - name: upload-zip uses: actions/upload-artifact@v6 with: name: ${{ env.PACKAGENAME }} path: ${{ env.PACKAGENAME }}.zip openmp-ios-simulator: runs-on: macos-15-intel env: OPENMP_VERSION: '18.1.2' OPENMP_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DLIBOMP_ENABLE_SHARED=OFF \ -DLIBOMP_OMPT_SUPPORT=OFF \ -DLIBOMP_USE_HWLOC=OFF \ steps: - name: cache-openmp id: cache-openmp uses: actions/cache@v5 with: path: openmp-install key: openmp-ios-simulator-release-18.1.2-20251004 - name: checkout if: steps.cache-openmp.outputs.cache-hit != 'true' uses: actions/checkout@v6 - name: openmp if: steps.cache-openmp.outputs.cache-hit != 'true' run: | wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/ cd openmp-${{ env.OPENMP_VERSION }}.src wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch - name: build-x86_64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-x86_64 && cd build-x86_64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR64 -DARCHS="x86_64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: build-arm64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-arm64 && cd build-arm64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATORARM64 -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: merge-fat-library if: steps.cache-openmp.outputs.cache-hit != 'true' run: | rm -rf $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/include $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install/lib lipo -create \ openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/lib/libomp.a \ openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a \ -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a - name: upload uses: actions/upload-artifact@v6 with: name: openmp-ios-simulator path: openmp-install ios-simulator: needs: [setup, openmp-ios-simulator] strategy: matrix: opt: - { vulkan: OFF, id: ios-simulator } - { vulkan: ON, id: ios-simulator-vulkan } runs-on: macos-15-intel env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ -DNCNN_BUILD_BENCHMARK=OFF \ -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ steps: - uses: actions/checkout@v6 with: submodules: true - name: download-openmp-ios-simulator uses: actions/download-artifact@v8 with: name: openmp-ios-simulator path: openmp-ios-simulator - name: install-openmp run: | sudo cp openmp-ios-simulator/include/* $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/include sudo cp openmp-ios-simulator/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib - name: build-x86_64 run: | mkdir build-x86_64 && cd build-x86_64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR64 -DARCHS="x86_64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: build-arm64 run: | mkdir build-arm64 && cd build-arm64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATORARM64 -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: package-openmp run: | rm -rf openmp.framework mkdir -p openmp.framework/Versions/A/Headers mkdir -p openmp.framework/Versions/A/Resources ln -s A openmp.framework/Versions/Current ln -s Versions/Current/Headers openmp.framework/Headers ln -s Versions/Current/Resources openmp.framework/Resources ln -s Versions/Current/openmp openmp.framework/openmp cp openmp-ios-simulator/lib/libomp.a openmp.framework/Versions/A/openmp cp -a openmp-ios-simulator/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - name: package-glslang if: matrix.opt.vulkan == 'ON' run: | rm -rf glslang.framework mkdir -p glslang.framework/Versions/A/Headers mkdir -p glslang.framework/Versions/A/Resources ln -s A glslang.framework/Versions/Current ln -s Versions/Current/Headers glslang.framework/Headers ln -s Versions/Current/Resources glslang.framework/Resources ln -s Versions/Current/glslang glslang.framework/glslang libtool -static \ build-x86_64/install/lib/libglslang.a \ build-x86_64/install/lib/libSPIRV.a \ -o build-x86_64/install/lib/libglslang_combined.a libtool -static \ build-arm64/install/lib/libglslang.a \ build-arm64/install/lib/libSPIRV.a \ -o build-arm64/install/lib/libglslang_combined.a lipo -create \ build-x86_64/install/lib/libglslang_combined.a \ build-arm64/install/lib/libglslang_combined.a \ -o glslang.framework/Versions/A/glslang cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/ sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist - name: package-ncnn run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers mkdir -p ncnn.framework/Versions/A/Resources ln -s A ncnn.framework/Versions/Current ln -s Versions/Current/Headers ncnn.framework/Headers ln -s Versions/Current/Resources ncnn.framework/Resources ln -s Versions/Current/ncnn ncnn.framework/ncnn lipo -create \ build-x86_64/install/lib/libncnn.a \ build-arm64/install/lib/libncnn.a \ -o ncnn.framework/Versions/A/ncnn cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - name: package if: matrix.opt.vulkan == 'OFF' run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework - name: package if: matrix.opt.vulkan == 'ON' run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - name: upload-zip uses: actions/upload-artifact@v6 with: name: ${{ env.PACKAGENAME }} path: ${{ env.PACKAGENAME }}.zip openmp-mac-catalyst: runs-on: macos-15-intel env: OPENMP_VERSION: '18.1.2' OPENMP_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$MAC_CATALYST_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DLIBOMP_ENABLE_SHARED=OFF \ -DLIBOMP_OMPT_SUPPORT=OFF \ -DLIBOMP_USE_HWLOC=OFF \ steps: - name: cache-openmp id: cache-openmp uses: actions/cache@v5 with: path: openmp-install key: openmp-mac-catalyst-release-18.1.2-20251004 - name: checkout if: steps.cache-openmp.outputs.cache-hit != 'true' uses: actions/checkout@v6 - name: openmp if: steps.cache-openmp.outputs.cache-hit != 'true' run: | wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/ cd openmp-${{ env.OPENMP_VERSION }}.src wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch - name: build-x86_64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-x86_64 && cd build-x86_64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=MAC_CATALYST -DARCHS="x86_64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: build-arm64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-arm64 && cd build-arm64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=MAC_CATALYST_ARM64 -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: merge-fat-library if: steps.cache-openmp.outputs.cache-hit != 'true' run: | rm -rf $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/include $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install/lib lipo -create \ openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/lib/libomp.a \ openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a \ -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a - name: upload uses: actions/upload-artifact@v6 with: name: openmp-mac-catalyst path: openmp-install mac-catalyst: needs: [setup, openmp-mac-catalyst] strategy: matrix: opt: - { vulkan: OFF, id: mac-catalyst } - { vulkan: ON, id: mac-catalyst-vulkan } runs-on: macos-15-intel env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$MAC_CATALYST_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ -DNCNN_BUILD_BENCHMARK=OFF \ -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ steps: - uses: actions/checkout@v6 with: submodules: true - name: download-openmp-mac-catalyst uses: actions/download-artifact@v8 with: name: openmp-mac-catalyst path: openmp-mac-catalyst - name: install-openmp run: | sudo cp openmp-mac-catalyst/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include sudo cp openmp-mac-catalyst/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib - name: build-x86_64 run: | mkdir build-x86_64 && cd build-x86_64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_CATALYST -DARCHS="x86_64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: build-arm64 run: | mkdir build-arm64 && cd build-arm64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_CATALYST -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: package-openmp run: | rm -rf openmp.framework mkdir -p openmp.framework/Versions/A/Headers mkdir -p openmp.framework/Versions/A/Resources ln -s A openmp.framework/Versions/Current ln -s Versions/Current/Headers openmp.framework/Headers ln -s Versions/Current/Resources openmp.framework/Resources ln -s Versions/Current/openmp openmp.framework/openmp cp openmp-mac-catalyst/lib/libomp.a openmp.framework/Versions/A/openmp cp -a openmp-mac-catalyst/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - name: package-glslang if: matrix.opt.vulkan == 'ON' run: | rm -rf glslang.framework mkdir -p glslang.framework/Versions/A/Headers mkdir -p glslang.framework/Versions/A/Resources ln -s A glslang.framework/Versions/Current ln -s Versions/Current/Headers glslang.framework/Headers ln -s Versions/Current/Resources glslang.framework/Resources ln -s Versions/Current/glslang glslang.framework/glslang libtool -static \ build-x86_64/install/lib/libglslang.a \ build-x86_64/install/lib/libSPIRV.a \ -o build-x86_64/install/lib/libglslang_combined.a libtool -static \ build-arm64/install/lib/libglslang.a \ build-arm64/install/lib/libSPIRV.a \ -o build-arm64/install/lib/libglslang_combined.a lipo -create \ build-x86_64/install/lib/libglslang_combined.a \ build-arm64/install/lib/libglslang_combined.a \ -o glslang.framework/Versions/A/glslang cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/ sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist - name: package-ncnn run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers mkdir -p ncnn.framework/Versions/A/Resources ln -s A ncnn.framework/Versions/Current ln -s Versions/Current/Headers ncnn.framework/Headers ln -s Versions/Current/Resources ncnn.framework/Resources ln -s Versions/Current/ncnn ncnn.framework/ncnn lipo -create \ build-x86_64/install/lib/libncnn.a \ build-arm64/install/lib/libncnn.a \ -o ncnn.framework/Versions/A/ncnn cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - name: package if: matrix.opt.vulkan == 'OFF' run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework - name: package if: matrix.opt.vulkan == 'ON' run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - name: upload-zip uses: actions/upload-artifact@v6 with: name: ${{ env.PACKAGENAME }} path: ${{ env.PACKAGENAME }}.zip openmp-watchos: runs-on: macos-15-intel env: OPENMP_VERSION: '18.1.2' OPENMP_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$WATCHOS_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DLIBOMP_ENABLE_SHARED=OFF \ -DLIBOMP_OMPT_SUPPORT=OFF \ -DLIBOMP_USE_HWLOC=OFF \ steps: - name: cache-openmp id: cache-openmp uses: actions/cache@v5 with: path: openmp-install key: openmp-watchos-release-18.1.2-20251004 - name: checkout if: steps.cache-openmp.outputs.cache-hit != 'true' uses: actions/checkout@v6 - name: openmp if: steps.cache-openmp.outputs.cache-hit != 'true' run: | wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/ cd openmp-${{ env.OPENMP_VERSION }}.src wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch - name: build-armv7k if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-armv7k && cd build-armv7k cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=WATCHOS -DARCHS="armv7k" .. cmake --build . -j 4 cmake --build . --target install/strip - name: build-arm64_32 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-arm64_32 && cd build-arm64_32 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=WATCHOS -DARCHS="arm64_32" .. cmake --build . -j 4 cmake --build . --target install/strip - name: merge-fat-library if: steps.cache-openmp.outputs.cache-hit != 'true' run: | rm -rf $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-arm64_32/install/include $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install/lib lipo -create \ openmp-${{ env.OPENMP_VERSION }}.src/build-armv7k/install/lib/libomp.a \ openmp-${{ env.OPENMP_VERSION }}.src/build-arm64_32/install/lib/libomp.a \ -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a - name: upload uses: actions/upload-artifact@v6 with: name: openmp-watchos path: openmp-install watchos: needs: [setup, openmp-watchos] runs-on: macos-15-intel env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-watchos NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$WATCHOS_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ -DNCNN_BUILD_BENCHMARK=OFF \ steps: - uses: actions/checkout@v6 - name: download-openmp-watchos uses: actions/download-artifact@v8 with: name: openmp-watchos path: openmp-watchos - name: install-openmp run: | sudo cp openmp-watchos/include/* $DEVELOPER_DIR/Platforms/WatchOS.platform/Developer/SDKs/WatchOS.sdk/usr/include sudo cp openmp-watchos/lib/libomp.a $DEVELOPER_DIR/Platforms/WatchOS.platform/Developer/SDKs/WatchOS.sdk/usr/lib - name: build-armv7k run: | mkdir build-armv7k && cd build-armv7k cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=WATCHOS -DARCHS="armv7k" .. cmake --build . -j 4 cmake --build . --target install/strip - name: build-arm64_32 run: | mkdir build-arm64_32 && cd build-arm64_32 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=WATCHOS -DARCHS="arm64_32" .. cmake --build . -j 4 cmake --build . --target install/strip - name: package-openmp run: | rm -rf openmp.framework mkdir -p openmp.framework/Versions/A/Headers mkdir -p openmp.framework/Versions/A/Resources ln -s A openmp.framework/Versions/Current ln -s Versions/Current/Headers openmp.framework/Headers ln -s Versions/Current/Resources openmp.framework/Resources ln -s Versions/Current/openmp openmp.framework/openmp cp openmp-watchos/lib/libomp.a openmp.framework/Versions/A/openmp cp -a openmp-watchos/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - name: package run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers mkdir -p ncnn.framework/Versions/A/Resources ln -s A ncnn.framework/Versions/Current ln -s Versions/Current/Headers ncnn.framework/Headers ln -s Versions/Current/Resources ncnn.framework/Resources ln -s Versions/Current/ncnn ncnn.framework/ncnn lipo -create \ build-armv7k/install/lib/libncnn.a \ build-arm64_32/install/lib/libncnn.a \ -o ncnn.framework/Versions/A/ncnn cp -a build-arm64_32/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework - name: upload-zip uses: actions/upload-artifact@v6 with: name: ${{ env.PACKAGENAME }} path: ${{ env.PACKAGENAME }}.zip openmp-watchos-simulator: runs-on: macos-15-intel env: OPENMP_VERSION: '18.1.2' OPENMP_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$WATCHOS_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DLIBOMP_ENABLE_SHARED=OFF \ -DLIBOMP_OMPT_SUPPORT=OFF \ -DLIBOMP_USE_HWLOC=OFF \ steps: - name: cache-openmp id: cache-openmp uses: actions/cache@v5 with: path: openmp-install key: openmp-watchos-simulator-release-18.1.2-20251004 - name: checkout if: steps.cache-openmp.outputs.cache-hit != 'true' uses: actions/checkout@v6 - name: openmp if: steps.cache-openmp.outputs.cache-hit != 'true' run: | wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/ cd openmp-${{ env.OPENMP_VERSION }}.src wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch - name: build-x86_64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-x86_64 && cd build-x86_64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_WATCHOS -DARCHS="x86_64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: build-arm64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-arm64 && cd build-arm64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_WATCHOS -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: merge-fat-library if: steps.cache-openmp.outputs.cache-hit != 'true' run: | rm -rf $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/include $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install/lib lipo -create \ openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/lib/libomp.a \ openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a \ -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a - name: upload uses: actions/upload-artifact@v6 with: name: openmp-watchos-simulator path: openmp-install watchos-simulator: needs: [setup, openmp-watchos-simulator] runs-on: macos-15-intel env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$WATCHOS_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ -DNCNN_BUILD_BENCHMARK=OFF \ steps: - uses: actions/checkout@v6 - name: download-openmp-watchos-simulator uses: actions/download-artifact@v8 with: name: openmp-watchos-simulator path: openmp-watchos-simulator - name: install-openmp run: | sudo cp openmp-watchos-simulator/include/* $DEVELOPER_DIR/Platforms/WatchSimulator.platform/Developer/SDKs/WatchSimulator.sdk/usr/include sudo cp openmp-watchos-simulator/lib/libomp.a $DEVELOPER_DIR/Platforms/WatchSimulator.platform/Developer/SDKs/WatchSimulator.sdk/usr/lib - name: build-x86_64 run: | mkdir build-x86_64 && cd build-x86_64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_WATCHOS -DARCHS="x86_64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: build-arm64 run: | mkdir build-arm64 && cd build-arm64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_WATCHOS -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: package-openmp run: | rm -rf openmp.framework mkdir -p openmp.framework/Versions/A/Headers mkdir -p openmp.framework/Versions/A/Resources ln -s A openmp.framework/Versions/Current ln -s Versions/Current/Headers openmp.framework/Headers ln -s Versions/Current/Resources openmp.framework/Resources ln -s Versions/Current/openmp openmp.framework/openmp cp openmp-watchos-simulator/lib/libomp.a openmp.framework/Versions/A/openmp cp -a openmp-watchos-simulator/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - name: package run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers mkdir -p ncnn.framework/Versions/A/Resources ln -s A ncnn.framework/Versions/Current ln -s Versions/Current/Headers ncnn.framework/Headers ln -s Versions/Current/Resources ncnn.framework/Resources ln -s Versions/Current/ncnn ncnn.framework/ncnn lipo -create \ build-x86_64/install/lib/libncnn.a \ build-arm64/install/lib/libncnn.a \ -o ncnn.framework/Versions/A/ncnn cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework - name: upload-zip uses: actions/upload-artifact@v6 with: name: ${{ env.PACKAGENAME }} path: ${{ env.PACKAGENAME }}.zip openmp-tvos: runs-on: macos-15-intel env: OPENMP_VERSION: '18.1.2' OPENMP_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DLIBOMP_ENABLE_SHARED=OFF \ -DLIBOMP_OMPT_SUPPORT=OFF \ -DLIBOMP_USE_HWLOC=OFF \ steps: - name: cache-openmp id: cache-openmp uses: actions/cache@v5 with: path: openmp-install key: openmp-tvos-release-18.1.2-20251004 - name: checkout if: steps.cache-openmp.outputs.cache-hit != 'true' uses: actions/checkout@v6 - name: openmp if: steps.cache-openmp.outputs.cache-hit != 'true' run: | wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/ cd openmp-${{ env.OPENMP_VERSION }}.src wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch - name: build-arm64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-arm64 && cd build-arm64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=TVOS -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: build-arm64e if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-arm64e && cd build-arm64e cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=TVOS -DARCHS="arm64e" .. cmake --build . -j 4 cmake --build . --target install/strip - name: merge-fat-library if: steps.cache-openmp.outputs.cache-hit != 'true' run: | rm -rf $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/include $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install/lib lipo -create \ openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a \ openmp-${{ env.OPENMP_VERSION }}.src/build-arm64e/install/lib/libomp.a \ -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a - name: upload uses: actions/upload-artifact@v6 with: name: openmp-tvos path: openmp-install tvos: needs: [setup, openmp-tvos] strategy: matrix: opt: - { vulkan: OFF, id: tvos } - { vulkan: ON, id: tvos-vulkan } runs-on: macos-15-intel env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ -DNCNN_BUILD_BENCHMARK=OFF \ -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ steps: - uses: actions/checkout@v6 with: submodules: true - name: download-openmp-tvos uses: actions/download-artifact@v8 with: name: openmp-tvos path: openmp-tvos - name: install-openmp run: | sudo cp openmp-tvos/include/* $DEVELOPER_DIR/Platforms/AppleTVOS.platform/Developer/SDKs/AppleTVOS.sdk/usr/include sudo cp openmp-tvos/lib/libomp.a $DEVELOPER_DIR/Platforms/AppleTVOS.platform/Developer/SDKs/AppleTVOS.sdk/usr/lib - name: build-arm64 run: | mkdir build-arm64 && cd build-arm64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=TVOS -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: build-arm64e run: | mkdir build-arm64e && cd build-arm64e cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=TVOS -DARCHS="arm64e" .. cmake --build . -j 4 cmake --build . --target install/strip - name: package-openmp run: | rm -rf openmp.framework mkdir -p openmp.framework/Versions/A/Headers mkdir -p openmp.framework/Versions/A/Resources ln -s A openmp.framework/Versions/Current ln -s Versions/Current/Headers openmp.framework/Headers ln -s Versions/Current/Resources openmp.framework/Resources ln -s Versions/Current/openmp openmp.framework/openmp cp openmp-tvos/lib/libomp.a openmp.framework/Versions/A/openmp cp -a openmp-tvos/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - name: package-glslang if: matrix.opt.vulkan == 'ON' run: | rm -rf glslang.framework mkdir -p glslang.framework/Versions/A/Headers mkdir -p glslang.framework/Versions/A/Resources ln -s A glslang.framework/Versions/Current ln -s Versions/Current/Headers glslang.framework/Headers ln -s Versions/Current/Resources glslang.framework/Resources ln -s Versions/Current/glslang glslang.framework/glslang libtool -static \ build-arm64/install/lib/libglslang.a \ build-arm64/install/lib/libSPIRV.a \ -o build-arm64/install/lib/libglslang_combined.a libtool -static \ build-arm64e/install/lib/libglslang.a \ build-arm64e/install/lib/libSPIRV.a \ -o build-arm64e/install/lib/libglslang_combined.a lipo -create \ build-arm64/install/lib/libglslang_combined.a \ build-arm64e/install/lib/libglslang_combined.a \ -o glslang.framework/Versions/A/glslang cp -a build-arm64/install/include/glslang glslang.framework/Versions/A/Headers/ sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist - name: package-ncnn run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers mkdir -p ncnn.framework/Versions/A/Resources ln -s A ncnn.framework/Versions/Current ln -s Versions/Current/Headers ncnn.framework/Headers ln -s Versions/Current/Resources ncnn.framework/Resources ln -s Versions/Current/ncnn ncnn.framework/ncnn lipo -create \ build-arm64/install/lib/libncnn.a \ build-arm64e/install/lib/libncnn.a \ -o ncnn.framework/Versions/A/ncnn cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - name: package if: matrix.opt.vulkan == 'OFF' run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework - name: package if: matrix.opt.vulkan == 'ON' run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - name: upload-zip uses: actions/upload-artifact@v6 with: name: ${{ env.PACKAGENAME }} path: ${{ env.PACKAGENAME }}.zip openmp-tvos-simulator: runs-on: macos-15-intel env: OPENMP_VERSION: '18.1.2' OPENMP_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DLIBOMP_ENABLE_SHARED=OFF \ -DLIBOMP_OMPT_SUPPORT=OFF \ -DLIBOMP_USE_HWLOC=OFF \ steps: - name: cache-openmp id: cache-openmp uses: actions/cache@v5 with: path: openmp-install key: openmp-tvos-simulator-release-18.1.2-20251004 - name: checkout if: steps.cache-openmp.outputs.cache-hit != 'true' uses: actions/checkout@v6 - name: openmp if: steps.cache-openmp.outputs.cache-hit != 'true' run: | wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/ cd openmp-${{ env.OPENMP_VERSION }}.src wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch - name: build-x86_64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-x86_64 && cd build-x86_64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_TVOS -DARCHS="x86_64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: build-arm64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-arm64 && cd build-arm64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_TVOS -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: merge-fat-library if: steps.cache-openmp.outputs.cache-hit != 'true' run: | rm -rf $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/include $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install/lib lipo -create \ openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/lib/libomp.a \ openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a \ -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a - name: upload uses: actions/upload-artifact@v6 with: name: openmp-tvos-simulator path: openmp-install tvos-simulator: needs: [setup, openmp-tvos-simulator] strategy: matrix: opt: - { vulkan: OFF, id: tvos-simulator } - { vulkan: ON, id: tvos-simulator-vulkan } runs-on: macos-15-intel env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ -DNCNN_BUILD_BENCHMARK=OFF \ -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ steps: - uses: actions/checkout@v6 with: submodules: true - name: download-openmp-tvos-simulator uses: actions/download-artifact@v8 with: name: openmp-tvos-simulator path: openmp-tvos-simulator - name: install-openmp run: | sudo cp openmp-tvos-simulator/include/* $DEVELOPER_DIR/Platforms/AppleTVSimulator.platform/Developer/SDKs/AppleTVSimulator.sdk/usr/include sudo cp openmp-tvos-simulator/lib/libomp.a $DEVELOPER_DIR/Platforms/AppleTVSimulator.platform/Developer/SDKs/AppleTVSimulator.sdk/usr/lib - name: build-x86_64 run: | mkdir build-x86_64 && cd build-x86_64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_TVOS -DARCHS="x86_64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: build-arm64 run: | mkdir build-arm64 && cd build-arm64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_TVOS -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: package-openmp run: | rm -rf openmp.framework mkdir -p openmp.framework/Versions/A/Headers mkdir -p openmp.framework/Versions/A/Resources ln -s A openmp.framework/Versions/Current ln -s Versions/Current/Headers openmp.framework/Headers ln -s Versions/Current/Resources openmp.framework/Resources ln -s Versions/Current/openmp openmp.framework/openmp cp openmp-tvos-simulator/lib/libomp.a openmp.framework/Versions/A/openmp cp -a openmp-tvos-simulator/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - name: package-glslang if: matrix.opt.vulkan == 'ON' run: | rm -rf glslang.framework mkdir -p glslang.framework/Versions/A/Headers mkdir -p glslang.framework/Versions/A/Resources ln -s A glslang.framework/Versions/Current ln -s Versions/Current/Headers glslang.framework/Headers ln -s Versions/Current/Resources glslang.framework/Resources ln -s Versions/Current/glslang glslang.framework/glslang libtool -static \ build-x86_64/install/lib/libglslang.a \ build-x86_64/install/lib/libSPIRV.a \ -o build-x86_64/install/lib/libglslang_combined.a libtool -static \ build-arm64/install/lib/libglslang.a \ build-arm64/install/lib/libSPIRV.a \ -o build-arm64/install/lib/libglslang_combined.a lipo -create \ build-x86_64/install/lib/libglslang_combined.a \ build-arm64/install/lib/libglslang_combined.a \ -o glslang.framework/Versions/A/glslang cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/ sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist - name: package-ncnn run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers mkdir -p ncnn.framework/Versions/A/Resources ln -s A ncnn.framework/Versions/Current ln -s Versions/Current/Headers ncnn.framework/Headers ln -s Versions/Current/Resources ncnn.framework/Resources ln -s Versions/Current/ncnn ncnn.framework/ncnn lipo -create \ build-x86_64/install/lib/libncnn.a \ build-arm64/install/lib/libncnn.a \ -o ncnn.framework/Versions/A/ncnn cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - name: package if: matrix.opt.vulkan == 'OFF' run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework - name: package if: matrix.opt.vulkan == 'ON' run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - name: upload-zip uses: actions/upload-artifact@v6 with: name: ${{ env.PACKAGENAME }} path: ${{ env.PACKAGENAME }}.zip openmp-visionos: runs-on: macos-15-intel env: OPENMP_VERSION: '18.1.2' OPENMP_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$VISIONOS_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DLIBOMP_ENABLE_SHARED=OFF \ -DLIBOMP_OMPT_SUPPORT=OFF \ -DLIBOMP_USE_HWLOC=OFF \ steps: - name: cache-openmp id: cache-openmp uses: actions/cache@v5 with: path: openmp-install key: openmp-visionos-release-18.1.2-20251004 - name: checkout if: steps.cache-openmp.outputs.cache-hit != 'true' uses: actions/checkout@v6 - name: openmp if: steps.cache-openmp.outputs.cache-hit != 'true' run: | wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/ cd openmp-${{ env.OPENMP_VERSION }}.src wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch - name: build-arm64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-arm64 && cd build-arm64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=VISIONOS -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: merge-fat-library if: steps.cache-openmp.outputs.cache-hit != 'true' run: | rm -rf $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/include $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install/lib cp openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a $GITHUB_WORKSPACE/openmp-install/lib/libomp.a - name: upload uses: actions/upload-artifact@v6 with: name: openmp-visionos path: openmp-install visionos: needs: [setup, openmp-visionos] strategy: matrix: opt: - { vulkan: OFF, id: visionos } - { vulkan: ON, id: visionos-vulkan } runs-on: macos-15-intel env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$VISIONOS_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ -DNCNN_BUILD_BENCHMARK=OFF \ -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ steps: - uses: actions/checkout@v6 with: submodules: true - name: download-openmp-visionos uses: actions/download-artifact@v8 with: name: openmp-visionos path: openmp-visionos - name: install-openmp run: | sudo cp openmp-visionos/include/* $DEVELOPER_DIR/Platforms/XROS.platform/Developer/SDKs/XROS.sdk/usr/include sudo cp openmp-visionos/lib/libomp.a $DEVELOPER_DIR/Platforms/XROS.platform/Developer/SDKs/XROS.sdk/usr/lib - name: build-arm64 run: | mkdir build-arm64 && cd build-arm64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=VISIONOS -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: package-openmp run: | rm -rf openmp.framework mkdir -p openmp.framework/Versions/A/Headers mkdir -p openmp.framework/Versions/A/Resources ln -s A openmp.framework/Versions/Current ln -s Versions/Current/Headers openmp.framework/Headers ln -s Versions/Current/Resources openmp.framework/Resources ln -s Versions/Current/openmp openmp.framework/openmp cp openmp-visionos/lib/libomp.a openmp.framework/Versions/A/openmp cp -a openmp-visionos/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - name: package-glslang if: matrix.opt.vulkan == 'ON' run: | rm -rf glslang.framework mkdir -p glslang.framework/Versions/A/Headers mkdir -p glslang.framework/Versions/A/Resources ln -s A glslang.framework/Versions/Current ln -s Versions/Current/Headers glslang.framework/Headers ln -s Versions/Current/Resources glslang.framework/Resources ln -s Versions/Current/glslang glslang.framework/glslang libtool -static \ build-arm64/install/lib/libglslang.a \ build-arm64/install/lib/libSPIRV.a \ -o build-arm64/install/lib/libglslang_combined.a cp build-arm64/install/lib/libglslang_combined.a glslang.framework/Versions/A/glslang cp -a build-arm64/install/include/glslang glslang.framework/Versions/A/Headers/ sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist - name: package-ncnn run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers mkdir -p ncnn.framework/Versions/A/Resources ln -s A ncnn.framework/Versions/Current ln -s Versions/Current/Headers ncnn.framework/Headers ln -s Versions/Current/Resources ncnn.framework/Resources ln -s Versions/Current/ncnn ncnn.framework/ncnn cp build-arm64/install/lib/libncnn.a ncnn.framework/Versions/A/ncnn cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - name: package if: matrix.opt.vulkan == 'OFF' run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework - name: package if: matrix.opt.vulkan == 'ON' run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - name: upload-zip uses: actions/upload-artifact@v6 with: name: ${{ env.PACKAGENAME }} path: ${{ env.PACKAGENAME }}.zip openmp-visionos-simulator: runs-on: macos-15-intel env: OPENMP_VERSION: '18.1.2' OPENMP_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$VISIONOS_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DLIBOMP_ENABLE_SHARED=OFF \ -DLIBOMP_OMPT_SUPPORT=OFF \ -DLIBOMP_USE_HWLOC=OFF \ steps: - name: cache-openmp id: cache-openmp uses: actions/cache@v5 with: path: openmp-install key: openmp-visionos-simulator-release-18.1.2-20251004 - name: checkout if: steps.cache-openmp.outputs.cache-hit != 'true' uses: actions/checkout@v6 - name: openmp if: steps.cache-openmp.outputs.cache-hit != 'true' run: | wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/ cd openmp-${{ env.OPENMP_VERSION }}.src wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch - name: build-x86_64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-x86_64 && cd build-x86_64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_VISIONOS -DARCHS="x86_64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: build-arm64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-arm64 && cd build-arm64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_VISIONOS -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: merge-fat-library if: steps.cache-openmp.outputs.cache-hit != 'true' run: | rm -rf $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/include $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install/lib lipo -create \ openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/lib/libomp.a \ openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a \ -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a - name: upload uses: actions/upload-artifact@v6 with: name: openmp-visionos-simulator path: openmp-install visionos-simulator: needs: [setup, openmp-visionos-simulator] strategy: matrix: opt: - { vulkan: OFF, id: visionos-simulator } - { vulkan: ON, id: visionos-simulator-vulkan } runs-on: macos-15-intel env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$VISIONOS_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ -DNCNN_BUILD_BENCHMARK=OFF \ -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ steps: - uses: actions/checkout@v6 with: submodules: true - name: download-openmp-visionos-simulator uses: actions/download-artifact@v8 with: name: openmp-visionos-simulator path: openmp-visionos-simulator - name: install-openmp run: | sudo cp openmp-visionos-simulator/include/* $DEVELOPER_DIR/Platforms/XRSimulator.platform/Developer/SDKs/XRSimulator.sdk/usr/include sudo cp openmp-visionos-simulator/lib/libomp.a $DEVELOPER_DIR/Platforms/XRSimulator.platform/Developer/SDKs/XRSimulator.sdk/usr/lib - name: build-x86_64 run: | mkdir build-x86_64 && cd build-x86_64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_VISIONOS -DARCHS="x86_64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: build-arm64 run: | mkdir build-arm64 && cd build-arm64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_VISIONOS -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install/strip - name: package-openmp run: | rm -rf openmp.framework mkdir -p openmp.framework/Versions/A/Headers mkdir -p openmp.framework/Versions/A/Resources ln -s A openmp.framework/Versions/Current ln -s Versions/Current/Headers openmp.framework/Headers ln -s Versions/Current/Resources openmp.framework/Resources ln -s Versions/Current/openmp openmp.framework/openmp cp openmp-visionos-simulator/lib/libomp.a openmp.framework/Versions/A/openmp cp -a openmp-visionos-simulator/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - name: package-glslang if: matrix.opt.vulkan == 'ON' run: | rm -rf glslang.framework mkdir -p glslang.framework/Versions/A/Headers mkdir -p glslang.framework/Versions/A/Resources ln -s A glslang.framework/Versions/Current ln -s Versions/Current/Headers glslang.framework/Headers ln -s Versions/Current/Resources glslang.framework/Resources ln -s Versions/Current/glslang glslang.framework/glslang libtool -static \ build-x86_64/install/lib/libglslang.a \ build-x86_64/install/lib/libSPIRV.a \ -o build-x86_64/install/lib/libglslang_combined.a libtool -static \ build-arm64/install/lib/libglslang.a \ build-arm64/install/lib/libSPIRV.a \ -o build-arm64/install/lib/libglslang_combined.a lipo -create \ build-x86_64/install/lib/libglslang_combined.a \ build-arm64/install/lib/libglslang_combined.a \ -o glslang.framework/Versions/A/glslang cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/ sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist - name: package-ncnn run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers mkdir -p ncnn.framework/Versions/A/Resources ln -s A ncnn.framework/Versions/Current ln -s Versions/Current/Headers ncnn.framework/Headers ln -s Versions/Current/Resources ncnn.framework/Resources ln -s Versions/Current/ncnn ncnn.framework/ncnn lipo -create \ build-x86_64/install/lib/libncnn.a \ build-arm64/install/lib/libncnn.a \ -o ncnn.framework/Versions/A/ncnn cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - name: package if: matrix.opt.vulkan == 'OFF' run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework - name: package if: matrix.opt.vulkan == 'ON' run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - name: upload-zip uses: actions/upload-artifact@v6 with: name: ${{ env.PACKAGENAME }} path: ${{ env.PACKAGENAME }}.zip android: needs: [setup] strategy: matrix: opt: - { vulkan: OFF, shared-lib: OFF, id: android } - { vulkan: OFF, shared-lib: ON, id: android-shared } - { vulkan: ON, shared-lib: OFF, id: android-vulkan } - { vulkan: ON, shared-lib: ON, id: android-vulkan-shared } runs-on: ubuntu-latest env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake \ -DANDROID_PLATFORM=android-21 \ -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False \ -DANDROID_SUPPORT_FLEXIBLE_PAGE_SIZES=ON \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_INSTALL_PREFIX=install \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ -DNCNN_BUILD_BENCHMARK=OFF \ -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ -DNCNN_SHARED_LIB=${{ matrix.opt.shared-lib }} \ steps: - uses: actions/checkout@v6 with: submodules: true - name: ndk-fix-debug run: sed -i -e '/^ -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake - name: build-armeabi-v7a run: | mkdir build-armeabi-v7a && cd build-armeabi-v7a cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON .. cmake --build . -j $(nproc) cmake --build . --target install/strip - name: build-arm64-v8a run: | mkdir build-arm64-v8a && cd build-arm64-v8a cmake ${{ env.NCNN_CMAKE_OPTIONS }}-DANDROID_ABI="arm64-v8a" .. cmake --build . -j $(nproc) cmake --build . --target install/strip - name: build-x86 run: | mkdir build-x86 && cd build-x86 cmake ${{ env.NCNN_CMAKE_OPTIONS }}-DANDROID_ABI="x86" .. cmake --build . -j $(nproc) cmake --build . --target install/strip - name: build-x86_64 run: | mkdir build-x86_64 && cd build-x86_64 cmake ${{ env.NCNN_CMAKE_OPTIONS }}-DANDROID_ABI="x86_64" .. cmake --build . -j $(nproc) cmake --build . --target install/strip - name: build-riscv64 run: | mkdir build-riscv64 && cd build-riscv64 cmake ${{ env.NCNN_CMAKE_OPTIONS }}-DANDROID_ABI="riscv64" .. cmake --build . -j $(nproc) cmake --build . --target install/strip - name: package run: | rm -rf ${{ env.PACKAGENAME }} mkdir -p ${{ env.PACKAGENAME }} cp -a build-armeabi-v7a/install ${{ env.PACKAGENAME }}/armeabi-v7a cp -a build-arm64-v8a/install ${{ env.PACKAGENAME }}/arm64-v8a cp -a build-x86/install ${{ env.PACKAGENAME }}/x86 cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64 cp -a build-riscv64/install ${{ env.PACKAGENAME }}/riscv64 rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - name: upload-zip uses: actions/upload-artifact@v6 with: name: ${{ env.PACKAGENAME }} path: ${{ env.PACKAGENAME }}.zip webassembly: needs: [setup] runs-on: ubuntu-latest env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-webassembly steps: - uses: actions/checkout@v6 - name: emsdk run: | git clone https://github.com/emscripten-core/emsdk.git cd emsdk ./emsdk install $EMSCRIPTEN_VERSION ./emsdk activate $EMSCRIPTEN_VERSION - name: build run: | source emsdk/emsdk_env.sh mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. cmake --build . -j $(nproc) cmake --build . --target install/strip - name: build-simd run: | source emsdk/emsdk_env.sh mkdir build-simd && cd build-simd cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. cmake --build . -j $(nproc) cmake --build . --target install/strip - name: build-threads run: | source emsdk/emsdk_env.sh mkdir build-threads && cd build-threads cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. cmake --build . -j $(nproc) cmake --build . --target install/strip - name: build-simd-threads run: | source emsdk/emsdk_env.sh mkdir build-simd-threads && cd build-simd-threads cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. cmake --build . -j $(nproc) cmake --build . --target install/strip - name: package run: | rm -rf ${{ env.PACKAGENAME }} mkdir -p ${{ env.PACKAGENAME }} cp -a build/install ${{ env.PACKAGENAME }}/basic cp -a build-simd/install ${{ env.PACKAGENAME }}/simd cp -a build-threads/install ${{ env.PACKAGENAME }}/threads cp -a build-simd-threads/install ${{ env.PACKAGENAME }}/simd-threads rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - name: upload-zip uses: actions/upload-artifact@v6 with: name: ${{ env.PACKAGENAME }} path: ${{ env.PACKAGENAME }}.zip windows: needs: [setup] strategy: matrix: opt: - { shared-lib: OFF, os: windows-2022, toolset-version: v140, windows-sdk-version: 22621, id: vs2015 } - { shared-lib: OFF, os: windows-2022, toolset-version: v141, windows-sdk-version: 22621, id: vs2017 } - { shared-lib: OFF, os: windows-2022, toolset-version: v142, windows-sdk-version: 22621, id: vs2019 } - { shared-lib: OFF, os: windows-2022, toolset-version: v143, windows-sdk-version: 26100, id: vs2022 } - { shared-lib: ON, os: windows-2022, toolset-version: v140, windows-sdk-version: 22621, id: vs2015-shared } - { shared-lib: ON, os: windows-2022, toolset-version: v141, windows-sdk-version: 22621, id: vs2017-shared } - { shared-lib: ON, os: windows-2022, toolset-version: v142, windows-sdk-version: 22621, id: vs2019-shared } - { shared-lib: ON, os: windows-2022, toolset-version: v143, windows-sdk-version: 26100, id: vs2022-shared } runs-on: ${{ matrix.opt.os }} env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-${{ matrix.opt.id }} UseMultiToolTask: true NCNN_CMAKE_OPTIONS: | -T ${{ matrix.opt.toolset-version }},host=x64 ` -DCMAKE_BUILD_TYPE=Release ` -DCMAKE_INSTALL_PREFIX=install ` -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" ` -DNCNN_BUILD_EXAMPLES=OFF ` -DNCNN_BUILD_TOOLS=ON ` -DNCNN_BUILD_BENCHMARK=OFF ` -DNCNN_VULKAN=ON ` -DNCNN_SHARED_LIB=${{ matrix.opt.shared-lib }} ` steps: - uses: actions/checkout@v6 with: submodules: true - name: Install VS 2017 (v141) Build Tools if: matrix.opt.toolset-version == 'v141' run: | $vsInstallPath = & "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -latest -property installationPath Start-Process -FilePath "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vs_installer.exe" -ArgumentList "modify --installPath `"$vsInstallPath`" --add Microsoft.VisualStudio.Component.VC.v141.x86.x64 --quiet --norestart --nocache" -Wait - name: Install and Setup VS 2015 (v140) Build Tools if: matrix.opt.toolset-version == 'v140' run: | $vs140Path = "C:/vs140_build_tools" Invoke-WebRequest -Uri "https://aka.ms/vs/15/release/vs_buildtools.exe" -OutFile vs_buildtools.exe Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--installPath `"$vs140Path`" --add Microsoft.VisualStudio.Workload.VCTools --add Microsoft.VisualStudio.Component.VC.140 --quiet --wait --norestart --nocache" -Wait $vcvarsPath = (Get-ChildItem -Path $vs140Path -Filter "vcvars64.bat" -Recurse | Select-Object -First 1).FullName $cmd = "`"$vcvarsPath`" && powershell -Command `"`$env:PATH;`$env:INCLUDE;`$env:LIB`"" $output = cmd.exe /c $cmd $lines = $output -split "`r`n" echo "PATH=$($lines[0]);$($env:PATH)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append echo "INCLUDE=$($lines[1])" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append echo "LIB=$($lines[2])" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append - uses: GuillaumeFalourd/setup-windows10-sdk-action@v2.4 with: sdk-version: ${{ matrix.opt.windows-sdk-version }} - name: cache-protobuf id: cache-protobuf uses: actions/cache@v5 with: path: "protobuf-install" key: protobuf-${{ matrix.opt.toolset-version }}-x86-x64-install - name: protobuf if: steps.cache-protobuf.outputs.cache-hit != 'true' run: | Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip 7z x ./protobuf-3.11.2.zip cd protobuf-3.11.2 mkdir build-x86; cd build-x86; cmake -T ${{ matrix.opt.toolset-version }},host=x64 -A Win32,version=10.0.${{ matrix.opt.windows-sdk-version }}.0 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake cmake --build . --config Release -j 4 cmake --build . --config Release --target install cd .. mkdir build-x64; cd build-x64; cmake -T ${{ matrix.opt.toolset-version }},host=x64 -A x64,version=10.0.${{ matrix.opt.windows-sdk-version }}.0 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake cmake --build . --config Release -j 4 cmake --build . --config Release --target install - name: build-x86 run: | mkdir build-x86; cd build-x86 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -A Win32,version=10.0.${{ matrix.opt.windows-sdk-version }}.0 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" .. cmake --build . --config Release -j 4 cmake --build . --config Release --target install - name: build-x64 run: | mkdir build-x64; cd build-x64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -A x64,version=10.0.${{ matrix.opt.windows-sdk-version }}.0 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" .. cmake --build . --config Release -j 4 cmake --build . --config Release --target install - name: build-arm64 if: matrix.opt.toolset-version == 'v142' || matrix.opt.toolset-version == 'v143' run: | mkdir build-arm64; cd build-arm64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -A arm64,version=10.0.${{ matrix.opt.windows-sdk-version }}.0 .. cmake --build . --config Release -j 4 cmake --build . --config Release --target install - name: package if: matrix.opt.toolset-version == 'v140' || matrix.opt.toolset-version == 'v141' run: | mkdir ${{ env.PACKAGENAME }} mkdir ${{ env.PACKAGENAME }}/x86 mkdir ${{ env.PACKAGENAME }}/x64 Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - name: package if: matrix.opt.toolset-version == 'v142' || matrix.opt.toolset-version == 'v143' run: | mkdir ${{ env.PACKAGENAME }} mkdir ${{ env.PACKAGENAME }}/x86 mkdir ${{ env.PACKAGENAME }}/x64 mkdir ${{ env.PACKAGENAME }}/arm64 Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" Copy-Item -Verbose -Recurse -Path "build-arm64\install\*" -Destination "${{ env.PACKAGENAME }}\arm64" 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - name: upload-zip uses: actions/upload-artifact@v6 with: name: ${{ env.PACKAGENAME }} path: ${{ env.PACKAGENAME }}.zip apple: needs: [setup, macos, ios, ios-simulator, mac-catalyst, watchos, watchos-simulator, tvos, tvos-simulator, visionos, visionos-simulator] runs-on: macos-15-intel env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-apple steps: - run: sudo xcode-select --switch /Applications/Xcode_16.4.0.app - name: download uses: actions/download-artifact@v8 with: path: artifacts - name: unzip run: | mkdir -p ncnn-ios mkdir -p ncnn-ios-vulkan mkdir -p ncnn-ios-simulator mkdir -p ncnn-ios-simulator-vulkan mkdir -p ncnn-mac-catalyst mkdir -p ncnn-mac-catalyst-vulkan mkdir -p ncnn-macos mkdir -p ncnn-macos-vulkan mkdir -p ncnn-tvos mkdir -p ncnn-tvos-vulkan mkdir -p ncnn-tvos-simulator mkdir -p ncnn-tvos-simulator-vulkan mkdir -p ncnn-visionos mkdir -p ncnn-visionos-vulkan mkdir -p ncnn-visionos-simulator mkdir -p ncnn-visionos-simulator-vulkan mkdir -p ncnn-watchos mkdir -p ncnn-watchos-simulator unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios/ncnn-${{ needs.setup.outputs.VERSION }}-ios.zip -d ncnn-ios unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan.zip -d ncnn-ios-vulkan unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator.zip -d ncnn-ios-simulator unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator-vulkan.zip -d ncnn-ios-simulator-vulkan unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst.zip -d ncnn-mac-catalyst unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst-vulkan.zip -d ncnn-mac-catalyst-vulkan unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-macos/ncnn-${{ needs.setup.outputs.VERSION }}-macos.zip -d ncnn-macos unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan.zip -d ncnn-macos-vulkan unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos/ncnn-${{ needs.setup.outputs.VERSION }}-tvos.zip -d ncnn-tvos unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-vulkan.zip -d ncnn-tvos-vulkan unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator.zip -d ncnn-tvos-simulator unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator-vulkan.zip -d ncnn-tvos-simulator-vulkan unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos/ncnn-${{ needs.setup.outputs.VERSION }}-visionos.zip -d ncnn-visionos unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-vulkan.zip -d ncnn-visionos-vulkan unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator.zip -d ncnn-visionos-simulator unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator-vulkan.zip -d ncnn-visionos-simulator-vulkan unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos/ncnn-${{ needs.setup.outputs.VERSION }}-watchos.zip -d ncnn-watchos unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator.zip -d ncnn-watchos-simulator - name: create-xcframwork run: | rm -rf openmp.xcframework xcodebuild -create-xcframework \ -framework ncnn-macos/openmp.framework \ -framework ncnn-ios/openmp.framework \ -framework ncnn-ios-simulator/openmp.framework \ -framework ncnn-mac-catalyst/openmp.framework \ -framework ncnn-watchos/openmp.framework \ -framework ncnn-watchos-simulator/openmp.framework \ -framework ncnn-tvos/openmp.framework \ -framework ncnn-tvos-simulator/openmp.framework \ -framework ncnn-visionos/openmp.framework \ -framework ncnn-visionos-simulator/openmp.framework \ -output openmp.xcframework rm -rf ncnn.xcframework xcodebuild -create-xcframework \ -framework ncnn-macos/ncnn.framework \ -framework ncnn-ios/ncnn.framework \ -framework ncnn-ios-simulator/ncnn.framework \ -framework ncnn-mac-catalyst/ncnn.framework \ -framework ncnn-watchos/ncnn.framework \ -framework ncnn-watchos-simulator/ncnn.framework \ -framework ncnn-tvos/ncnn.framework \ -framework ncnn-tvos-simulator/ncnn.framework \ -framework ncnn-visionos/ncnn.framework \ -framework ncnn-visionos-simulator/ncnn.framework \ -output ncnn.xcframework rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.xcframework ncnn.xcframework - name: create-xcframwork-vulkan run: | rm -rf openmp.xcframework xcodebuild -create-xcframework \ -framework ncnn-macos-vulkan/openmp.framework \ -framework ncnn-ios-vulkan/openmp.framework \ -framework ncnn-ios-simulator-vulkan/openmp.framework \ -framework ncnn-mac-catalyst-vulkan/openmp.framework \ -framework ncnn-watchos/openmp.framework \ -framework ncnn-watchos-simulator/openmp.framework \ -framework ncnn-tvos-vulkan/openmp.framework \ -framework ncnn-tvos-simulator-vulkan/openmp.framework \ -framework ncnn-visionos/openmp.framework \ -framework ncnn-visionos-simulator/openmp.framework \ -output openmp.xcframework rm -rf glslang.xcframework xcodebuild -create-xcframework \ -framework ncnn-macos-vulkan/glslang.framework \ -framework ncnn-ios-vulkan/glslang.framework \ -framework ncnn-ios-simulator-vulkan/glslang.framework \ -framework ncnn-mac-catalyst-vulkan/glslang.framework \ -framework ncnn-tvos-vulkan/glslang.framework \ -framework ncnn-tvos-simulator-vulkan/glslang.framework \ -framework ncnn-visionos-vulkan/glslang.framework \ -framework ncnn-visionos-simulator-vulkan/glslang.framework \ -output glslang.xcframework rm -rf ncnn.xcframework xcodebuild -create-xcframework \ -framework ncnn-macos-vulkan/ncnn.framework \ -framework ncnn-ios-vulkan/ncnn.framework \ -framework ncnn-ios-simulator-vulkan/ncnn.framework \ -framework ncnn-mac-catalyst-vulkan/ncnn.framework \ -framework ncnn-watchos/ncnn.framework \ -framework ncnn-watchos-simulator/ncnn.framework \ -framework ncnn-tvos-vulkan/ncnn.framework \ -framework ncnn-tvos-simulator-vulkan/ncnn.framework \ -framework ncnn-visionos-vulkan/ncnn.framework \ -framework ncnn-visionos-simulator-vulkan/ncnn.framework \ -output ncnn.xcframework rm -f ${{ env.PACKAGENAME }}-vulkan.zip zip -9 -y -r ${{ env.PACKAGENAME }}-vulkan.zip openmp.xcframework glslang.xcframework ncnn.xcframework - name: upload-zip uses: actions/upload-artifact@v6 with: name: ${{ env.PACKAGENAME }} path: ${{ env.PACKAGENAME }}.zip - name: upload-zip-vulkan uses: actions/upload-artifact@v6 with: name: ${{ env.PACKAGENAME }}-vulkan path: ${{ env.PACKAGENAME }}-vulkan.zip release: permissions: contents: write # for softprops/action-gh-release to create a release needs: [setup, full-source, ubuntu, macos, ios, ios-simulator, mac-catalyst, watchos, watchos-simulator, tvos, tvos-simulator, android, webassembly, windows, apple] runs-on: ubuntu-latest steps: - name: download uses: actions/download-artifact@v8 with: path: artifacts - name: create-release uses: softprops/action-gh-release@v2 with: token: ${{ secrets.GITHUB_TOKEN }} tag_name: ${{ needs.setup.outputs.VERSION }} name: Release ${{ needs.setup.outputs.VERSION }} files: artifacts/*/*.zip ================================================ FILE: .github/workflows/sync-wiki.yml ================================================ name: sync-wiki on: push: branches: [master] paths: - '.github/workflows/sync-wiki.yml' - 'docs/**' concurrency: group: sync-wiki-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: sync-wiki: permissions: contents: write # for Git to git push runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: sync run: | cp -r docs $GITHUB_WORKSPACE/ncnn.wiki cd $GITHUB_WORKSPACE/ncnn.wiki git config --global user.name "wiki-sync-bot" git config --global user.email "wiki-sync-bot@qq.com" git init git add . git commit -m "sync" git remote add upstream https://${{ secrets.WIKI_SYNC_BOT_TOKEN }}@github.com/Tencent/ncnn.wiki.git git push upstream master -f ================================================ FILE: .github/workflows/test-coverage.yml ================================================ name: test-coverage on: push: branches: [master] paths: - '.github/workflows/test-coverage.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/**' - 'tests/**' - 'toolchains/**' - 'glslang' pull_request: branches: [master] paths: - '.github/workflows/test-coverage.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/**' - 'tests/**' - 'toolchains/**' - 'glslang' concurrency: group: test-coverage-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: linux-gcc-gpu-t4: runs-on: [self-hosted, linux, t4] steps: - uses: actions/checkout@v6 with: submodules: true - name: build env: CC: gcc CXX: g++ LD_LIBRARY_PATH: /data/action/install/lib64 run: | mkdir build && cd build cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_VULKAN=ON -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_XOP=OFF -DNCNN_AVXVNNI=OFF -DNCNN_AVXNECONVERT=OFF -DNCNN_AVX512=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j 4 - name: test env: LD_LIBRARY_PATH: /data/action/install/lib64 run: cd build && ctest --output-on-failure -j 4 - name: lcov-collect run: | cd build lcov -d ./src -c -o lcov.info lcov -r lcov.info '/usr/*' -o lcov.info lcov -r lcov.info '*/install/*' -o lcov.info lcov -r lcov.info '*/build/*' -o lcov.info lcov --list lcov.info - name: codecov uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} disable_search: true plugins: noop binary: /data/action/.local/bin/codecov files: build/lcov.info linux-gcc-x64: name: x64-${{ matrix.name }} runs-on: [self-hosted, linux, ubuntu25] strategy: fail-fast: false matrix: include: - { name: 'none', SSE2: OFF, AVX: OFF, F16C: OFF, FMA: OFF, AVX2: OFF, AVX512: OFF, AVX512VNNI: OFF } - { name: 'sse2', SSE2: ON, AVX: OFF, F16C: OFF, FMA: OFF, AVX2: OFF, AVX512: OFF, AVX512VNNI: OFF } - { name: 'avx', SSE2: ON, AVX: ON, F16C: OFF, FMA: OFF, AVX2: OFF, AVX512: OFF, AVX512VNNI: OFF } - { name: 'avx2', SSE2: ON, AVX: ON, F16C: ON, FMA: ON, AVX2: ON, AVX512: OFF, AVX512VNNI: OFF } - { name: 'avx512', SSE2: ON, AVX: ON, F16C: ON, FMA: ON, AVX2: ON, AVX512: ON, AVX512VNNI: OFF } - { name: 'avx512vnni', SSE2: ON, AVX: ON, F16C: ON, FMA: ON, AVX2: ON, AVX512: ON, AVX512VNNI: ON } steps: - uses: actions/checkout@v6 - name: build run: | mkdir build && cd build cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF \ -DNCNN_SSE2=${{ matrix.SSE2 }} \ -DNCNN_AVX=${{ matrix.AVX }} \ -DNCNN_F16C=${{ matrix.F16C }} \ -DNCNN_FMA=${{ matrix.FMA }} \ -DNCNN_AVX2=${{ matrix.AVX2 }} \ -DNCNN_AVX512=${{ matrix.AVX512 }} \ -DNCNN_AVX512VNNI=${{ matrix.AVX512VNNI }} \ -DNCNN_XOP=OFF \ -DNCNN_AVXVNNI=OFF \ -DNCNN_AVX512BF16=OFF \ -DNCNN_AVX512FP16=OFF \ -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j 8 - name: test run: | cd build ctest --output-on-failure -j 8 - name: lcov-collect run: | cd build lcov --ignore-errors inconsistent -d ./src -c -o lcov.info lcov --ignore-errors inconsistent -r lcov.info '/usr/*' -o lcov.info lcov --ignore-errors inconsistent -r lcov.info '*/build/*' -o lcov.info lcov --ignore-errors inconsistent --list lcov.info - name: build-openmp run: | mkdir build-openmp && cd build-openmp cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF \ -DNCNN_SSE2=${{ matrix.SSE2 }} \ -DNCNN_AVX=${{ matrix.AVX }} \ -DNCNN_F16C=${{ matrix.F16C }} \ -DNCNN_FMA=${{ matrix.FMA }} \ -DNCNN_AVX2=${{ matrix.AVX2 }} \ -DNCNN_AVX512=${{ matrix.AVX512 }} \ -DNCNN_AVX512VNNI=${{ matrix.AVX512VNNI }} \ -DNCNN_XOP=OFF \ -DNCNN_AVXVNNI=OFF \ -DNCNN_AVX512BF16=OFF \ -DNCNN_AVX512FP16=OFF \ -DNCNN_OPENMP=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j 8 - name: test run: | export OMP_THREAD_LIMIT=1 export OMP_NUM_THREADS=1 cd build-openmp ctest --output-on-failure -j 8 - name: lcov-collect run: | cd build-openmp lcov --ignore-errors inconsistent -d ./src -c -o lcov.info lcov --ignore-errors inconsistent -r lcov.info '/usr/*' -o lcov.info lcov --ignore-errors inconsistent -r lcov.info '*/build-openmp/*' -o lcov.info lcov --ignore-errors inconsistent --list lcov.info - name: codecov uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} disable_search: true plugins: noop binary: /data/action/osd/codecov files: build/lcov.info,build-openmp/lcov.info linux-gcc-x64-simplestl-simplemath: name: simplestl-simplemath runs-on: [self-hosted, linux, ubuntu25] steps: - uses: actions/checkout@v6 - name: build run: | mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake \ -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEMATH=ON \ -DNCNN_OPENMP=OFF -DNCNN_THREADS=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j 8 - name: test run: | cd build ctest --output-on-failure -j 8 - name: lcov-collect run: | cd build lcov --ignore-errors inconsistent -d ./src -c -o lcov.info lcov --ignore-errors inconsistent -r lcov.info '/usr/*' -o lcov.info lcov --ignore-errors inconsistent -r lcov.info '*/build/*' -o lcov.info lcov --ignore-errors inconsistent --list lcov.info - name: codecov uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} disable_search: true plugins: noop binary: /data/action/osd/codecov files: build/lcov.info linux-gcc-x64-sde: name: sde-${{ matrix.cpu }} runs-on: [self-hosted, linux, ubuntu25] env: SDE_PATH: /data/action/osd/sde-external-9.33.0-2024-01-07-lin strategy: fail-fast: false matrix: include: - { cpu: hsw, AVX2: ON, AVXVNNI: OFF, AVXVNNIINT8: OFF, AVXNECONVERT: OFF, AVX512: OFF, AVX512VNNI: OFF, AVX512BF16: OFF, AVX512FP16: OFF } - { cpu: adl, AVX2: ON, AVXVNNI: ON, AVXVNNIINT8: OFF, AVXNECONVERT: OFF, AVX512: OFF, AVX512VNNI: OFF, AVX512BF16: OFF, AVX512FP16: OFF } - { cpu: arl, AVX2: ON, AVXVNNI: ON, AVXVNNIINT8: ON, AVXNECONVERT: ON, AVX512: OFF, AVX512VNNI: OFF, AVX512BF16: OFF, AVX512FP16: OFF } - { cpu: spr, AVX2: ON, AVXVNNI: OFF, AVXVNNIINT8: OFF, AVXNECONVERT: OFF, AVX512: ON, AVX512VNNI: ON, AVX512BF16: ON, AVX512FP16: ON } steps: - uses: actions/checkout@v6 - name: build run: | mkdir build && cd build cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF \ -DNCNN_AVX=ON \ -DNCNN_F16C=ON \ -DNCNN_XOP=OFF \ -DNCNN_AVX2=${{ matrix.AVX2 }} \ -DNCNN_AVXVNNI=${{ matrix.AVXVNNI }} \ -DNCNN_AVXVNNIINT8=${{ matrix.AVXVNNIINT8 }} \ -DNCNN_AVXNECONVERT=${{ matrix.AVXNECONVERT }} \ -DNCNN_AVX512=${{ matrix.AVX512 }} \ -DNCNN_AVX512VNNI=${{ matrix.AVX512VNNI }} \ -DNCNN_AVX512BF16=${{ matrix.AVX512BF16 }} \ -DNCNN_AVX512FP16=${{ matrix.AVX512FP16 }} \ -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j 8 - name: test run: | cd build TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-${{ matrix.cpu }};--" ctest --output-on-failure -j 8 - name: lcov-collect run: | cd build lcov --ignore-errors inconsistent -d ./src -c -o lcov.info lcov --ignore-errors inconsistent -r lcov.info '/usr/*' -o lcov.info lcov --ignore-errors inconsistent -r lcov.info '*/build/*' -o lcov.info lcov --ignore-errors inconsistent --list lcov.info - name: codecov uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} disable_search: true plugins: noop binary: /data/action/osd/codecov files: build/lcov.info linux-gcc-x64-sde-combined: name: sde-combined runs-on: [self-hosted, linux, ubuntu25] env: SDE_PATH: /data/action/osd/sde-external-9.33.0-2024-01-07-lin steps: - uses: actions/checkout@v6 - name: build run: | mkdir build && cd build cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j 8 - name: test-p4p run: | cd build TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-p4p;--" ctest --output-on-failure -j 8 - name: test-snb run: | cd build TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-snb;--" ctest --output-on-failure -j 8 - name: test-hsw run: | cd build TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-hsw;--" ctest --output-on-failure -j 8 - name: test-adl run: | cd build TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-adl;--" ctest --output-on-failure -j 8 - name: test-arl run: | cd build TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-arl;--" ctest --output-on-failure -j 8 - name: test-skx run: | cd build TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-skx;--" ctest --output-on-failure -j 8 - name: test-spr run: | cd build TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-spr;--" ctest --output-on-failure -j 8 - name: test-gnr run: | cd build TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-gnr;--" ctest --output-on-failure -j 8 - name: lcov-collect run: | cd build lcov --ignore-errors inconsistent -d ./src -c -o lcov.info lcov --ignore-errors inconsistent -r lcov.info '/usr/*' -o lcov.info lcov --ignore-errors inconsistent -r lcov.info '*/build/*' -o lcov.info lcov --ignore-errors inconsistent --list lcov.info - name: codecov uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} disable_search: true plugins: noop binary: /data/action/osd/codecov files: build/lcov.info linux-gcc-riscv64-rvv: strategy: matrix: openmp: [ON, OFF] runs-on: [self-hosted, linux, ubuntu] steps: - uses: actions/checkout@v6 - name: build run: | export RISCV_ROOT_PATH=/data/action/osd/riscv mkdir build cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=ON -DNCNN_ZFH=ON -DNCNN_ZVFH=ON -DNCNN_OPENMP=${{ matrix.openmp }} -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j 8 - name: test-vlen256 run: | export PATH=/data/action/osd/qemu-install/bin:$PATH cd build TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,zfh=true,zvfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/riscv/sysroot" ctest --output-on-failure -j 8 - name: test-vlen128 run: | export PATH=/data/action/osd/qemu-install/bin:$PATH cd build TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,zfh=true,zvfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/riscv/sysroot" ctest --output-on-failure -j 8 - name: lcov-collect run: | cd build lcov --gcov-tool /data/action/osd/riscv/bin/riscv64-unknown-linux-gnu-gcov -d ./src -c -o lcov.info lcov -r lcov.info '/usr/*' -o lcov.info lcov -r lcov.info '*/install/*' -o lcov.info lcov -r lcov.info '*/build/*' -o lcov.info lcov --list lcov.info - name: codecov uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} disable_search: true plugins: noop files: build/lcov.info linux-gpu-llvmpipe: runs-on: [self-hosted, linux, ubuntu25] steps: - uses: actions/checkout@v6 with: submodules: true - name: build run: | mkdir build && cd build cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVXVNNI=OFF -DNCNN_AVXNECONVERT=OFF -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_AVX512BF16=OFF -DNCNN_AVX512FP16=OFF -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j 8 - name: test run: | export LP_NUM_THREADS=4 cd build && ctest --output-on-failure -j 8 - name: lcov-collect run: | cd build lcov --ignore-errors inconsistent -d ./src -c -o lcov.info lcov --ignore-errors inconsistent -r lcov.info '/usr/*' -o lcov.info lcov --ignore-errors inconsistent -r lcov.info '*/build/*' -o lcov.info lcov --ignore-errors inconsistent --list lcov.info - name: codecov uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} disable_search: true plugins: noop binary: /data/action/osd/codecov files: build/lcov.info linux-gpu-swiftshader: runs-on: [self-hosted, linux, ubuntu25] steps: - uses: actions/checkout@v6 with: submodules: true - name: cache-swiftshader id: cache-swiftshader uses: actions/cache@v5 with: path: swiftshader-install key: swiftshader-linux-install-20250508 - name: checkout-swiftshader if: steps.cache-swiftshader.outputs.cache-hit != 'true' uses: actions/checkout@v6 with: repository: google/swiftshader path: swiftshader ref: 930d46d31b5d637f313fd5ef55da2bbf053c26c1 - name: swiftshader if: steps.cache-swiftshader.outputs.cache-hit != 'true' run: | cd swiftshader git -c submodule."third_party/git-hooks".update=none submodule update --init --recursive mkdir -p build; cd build cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release .. cmake --build . -j 8 mkdir $GITHUB_WORKSPACE/swiftshader-install cp Linux/* $GITHUB_WORKSPACE/swiftshader-install - name: build run: | mkdir build && cd build cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVXVNNI=OFF -DNCNN_AVXNECONVERT=OFF -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_AVX512BF16=OFF -DNCNN_AVX512FP16=OFF -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j 8 - name: test run: | printf "[Processor]\nThreadCount=1\n" > build/tests/SwiftShader.ini export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json" cd build && ctest --output-on-failure -j 8 - name: lcov-collect run: | cd build lcov --ignore-errors inconsistent -d ./src -c -o lcov.info lcov --ignore-errors inconsistent -r lcov.info '/usr/*' -o lcov.info lcov --ignore-errors inconsistent -r lcov.info '*/build/*' -o lcov.info lcov --ignore-errors inconsistent --list lcov.info - name: codecov uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} disable_search: true plugins: noop binary: /data/action/osd/codecov files: build/lcov.info linux-gcc-cross: name: ${{ matrix.arch }} runs-on: [self-hosted, linux, ubuntu25] strategy: fail-fast: false matrix: include: - arch: arm toolchain: arm-linux-gnueabi.toolchain.cmake extra-cmake-args: -DNCNN_VFPV4=ON qemu: qemu-arm-static qemu-args: "-L;/usr/arm-linux-gnueabi" - arch: arm-noinlineasm toolchain: arm-linux-gnueabi.toolchain.cmake extra-cmake-args: -DNCNN_GNU_INLINE_ASM=OFF -DNCNN_VFPV4=ON qemu: qemu-arm-static qemu-args: "-L;/usr/arm-linux-gnueabi" - arch: armhf-vfpv3-d16 toolchain: arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake extra-cmake-args: -DNCNN_VFPV4=OFF qemu: qemu-arm-static qemu-args: "-L;/usr/arm-linux-gnueabihf" - arch: armhf-vfpv3-d16-noinlineasm toolchain: arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake extra-cmake-args: -DNCNN_GNU_INLINE_ASM=OFF -DNCNN_VFPV4=OFF qemu: qemu-arm-static qemu-args: "-L;/usr/arm-linux-gnueabihf" - arch: aarch64-armv8.0 toolchain: aarch64-linux-gnu.toolchain.cmake extra-cmake-args: -DNCNN_ARM82=OFF qemu: qemu-aarch64-static qemu-args: "-L;/usr/aarch64-linux-gnu" - arch: aarch64-armv8.2 toolchain: aarch64-linux-gnu.toolchain.cmake extra-cmake-args: -DNCNN_ARM82DOT=OFF -DNCNN_ARM82FP16FML=OFF qemu: qemu-aarch64-static qemu-args: "-L;/usr/aarch64-linux-gnu" - arch: aarch64-armv8.4 toolchain: aarch64-linux-gnu.toolchain.cmake extra-cmake-args: -DNCNN_ARM84BF16=OFF -DNCNN_ARM84I8MM=OFF qemu: qemu-aarch64-static qemu-args: "-L;/usr/aarch64-linux-gnu" - arch: aarch64-armv8.6 toolchain: aarch64-linux-gnu.toolchain.cmake extra-cmake-args: -DNCNN_ARM86SVE=OFF qemu: qemu-aarch64-static qemu-args: "-L;/usr/aarch64-linux-gnu" - arch: aarch64-armv8.6-noinlineasm toolchain: aarch64-linux-gnu.toolchain.cmake extra-cmake-args: -DNCNN_GNU_INLINE_ASM=OFF -DNCNN_ARM86SVE=OFF qemu: qemu-aarch64-static qemu-args: "-L;/usr/aarch64-linux-gnu" - arch: mipsisa32r6el toolchain: mipsisa32r6el-linux-gnu.toolchain.cmake extra-cmake-args: -DNCNN_MSA=OFF -DNCNN_MMI=OFF qemu: qemu-mipsel-static qemu-args: "-L;/usr/mipsisa32r6el-linux-gnu" - arch: mipsisa64r6el toolchain: mipsisa64r6el-linux-gnuabi64.toolchain.cmake extra-cmake-args: -DNCNN_MSA=ON -DNCNN_MMI=OFF qemu: qemu-mips64el-static qemu-args: "-L;/usr/mipsisa64r6el-linux-gnuabi64" - arch: powerpc toolchain: powerpc-linux-gnu.toolchain.cmake extra-cmake-args: qemu: qemu-ppc-static qemu-args: "-L;/usr/powerpc-linux-gnu" - arch: powerpc64le toolchain: powerpc64le-linux-gnu.toolchain.cmake extra-cmake-args: qemu: qemu-ppc64le-static qemu-args: "-L;/usr/powerpc64le-linux-gnu" - arch: riscv64 toolchain: riscv64-linux-gnu.toolchain.cmake extra-cmake-args: qemu: qemu-riscv64-static qemu-args: "-L;/usr/riscv64-linux-gnu" - arch: loongarch64-la264 toolchain: loongarch64-linux-gnu.toolchain.cmake extra-cmake-args: -DNCNN_LSX=ON -DNCNN_LASX=OFF qemu: qemu-loongarch64-static qemu-args: "-L;/usr/loongarch64-linux-gnu" - arch: loongarch64-la664 toolchain: loongarch64-linux-gnu.toolchain.cmake extra-cmake-args: -DNCNN_LSX=ON -DNCNN_LASX=ON qemu: qemu-loongarch64-static qemu-args: "-L;/usr/loongarch64-linux-gnu" steps: - uses: actions/checkout@v6 - name: build run: | mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/${{ matrix.toolchain }} ${{ matrix.extra-cmake-args }} -DNCNN_OPENMP=OFF \ -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF \ -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j 8 - name: test run: | cd build TESTS_EXECUTABLE_LOADER=${{ matrix.qemu }} TESTS_EXECUTABLE_LOADER_ARGUMENTS="${{ matrix.qemu-args }}" ctest --output-on-failure -j 8 - name: lcov-collect run: | cd build lcov --ignore-errors inconsistent -d ./src -c -o lcov.info lcov --ignore-errors inconsistent -r lcov.info '/usr/*' -o lcov.info lcov --ignore-errors inconsistent -r lcov.info '*/build/*' -o lcov.info lcov --ignore-errors inconsistent --list lcov.info - name: build-openmp run: | mkdir build-openmp && cd build-openmp cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/${{ matrix.toolchain }} ${{ matrix.extra-cmake-args }} -DNCNN_OPENMP=ON \ -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF \ -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j 8 - name: test-openmp run: | export OMP_THREAD_LIMIT=1 export OMP_NUM_THREADS=1 cd build-openmp TESTS_EXECUTABLE_LOADER=${{ matrix.qemu }} TESTS_EXECUTABLE_LOADER_ARGUMENTS="${{ matrix.qemu-args }}" ctest --output-on-failure -j 8 - name: lcov-collect-openmp run: | cd build-openmp lcov --ignore-errors inconsistent -d ./src -c -o lcov.info lcov --ignore-errors inconsistent -r lcov.info '/usr/*' -o lcov.info lcov --ignore-errors inconsistent -r lcov.info '*/build-openmp/*' -o lcov.info lcov --ignore-errors inconsistent --list lcov.info - name: codecov uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} disable_search: true plugins: noop binary: /data/action/osd/codecov files: build/lcov.info,build-openmp/lcov.info ================================================ FILE: .github/workflows/tvos.yml ================================================ name: tvos on: push: branches: [master] paths: - '.github/workflows/tvos.yml' - 'toolchains/ios.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' - 'src/layer/x86/**' - 'src/layer/vulkan/**' - 'glslang' pull_request: branches: [master] paths: - '.github/workflows/tvos.yml' - 'toolchains/ios.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' - 'src/layer/x86/**' - 'src/layer/vulkan/**' - 'glslang' concurrency: group: tvos-${{ github.ref }} cancel-in-progress: true env: DEVELOPER_DIR: /Applications/Xcode_16.4.0.app/Contents/Developer TVOS_DEPLOYMENT_TARGET: '11.0' ENABLE_BITCODE: OFF ENABLE_ARC: OFF ENABLE_VISIBILITY: OFF permissions: contents: read jobs: build: runs-on: macos-15-intel env: OPENMP_VERSION: '18.1.2' OPENMP_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DLIBOMP_ENABLE_SHARED=OFF \ -DLIBOMP_OMPT_SUPPORT=OFF \ -DLIBOMP_USE_HWLOC=OFF \ NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VULKAN=ON \ steps: - uses: actions/checkout@v6 with: submodules: true - name: cache-openmp id: cache-openmp uses: actions/cache@v5 with: path: openmp-install key: openmp-tvos-install-20251004 - name: openmp if: steps.cache-openmp.outputs.cache-hit != 'true' run: | wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/ cd openmp-${{ env.OPENMP_VERSION }}.src wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch - name: openmp-arm64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-arm64 && cd build-arm64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=TVOS -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install - name: openmp-arm64e if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-arm64e && cd build-arm64e cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=TVOS -DARCHS="arm64e" .. cmake --build . -j 4 cmake --build . --target install - name: openmp-simulator-x86_64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-simulator-x86_64 && cd build-simulator-x86_64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_TVOS -DARCHS="x86_64" .. cmake --build . -j 4 cmake --build . --target install - name: openmp-simulator-arm64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-simulator-arm64 && cd build-simulator-arm64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_TVOS -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install - name: openmp-merge-fat-library if: steps.cache-openmp.outputs.cache-hit != 'true' run: | mkdir -p $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install/tvos mkdir -p $GITHUB_WORKSPACE/openmp-install/tvos-simulator cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/include $GITHUB_WORKSPACE/openmp-install/tvos mkdir -p $GITHUB_WORKSPACE/openmp-install/tvos/lib lipo -create \ openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a \ openmp-${{ env.OPENMP_VERSION }}.src/build-arm64e/install/lib/libomp.a \ -o $GITHUB_WORKSPACE/openmp-install/tvos/lib/libomp.a cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-simulator-x86_64/install/include $GITHUB_WORKSPACE/openmp-install/tvos-simulator mkdir -p $GITHUB_WORKSPACE/openmp-install/tvos-simulator/lib lipo -create \ openmp-${{ env.OPENMP_VERSION }}.src/build-simulator-x86_64/install/lib/libomp.a \ openmp-${{ env.OPENMP_VERSION }}.src/build-simulator-arm64/install/lib/libomp.a \ -o $GITHUB_WORKSPACE/openmp-install/tvos-simulator/lib/libomp.a - name: install-openmp run: | sudo cp $GITHUB_WORKSPACE/openmp-install/tvos/include/* $DEVELOPER_DIR/Platforms/AppleTVOS.platform/Developer/SDKs/AppleTVOS.sdk/usr/include sudo cp $GITHUB_WORKSPACE/openmp-install/tvos/lib/libomp.a $DEVELOPER_DIR/Platforms/AppleTVOS.platform/Developer/SDKs/AppleTVOS.sdk/usr/lib sudo cp $GITHUB_WORKSPACE/openmp-install/tvos-simulator/include/* $DEVELOPER_DIR/Platforms/AppleTVSimulator.platform/Developer/SDKs/AppleTVSimulator.sdk/usr/include sudo cp $GITHUB_WORKSPACE/openmp-install/tvos-simulator/lib/libomp.a $DEVELOPER_DIR/Platforms/AppleTVSimulator.platform/Developer/SDKs/AppleTVSimulator.sdk/usr/lib - name: arm64 run: | mkdir build-arm64 && cd build-arm64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=TVOS -DARCHS="arm64" .. cmake --build . -j 4 - name: arm64e run: | mkdir build-arm64e && cd build-arm64e cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=TVOS -DARCHS="arm64e" .. cmake --build . -j 4 - name: simulator-x86_64 run: | mkdir build-simulator-x86_64 && cd build-simulator-x86_64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_TVOS -DARCHS="x86_64" .. cmake --build . -j 4 - name: simulator-arm64 run: | mkdir build-simulator-arm64 && cd build-simulator-arm64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATORARM64_TVOS -DARCHS="arm64" .. cmake --build . -j 4 ================================================ FILE: .github/workflows/visionos.yml ================================================ name: visionos on: push: branches: [master] paths: - '.github/workflows/visionos.yml' - 'toolchains/ios.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' - 'src/layer/x86/**' pull_request: branches: [master] paths: - '.github/workflows/visionos.yml' - 'toolchains/ios.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' - 'src/layer/x86/**' concurrency: group: visionos-${{ github.ref }} cancel-in-progress: true env: DEVELOPER_DIR: /Applications/Xcode_16.4.0.app/Contents/Developer VISIONOS_DEPLOYMENT_TARGET: '1.0' ENABLE_BITCODE: OFF ENABLE_ARC: OFF ENABLE_VISIBILITY: OFF permissions: contents: read jobs: build: runs-on: macos-15-intel env: OPENMP_VERSION: '18.1.2' OPENMP_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$VISIONOS_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DLIBOMP_ENABLE_SHARED=OFF \ -DLIBOMP_OMPT_SUPPORT=OFF \ -DLIBOMP_USE_HWLOC=OFF \ NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$VISIONOS_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VULKAN=ON \ steps: - uses: actions/checkout@v6 with: submodules: true - name: cache-openmp id: cache-openmp uses: actions/cache@v5 with: path: openmp-install key: openmp-visionos-install-20251004 - name: openmp if: steps.cache-openmp.outputs.cache-hit != 'true' run: | wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/ cd openmp-${{ env.OPENMP_VERSION }}.src wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch - name: openmp-arm64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-arm64 && cd build-arm64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=VISIONOS -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install - name: openmp-simulator-x86_64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-simulator-x86_64 && cd build-simulator-x86_64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_VISIONOS -DARCHS="x86_64" .. cmake --build . -j 4 cmake --build . --target install - name: openmp-simulator-arm64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-simulator-arm64 && cd build-simulator-arm64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_VISIONOS -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install - name: openmp-merge-fat-library if: steps.cache-openmp.outputs.cache-hit != 'true' run: | mkdir -p $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install/visionos mkdir -p $GITHUB_WORKSPACE/openmp-install/visionos-simulator cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/include $GITHUB_WORKSPACE/openmp-install/visionos mkdir -p $GITHUB_WORKSPACE/openmp-install/visionos/lib cp openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a $GITHUB_WORKSPACE/openmp-install/visionos/lib/libomp.a cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-simulator-x86_64/install/include $GITHUB_WORKSPACE/openmp-install/visionos-simulator mkdir -p $GITHUB_WORKSPACE/openmp-install/visionos-simulator/lib lipo -create \ openmp-${{ env.OPENMP_VERSION }}.src/build-simulator-x86_64/install/lib/libomp.a \ openmp-${{ env.OPENMP_VERSION }}.src/build-simulator-arm64/install/lib/libomp.a \ -o $GITHUB_WORKSPACE/openmp-install/visionos-simulator/lib/libomp.a - name: install-openmp run: | sudo cp $GITHUB_WORKSPACE/openmp-install/visionos/include/* $DEVELOPER_DIR/Platforms/XROS.platform/Developer/SDKs/XROS.sdk/usr/include sudo cp $GITHUB_WORKSPACE/openmp-install/visionos/lib/libomp.a $DEVELOPER_DIR/Platforms/XROS.platform/Developer/SDKs/XROS.sdk/usr/lib sudo cp $GITHUB_WORKSPACE/openmp-install/visionos-simulator/include/* $DEVELOPER_DIR/Platforms/XRSimulator.platform/Developer/SDKs/XRSimulator.sdk/usr/include sudo cp $GITHUB_WORKSPACE/openmp-install/visionos-simulator/lib/libomp.a $DEVELOPER_DIR/Platforms/XRSimulator.platform/Developer/SDKs/XRSimulator.sdk/usr/lib - name: arm64 run: | mkdir build-arm64 && cd build-arm64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=VISIONOS -DARCHS="arm64" .. cmake --build . -j 4 - name: simulator-x86_64 run: | mkdir build-simulator-x86_64 && cd build-simulator-x86_64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_VISIONOS -DARCHS="x86_64" .. cmake --build . -j 4 - name: simulator-arm64 run: | mkdir build-simulator-arm64 && cd build-simulator-arm64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_VISIONOS -DARCHS="arm64" .. cmake --build . -j 4 ================================================ FILE: .github/workflows/watchos.yml ================================================ name: watchos on: push: branches: [master] paths: - '.github/workflows/watchos.yml' - 'toolchains/ios.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' - 'src/layer/x86/**' pull_request: branches: [master] paths: - '.github/workflows/watchos.yml' - 'toolchains/ios.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' - 'src/layer/x86/**' concurrency: group: watchos-${{ github.ref }} cancel-in-progress: true env: DEVELOPER_DIR: /Applications/Xcode_16.4.0.app/Contents/Developer WATCHOS_DEPLOYMENT_TARGET: '6.0' ENABLE_BITCODE: OFF ENABLE_ARC: OFF ENABLE_VISIBILITY: OFF permissions: contents: read jobs: build: runs-on: macos-15-intel env: OPENMP_VERSION: '18.1.2' OPENMP_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$WATCHOS_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DLIBOMP_ENABLE_SHARED=OFF \ -DLIBOMP_OMPT_SUPPORT=OFF \ -DLIBOMP_USE_HWLOC=OFF \ NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$WATCHOS_DEPLOYMENT_TARGET \ -DENABLE_BITCODE=$ENABLE_BITCODE \ -DENABLE_ARC=$ENABLE_ARC \ -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ -DCMAKE_INSTALL_PREFIX=install \ -DCMAKE_BUILD_TYPE=Release \ -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="libomp.a" \ steps: - uses: actions/checkout@v6 - name: cache-openmp id: cache-openmp uses: actions/cache@v5 with: path: openmp-install key: openmp-watchos-install-20251004 - name: openmp if: steps.cache-openmp.outputs.cache-hit != 'true' run: | wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/ cd openmp-${{ env.OPENMP_VERSION }}.src wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch - name: openmp-armv7k if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-armv7k && cd build-armv7k cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=WATCHOS -DARCHS="armv7k" .. cmake --build . -j 4 cmake --build . --target install - name: openmp-arm64_32 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-arm64_32 && cd build-arm64_32 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=WATCHOS -DARCHS="arm64_32" .. cmake --build . -j 4 cmake --build . --target install - name: openmp-simulator-x86_64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-simulator-x86_64 && cd build-simulator-x86_64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_WATCHOS -DARCHS="x86_64" .. cmake --build . -j 4 cmake --build . --target install - name: openmp-simulator-arm64 if: steps.cache-openmp.outputs.cache-hit != 'true' run: | cd openmp-${{ env.OPENMP_VERSION }}.src mkdir -p build-simulator-arm64 && cd build-simulator-arm64 cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_WATCHOS -DARCHS="arm64" .. cmake --build . -j 4 cmake --build . --target install - name: openmp-merge-fat-library if: steps.cache-openmp.outputs.cache-hit != 'true' run: | mkdir -p $GITHUB_WORKSPACE/openmp-install mkdir -p $GITHUB_WORKSPACE/openmp-install/watchos mkdir -p $GITHUB_WORKSPACE/openmp-install/watchos-simulator cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-arm64_32/install/include $GITHUB_WORKSPACE/openmp-install/watchos mkdir -p $GITHUB_WORKSPACE/openmp-install/watchos/lib lipo -create \ openmp-${{ env.OPENMP_VERSION }}.src/build-armv7k/install/lib/libomp.a \ openmp-${{ env.OPENMP_VERSION }}.src/build-arm64_32/install/lib/libomp.a \ -o $GITHUB_WORKSPACE/openmp-install/watchos/lib/libomp.a cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-simulator-x86_64/install/include $GITHUB_WORKSPACE/openmp-install/watchos-simulator mkdir -p $GITHUB_WORKSPACE/openmp-install/watchos-simulator/lib lipo -create \ openmp-${{ env.OPENMP_VERSION }}.src/build-simulator-x86_64/install/lib/libomp.a \ openmp-${{ env.OPENMP_VERSION }}.src/build-simulator-arm64/install/lib/libomp.a \ -o $GITHUB_WORKSPACE/openmp-install/watchos-simulator/lib/libomp.a - name: install-openmp run: | sudo cp $GITHUB_WORKSPACE/openmp-install/watchos/include/* $DEVELOPER_DIR/Platforms/WatchOS.platform/Developer/SDKs/WatchOS.sdk/usr/include sudo cp $GITHUB_WORKSPACE/openmp-install/watchos/lib/libomp.a $DEVELOPER_DIR/Platforms/WatchOS.platform/Developer/SDKs/WatchOS.sdk/usr/lib sudo cp $GITHUB_WORKSPACE/openmp-install/watchos-simulator/include/* $DEVELOPER_DIR/Platforms/WatchSimulator.platform/Developer/SDKs/WatchSimulator.sdk/usr/include sudo cp $GITHUB_WORKSPACE/openmp-install/watchos-simulator/lib/libomp.a $DEVELOPER_DIR/Platforms/WatchSimulator.platform/Developer/SDKs/WatchSimulator.sdk/usr/lib - name: armv7k run: | mkdir build-armv7k && cd build-armv7k cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=WATCHOS -DARCHS="armv7k" .. cmake --build . -j 4 - name: arm64_32 run: | mkdir build-arm64_32 && cd build-arm64_32 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=WATCHOS -DARCHS="arm64_32" .. cmake --build . -j 4 - name: simulator-x86_64 run: | mkdir build-simulator-x86_64 && cd build-simulator-x86_64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_WATCHOS -DARCHS="x86_64" .. cmake --build . -j 4 - name: simulator-arm64 run: | mkdir build-simulator-arm64 && cd build-simulator-arm64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_WATCHOS -DARCHS="arm64" .. cmake --build . -j 4 ================================================ FILE: .github/workflows/web-assembly.yml ================================================ name: web-assembly on: push: branches: [master] paths: - '.github/workflows/web-assembly.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'tests/**' pull_request: branches: [master] paths: - '.github/workflows/web-assembly.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'tests/**' env: EMSCRIPTEN_VERSION: 3.1.28 concurrency: group: web-assembly-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: webassembly: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: emsdk run: | git clone https://github.com/emscripten-core/emsdk.git cd emsdk ./emsdk install $EMSCRIPTEN_VERSION ./emsdk activate $EMSCRIPTEN_VERSION - name: build-basic run: | source emsdk/emsdk_env.sh export LDFLAGS="-sERROR_ON_WASM_CHANGES_AFTER_LINK -sWASM_BIGINT -O1" mkdir build-basic && cd build-basic cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test-basic run: | cd build-basic TESTS_EXECUTABLE_LOADER=node ctest --output-on-failure -j $(nproc) - name: build-simd run: | source emsdk/emsdk_env.sh export LDFLAGS="-sERROR_ON_WASM_CHANGES_AFTER_LINK -sWASM_BIGINT -O1" mkdir build-simd && cd build-simd cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test-simd run: | cd build-simd TESTS_EXECUTABLE_LOADER=node ctest --output-on-failure -j $(nproc) - name: build-simd-omp run: | source emsdk/emsdk_env.sh export LDFLAGS="-sERROR_ON_WASM_CHANGES_AFTER_LINK -sWASM_BIGINT -O1" mkdir build-simd-omp && cd build-simd-omp cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j $(nproc) - name: test-simd-omp run: | cd build-simd-omp TESTS_EXECUTABLE_LOADER=node ctest --output-on-failure -j $(nproc) ================================================ FILE: .github/workflows/windows-arm.yml ================================================ name: windows-arm on: push: branches: [master] paths: - '.github/workflows/windows-arm.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' - 'src/layer/vulkan/**' - 'tests/**' - 'glslang' pull_request: branches: [master] paths: - '.github/workflows/windows-arm.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' - 'src/layer/vulkan/**' - 'tests/**' - 'glslang' concurrency: group: windows-arm-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: windows: name: ${{ matrix.vs-version }} runs-on: windows-2022 strategy: matrix: include: - vs-version: vs2019 toolset-version: v142 windows-sdk-version: 22621 - vs-version: vs2022 toolset-version: v143 windows-sdk-version: 26100 env: UseMultiToolTask: true NCNN_CMAKE_OPTIONS: -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_VULKAN=ON steps: - uses: actions/checkout@v6 with: submodules: true - uses: GuillaumeFalourd/setup-windows10-sdk-action@v2.4 with: sdk-version: ${{ matrix.windows-sdk-version }} - name: arm64 run: | mkdir build-arm64; cd build-arm64 cmake -T ${{ matrix.toolset-version }},host=x64 -A arm64,version=10.0.${{ matrix.windows-sdk-version }}.0 ${{ env.NCNN_CMAKE_OPTIONS }} .. cmake --build . --config Release -j 4 - name: arm64-shared run: | mkdir build-arm64-shared; cd build-arm64-shared cmake -T ${{ matrix.toolset-version }},host=x64 -A arm64,version=10.0.${{ matrix.windows-sdk-version }}.0 ${{ env.NCNN_CMAKE_OPTIONS }} -DNCNN_SHARED_LIB=ON .. cmake --build . --config Release -j 4 woa-linux: name: woa-linux runs-on: ubuntu-latest container: linaro/wine-arm64 steps: - uses: actions/checkout@v6 - name: msvc-wine env: WINEPREFIX: /tmp/wine-x64-prefix/ run: | apt-get update apt-get install -y wine64 python3 msitools python3-simplejson python3-six ca-certificates winbind cmake ninja-build meson ln -s /usr/bin/wine /usr/bin/wine64 xvfb-run winecfg & git clone --depth 1 https://github.com/mstorsjo/msvc-wine msvc-wine/vsdownload.py --accept-license --dest /msvc msvc-wine/install.sh /msvc - name: build env: WINEPREFIX: /tmp/wine-x64-prefix/ CC: cl CXX: cl run: | export PATH=/msvc/bin/arm64:$PATH mkdir build && cd build cmake -GNinja -DCMAKE_BUILD_TYPE=Release -DCMAKE_SYSTEM_NAME=Windows -DNCNN_BUILD_TESTS=ON .. cmake --build . --config Release -j $(nproc) - name: test run: | cd build TESTS_EXECUTABLE_LOADER=wine-arm64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="" ctest --output-on-failure -j $(nproc) windows-arm: runs-on: windows-11-arm env: UseMultiToolTask: true steps: - uses: actions/checkout@v6 with: submodules: true - name: build run: | mkdir build; cd build cmake -A arm64 -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_VULKAN=OFF -DNCNN_ARM82=OFF .. cmake --build . --config Release -j 4 - name: test run: cd build; ctest -C Release --output-on-failure -j 4 ================================================ FILE: .github/workflows/windows-clang.yml ================================================ name: windows-clang on: push: branches: [master] paths: - '.github/workflows/windows-clang.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' - 'src/layer/x86/**' - 'src/layer/vulkan/**' - 'tests/**' - 'glslang' pull_request: branches: [master] paths: - '.github/workflows/windows-clang.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' - 'src/layer/x86/**' - 'src/layer/vulkan/**' - 'tests/**' - 'glslang' concurrency: group: windows-clang-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: windows: name: ClangCL runs-on: windows-2022 env: UseMultiToolTask: true NCNN_CMAKE_OPTIONS: -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF steps: - uses: actions/checkout@v6 with: submodules: true - name: arm64 run: | mkdir build-arm64; cd build-arm64 cmake -T ClangCL -A arm64 ${{ env.NCNN_CMAKE_OPTIONS }} -DNCNN_VULKAN=OFF .. cmake --build . --config Release -j 4 - name: arm64-vulkan run: | mkdir build-arm64-vulkan; cd build-arm64-vulkan cmake -T ClangCL -A arm64 ${{ env.NCNN_CMAKE_OPTIONS }} -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON .. cmake --build . --config Release -j 4 - name: x86 run: | mkdir build-x86; cd build-x86 cmake -T ClangCL -A Win32 ${{ env.NCNN_CMAKE_OPTIONS }} -DNCNN_BUILD_TESTS=ON -DNCNN_VULKAN=OFF .. cmake --build . --config Release -j 4 - name: x86-test run: cd build-x86; ctest -C Release --output-on-failure -j 4 - name: x86-vulkan run: | mkdir build-x86-vulkan; cd build-x86-vulkan cmake -T ClangCL -A Win32 ${{ env.NCNN_CMAKE_OPTIONS }} -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON .. cmake --build . --config Release -j 4 - name: x64 run: | mkdir build-x64; cd build-x64 cmake -T ClangCL -A x64 ${{ env.NCNN_CMAKE_OPTIONS }} -DNCNN_BUILD_TESTS=ON -DNCNN_VULKAN=OFF .. cmake --build . --config Release -j 4 - name: x64-test run: cd build-x64; ctest -C Release --output-on-failure -j 4 - name: x64-vulkan run: | mkdir build-x64-vulkan; cd build-x64-vulkan cmake -T ClangCL -A x64 ${{ env.NCNN_CMAKE_OPTIONS }} -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON .. cmake --build . --config Release -j 4 ================================================ FILE: .github/workflows/windows-mingw.yml ================================================ name: windows-mingw on: push: branches: [master] paths: - '.github/workflows/windows-mingw.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'src/layer/vulkan/**' - 'tests/**' - 'glslang' pull_request: branches: [master] paths: - '.github/workflows/windows-mingw.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'src/layer/vulkan/**' - 'tests/**' - 'glslang' concurrency: group: windows-mingw-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: windows: name: MinGW-w64 runs-on: windows-2022 env: UseMultiToolTask: true NCNN_CMAKE_OPTIONS: -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF steps: - uses: actions/checkout@v6 with: submodules: true - name: x64 run: | mkdir build-x64; cd build-x64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DNCNN_BUILD_TESTS=ON -DNCNN_VULKAN=OFF -G "MinGW Makefiles" .. cmake --build . --config Release -j 4 - name: x64-test run: cd build-x64; ctest -C Release --output-on-failure -j 4 - name: x64-vulkan run: | mkdir build-x64-vulkan; cd build-x64-vulkan cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON -G "MinGW Makefiles" .. cmake --build . --config Release -j 4 ================================================ FILE: .github/workflows/windows-xp.yml ================================================ name: windows-xp on: push: branches: [master] paths: - '.github/workflows/windows-xp.yml' - 'toolchains/windows-xp-msvc.toolchain.cmake' - 'toolchains/windows-xp-mingw.toolchain.cmake' - 'toolchains/windows-xp-clang.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'tests/**' pull_request: branches: [master] paths: - '.github/workflows/windows-xp.yml' - 'toolchains/windows-xp-msvc.toolchain.cmake' - 'toolchains/windows-xp-mingw.toolchain.cmake' - 'toolchains/windows-xp-clang.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'tests/**' concurrency: group: windows-xp-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: MSVC: runs-on: windows-2025 env: VS_INSTALL_DIR: C:\Program Files\Microsoft Visual Studio\2022\Enterprise UseMultiToolTask: true NCNN_CMAKE_OPTIONS: -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON steps: - uses: actions/checkout@v6 with: submodules: true - name: config shell: cmd run: | "C:\Program Files (x86)\Microsoft Visual Studio\Installer\setup.exe" modify --installPath "${{ env.VS_INSTALL_DIR }}" --channelId VisualStudio.17.Release --add Microsoft.VisualStudio.Component.WinXP --add Microsoft.VisualStudio.Component.VC.Tools.X86.X64.Spectre --add Microsoft.VisualStudio.Component.VC.Tools.X86.X64 --add Microsoft.VisualStudio.Component.VC.Tools.X86.X64 --add Microsoft.VisualStudio.Component.VC.v141.xp --nocache --quiet call "${{ env.VS_INSTALL_DIR }}\VC\Auxiliary\Build\vcvarsall.bat" x86 - name: build run: | mkdir build; cd build cmake ${{ env.NCNN_CMAKE_OPTIONS }} -A WIN32 -G "Visual Studio 17 2022" -T v141_xp -DNCNN_WINXP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_OPENMP=OFF -DNCNN_BUILD_WITH_STATIC_CRT=ON -DNCNN_AVX=OFF -DCMAKE_TOOLCHAIN_FILE="../toolchains/windows-xp-msvc.toolchain.cmake" .. cmake --build . --config Release -j 4 - name: test run: cd build; ctest -C Release --output-on-failure -j 4 MinGW-w32: runs-on: windows-2025 env: UseMultiToolTask: true NCNN_CMAKE_OPTIONS: -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON steps: - uses: actions/checkout@v6 with: submodules: true - name: config run: | Invoke-WebRequest -Uri https://github.com/nihui/ncnn-assets/releases/download/toolchain/i686-8.1.0-release-posix-dwarf-rt_v6-rev0.7z -OutFile i686-8.1.0-release-posix-dwarf-rt_v6-rev0.7z 7z x ./i686-8.1.0-release-posix-dwarf-rt_v6-rev0.7z Add-Content -Path $env:GITHUB_ENV -Value "MINGW32_ROOT_PATH=${{ github.workspace }}\mingw32" Add-Content -Path $env:GITHUB_PATH -Value "${{ github.workspace }}\mingw32\bin" - name: build run: | mkdir build; cd build cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DCMAKE_TOOLCHAIN_FILE="../toolchains/windows-xp-mingw.toolchain.cmake" -DNCNN_WINXP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_AVX=OFF .. -G "MinGW Makefiles" cmake --build . --config Release -j 4 - name: test run: cd build; ctest -C Release --output-on-failure -j 4 Clang: runs-on: windows-2022 env: UseMultiToolTask: true NCNN_CMAKE_OPTIONS: -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON steps: - uses: actions/checkout@v6 with: submodules: true - name: Set up Clang run: choco install llvm --version=6.0.0 --allow-downgrade - name: Verify Clang run: | clang --version clang++ --version - name: config run: | Invoke-WebRequest -Uri https://github.com/nihui/ncnn-assets/releases/download/toolchain/i686-8.1.0-release-posix-dwarf-rt_v6-rev0.7z -OutFile i686-8.1.0-release-posix-dwarf-rt_v6-rev0.7z 7z x ./i686-8.1.0-release-posix-dwarf-rt_v6-rev0.7z Add-Content -Path $env:GITHUB_ENV -Value "MINGW32_ROOT_PATH=${{ github.workspace }}\mingw32" echo "${{ github.workspace }}\mingw32\bin;$env:PATH" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 - name: build run: | mkdir build; cd build cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DCMAKE_TOOLCHAIN_FILE="../toolchains/windows-xp-clang.toolchain.cmake" -DNCNN_WINXP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_AVX=OFF .. -G "MinGW Makefiles" cmake --build . --config Release -j 4 - name: test run: cd build; ctest -C Release --output-on-failure -j 4 ================================================ FILE: .github/workflows/windows.yml ================================================ name: windows on: push: branches: [master] paths: - '.github/workflows/windows.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'src/layer/vulkan/**' - 'tests/**' - 'tools/**' - '!tools/pnnx/**' - 'examples/**' - 'glslang' pull_request: branches: [master] paths: - '.github/workflows/windows.yml' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - 'src/layer/x86/**' - 'src/layer/vulkan/**' - 'tests/**' - 'tools/**' - '!tools/pnnx/**' - 'examples/**' - 'glslang' concurrency: group: windows-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: msvc: name: ${{ matrix.vs-version }} runs-on: windows-2022 strategy: matrix: include: - vs-version: vs2015 toolset-version: v140 windows-sdk-version: 22621 - vs-version: vs2017 toolset-version: v141 windows-sdk-version: 22621 - vs-version: vs2019 toolset-version: v142 windows-sdk-version: 26100 - vs-version: vs2022 toolset-version: v143 windows-sdk-version: 26100 env: UseMultiToolTask: true steps: - uses: actions/checkout@v6 with: submodules: true - name: Install VS 2017 (v141) Build Tools if: matrix.vs-version == 'vs2017' run: | $vsInstallPath = & "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -latest -property installationPath Start-Process -FilePath "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vs_installer.exe" -ArgumentList "modify --installPath `"$vsInstallPath`" --add Microsoft.VisualStudio.Component.VC.v141.x86.x64 --quiet --norestart --nocache" -Wait - name: Install and Setup VS 2015 (v140) Build Tools if: matrix.vs-version == 'vs2015' run: | $vs140Path = "C:/vs140_build_tools" Invoke-WebRequest -Uri "https://aka.ms/vs/15/release/vs_buildtools.exe" -OutFile vs_buildtools.exe Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--installPath `"$vs140Path`" --add Microsoft.VisualStudio.Workload.VCTools --add Microsoft.VisualStudio.Component.VC.140 --quiet --wait --norestart --nocache" -Wait $vcvarsPath = (Get-ChildItem -Path $vs140Path -Filter "vcvars64.bat" -Recurse | Select-Object -First 1).FullName $cmd = "`"$vcvarsPath`" && powershell -Command `"`$env:PATH;`$env:INCLUDE;`$env:LIB`"" $output = cmd.exe /c $cmd $lines = $output -split "`r`n" echo "PATH=$($lines[0]);$($env:PATH)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append echo "INCLUDE=$($lines[1])" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append echo "LIB=$($lines[2])" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append - uses: GuillaumeFalourd/setup-windows10-sdk-action@v2.4 with: sdk-version: ${{ matrix.windows-sdk-version }} - name: cache-protobuf id: cache-protobuf uses: actions/cache@v5 with: path: "protobuf-install" key: protobuf-${{ matrix.vs-version }}-x64-install-3 - name: protobuf if: steps.cache-protobuf.outputs.cache-hit != 'true' run: | Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip 7z x ./protobuf-3.11.2.zip cd protobuf-3.11.2 mkdir build-${{ matrix.vs-version }}; cd build-${{ matrix.vs-version }} cmake -T ${{ matrix.toolset-version }},host=x64 -A x64,version=10.0.${{ matrix.windows-sdk-version }}.0 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF -DNCNN_BUILD_TESTS=ON ../cmake cmake --build . --config Release -j 4 cmake --build . --config Release --target install - name: cache-swiftshader if: matrix.vs-version != 'vs2015' && matrix.vs-version != 'vs2017' id: cache-swiftshader uses: actions/cache@v5 with: path: swiftshader-install key: swiftshader-${{ matrix.vs-version }}-x64-install-20251010 - name: checkout-swiftshader if: matrix.vs-version != 'vs2015' && matrix.vs-version != 'vs2017' && steps.cache-swiftshader.outputs.cache-hit != 'true' uses: actions/checkout@v6 with: repository: google/swiftshader path: swiftshader ref: de870ac7518fe2b6bb651ecc22fc36647cf7b986 - name: checkout-swiftshader-submodules if: matrix.vs-version != 'vs2015' && matrix.vs-version != 'vs2017' && steps.cache-swiftshader.outputs.cache-hit != 'true' run: | cd swiftshader git -c submodule."third_party/git-hooks".update=none submodule update --init --recursive - name: swiftshader if: matrix.vs-version != 'vs2015' && matrix.vs-version != 'vs2017' && steps.cache-swiftshader.outputs.cache-hit != 'true' run: | cd swiftshader mkdir build-${{ matrix.vs-version }}; cd build-${{ matrix.vs-version }} cmake -T ${{ matrix.toolset-version }},host=x64 -A x64,version=10.0.${{ matrix.windows-sdk-version }}.0 -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release .. cmake --build . --config Release -j 4 mkdir "$env:GITHUB_WORKSPACE/swiftshader-install" Copy-Item -Path "Windows\*" -Destination "$env:GITHUB_WORKSPACE\swiftshader-install" - name: x64 run: | mkdir build-x64; cd build-x64 cmake -T ${{ matrix.toolset-version }},host=x64 -A x64,version=10.0.${{ matrix.windows-sdk-version }}.0 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=ON .. cmake --build . --config Release -j 4 - name: x64-test if: matrix.vs-version != 'vs2015' && matrix.vs-version != 'vs2017' run: | echo "[Processor]`nThreadCount=1`n" > build-x64/tests/Release/SwiftShader.ini Copy-Item -Path "$env:GITHUB_WORKSPACE\swiftshader-install\vulkan-1.dll" -Destination 'build-x64\tests' cd build-x64; ctest -C Release --output-on-failure -j 4 - name: x64-sse2 run: | mkdir build-x64-sse2; cd build-x64-sse2 cmake -T ${{ matrix.toolset-version }},host=x64 -A x64,version=10.0.${{ matrix.windows-sdk-version }}.0 -DNCNN_RUNTIME_CPU=OFF -DNCNN_XOP=OFF -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_AVX512=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_DISABLE_RTTI=ON -DNCNN_DISABLE_EXCEPTION=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. cmake --build . --config Release -j 4 - name: x64-sse2-test run: cd build-x64-sse2; ctest -C Release --output-on-failure -j 4 - name: x64-avx run: | mkdir build-x64-avx; cd build-x64-avx cmake -T ${{ matrix.toolset-version }},host=x64 -A x64,version=10.0.${{ matrix.windows-sdk-version }}.0 -DNCNN_RUNTIME_CPU=OFF -DNCNN_XOP=OFF -DNCNN_AVX=ON -DNCNN_AVX2=OFF -DNCNN_AVX512=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_DISABLE_RTTI=ON -DNCNN_DISABLE_EXCEPTION=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. cmake --build . --config Release -j 4 - name: x64-avx-test run: cd build-x64-avx; ctest -C Release --output-on-failure -j 4 - name: x86 run: | mkdir build-x86; cd build-x86 cmake -T ${{ matrix.toolset-version }},host=x64 -A Win32,version=10.0.${{ matrix.windows-sdk-version }}.0 -DNCNN_SHARED_LIB=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. cmake --build . --config Release -j 4 - name: x86-test run: | Copy-Item -Path "build-x86\src\Release\ncnn.dll" -Destination 'build-x86\tests' cd build-x86; ctest -C Release --output-on-failure -j 4 ================================================ FILE: .gitignore ================================================ # CMake build directory build*/ # Backup files. *~ # Prerequisites *.d # Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod *.smod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app # MACOSX .DS_Store # IDE .vs .vscode .idea cmake-build-debug cmake-build-release CMakeSettings.json # Compiled python __pycache__ *.pyc *.pyd *.egg-info/ python/setup.py # Clangd .cache/ # Xmake .xmake/ ================================================ FILE: .gitmodules ================================================ [submodule "glslang"] path = glslang url = https://github.com/nihui/glslang [submodule "python/pybind11"] path = python/pybind11 url = https://github.com/pybind/pybind11.git ================================================ FILE: CITATION.cff ================================================ cff-version: 1.2.0 title: ncnn message: >- If you use this software, please cite it using the metadata from this file. type: software authors: - family-names: "Ni" given-names: "Hui" - name: "The ncnn contributors" abstract: >- ncnn is a high-performance neural network inference computing framework optimized for mobile platforms. date-released: 2017-06-30 keywords: - "neural network" - "artificial intelligence" - "deep learning" - android - ios - windows - linux - macos - pnnx - simd - vulkan - riscv - x86 - arm - mips - loongarch license: BSD-3-Clause repository-code: "https://github.com/Tencent/ncnn" ================================================ FILE: CMakeLists.txt ================================================ if(CMAKE_TOOLCHAIN_FILE) set(LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_BINARY_DIR} CACHE PATH "root for library output, set this to change where android libs are compiled to") # get absolute path, but get_filename_component ABSOLUTE only refer with source dir, so find_file here :( get_filename_component(CMAKE_TOOLCHAIN_FILE_NAME ${CMAKE_TOOLCHAIN_FILE} NAME) find_file(CMAKE_TOOLCHAIN_FILE ${CMAKE_TOOLCHAIN_FILE_NAME} PATHS ${CMAKE_SOURCE_DIR} NO_DEFAULT_PATH) message(STATUS "CMAKE_TOOLCHAIN_FILE = ${CMAKE_TOOLCHAIN_FILE}") endif() if(NOT DEFINED CMAKE_INSTALL_PREFIX) set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "Installation Directory") endif() message(STATUS "CMAKE_INSTALL_PREFIX = ${CMAKE_INSTALL_PREFIX}") if(NOT DEFINED NCNN_VERSION) string(TIMESTAMP NCNN_VERSION "%Y%m%d") endif() set(NCNN_VERSION_MAJOR 1) set(NCNN_VERSION_MINOR 0) set(NCNN_VERSION_PATCH ${NCNN_VERSION}) set(NCNN_VERSION_STRING ${NCNN_VERSION_MAJOR}.${NCNN_VERSION_MINOR}.${NCNN_VERSION_PATCH}) set(NCNN_VERSION_NUMBER ${NCNN_VERSION}) message(STATUS "NCNN_VERSION_STRING = ${NCNN_VERSION_STRING}") cmake_minimum_required(VERSION 2.8.12...3.10) if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE release CACHE STRING "Choose the type of build" FORCE) endif() if(NOT CMAKE_VERSION VERSION_LESS "3.15") # enable CMAKE_MSVC_RUNTIME_LIBRARY cmake_policy(SET CMP0091 NEW) endif() if(POLICY CMP0025) # reference from https://cmake.org/cmake/help/latest/policy/CMP0025.html cmake_policy(SET CMP0025 NEW) endif() if(POLICY CMP0057) # reference from https://cmake.org/cmake/help/latest/policy/CMP0057.html cmake_policy(SET CMP0057 NEW) endif() project(ncnn) if(MSVC AND NOT CMAKE_VERSION VERSION_LESS "3.15") option(NCNN_BUILD_WITH_STATIC_CRT "Enables use of statically linked CRT for statically linked ncnn" OFF) if(NCNN_BUILD_WITH_STATIC_CRT) # cmake before version 3.15 not work set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") endif() endif() if(CMAKE_FIND_LIBRARY_SUFFIXES_INIT) # project() overwrite CMAKE_FIND_LIBRARY_SUFFIXES in toolchain, restore it set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES_INIT}) endif() option(NCNN_SHARED_LIB "shared library support" OFF) option(NCNN_ENABLE_LTO "enable link-time optimization" OFF) option(NCNN_OPENMP "openmp support" ON) option(NCNN_STDIO "load model from external file" ON) option(NCNN_STRING "plain and verbose string" ON) option(NCNN_INSTALL_SDK "install ncnn library and headers" ON) option(NCNN_SIMPLEOCV "minimal opencv structure emulation" OFF) option(NCNN_SIMPLEOMP "minimal openmp runtime emulation" OFF) option(NCNN_SIMPLESTL "minimal cpp stl structure emulation" OFF) option(NCNN_SIMPLEMATH "minimal cmath" OFF) option(NCNN_THREADS "build with threads" ON) option(NCNN_BENCHMARK "print benchmark information for every layer" OFF) option(NCNN_C_API "build with C api" ON) option(NCNN_PLATFORM_API "build with platform api candy" ON) option(NCNN_WINXP "build with windows xp compatibility" OFF) option(NCNN_PIXEL "convert and resize from/to image pixel" ON) option(NCNN_PIXEL_ROTATE "rotate image pixel orientation" ON) option(NCNN_PIXEL_AFFINE "warp affine image pixel" ON) option(NCNN_PIXEL_DRAWING "draw basic figure and text" ON) option(NCNN_CMAKE_VERBOSE "print verbose cmake messages" OFF) option(NCNN_VULKAN "vulkan compute support" OFF) option(NCNN_SIMPLEVK "minimal in-house vulkan loader" ON) option(NCNN_SYSTEM_GLSLANG "use system glslang library" OFF) option(NCNN_RUNTIME_CPU "runtime dispatch cpu routines" ON) option(NCNN_DISABLE_PIC "disable position-independent code" OFF) option(NCNN_BUILD_TESTS "build tests" OFF) option(NCNN_COVERAGE "build for coverage" OFF) option(NCNN_ASAN "build for address sanitizer" OFF) option(NCNN_BUILD_BENCHMARK "build benchmark" ON) option(NCNN_PYTHON "build python api" OFF) option(NCNN_INT8 "int8 inference" ON) option(NCNN_BF16 "bf16 inference" ON) option(NCNN_FORCE_INLINE "force inline some function" ON) if(ANDROID OR IOS OR NCNN_SIMPLESTL) option(NCNN_DISABLE_RTTI "disable rtti" ON) option(NCNN_DISABLE_EXCEPTION "disable exception" ON) else() option(NCNN_DISABLE_RTTI "disable rtti" OFF) option(NCNN_DISABLE_EXCEPTION "disable exception" OFF) endif() if(ANDROID OR IOS OR NCNN_SIMPLESTL OR CMAKE_CROSSCOMPILING) option(NCNN_BUILD_TOOLS "build tools" OFF) option(NCNN_BUILD_EXAMPLES "build examples" OFF) else() option(NCNN_BUILD_TOOLS "build tools" ON) option(NCNN_BUILD_EXAMPLES "build examples" ON) endif() if(NCNN_SHARED_LIB) if(NCNN_ENABLE_LTO) # enable global link time optimization cmake_policy(SET CMP0069 NEW) set(CMAKE_POLICY_DEFAULT_CMP0069 NEW) include(CheckIPOSupported) check_ipo_supported(RESULT ipo_supported OUTPUT ipo_supported_output) if(ipo_supported) set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) else() message(WARNING "IPO is not supported: ${ipo_supported_output}") set(NCNN_ENABLE_LTO OFF) endif() endif() endif() if(NOT NCNN_STDIO OR NOT NCNN_STRING) if(NCNN_BUILD_TOOLS) message(WARNING "NCNN_STDIO or NCNN_STRING disabled, NCNN_BUILD_TOOLS will be turned off.") set(NCNN_BUILD_TOOLS OFF) endif() if(NCNN_BUILD_EXAMPLES) message(WARNING "NCNN_STDIO or NCNN_STRING disabled, NCNN_BUILD_EXAMPLES will be turned off.") set(NCNN_BUILD_EXAMPLES OFF) endif() if(NCNN_BUILD_BENCHMARK) message(WARNING "NCNN_STDIO or NCNN_STRING disabled, NCNN_BUILD_BENCHMARK will be turned off.") set(NCNN_BUILD_BENCHMARK OFF) endif() if(NCNN_BUILD_TESTS) message(WARNING "NCNN_STDIO or NCNN_STRING disabled, NCNN_BUILD_TESTS will be turned off.") set(NCNN_BUILD_TESTS OFF) endif() endif() ############################################## include(CheckCXXCompilerFlag) set(CMAKE_TRY_COMPILE_CONFIGURATION release) set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) # gnu inline assembly in clang msvc does not work actually if(NOT (CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC"))) check_cxx_source_compiles("int test(int a) { asm volatile(\"\" : \"=r\"(a) : \"0\"(a) : \"memory\"); return a; }" NCNN_COMPILER_SUPPORT_GNU_INLINE_ASM) if(NCNN_COMPILER_SUPPORT_GNU_INLINE_ASM) option(NCNN_GNU_INLINE_ASM "optimize platform with gnu style inline assembly" ON) else() message(WARNING "The compiler does not support gnu style inline assembly. NCNN_GNU_INLINE_ASM will be OFF.") endif() endif() if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm") OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "arm64") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|aarch64)") OR (CMAKE_CXX_COMPILER_ARCHITECTURE_ID MATCHES "(ARMV7|ARM64)") OR ((CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")) AND (${CMAKE_GENERATOR_PLATFORM} MATCHES "^(arm|arm64)"))) set(NCNN_TARGET_ARCH arm) if(APPLE AND CMAKE_OSX_ARCHITECTURES STREQUAL "arm64_32") set(NCNN_TARGET_ILP32 TRUE) endif() if(CMAKE_SIZEOF_VOID_P EQUAL 4 AND NOT NCNN_TARGET_ILP32) check_cxx_source_compiles("#include \nfloat32x4_t test(float32x4_t s, float32x4_t a, float32x4_t b) { return vmlaq_f32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM_NEON) if(NCNN_COMPILER_SUPPORT_ARM_NEON) if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")) set(CMAKE_REQUIRED_FLAGS "/arch:VFPv4") check_cxx_source_compiles("#include \nfloat16x4_t test(float32x4_t a) { return vcvt_f16_f32(a); }" NCNN_COMPILER_SUPPORT_ARM_VFPV4) unset(CMAKE_REQUIRED_FLAGS) else() set(CMAKE_REQUIRED_FLAGS "-mfpu=neon-vfpv4") check_cxx_source_compiles("#include \nfloat16x4_t test(float32x4_t a) { return vcvt_f16_f32(a); }" NCNN_COMPILER_SUPPORT_ARM_VFPV4) if(NOT NCNN_COMPILER_SUPPORT_ARM_VFPV4) set(CMAKE_REQUIRED_FLAGS "-mfpu=neon-vfpv4 -mfp16-format=ieee") check_cxx_source_compiles("#include \nfloat16x4_t test(float32x4_t a) { return vcvt_f16_f32(a); }" NCNN_COMPILER_SUPPORT_ARM_VFPV4_FP16) endif() unset(CMAKE_REQUIRED_FLAGS) endif() endif() if(NCNN_COMPILER_SUPPORT_ARM_VFPV4 OR NCNN_COMPILER_SUPPORT_ARM_VFPV4_FP16) option(NCNN_VFPV4 "optimize armv7 platform with vfpv4" ON) else() message(WARNING "The compiler does not support arm vfpv4. NCNN_VFPV4 will be OFF.") endif() endif() if(CMAKE_SIZEOF_VOID_P EQUAL 8 OR NCNN_TARGET_ILP32) if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") set(CMAKE_REQUIRED_FLAGS "/arch:armv8.0") check_cxx_source_compiles("#include \nfloat16x4_t test(float32x4_t a) { return vcvt_f16_f32(a); }" NCNN_COMPILER_SUPPORT_ARM_VFPV4) set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2") check_cxx_source_compiles("#include \nfloat16x8_t test(float16x8_t s, float16x8_t a, float16x8_t b) { return vfmaq_f16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_FP16) set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2") check_cxx_source_compiles("#include \nint32x4_t test(int32x4_t s, int8x16_t a, int8x16_t b) { return vdotq_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_DOTPROD) set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2") check_cxx_source_compiles("#include \nfloat32x4_t test(float32x4_t s, float16x8_t a, float16x8_t b) { return vfmlalq_low_f16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML) set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4") check_cxx_source_compiles("#include \nfloat32x4_t test(float32x4_t s, bfloat16x8_t a, bfloat16x8_t b) { return vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(s, a, b))); }" NCNN_COMPILER_SUPPORT_ARM84_BF16) set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4") check_cxx_source_compiles("#include \nint32x4_t test(int32x4_t s, int8x16_t a, int8x16_t b) { return vmmlaq_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM84_I8MM) set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6") check_cxx_source_compiles("#include \nsvfloat16_t test(svfloat16_t s, svfloat16_t a, svfloat16_t b, svbool_t bp) { return svmla_f16_z(bp, s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVE) set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6") check_cxx_source_compiles("#include \nsvint16_t test(svint16_t s, svint8_t a, svint8_t b) { return svmlslb_s16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVE2) set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6") check_cxx_source_compiles("#include \nsvfloat32_t test(svfloat32_t s, svbfloat16_t a, svbfloat16_t b) { return svbfmmla_f32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEBF16) set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6") check_cxx_source_compiles("#include \nsvint32_t test(svint32_t s, svint8_t a, svint8_t b) { return svmmla_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM) set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6") check_cxx_source_compiles("#include \nsvfloat32_t test(svfloat32_t s, svfloat32_t a, svfloat32_t b) { return svmmla_f32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM) unset(CMAKE_REQUIRED_FLAGS) elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC") set(CMAKE_REQUIRED_FLAGS "/arch:armv8.0") check_cxx_source_compiles("#include \nfloat16x4_t test(float32x4_t a) { return vcvt_f16_f32(a); }" NCNN_COMPILER_SUPPORT_ARM_VFPV4) set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2 -march=armv8.2-a+fp16") check_cxx_source_compiles("#include \nfloat16x8_t test(float16x8_t s, float16x8_t a, float16x8_t b) { return vfmaq_f16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_FP16) set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2 -march=armv8.2-a+dotprod") check_cxx_source_compiles("#include \nint32x4_t test(int32x4_t s, int8x16_t a, int8x16_t b) { return vdotq_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_DOTPROD) set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2 -march=armv8.2-a+fp16fml") check_cxx_source_compiles("#include \nfloat32x4_t test(float32x4_t s, float16x8_t a, float16x8_t b) { return vfmlalq_low_f16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML) set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4 -march=armv8.4-a+bf16") check_cxx_source_compiles("#include \nfloat32x4_t test(float32x4_t s, bfloat16x8_t a, bfloat16x8_t b) { return vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(s, a, b))); }" NCNN_COMPILER_SUPPORT_ARM84_BF16) set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4 -march=armv8.4-a+i8mm") check_cxx_source_compiles("#include \nint32x4_t test(int32x4_t s, int8x16_t a, int8x16_t b) { return vmmlaq_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM84_I8MM) set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6 -march=armv8.6-a+sve") check_cxx_source_compiles("#include \nsvfloat16_t test(svfloat16_t s, svfloat16_t a, svfloat16_t b, svbool_t bp) { return svmla_f16_z(bp, s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVE) set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6 -march=armv8.6-a+sve2") check_cxx_source_compiles("#include \nsvint16_t test(svint16_t s, svint8_t a, svint8_t b) { return svmlslb_s16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVE2) set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6 -march=armv8.6-a+sve+bf16") check_cxx_source_compiles("#include \nsvfloat32_t test(svfloat32_t s, svbfloat16_t a, svbfloat16_t b) { return svbfmmla_f32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEBF16) set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6 -march=armv8.6-a+sve+i8mm") check_cxx_source_compiles("#include \nsvint32_t test(svint32_t s, svint8_t a, svint8_t b) { return svmmla_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM) set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6 -march=armv8.6-a+sve+f32mm") check_cxx_source_compiles("#include \nsvfloat32_t test(svfloat32_t s, svfloat32_t a, svfloat32_t b) { return svmmla_f32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM) unset(CMAKE_REQUIRED_FLAGS) else() set(CMAKE_REQUIRED_FLAGS "-march=armv8-a") check_cxx_source_compiles("#include \nfloat16x4_t test(float32x4_t a) { return vcvt_f16_f32(a); }" NCNN_COMPILER_SUPPORT_ARM_VFPV4) set(CMAKE_REQUIRED_FLAGS "-march=armv8.2-a+fp16") check_cxx_source_compiles("#include \nfloat16x8_t test(float16x8_t s, float16x8_t a, float16x8_t b) { return vfmaq_f16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_FP16) set(CMAKE_REQUIRED_FLAGS "-march=armv8.2-a+dotprod") check_cxx_source_compiles("#include \nint32x4_t test(int32x4_t s, int8x16_t a, int8x16_t b) { return vdotq_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_DOTPROD) set(CMAKE_REQUIRED_FLAGS "-march=armv8.2-a+fp16fml") check_cxx_source_compiles("#include \nfloat32x4_t test(float32x4_t s, float16x8_t a, float16x8_t b) { return vfmlalq_low_f16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML) set(CMAKE_REQUIRED_FLAGS "-march=armv8.4-a+bf16") check_cxx_source_compiles("#include \nfloat32x4_t test(float32x4_t s, bfloat16x8_t a, bfloat16x8_t b) { return vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(s, a, b))); }" NCNN_COMPILER_SUPPORT_ARM84_BF16) set(CMAKE_REQUIRED_FLAGS "-march=armv8.4-a+i8mm") check_cxx_source_compiles("#include \nint32x4_t test(int32x4_t s, int8x16_t a, int8x16_t b) { return vmmlaq_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM84_I8MM) set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve") check_cxx_source_compiles("#include \nsvfloat16_t test(svfloat16_t s, svfloat16_t a, svfloat16_t b, svbool_t bp) { return svmla_f16_z(bp, s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVE) set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve2") check_cxx_source_compiles("#include \nsvint16_t test(svint16_t s, svint8_t a, svint8_t b) { return svmlslb_s16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVE2) set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve+bf16") check_cxx_source_compiles("#include \nsvfloat32_t test(svfloat32_t s, svbfloat16_t a, svbfloat16_t b) { return svbfmmla_f32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEBF16) set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve+i8mm") check_cxx_source_compiles("#include \nsvint32_t test(svint32_t s, svint8_t a, svint8_t b) { return svmmla_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM) set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve+f32mm") check_cxx_source_compiles("#include \nsvfloat32_t test(svfloat32_t s, svfloat32_t a, svfloat32_t b) { return svmmla_f32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM) unset(CMAKE_REQUIRED_FLAGS) endif() if(NCNN_COMPILER_SUPPORT_ARM_VFPV4) option(NCNN_VFPV4 "optimize aarch64 platform with vfpv4" ON) else() message(WARNING "The compiler does not support arm vfpv4. NCNN_VFPV4 will be OFF.") endif() if(NCNN_COMPILER_SUPPORT_ARM82_FP16) option(NCNN_ARM82 "optimize aarch64 platform with armv8.2 fp16" ON) if(NCNN_COMPILER_SUPPORT_ARM82_DOTPROD) if(NCNN_ARM82) option(NCNN_ARM82DOT "optimize aarch64 platform with armv8.2 dotprod" ON) endif() else() message(WARNING "The compiler does not support armv8.2 dotprod. NCNN_ARM82DOT will be OFF.") endif() if(NCNN_COMPILER_SUPPORT_ARM82_FP16FML) if(NCNN_ARM82) option(NCNN_ARM82FP16FML "optimize aarch64 platform with armv8.2 fp16fml" ON) endif() else() message(WARNING "The compiler does not support armv8.2 fp16fml. NCNN_ARM82FP16FML will be OFF.") endif() if(NCNN_COMPILER_SUPPORT_ARM84_BF16) if(NCNN_ARM82DOT AND NCNN_ARM82FP16FML) option(NCNN_ARM84BF16 "optimize aarch64 platform with armv8.4 bf16" ON) endif() else() message(WARNING "The compiler does not support armv8.4 bf16. NCNN_ARM86BF16 will be OFF.") endif() if(NCNN_COMPILER_SUPPORT_ARM84_I8MM) if(NCNN_ARM82DOT AND NCNN_ARM82FP16FML) option(NCNN_ARM84I8MM "optimize aarch64 platform with armv8.4 i8mm" ON) endif() else() message(WARNING "The compiler does not support armv8.4 i8mm. NCNN_ARM84I8MM will be OFF.") endif() if(NCNN_COMPILER_SUPPORT_ARM86_SVE) if(NCNN_ARM84BF16 AND NCNN_ARM84I8MM) option(NCNN_ARM86SVE "optimize aarch64 platform with armv8.6 sve" ON) if(NCNN_COMPILER_SUPPORT_ARM86_SVE2) if(NCNN_ARM86SVE) option(NCNN_ARM86SVE2 "optimize aarch64 platform with armv8.6 sve2" ON) endif() else() message(WARNING "The compiler does not support armv8.6 sve2. NCNN_ARM86SVE2 will be OFF.") endif() if(NCNN_COMPILER_SUPPORT_ARM86_SVEBF16) if(NCNN_ARM86SVE) option(NCNN_ARM86SVEBF16 "optimize aarch64 platform with armv8.6 sve bf16" ON) endif() else() message(WARNING "The compiler does not support armv8.6 sve bf16. NCNN_ARM86SVEBF16 will be OFF.") endif() if(NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM) if(NCNN_ARM86SVE) option(NCNN_ARM86SVEI8MM "optimize aarch64 platform with armv8.6 sve i8mm" ON) endif() else() message(WARNING "The compiler does not support armv8.6 sve i8mm. NCNN_ARM86SVEI8MM will be OFF.") endif() if(NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM) if(NCNN_ARM86SVE) option(NCNN_ARM86SVEF32MM "optimize aarch64 platform with armv8.6 sve f32mm" ON) endif() else() message(WARNING "The compiler does not support armv8.6 sve f32mm. NCNN_ARM86SVEF32MM will be OFF.") endif() endif() else() message(WARNING "The compiler does not support armv8.6 sve. NCNN_ARM86SVE will be OFF.") endif() else() message(WARNING "The compiler does not support armv8.2 fp16. NCNN_ARM82 will be OFF.") endif() endif() elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(mips)") set(NCNN_TARGET_ARCH mips) check_cxx_compiler_flag("-mmsa" NCNN_COMPILER_SUPPORT_MIPS_MSA) set(CMAKE_REQUIRED_FLAGS "-mloongson-mmi -I${CMAKE_CURRENT_SOURCE_DIR}/src/layer/mips") check_cxx_source_compiles("#include \"loongson_mmi.h\"\nint32x2_t test(int16x4_t a, int16x4_t b) { return __mmi_pmaddhw(a, b); }" NCNN_COMPILER_SUPPORT_LOONGSON_MMI) unset(CMAKE_REQUIRED_FLAGS) if(NCNN_COMPILER_SUPPORT_MIPS_MSA) option(NCNN_MSA "optimize mips platform with msa extension" ON) else() message(WARNING "The compiler does not support msa extension. NCNN_MSA will be OFF.") endif() if(NCNN_COMPILER_SUPPORT_LOONGSON_MMI) option(NCNN_MMI "optimize mips platform with loongson mmi extension" ON) else() message(WARNING "The compiler does not support loongson mmi extension. NCNN_MMI will be OFF.") endif() elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(loongarch64|loongarch32)") set(NCNN_TARGET_ARCH loongarch) set(CMAKE_REQUIRED_FLAGS "-mlsx") check_cxx_source_compiles("#include \n__m128 test(__m128 a, __m128 b, __m128 c) { return __lsx_vfmadd_s(a, b, c); }" NCNN_COMPILER_SUPPORT_LOONGARCH_LSX) set(CMAKE_REQUIRED_FLAGS "-mlasx") check_cxx_source_compiles("#include \n__m256 test(__m256 a, __m256 b, __m256 c) { return __lasx_xvfmadd_s(a, b, c); }" NCNN_COMPILER_SUPPORT_LOONGARCH_LASX) unset(CMAKE_REQUIRED_FLAGS) if(NCNN_COMPILER_SUPPORT_LOONGARCH_LSX) option(NCNN_LSX "optimize loongarch platform with lsx extension" ON) if(NCNN_COMPILER_SUPPORT_LOONGARCH_LASX) option(NCNN_LASX "optimize loongarch platform with lasx extension" ON) else() message(WARNING "The compiler does not support lasx extension. NCNN_LASX will be OFF.") endif() else() message(WARNING "The compiler does not support lsx extension. NCNN_LSX will be OFF.") endif() elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)") set(NCNN_TARGET_ARCH riscv) if(CMAKE_SIZEOF_VOID_P EQUAL 8) set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv") check_cxx_source_compiles("#include \nvfloat32m8_t test(vfloat32m8_t s, vfloat32m8_t w, float v, size_t vl) { return __riscv_vfmacc_vf_f32m8(s, v, w, vl); }\nvfloat32m1x2_t test2(vfloat32m1_t x) { return __riscv_vcreate_v_f32m1x2(x, x); }" NCNN_COMPILER_SUPPORT_RISCV_V) set(CMAKE_REQUIRED_FLAGS "-march=rv64gc_zfh -D__fp16=_Float16") check_cxx_source_compiles("__fp16 test(__fp16 a) { return a * a; }" NCNN_COMPILER_SUPPORT_RISCV_ZFH) set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv_zfh_zvfh -D__fp16=_Float16") check_cxx_source_compiles("#include \nvfloat16m8_t test(vfloat16m8_t s, vfloat16m8_t w, __fp16 v, size_t vl) { return __riscv_vfmacc_vf_f16m8(s, v, w, vl); }\nvfloat16m1x2_t test2(vfloat16m1_t x){ return __riscv_vcreate_v_f16m1x2(x, x); }" NCNN_COMPILER_SUPPORT_RISCV_ZVFH) set(CMAKE_REQUIRED_FLAGS "-march=rv64gc_zfh_xtheadvector -D__fp16=_Float16") check_cxx_source_compiles("#include \nvfloat16m8_t test(vfloat16m8_t s, vfloat16m8_t w, __fp16 v, size_t vl) { return __riscv_vfmacc_vf_f16m8(s, v, w, vl); }\nvfloat16m1x2_t test2(vfloat16m1_t x){ return __riscv_vcreate_v_f16m1x2(x, x); }" NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR) unset(CMAKE_REQUIRED_FLAGS) if(NCNN_COMPILER_SUPPORT_RISCV_V OR NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR) option(NCNN_RVV "optimize risc-v platform with v extension" ON) else() message(WARNING "The compiler does not support risc-v v or xtheadvector extension. NCNN_RVV will be OFF.") endif() if(NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR) option(NCNN_XTHEADVECTOR "optimize risc-v platform with xtheadvector extension" ON) else() message(WARNING "The compiler does not support risc-v xtheadvector extension. NCNN_XTHEADVECTOR will be OFF.") endif() if(NCNN_COMPILER_SUPPORT_RISCV_ZFH) option(NCNN_ZFH "optimize risc-v platform with zfh extension" ON) if(NCNN_COMPILER_SUPPORT_RISCV_ZVFH OR NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR) if(NCNN_RVV AND NCNN_ZFH) option(NCNN_ZVFH "optimize risc-v platform with zvfh extension" ON) endif() else() message(WARNING "The compiler does not support zvfh extension. NCNN_ZVFH will be OFF.") endif() else() message(WARNING "The compiler does not support risc-v zfh extension. NCNN_ZFH will be OFF.") endif() elseif(CMAKE_SIZEOF_VOID_P EQUAL 4) set(CMAKE_REQUIRED_FLAGS "-march=rv32gcv") check_cxx_source_compiles("#include \nvfloat32m8_t test(vfloat32m8_t s, vfloat32m8_t w, float v, size_t vl) { return __riscv_vfmacc_vf_f32m8(s, v, w, vl); }\nvfloat32m1x2_t test2(vfloat32m1_t x) { return __riscv_vcreate_v_f32m1x2(x, x); }" NCNN_COMPILER_SUPPORT_RISCV_V) set(CMAKE_REQUIRED_FLAGS "-march=rv32gc_zfh -D__fp16=_Float16") check_cxx_source_compiles("__fp16 test(__fp16 a) { return a * a; }" NCNN_COMPILER_SUPPORT_RISCV_ZFH) set(CMAKE_REQUIRED_FLAGS "-march=rv32gcv_zfh_zvfh -D__fp16=_Float16") check_cxx_source_compiles("#include \nvfloat16m8_t test(vfloat16m8_t s, vfloat16m8_t w, __fp16 v, size_t vl) { return __riscv_vfmacc_vf_f16m8(s, v, w, vl); }\nvfloat16m1x2_t test2(vfloat16m1_t x){ return __riscv_vcreate_v_f16m1x2(x, x); }" NCNN_COMPILER_SUPPORT_RISCV_ZVFH) set(CMAKE_REQUIRED_FLAGS "-march=rv32gc_zfh_xtheadvector -D__fp16=_Float16") check_cxx_source_compiles("#include \nvfloat16m8_t test(vfloat16m8_t s, vfloat16m8_t w, __fp16 v, size_t vl) { return __riscv_vfmacc_vf_f16m8(s, v, w, vl); }\nvfloat16m1x2_t test2(vfloat16m1_t x){ return __riscv_vcreate_v_f16m1x2(x, x); }" NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR) unset(CMAKE_REQUIRED_FLAGS) if(NCNN_COMPILER_SUPPORT_RISCV_V OR NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR) option(NCNN_RVV "optimize risc-v platform with v extension" ON) else() message(WARNING "The compiler does not support risc-v v or xtheadvector extension. NCNN_RVV will be OFF.") endif() if(NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR) option(NCNN_XTHEADVECTOR "optimize risc-v platform with xtheadvector extension" ON) else() message(WARNING "The compiler does not support risc-v xtheadvector extension. NCNN_XTHEADVECTOR will be OFF.") endif() if(NCNN_COMPILER_SUPPORT_RISCV_ZFH) option(NCNN_ZFH "optimize risc-v platform with zfh extension" ON) if(NCNN_COMPILER_SUPPORT_RISCV_ZVFH OR NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR) if(NCNN_RVV AND NCNN_ZFH) option(NCNN_ZVFH "optimize risc-v platform with zvfh extension" ON) endif() else() message(WARNING "The compiler does not support zvfh extension. NCNN_ZVFH will be OFF.") endif() else() message(WARNING "The compiler does not support risc-v zfh extension. NCNN_ZFH will be OFF.") endif() endif() elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)") set(NCNN_TARGET_ARCH powerpc) if(NCNN_PPC64LE_VSX) set(NCNN_TARGET_ARCH x86) set(CMAKE_REQUIRED_FLAGS "-DNO_WARN_X86_INTRINSICS -D__SSE2__") check_cxx_source_compiles("#include \n__m128i test(__m128i a, __m128i b) { return _mm_madd_epi16(a, b); }" NCNN_COMPILER_SUPPORT_PPC64LE_SSE2) unset(CMAKE_REQUIRED_FLAGS) set(CMAKE_REQUIRED_FLAGS "-DNO_WARN_X86_INTRINSICS -D__SSE4_1__") check_cxx_source_compiles("#include \n__m128i test(__m128i a, __m128i b) { return _mm_packus_epi32(a, b); }" NCNN_COMPILER_SUPPORT_PPC64LE_SSE41) unset(CMAKE_REQUIRED_FLAGS) if(NCNN_COMPILER_SUPPORT_PPC64LE_SSE2) option(NCNN_VSX_SSE2 "optimize ppc64le platform with sse2 extension" ON) else() message(WARNING "The compiler does not support sse2 extension. NCNN_VSX_SSE2 will be OFF.") endif() if(NCNN_COMPILER_SUPPORT_PPC64LE_SSE41) option(NCNN_VSX_SSE41 "optimize ppc64le platform with sse4.1 extension" ON) else() message(WARNING "The compiler does not support sse4.1 extension. NCNN_VSX_SSE41 will be OFF.") endif() endif() elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(xtensa)") set(NCNN_TARGET_ARCH xtensa) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x)") set(NCNN_TARGET_ARCH s390x) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(sw_64)") set(NCNN_TARGET_ARCH sw_64) #sw_64 is alpha-like platform set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mieee") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mieee") else() set(NCNN_TARGET_ARCH x86) option(NCNN_SSE2 "optimize x86 platform with sse2 extension" ON) if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") set(CMAKE_REQUIRED_FLAGS "/arch:AVX") check_cxx_source_compiles("#include \n__m256 test(__m256 a, __m256 b) { return _mm256_mul_ps(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX) set(CMAKE_REQUIRED_FLAGS "/arch:AVX") check_cxx_source_compiles("#include \n__m256 test(__m256 s, __m256 a, __m256 b) { return _mm256_fmadd_ps(a, b, s); }" NCNN_COMPILER_SUPPORT_X86_FMA) set(CMAKE_REQUIRED_FLAGS "/arch:AVX") check_cxx_source_compiles("#include \n#include \n__m128i test(__m128i s, __m128i a, __m128i b) { return _mm_maddd_epi16(a, b, s); }" NCNN_COMPILER_SUPPORT_X86_XOP) set(CMAKE_REQUIRED_FLAGS "/arch:AVX") check_cxx_source_compiles("#include \n__m256 test(__m128i a) { return _mm256_cvtph_ps(a); }" NCNN_COMPILER_SUPPORT_X86_F16C) set(CMAKE_REQUIRED_FLAGS "/arch:AVX2") check_cxx_source_compiles("#include \n__m256i test(__m256i a, __m256i b) { return _mm256_madd_epi16(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX2) set(CMAKE_REQUIRED_FLAGS "/arch:AVX512") check_cxx_source_compiles("#include \n__m512i test(__m512i a, __m512i b) { return _mm512_madd_epi16(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX512) set(CMAKE_REQUIRED_FLAGS "/arch:AVX2") check_cxx_source_compiles("#include \n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpwssd_avx_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI) set(CMAKE_REQUIRED_FLAGS "/arch:AVX2") check_cxx_source_compiles("#include \n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpbssd_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT8) set(CMAKE_REQUIRED_FLAGS "/arch:AVX2") check_cxx_source_compiles("#include \n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpwsud_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT16) set(CMAKE_REQUIRED_FLAGS "/arch:AVX2") check_cxx_source_compiles("#include \n__m128bh test(__m256 a) { return _mm256_cvtneps_avx_pbh(a); }" NCNN_COMPILER_SUPPORT_X86_AVX_NE_CONVERT) set(CMAKE_REQUIRED_FLAGS "/arch:AVX512") check_cxx_source_compiles("#include \n__m512i test(__m512i s, __m512i a, __m512i b) { return _mm512_dpwssd_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI) set(CMAKE_REQUIRED_FLAGS "/arch:AVX512") check_cxx_source_compiles("#include \n__m256bh test(__m256bh s, __m512bh a, __m512bh b) { return _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(s), a, b)); }\n__m512i test2(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16) set(CMAKE_REQUIRED_FLAGS "/arch:AVX512") check_cxx_source_compiles("#include \n__m512h test(__m512h s, __m512h a, __m512h b) { return _mm512_fmadd_ph(s, a, b); }\n__m512 test2(__m512 a) { return _mm512_cvtxph_ps(_mm512_cvtxps_ph(a)); }" NCNN_COMPILER_SUPPORT_X86_AVX512_FP16) unset(CMAKE_REQUIRED_FLAGS) elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC") check_cxx_compiler_flag("-mrecip=none" NCNN_COMPILER_SUPPORT_X86_RECIP_NONE) set(CMAKE_REQUIRED_FLAGS "/arch:AVX") check_cxx_source_compiles("#include \n__m256 test(__m256 a, __m256 b) { return _mm256_mul_ps(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX) set(CMAKE_REQUIRED_FLAGS "/arch:AVX -mfma -mf16c") check_cxx_source_compiles("#include \n__m256 test(__m256 s, __m256 a, __m256 b) { return _mm256_fmadd_ps(a, b, s); }" NCNN_COMPILER_SUPPORT_X86_FMA) set(CMAKE_REQUIRED_FLAGS "/arch:AVX -mxop") check_cxx_source_compiles("#include \n__m128i test(__m128i s, __m128i a, __m128i b) { return _mm_maddd_epi16(a, b, s); }" NCNN_COMPILER_SUPPORT_X86_XOP) set(CMAKE_REQUIRED_FLAGS "/arch:AVX -mf16c") check_cxx_source_compiles("#include \n__m256 test(__m128i a) { return _mm256_cvtph_ps(a); }" NCNN_COMPILER_SUPPORT_X86_F16C) set(CMAKE_REQUIRED_FLAGS "/arch:AVX2 -mfma -mf16c") check_cxx_source_compiles("#include \n__m256i test(__m256i a, __m256i b) { return _mm256_madd_epi16(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX2) set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl") check_cxx_source_compiles("#include \n__m512i test(__m512i a, __m512i b) { return _mm512_madd_epi16(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX512) set(CMAKE_REQUIRED_FLAGS "/arch:AVX2 -mfma -mf16c -mavxvnni") check_cxx_source_compiles("#include \n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpwssd_avx_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI) set(CMAKE_REQUIRED_FLAGS "/arch:AVX2 -mfma -mf16c -mavxvnni -mavxvnniint8") check_cxx_source_compiles("#include \n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpbssd_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT8) set(CMAKE_REQUIRED_FLAGS "/arch:AVX2 -mfma -mf16c -mavxvnni -mavxvnniint16") check_cxx_source_compiles("#include \n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpwsud_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT16) set(CMAKE_REQUIRED_FLAGS "/arch:AVX2 -mfma -mf16c -mavxneconvert") check_cxx_source_compiles("#include \n__m128bh test(__m256 a) { return _mm256_cvtneps_avx_pbh(a); }" NCNN_COMPILER_SUPPORT_X86_AVX_NE_CONVERT) set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512vnni") check_cxx_source_compiles("#include \n__m512i test(__m512i s, __m512i a, __m512i b) { return _mm512_dpwssd_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI) set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512bf16") check_cxx_source_compiles("#include \n__m256bh test(__m256bh s, __m512bh a, __m512bh b) { return _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(s), a, b)); }\n__m512i test2(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16) set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512fp16") check_cxx_source_compiles("#include \n__m512h test(__m512h s, __m512h a, __m512h b) { return _mm512_fmadd_ph(s, a, b); }\n__m512 test2(__m512 a) { return _mm512_cvtxph_ps(_mm512_cvtxps_ph(a)); }" NCNN_COMPILER_SUPPORT_X86_AVX512_FP16) unset(CMAKE_REQUIRED_FLAGS) else() check_cxx_compiler_flag("-mrecip=none" NCNN_COMPILER_SUPPORT_X86_RECIP_NONE) set(CMAKE_REQUIRED_FLAGS "-mavx") check_cxx_source_compiles("#include \n__m256 test(__m256 a, __m256 b) { return _mm256_mul_ps(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX) set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c") check_cxx_source_compiles("#include \n__m256 test(__m256 s, __m256 a, __m256 b) { return _mm256_fmadd_ps(a, b, s); }" NCNN_COMPILER_SUPPORT_X86_FMA) set(CMAKE_REQUIRED_FLAGS "-mfma -mxop") check_cxx_source_compiles("#include \n__m128i test(__m128i s, __m128i a, __m128i b) { return _mm_maddd_epi16(a, b, s); }" NCNN_COMPILER_SUPPORT_X86_XOP) set(CMAKE_REQUIRED_FLAGS "-mf16c") check_cxx_source_compiles("#include \n__m256 test(__m128i a) { return _mm256_cvtph_ps(a); }" NCNN_COMPILER_SUPPORT_X86_F16C) set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx2") check_cxx_source_compiles("#include \n__m256i test(__m256i a, __m256i b) { return _mm256_madd_epi16(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX2) set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl") check_cxx_source_compiles("#include \n__m512i test(__m512i a, __m512i b) { return _mm512_madd_epi16(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX512) set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx2 -mavxvnni") check_cxx_source_compiles("#include \n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpwssd_avx_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI) set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx2 -mavxvnni -mavxvnniint8") check_cxx_source_compiles("#include \n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpbssd_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT8) set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx2 -mavxvnni -mavxvnniint16") check_cxx_source_compiles("#include \n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpwsud_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT16) set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx2 -mavxneconvert") check_cxx_source_compiles("#include \n__m128bh test(__m256 a) { return _mm256_cvtneps_avx_pbh(a); }" NCNN_COMPILER_SUPPORT_X86_AVX_NE_CONVERT) set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512vnni") check_cxx_source_compiles("#include \n__m512i test(__m512i s, __m512i a, __m512i b) { return _mm512_dpwssd_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI) set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512bf16") check_cxx_source_compiles("#include \n__m256bh test(__m256bh s, __m512bh a, __m512bh b) { return _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(s), a, b)); }\n__m512i test2(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16) set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512fp16") check_cxx_source_compiles("#include \n__m512h test(__m512h s, __m512h a, __m512h b) { return _mm512_fmadd_ph(s, a, b); }\n__m512 test2(__m512 a) { return _mm512_cvtxph_ps(_mm512_cvtxps_ph(a)); }" NCNN_COMPILER_SUPPORT_X86_AVX512_FP16) unset(CMAKE_REQUIRED_FLAGS) endif() if(NOT CMAKE_SYSTEM_NAME MATCHES "Emscripten|WASI" AND NCNN_COMPILER_SUPPORT_X86_AVX) option(NCNN_AVX "optimize x86 platform with avx extension" ON) if(NCNN_COMPILER_SUPPORT_X86_FMA) if(NCNN_AVX) option(NCNN_FMA "optimize x86 platform with fma extension" ON) endif() else() message(WARNING "The compiler does not support fma extension. NCNN_FMA will be OFF.") endif() if(NCNN_COMPILER_SUPPORT_X86_XOP) if(NCNN_AVX) option(NCNN_XOP "optimize x86 platform with xop extension" ON) endif() else() message(WARNING "The compiler does not support xop extension. NCNN_XOP will be OFF.") endif() if(NCNN_COMPILER_SUPPORT_X86_F16C) if(NCNN_AVX) option(NCNN_F16C "optimize x86 platform with f16c extension" ON) endif() else() message(WARNING "The compiler does not support f16c extension. NCNN_F16C will be OFF.") endif() if(NCNN_COMPILER_SUPPORT_X86_AVX2) if(NCNN_AVX) option(NCNN_AVX2 "optimize x86 platform with avx2 extension" ON) endif() if(NCNN_COMPILER_SUPPORT_X86_AVX_VNNI) if(NCNN_AVX2) option(NCNN_AVXVNNI "optimize x86 platform with avx vnni extension" ON) endif() if(NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT8) if(NCNN_AVXVNNI) option(NCNN_AVXVNNIINT8 "optimize x86 platform with avx vnni int8 extension" ON) endif() else() message(WARNING "The compiler does not support avx vnni int8 extension. NCNN_AVXVNNIINT8 will be OFF.") endif() if(NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT16) if(NCNN_AVXVNNI) option(NCNN_AVXVNNIINT16 "optimize x86 platform with avx vnni int16 extension" ON) endif() else() message(WARNING "The compiler does not support avx vnni int16 extension. NCNN_AVXVNNIINT16 will be OFF.") endif() else() message(WARNING "The compiler does not support avx vnni extension. NCNN_AVXVNNI will be OFF.") endif() if(NCNN_COMPILER_SUPPORT_X86_AVX_NE_CONVERT) if(NCNN_AVX2) option(NCNN_AVXNECONVERT "optimize x86 platform with avx ne convert extension" ON) endif() else() message(WARNING "The compiler does not support avx ne convert extension. NCNN_AVXNECONVERT will be OFF.") endif() if(NCNN_COMPILER_SUPPORT_X86_AVX512) if(NCNN_AVX2) option(NCNN_AVX512 "optimize x86 platform with avx512 extension" ON) endif() if(NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI) if(NCNN_AVX512) option(NCNN_AVX512VNNI "optimize x86 platform with avx512 vnni extension" ON) endif() else() message(WARNING "The compiler does not support avx512 vnni extension. NCNN_AVX512VNNI will be OFF.") endif() if(NCNN_COMPILER_SUPPORT_X86_AVX512_BF16) if(NCNN_AVX512) option(NCNN_AVX512BF16 "optimize x86 platform with avx512 bf16 extension" ON) endif() else() message(WARNING "The compiler does not support avx512 bf16 extension. NCNN_AVX512BF16 will be OFF.") endif() if(NCNN_COMPILER_SUPPORT_X86_AVX512_FP16) if(NCNN_AVX512) option(NCNN_AVX512FP16 "optimize x86 platform with avx512 fp16 extension" ON) endif() else() message(WARNING "The compiler does not support avx512 fp16 extension. NCNN_AVX512FP16 will be OFF.") endif() else() message(WARNING "The compiler does not support avx512 extension. NCNN_AVX512 will be OFF.") endif() else() message(WARNING "The compiler does not support avx2 extension. NCNN_AVX2 will be OFF.") endif() else() message(WARNING "The compiler does not support avx extension. NCNN_AVX will be OFF.") endif() endif() unset(CMAKE_TRY_COMPILE_CONFIGURATION) unset(CMAKE_TRY_COMPILE_TARGET_TYPE) if(NCNN_TARGET_ILP32) message(STATUS "Target arch: ${NCNN_TARGET_ARCH} 64bit ilp32") elseif(CMAKE_SIZEOF_VOID_P EQUAL 8) message(STATUS "Target arch: ${NCNN_TARGET_ARCH} 64bit") else() message(STATUS "Target arch: ${NCNN_TARGET_ARCH} 32bit") endif() ############################################## # set cmake default folder name set_property(GLOBAL PROPERTY USE_FOLDERS ON) set_property(GLOBAL PROPERTY PREDEFINED_TARGETS_FOLDER "cmake") if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -s FORCE_FILESYSTEM=1 -s INITIAL_MEMORY=256MB -s EXIT_RUNTIME=1") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -s FORCE_FILESYSTEM=1 -s INITIAL_MEMORY=256MB -s EXIT_RUNTIME=1") set(CMAKE_EXECUTBLE_LINKER_FLAGS "${CMAKE_EXECUTBLE_LINKER_FLAGS} -s FORCE_FILESYSTEM=1 -s INITIAL_MEMORY=256MB -s EXIT_RUNTIME=1") if(NCNN_OPENMP AND NCNN_SIMPLEOMP) # TODO better flags for emscripten # node --experimental-wasm-threads xxx.js set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -s USE_PTHREADS=1 -s PTHREAD_POOL_SIZE=15") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -s USE_PTHREADS=1 -s PTHREAD_POOL_SIZE=15") set(CMAKE_EXECUTBLE_LINKER_FLAGS "${CMAKE_EXECUTBLE_LINKER_FLAGS} -s USE_PTHREADS=1 -s PTHREAD_POOL_SIZE=15") endif() endif() if(NCNN_VULKAN) if(NCNN_SYSTEM_GLSLANG) find_package(Threads) find_package(SPIRV-Tools QUIET) find_package(SPIRV-Tools-opt QUIET) find_package(glslang QUIET) if(glslang_FOUND) add_library(glslang ALIAS glslang::glslang) add_library(SPIRV ALIAS glslang::SPIRV) else() set(GLSLANG_TARGET_DIR "GLSLANG-NOTFOUND" CACHE PATH "Absolute path to glslangTargets.cmake directory") if(NOT GLSLANG_TARGET_DIR AND NOT DEFINED ENV{GLSLANG_TARGET_DIR}) message(WARNING "set glslang_DIR to glslang-config.cmake directory for using system glslang.") message(WARNING "GLSLANG_TARGET_DIR must be defined! NCNN_SYSTEM_GLSLANG will be turned off.") set(NCNN_SYSTEM_GLSLANG OFF) else() include("${GLSLANG_TARGET_DIR}/OSDependentTargets.cmake") include("${GLSLANG_TARGET_DIR}/OGLCompilerTargets.cmake") if(EXISTS "${GLSLANG_TARGET_DIR}/HLSLTargets.cmake") # hlsl support can be optional include("${GLSLANG_TARGET_DIR}/HLSLTargets.cmake") endif() include("${GLSLANG_TARGET_DIR}/glslangTargets.cmake") include("${GLSLANG_TARGET_DIR}/SPIRVTargets.cmake") endif() endif() if(TARGET glslang AND TARGET SPIRV) get_property(glslang_location TARGET glslang PROPERTY LOCATION) get_property(SPIRV_location TARGET SPIRV PROPERTY LOCATION) message(STATUS "Found glslang: ${glslang_location} (found version \"${glslang_VERSION}\")") message(STATUS "Found SPIRV: ${SPIRV_location} (found version \"${glslang_VERSION}\")") else() message(WARNING "glslang or SPIRV target not found! NCNN_SYSTEM_GLSLANG will be turned off.") set(NCNN_SYSTEM_GLSLANG OFF) endif() endif() if(NOT NCNN_SYSTEM_GLSLANG) if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/glslang/CMakeLists.txt") message(FATAL_ERROR "The submodules were not downloaded! Please update submodules with \"git submodule update --init\" and try again.") else() # glslang requires c++11 set(CMAKE_CXX_STANDARD 11) option(BUILD_EXTERNAL "" OFF) option(ENABLE_SPVREMAPPER "" OFF) option(ENABLE_GLSLANG_BINARIES "" OFF) option(ENABLE_HLSL "" OFF) option(ENABLE_RTTI "" OFF) option(ENABLE_EXCEPTIONS "" OFF) option(ENABLE_OPT "" OFF) option(ENABLE_PCH "" OFF) option(GLSLANG_TESTS "" OFF) if(NCNN_SHARED_LIB) option(GLSLANG_ENABLE_INSTALL "" OFF) else() option(GLSLANG_ENABLE_INSTALL "" ON) endif() add_subdirectory(glslang) if(NCNN_SHARED_LIB) if(CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NOT CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")) target_compile_options(glslang PRIVATE -fvisibility=hidden -fvisibility-inlines-hidden) target_compile_options(glslang-default-resource-limits PRIVATE -fvisibility=hidden -fvisibility-inlines-hidden) endif() if(NCNN_ENABLE_LTO) set_target_properties(glslang PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON) set_target_properties(glslang-default-resource-limits PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON) endif() endif() endif() endif() endif() add_subdirectory(src) if(NCNN_BUILD_BENCHMARK) add_subdirectory(benchmark) endif() if(NCNN_BUILD_EXAMPLES) add_subdirectory(examples) endif() if(NCNN_BUILD_TOOLS) add_subdirectory(tools) endif() if(NCNN_BUILD_TESTS) enable_testing() add_subdirectory(tests) add_subdirectory(tests/perf) endif() if(NCNN_PYTHON) add_subdirectory(python) endif() ================================================ FILE: CONTRIBUTING.md ================================================ # Acknowledgements - Thanks to bug1989 [https://github.com/bug1989] for contributing the initial quantized int8 inference code and a large variety of device benchmark - Thanks to zhiliu6 [https://github.com/zhiliu6] for contributing the darknet conversion tool, operators and YOLO examples - Thanks to Tijmen Verhulsdonck [https://github.com/Timen] for contributing the massive AVX optimization for x86 platform ================================================ FILE: Info.plist ================================================ CFBundleName __NAME__ CFBundleIdentifier __IDENTIFIER__ CFBundleVersion __VERSION__ CFBundleShortVersionString __VERSION__ CFBundleSignature ???? CFBundlePackageType FMWK ================================================ FILE: LICENSE.txt ================================================ Tencent is pleased to support the open source community by making ncnn available. Copyright (C) 2017 Tencent. All rights reserved. If you have downloaded a copy of the ncnn binary from Tencent, please note that the ncnn binary is licensed under the BSD 3-Clause License. If you have downloaded a copy of the ncnn source code from Tencent, please note that ncnn source code is licensed under the BSD 3-Clause License, except for the third-party components listed below which are subject to different license terms. Your integration of ncnn into your own projects may require compliance with the BSD 3-Clause License, as well as the other licenses applicable to the third-party components included within ncnn. A copy of the BSD 3-Clause License is included in this file. Other dependencies and licenses: Open Source Software Licensed Under the zlib License: The below software in this distribution may have been modified by Tencent (“Tencent Modifications”). All Tencent Modifications are Copyright (C) 2017 Tencent. ---------------------------------------------------------------------------------------- 1. neon_mathfun.h Copyright (C) 2011 Julien Pommier 2. sse_mathfun.h Copyright (C) 2007 Julien Pommier 3. avx_mathfun.h Copyright (C) 2012 Giovanni Garberoglio Interdisciplinary Laboratory for Computational Science (LISC) Fondazione Bruno Kessler and University of Trento via Sommarive, 18 I-38123 Trento (Italy) Terms of the zlib License: --------------------------------------------------- Copyright (c) This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. Open Source Software Licensed Under the BSD 2-Clause License: The below software in this distribution may have been modified by Tencent (“Tencent Modifications”). All Tencent Modifications are Copyright (C) 2017 Tencent. ---------------------------------------------------------------------------------------- 1. squeezenet 1.1 Copyright (c) 2016 Forrest N. Iandola and Matthew W. Moskewicz and Khalid Ashraf and Song Han and William J. Dally and Kurt Keutzer All rights reserved. 2. caffe.proto master All contributions by the University of California: Copyright (c) 2014-2017 The Regents of the University of California (Regents) All rights reserved. All other contributions: Copyright (c) 2014-2017, the respective contributors All rights reserved. Terms of the BSD 2-Clause License: -------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Open Source Software Licensed Under the BSD 3-Clause License: The below software in this distribution may have been modified by Tencent (“Tencent Modifications”). All Tencent Modifications are Copyright (C) 2017 Tencent. ---------------------------------------------------------------------------------------- 1. android.toolchain.cmake master Copyright (c) 2010-2011, Ethan Rublee Copyright (c) 2011-2014, Andrey Kamaev All rights reserved. Terms of the BSD 3-Clause License: -------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. Neither the name of [copyright holder] nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: MANIFEST.in ================================================ recursive-include cmake * recursive-include glslang * prune glslang/Test recursive-include src * recursive-include python * prune python/pybind11/tests include CMakeLists.txt ================================================ FILE: README.md ================================================ ![ncnn](https://raw.githubusercontent.com/Tencent/ncnn/master/images/256-ncnn.png) # ncnn [![License](https://img.shields.io/badge/license-BSD_3_Clause-blue.svg?style=for-the-badge)](LICENSE.txt) [![Download Total Count](https://img.shields.io/github/downloads/Tencent/ncnn/total.svg?style=for-the-badge)](https://github.com/Tencent/ncnn/releases) [![codecov](https://img.shields.io/codecov/c/github/Tencent/ncnn/master?style=for-the-badge)](https://codecov.io/gh/Tencent/ncnn) ncnn is a high-performance neural network inference computing framework optimized for mobile platforms. ncnn is deeply considerate about deployment and uses on mobile phones from the beginning of design. ncnn does not have third-party dependencies. It is cross-platform and runs faster than all known open-source frameworks on mobile phone cpu. Developers can easily deploy deep learning algorithm models to the mobile platform by using efficient ncnn implementation, creating intelligent APPs, and bringing artificial intelligence to your fingertips. ncnn is currently being used in many Tencent applications, such as QQ, Qzone, WeChat, Pitu, and so on. ncnn 是一个为手机端极致优化的高性能神经网络前向计算框架。 ncnn 从设计之初深刻考虑手机端的部署和使用。 无第三方依赖,跨平台,手机端 cpu 的速度快于目前所有已知的开源框架。 基于 ncnn,开发者能够将深度学习算法轻松移植到手机端高效执行, 开发出人工智能 APP,将 AI 带到你的指尖。 ncnn 目前已在腾讯多款应用中使用,如:QQ,Qzone,微信,天天 P 图等。 ---
技术交流 QQ 群
637093648 (超多大佬)
答案:卷卷卷卷卷(已满)
Telegram Group Discord Channel
Pocky QQ 群(MLIR YES!)
677104663 (超多大佬)
答案:multi-level intermediate representation
他们都不知道 pnnx 有多好用群
818998520 (新群!)
--- ## Download & Build status https://github.com/Tencent/ncnn/releases/latest
**[how to build ncnn library](https://github.com/Tencent/ncnn/wiki/how-to-build) on Linux / Windows / macOS / Raspberry Pi3, Pi4 / POWER / Android / NVIDIA Jetson / iOS / WebAssembly / AllWinner D1 / Loongson 2K1000**
Source [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-full-source.zip)
- [Build for Android](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-android) - [Build for Termux on Android](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-termux-on-android)
Android [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-android-vulkan.zip) [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-android.zip) [](https://github.com/Tencent/ncnn/actions?query=workflow%3Aandroid)
Android shared [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-android-vulkan-shared.zip) [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-android-shared.zip)
- [Build for HarmonyOS with cross-compiling](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-harmonyos-with-cross-compiling)
HarmonyOS [](https://github.com/Tencent/ncnn/actions?query=workflow%3Aharmonyos)
HarmonyOS shared
- [Build for iOS on macOS with xcode](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-ios-on-macos-with-xcode)
iOS [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-ios-vulkan.zip) [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-ios.zip) [](https://github.com/Tencent/ncnn/actions?query=workflow%3Aios)
iOS-Simulator [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-ios-simulator-vulkan.zip) [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-ios-simulator.zip)
- [Build for macOS](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-macos)
macOS [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-macos-vulkan.zip) [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-macos.zip) [](https://github.com/Tencent/ncnn/actions?query=workflow%3Amacos)
Mac-Catalyst [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-mac-catalyst-vulkan.zip) [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-mac-catalyst.zip) [](https://github.com/Tencent/ncnn/actions?query=workflow%3Amac-catalyst)
watchOS [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-watchos.zip) [](https://github.com/Tencent/ncnn/actions?query=workflow%3Awatchos)
watchOS-Simulator [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-watchos-simulator.zip)
tvOS [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-tvos-vulkan.zip) [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-tvos.zip) [](https://github.com/Tencent/ncnn/actions?query=workflow%3Atvos)
tvOS-Simulator [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-tvos-simulator-vulkan.zip) [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-tvos-simulator.zip)
visionOS [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-visionos-vulkan.zip) [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-visionos.zip) [](https://github.com/Tencent/ncnn/actions?query=workflow%3Avisionos)
visionOS-Simulator [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-visionos-simulator-vulkan.zip) [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-visionos-simulator.zip)
Apple xcframework [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-apple-vulkan.zip) [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-apple.zip)
- [Build for Linux / NVIDIA Jetson / Raspberry Pi3, Pi4 / POWER](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-linux)
Ubuntu 22.04 [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-ubuntu-2204.zip) [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-ubuntu-2204-shared.zip) [](https://github.com/Tencent/ncnn/actions?query=workflow%3Alinux-x64-gpu-gcc)
Ubuntu 24.04 [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-ubuntu-2404.zip) [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-ubuntu-2404-shared.zip)
windows - [Build for Windows x64 using VS2017](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-windows-x64-using-visual-studio-community-2017) - [Build for Windows x64 using MinGW-w64](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-windows-x64-using-mingw-w64)
VS2015 [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-windows-vs2015.zip) [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-windows-vs2015-shared.zip) [](https://github.com/Tencent/ncnn/actions?query=workflow%3Awindows)
VS2017 [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-windows-vs2017.zip) [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-windows-vs2017-shared.zip)
VS2019 [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-windows-vs2019.zip) [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-windows-vs2019-shared.zip)
VS2022 [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-windows-vs2022.zip) [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-windows-vs2022-shared.zip)
- [Build for WebAssembly](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-webassembly)
WebAssembly [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-webassembly.zip) [](https://github.com/Tencent/ncnn/actions?query=workflow%3Aweb-assembly)
- [Build for ARM Cortex-A family with cross-compiling](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-arm-cortex-a-family-with-cross-compiling) - [Build for Hisilicon platform with cross-compiling](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-hisilicon-platform-with-cross-compiling) - [Build for AllWinner D1](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-allwinner-d1) - [Build for Loongson 2K1000](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-loongson-2k1000) - [Build for QNX](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-qnx)
Linux (arm) [](https://github.com/Tencent/ncnn/actions?query=workflow%3Alinux-arm)
Linux (aarch64) [](https://github.com/Tencent/ncnn/actions?query=workflow%3Alinux-aarch64)
Linux (mips) [](https://github.com/Tencent/ncnn/actions?query=workflow%3Alinux-mips)
Linux (mips64) [](https://github.com/Tencent/ncnn/actions?query=workflow%3Alinux-mips64)
Linux (ppc64) [](https://github.com/Tencent/ncnn/actions?query=workflow%3Alinux-ppc64)
Linux (riscv64) [](https://github.com/Tencent/ncnn/actions?query=workflow%3Alinux-riscv64)
Linux (loongarch64) [](https://github.com/Tencent/ncnn/actions?query=workflow%3Alinux-loongarch64)
--- ## Support most commonly used CNN network ## 支持大部分常用的 CNN 网络 - Classical CNN: [VGG](https://github.com/BVLC/caffe/wiki/Model-Zoo#models-used-by-the-vgg-team-in-ilsvrc-2014) [AlexNet](https://github.com/BVLC/caffe/tree/9b891540183ddc834a02b2bd81b31afae71b2153/models/bvlc_alexnet) [GoogleNet](https://github.com/BVLC/caffe/tree/9b891540183ddc834a02b2bd81b31afae71b2153/models/bvlc_googlenet) Inception ... - Practical CNN: [ResNet](https://github.com/tornadomeet/ResNet) [DenseNet](https://github.com/liuzhuang13/DenseNet) [SENet](https://github.com/hujie-frank/SENet) [FPN](https://github.com/unsky/FPN) ... - Light-weight CNN: [SqueezeNet](https://github.com/forresti/SqueezeNet) [MobileNetV1](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md) [MobileNetV2/V3](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/README.md) [ShuffleNetV1](https://github.com/farmingyard/ShuffleNet) [ShuffleNetV2](https://github.com/opconty/keras-shufflenetV2) [MNasNet](https://github.com/tensorflow/models/tree/master/research/slim/nets/nasnet) ... - Face Detection: [MTCNN](https://github.com/ipazc/mtcnn) [RetinaFace](https://github.com/biubug6/Pytorch_Retinaface) [scrfd](https://github.com/nihui/ncnn-android-scrfd) ... - Detection: [VGG-SSD](https://github.com/lzx1413/CAFFE_SSD) [MobileNet-SSD](https://github.com/chuanqi305/MobileNet-SSD) [SqueezeNet-SSD](https://github.com/chuanqi305/SqueezeNet-SSD) [MobileNetV2-SSDLite](https://github.com/chuanqi305/MobileNetv2-SSDLite) [MobileNetV3-SSDLite](https://github.com/XiaoyuHuang96/MobilenetV3SSDLite-tfkeras) ... - Detection: [Faster-RCNN](https://github.com/rbgirshick/py-faster-rcnn) [R-FCN](https://github.com/daijifeng001/R-FCN) ... - Detection: [YOLOv2](https://github.com/longcw/yolo2-pytorch) [YOLOv3](https://github.com/ultralytics/yolov3) [MobileNet-YOLOv3](https://github.com/eric612/MobileNet-YOLO) [YOLOv4](https://github.com/Tianxiaomo/pytorch-YOLOv4) [YOLOv5](https://github.com/ultralytics/yolov5) [YOLOv7](https://github.com/WongKinYiu/yolov7) [YOLOX](https://github.com/Megvii-BaseDetection/YOLOX) [YOLOv8](https://github.com/nihui/ncnn-android-yolov8) ... - Detection: [NanoDet](https://github.com/RangiLyu/nanodet) - Segmentation: [FCN](https://github.com/unsky/FPN) [PSPNet](https://github.com/hszhao/PSPNet) [UNet](https://github.com/zhixuhao/unet) [YOLACT](https://github.com/dbolya/yolact) ... - Pose Estimation: [SimplePose](https://github.com/dog-qiuqiu/Ultralight-SimplePose) ... --- ## HowTo **[use ncnn with alexnet](https://github.com/Tencent/ncnn/wiki/use-ncnn-with-alexnet) with detailed steps, recommended for beginners :)** **[ncnn 组件使用指北 alexnet](https://github.com/Tencent/ncnn/wiki/use-ncnn-with-alexnet.zh) 附带详细步骤,新人强烈推荐 :)** **[use netron for ncnn model visualization](https://netron.app)** **[use ncnn with pytorch or onnx](https://github.com/Tencent/ncnn/wiki/use-ncnn-with-pytorch-or-onnx)** [ncnn low-level operation api](https://github.com/Tencent/ncnn/wiki/low-level-operation-api) [ncnn param and model file spec](https://github.com/Tencent/ncnn/wiki/param-and-model-file-structure) [ncnn operation param weight table](https://github.com/Tencent/ncnn/wiki/operation-param-weight-table) [how to implement custom layer step by step](https://github.com/Tencent/ncnn/wiki/how-to-implement-custom-layer-step-by-step) --- ## FAQ **[ncnn deepwiki](https://deepwiki.com/Tencent/ncnn) LLM Answering Questions ;)** **[ncnn throw error](https://github.com/Tencent/ncnn/wiki/FAQ-ncnn-throw-error)** **[ncnn produce wrong result](https://github.com/Tencent/ncnn/wiki/FAQ-ncnn-produce-wrong-result)** **[ncnn vulkan](https://github.com/Tencent/ncnn/wiki/FAQ-ncnn-vulkan)** --- ## Features - Supports convolutional neural networks, supports multiple input and multi-branch structure, can calculate part of the branch - No third-party library dependencies, does not rely on BLAS / NNPACK or any other computing framework - Pure C++ implementation, cross-platform, supports Android, iOS and so on - ARM NEON assembly level of careful optimization, calculation speed is extremely high - Sophisticated memory management and data structure design, very low memory footprint - Supports multi-core parallel computing acceleration, ARM big.LITTLE CPU scheduling optimization - Supports GPU acceleration via the next-generation low-overhead Vulkan API - Extensible model design, supports 8bit [quantization](https://github.com/Tencent/ncnn/wiki/quantized-int8-inference) and half-precision floating point storage, can import caffe/pytorch/mxnet/onnx/darknet/keras/tensorflow(mlir) models - Support direct memory zero copy reference load network model - Can be registered with custom layer implementation and extended - Well, it is strong, not afraid of being stuffed with 卷 QvQ ## 功能概述 - 支持卷积神经网络,支持多输入和多分支结构,可计算部分分支 - 无任何第三方库依赖,不依赖 BLAS/NNPACK 等计算框架 - 纯 C++ 实现,跨平台,支持 Android / iOS 等 - ARM Neon 汇编级良心优化,计算速度极快 - 精细的内存管理和数据结构设计,内存占用极低 - 支持多核并行计算加速,ARM big.LITTLE CPU 调度优化 - 支持基于全新低消耗的 Vulkan API GPU 加速 - 可扩展的模型设计,支持 8bit [量化](tools/quantize) 和半精度浮点存储,可导入 caffe/pytorch/mxnet/onnx/darknet/keras/tensorflow(mlir) 模型 - 支持直接内存零拷贝引用加载网络模型 - 可注册自定义层实现并扩展 - 恩,很强就是了,不怕被塞卷 QvQ --- ## supported platform matrix - ✅ = known work and runs fast with good optimization - ✔️ = known work, but speed may not be fast enough - ❔ = shall work, not confirmed - / = not applied | | Windows | Linux | Android | macOS | iOS | | ---------- | ------- | ----- | ------- | ----- | --- | | intel-cpu | ✔️ | ✔️ | ✔️ | ✔️ | / | | intel-gpu | ✔️ | ✔️ | ✔️ | ✔️ | / | | amd-cpu | ✔️ | ✔️ | ✔️ | ✔️ | / | | amd-gpu | ✔️ | ✔️ | ✔️ | ✔️ | / | | nvidia-gpu | ✔️ | ✔️ | ✔️ | ✔️ | / | | qcom-cpu | ✅ | ✅ | ✅ | / | / | | qcom-gpu | ✔️ | ✔️ | ✔️ | / | / | | arm-cpu | ✅ | ✅ | ✅ | / | / | | arm-gpu | ❔ | ✔️ | ✔️ | / | / | | apple-cpu | / | / | / | ✔️ | ✅ | | apple-gpu | / | / | / | ✔️ | ✔️ | | ibm-cpu | / | ✔️ | / | / | / | --- ## Project examples - - - - - - - 🤩 -
-
- Call ncnn from Fortran - Use ncnn for real-time speech recognition (i.e., speech-to-text); also support embedded devices and provide mobile Apps (e.g., Android App) --- ## License [BSD 3 Clause](LICENSE.txt) ================================================ FILE: benchmark/CMakeLists.txt ================================================ if(MSVC) # warning C4996: 'fopen': This function or variable may be unsafe. Consider using fopen_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details. add_definitions(/wd4996) endif() # ncnn macro include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_add_param.cmake) set(benchncnn_PARAMS alexnet.param blazeface.param efficientnet_b0.param efficientnetv2_b0.param FastestDet.param googlenet_int8.param googlenet.param mnasnet.param mobilenet_int8.param mobilenet_ssd_int8.param mobilenet_ssd.param mobilenet_v2.param mobilenet_v3.param mobilenet_yolo.param mobilenet.param mobilenetv2_yolov3.param nanodet_m.param proxylessnasnet.param regnety_400m.param resnet18_int8.param resnet18.param resnet50_int8.param resnet50.param shufflenet_v2.param shufflenet.param squeezenet_int8.param squeezenet_ssd_int8.param squeezenet_ssd.param squeezenet.param vgg16_int8.param vgg16.param vision_transformer.param yolo-fastest-1.1.param yolo-fastestv2.param yolov4-tiny.param ) foreach(PARAM_FILE ${benchncnn_PARAMS}) ncnn_add_param("${CMAKE_CURRENT_SOURCE_DIR}/${PARAM_FILE}") endforeach() add_custom_target(ncnn-generate-param DEPENDS ${NCNN_PARAM_HEX_FILES}) configure_file(benchncnn_param_data.h.in ${CMAKE_CURRENT_BINARY_DIR}/benchncnn_param_data.h) add_executable(benchncnn benchncnn.cpp) target_link_libraries(benchncnn PRIVATE ncnn) target_include_directories(benchncnn PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") target_link_libraries(benchncnn PRIVATE nodefs.js) endif() add_dependencies(benchncnn ncnn-generate-param) # add benchncnn to a virtual project group set_property(TARGET benchncnn PROPERTY FOLDER "benchmark") ================================================ FILE: benchmark/FastestDet.param ================================================ 7767517 127 150 Input in0 0 1 in0 Convolution convrelu_0 1 1 in0 1 0=24 1=3 11=3 12=1 13=2 14=1 2=1 3=2 4=1 5=1 6=648 9=1 Pooling maxpool2d_43 1 1 1 2 0=0 1=3 11=3 12=2 13=1 2=2 3=1 5=1 Split splitncnn_0 1 2 2 3 4 ConvolutionDepthWise convdw_95 1 1 4 5 0=24 1=3 11=3 12=1 13=2 14=1 2=1 3=2 4=1 5=1 6=216 7=24 Convolution convrelu_1 1 1 3 6 0=24 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=576 9=1 ConvolutionDepthWise convdw_96 1 1 6 7 0=24 1=3 11=3 12=1 13=2 14=1 2=1 3=2 4=1 5=1 6=216 7=24 Convolution convrelu_3 1 1 5 8 0=24 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=576 9=1 Convolution convrelu_2 1 1 7 9 0=24 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=576 9=1 Concat cat_0 2 1 8 9 10 0=0 ShuffleChannel shufflechannel_0 1 1 10 11 0=2 1=1 Slice shufflechannel_0_slice 1 2 11 12 13 -23300=2,-233,-233 1=0 Convolution convrelu_4 1 1 13 14 0=24 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=576 9=1 ConvolutionDepthWise convdw_97 1 1 14 15 0=24 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=216 7=24 Convolution convrelu_5 1 1 15 16 0=24 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=576 9=1 Concat cat_1 2 1 12 16 17 0=0 ShuffleChannel shufflechannel_1 1 1 17 18 0=2 1=1 Slice shufflechannel_1_slice 1 2 18 19 20 -23300=2,-233,-233 1=0 Convolution convrelu_6 1 1 20 21 0=24 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=576 9=1 ConvolutionDepthWise convdw_98 1 1 21 22 0=24 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=216 7=24 Convolution convrelu_7 1 1 22 23 0=24 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=576 9=1 Concat cat_2 2 1 19 23 24 0=0 ShuffleChannel shufflechannel_2 1 1 24 25 0=2 1=1 Slice shufflechannel_2_slice 1 2 25 26 27 -23300=2,-233,-233 1=0 Convolution convrelu_8 1 1 27 28 0=24 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=576 9=1 ConvolutionDepthWise convdw_99 1 1 28 29 0=24 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=216 7=24 Convolution convrelu_9 1 1 29 30 0=24 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=576 9=1 Concat cat_3 2 1 26 30 31 0=0 Split splitncnn_1 1 3 31 32 33 34 ConvolutionDepthWise convdw_100 1 1 34 35 0=48 1=3 11=3 12=1 13=2 14=1 2=1 3=2 4=1 5=1 6=432 7=48 Convolution convrelu_10 1 1 33 36 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1 ConvolutionDepthWise convdw_101 1 1 36 37 0=48 1=3 11=3 12=1 13=2 14=1 2=1 3=2 4=1 5=1 6=432 7=48 Convolution convrelu_12 1 1 35 38 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1 Convolution convrelu_11 1 1 37 39 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1 Concat cat_4 2 1 38 39 40 0=0 ShuffleChannel shufflechannel_3 1 1 40 41 0=2 1=1 Slice shufflechannel_3_slice 1 2 41 42 43 -23300=2,-233,-233 1=0 Convolution convrelu_13 1 1 43 44 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1 ConvolutionDepthWise convdw_102 1 1 44 45 0=48 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=432 7=48 Convolution convrelu_14 1 1 45 46 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1 Concat cat_5 2 1 42 46 47 0=0 ShuffleChannel shufflechannel_4 1 1 47 48 0=2 1=1 Slice shufflechannel_4_slice 1 2 48 49 50 -23300=2,-233,-233 1=0 Convolution convrelu_15 1 1 50 51 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1 ConvolutionDepthWise convdw_103 1 1 51 52 0=48 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=432 7=48 Convolution convrelu_16 1 1 52 53 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1 Concat cat_6 2 1 49 53 54 0=0 ShuffleChannel shufflechannel_5 1 1 54 55 0=2 1=1 Slice shufflechannel_5_slice 1 2 55 56 57 -23300=2,-233,-233 1=0 Convolution convrelu_17 1 1 57 58 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1 ConvolutionDepthWise convdw_104 1 1 58 59 0=48 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=432 7=48 Convolution convrelu_18 1 1 59 60 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1 Concat cat_7 2 1 56 60 61 0=0 ShuffleChannel shufflechannel_6 1 1 61 62 0=2 1=1 Slice shufflechannel_6_slice 1 2 62 63 64 -23300=2,-233,-233 1=0 Convolution convrelu_19 1 1 64 65 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1 ConvolutionDepthWise convdw_105 1 1 65 66 0=48 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=432 7=48 Convolution convrelu_20 1 1 66 67 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1 Concat cat_8 2 1 63 67 68 0=0 ShuffleChannel shufflechannel_7 1 1 68 69 0=2 1=1 Slice shufflechannel_7_slice 1 2 69 70 71 -23300=2,-233,-233 1=0 Convolution convrelu_21 1 1 71 72 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1 ConvolutionDepthWise convdw_106 1 1 72 73 0=48 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=432 7=48 Convolution convrelu_22 1 1 73 74 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1 Concat cat_9 2 1 70 74 75 0=0 ShuffleChannel shufflechannel_8 1 1 75 76 0=2 1=1 Slice shufflechannel_8_slice 1 2 76 77 78 -23300=2,-233,-233 1=0 Convolution convrelu_23 1 1 78 79 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1 ConvolutionDepthWise convdw_107 1 1 79 80 0=48 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=432 7=48 Convolution convrelu_24 1 1 80 81 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1 Concat cat_10 2 1 77 81 82 0=0 ShuffleChannel shufflechannel_9 1 1 82 83 0=2 1=1 Slice shufflechannel_9_slice 1 2 83 84 85 -23300=2,-233,-233 1=0 Convolution convrelu_25 1 1 85 86 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1 ConvolutionDepthWise convdw_108 1 1 86 87 0=48 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=432 7=48 Convolution convrelu_26 1 1 87 88 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1 Concat cat_11 2 1 84 88 89 0=0 Split splitncnn_2 1 3 89 90 91 92 ConvolutionDepthWise convdw_109 1 1 92 93 0=96 1=3 11=3 12=1 13=2 14=1 2=1 3=2 4=1 5=1 6=864 7=96 Convolution convrelu_27 1 1 91 94 0=96 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=9216 9=1 ConvolutionDepthWise convdw_110 1 1 94 95 0=96 1=3 11=3 12=1 13=2 14=1 2=1 3=2 4=1 5=1 6=864 7=96 Convolution convrelu_29 1 1 93 96 0=96 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=9216 9=1 Convolution convrelu_28 1 1 95 97 0=96 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=9216 9=1 Concat cat_12 2 1 96 97 98 0=0 ShuffleChannel shufflechannel_10 1 1 98 99 0=2 1=1 Slice shufflechannel_10_slice 1 2 99 100 101 -23300=2,-233,-233 1=0 Convolution convrelu_30 1 1 101 102 0=96 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=9216 9=1 ConvolutionDepthWise convdw_111 1 1 102 103 0=96 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=864 7=96 Convolution convrelu_31 1 1 103 104 0=96 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=9216 9=1 Concat cat_13 2 1 100 104 105 0=0 ShuffleChannel shufflechannel_11 1 1 105 106 0=2 1=1 Slice shufflechannel_11_slice 1 2 106 107 108 -23300=2,-233,-233 1=0 Convolution convrelu_32 1 1 108 109 0=96 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=9216 9=1 ConvolutionDepthWise convdw_112 1 1 109 110 0=96 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=864 7=96 Convolution convrelu_33 1 1 110 111 0=96 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=9216 9=1 Concat cat_14 2 1 107 111 112 0=0 ShuffleChannel shufflechannel_12 1 1 112 113 0=2 1=1 Slice shufflechannel_12_slice 1 2 113 114 115 -23300=2,-233,-233 1=0 Convolution convrelu_34 1 1 115 116 0=96 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=9216 9=1 ConvolutionDepthWise convdw_113 1 1 116 117 0=96 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=864 7=96 Convolution convrelu_35 1 1 117 118 0=96 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=9216 9=1 Concat cat_15 2 1 114 118 119 0=0 Pooling avgpool2d_0 1 1 32 120 0=1 1=3 11=3 12=2 13=1 2=2 3=1 5=1 6=1 Interp upsample_94 1 1 119 121 0=1 1=2.000000e+00 2=2.000000e+00 6=0 Concat cat_16 3 1 120 90 121 122 0=0 Convolution convrelu_36 1 1 122 123 0=96 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=32256 9=1 Split splitncnn_3 1 4 123 124 125 126 127 ConvolutionDepthWise convdwrelu_5 1 1 127 128 0=96 1=5 11=5 12=1 13=1 14=2 2=1 3=1 4=2 5=1 6=2400 7=96 9=1 ConvolutionDepthWise convdwrelu_0 1 1 126 129 0=96 1=5 11=5 12=1 13=1 14=2 2=1 3=1 4=2 5=1 6=2400 7=96 9=1 ConvolutionDepthWise convdwrelu_4 1 1 129 130 0=96 1=5 11=5 12=1 13=1 14=2 2=1 3=1 4=2 5=1 6=2400 7=96 9=1 ConvolutionDepthWise convdwrelu_1 1 1 125 131 0=96 1=5 11=5 12=1 13=1 14=2 2=1 3=1 4=2 5=1 6=2400 7=96 9=1 ConvolutionDepthWise convdwrelu_2 1 1 131 132 0=96 1=5 11=5 12=1 13=1 14=2 2=1 3=1 4=2 5=1 6=2400 7=96 9=1 ConvolutionDepthWise convdwrelu_3 1 1 132 133 0=96 1=5 11=5 12=1 13=1 14=2 2=1 3=1 4=2 5=1 6=2400 7=96 9=1 Concat cat_17 3 1 128 130 133 134 0=0 Convolution conv_38 1 1 134 135 0=96 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=27648 BinaryOp add_0 2 1 124 135 136 0=0 ReLU relu_87 1 1 136 137 Convolution convrelu_37 1 1 137 138 0=96 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=9216 9=1 Split splitncnn_4 1 3 138 139 140 141 ConvolutionDepthWise convdwrelu_7 1 1 139 142 0=96 1=5 11=5 12=1 13=1 14=2 2=1 3=1 4=2 5=1 6=2400 7=96 9=1 Convolution conv_41 1 1 142 143 0=80 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=7680 ConvolutionDepthWise convdwrelu_8 1 1 140 144 0=96 1=5 11=5 12=1 13=1 14=2 2=1 3=1 4=2 5=1 6=2400 7=96 9=1 Convolution conv_42 1 1 144 145 0=4 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=384 Softmax softmax_93 1 1 143 146 0=0 1=1 ConvolutionDepthWise convdwrelu_6 1 1 141 147 0=96 1=5 11=5 12=1 13=1 14=2 2=1 3=1 4=2 5=1 6=2400 7=96 9=1 Convolution convsigmoid_38 1 1 147 148 0=1 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=96 9=4 Concat cat_18 3 1 148 145 146 out0 0=0 ================================================ FILE: benchmark/README.md ================================================ benchncnn can be used to test neural network inference performance Only the network definition files (ncnn param) are required. The large model binary files (ncnn bin) are not loaded but generated randomly for speed test. If no model specified, it would benchmark default built-in models. More model networks may be added later. --- Build ```shell # assume you have already build ncnn library successfully # uncomment the following line in /CMakeLists.txt with your favorite editor # add_subdirectory(benchmark) cd / make -j4 # you can find benchncnn binary in //benchmark ``` Usage ```shell # copy all param files to the current directory ./benchncnn [loop count] [num threads] [powersave] [gpu device] [cooling down] [(key=value)...] param=model.param shape=[227,227,3],.. ``` run benchncnn on android device ```shell # for running on android device, upload to /data/local/tmp/ folder adb push benchncnn /data/local/tmp/ # (optional) upload your ncnn model param to /data/local/tmp/ folder adb push model.param /data/local/tmp/ # executed in android adb shell adb shell cd /data/local/tmp/ # sample: benchmark built-in models on cpu, with 4 threads on big core, 4 loops and cooling_down ./benchncnn 4 4 2 -1 1 # sample: benchmark built-in models on gpu id 0, with 1 thread on big core, 8 loops, without cooling_down ./benchncnn 8 1 2 0 0 ./benchncnn [loop count] [num threads] [powersave] [gpu device] [cooling down] [(key=value)...] param=model.param shape=[227,227,3],.. ``` Parameter |param|options|default| |---|---|---| |loop count|1~N|4| |num threads|1~N|max_cpu_count| |powersave|0=all cores, 1=little cores only, 2=big cores only|0| |gpu device|-1=cpu-only, 0=gpu0, 1=gpu1 ...|-1| |cooling down|0=disable, 1=enable|1| |param|ncnn model.param filepath|-| |shape|model input shapes with, whc format|-| Tips: Disable android UI server and set CPU and GPU to max frequency ```shell # stopping android ui server, can be retarted later via adb shell start adb root adb shell stop # executed in android adb shell # set cpu performance mode echo "performance" > /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor echo "performance" > /sys/devices/system/cpu/cpu1/cpufreq/scaling_governor echo "performance" > /sys/devices/system/cpu/cpu2/cpufreq/scaling_governor echo "performance" > /sys/devices/system/cpu/cpu3/cpufreq/scaling_governor echo "performance" > /sys/devices/system/cpu/cpu4/cpufreq/scaling_governor echo "performance" > /sys/devices/system/cpu/cpu5/cpufreq/scaling_governor # set gpu performance mode (eg. RK3399) echo "performance" > /sys/class/misc/mali0/device/devfreq/ff9a0000.gpu/governor # set gpu performance mode (eg. Android Adreno) echo 1 > /sys/class/kgsl/kgsl-3d0/force_clk_on echo 10000000 > /sys/class/kgsl/kgsl-3d0/idle_timer echo "performance" > /sys/class/kgsl/kgsl-3d0/devfreq/governor echo > /sys/class/kgsl/kgsl-3d0/gpuclk ``` --- Typical output (executed in android adb shell) ### NVIDIA Jetson AGX Orin (Cortex-A78AE 2.2 GHz x 12 + Ampere@1.3 GHz Tensor Cores 64) ``` i@orin:~/projects/ncnn/benchmark$ ./benchncnn 64 1 0 -1 0 loop_count = 64 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 11.66 max = 11.80 avg = 11.74 squeezenet_int8 min = 12.24 max = 12.39 avg = 12.31 mobilenet min = 19.56 max = 19.73 avg = 19.65 mobilenet_int8 min = 16.06 max = 16.25 avg = 16.14 mobilenet_v2 min = 13.20 max = 13.41 avg = 13.29 mobilenet_v3 min = 11.39 max = 11.57 avg = 11.48 shufflenet min = 8.07 max = 8.18 avg = 8.11 shufflenet_v2 min = 8.41 max = 8.51 avg = 8.45 mnasnet min = 12.74 max = 12.91 avg = 12.79 proxylessnasnet min = 15.18 max = 15.32 avg = 15.25 efficientnet_b0 min = 26.86 max = 26.96 avg = 26.90 efficientnetv2_b0 min = 35.99 max = 36.15 avg = 36.07 regnety_400m min = 16.81 max = 16.98 avg = 16.87 blazeface min = 4.25 max = 4.37 avg = 4.29 googlenet min = 48.73 max = 48.98 avg = 48.87 googlenet_int8 min = 47.39 max = 47.60 avg = 47.49 resnet18 min = 30.93 max = 31.24 avg = 31.08 resnet18_int8 min = 55.44 max = 55.70 avg = 55.56 alexnet min = 44.19 max = 44.43 avg = 44.33 vgg16 min = 173.94 max = 174.97 avg = 174.46 vgg16_int8 min = 475.10 max = 479.37 avg = 477.33 resnet50 min = 89.50 max = 90.11 avg = 89.80 resnet50_int8 min = 106.77 max = 107.14 avg = 106.96 squeezenet_ssd min = 37.78 max = 38.35 avg = 37.93 squeezenet_ssd_int8 min = 50.48 max = 50.88 avg = 50.74 mobilenet_ssd min = 45.62 max = 46.12 avg = 45.74 mobilenet_ssd_int8 min = 37.77 max = 38.00 avg = 37.88 mobilenet_yolo min = 90.23 max = 90.49 avg = 90.35 mobilenetv2_yolov3 min = 47.27 max = 47.48 avg = 47.33 yolov4-tiny min = 60.41 max = 60.75 avg = 60.57 nanodet_m min = 19.26 max = 19.43 avg = 19.35 yolo-fastest-1.1 min = 8.16 max = 8.31 avg = 8.20 yolo-fastestv2 min = 8.26 max = 8.39 avg = 8.32 i@orin:~/projects/ncnn/benchmark$ ./benchncnn 64 2 0 -1 0 loop_count = 64 num_threads = 2 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 6.83 max = 6.98 avg = 6.90 squeezenet_int8 min = 7.39 max = 7.50 avg = 7.45 mobilenet min = 10.40 max = 10.50 avg = 10.45 mobilenet_int8 min = 8.92 max = 9.09 avg = 8.99 mobilenet_v2 min = 7.67 max = 7.80 avg = 7.74 mobilenet_v3 min = 6.86 max = 7.01 avg = 6.93 shufflenet min = 6.34 max = 6.44 avg = 6.39 shufflenet_v2 min = 5.71 max = 5.83 avg = 5.76 mnasnet min = 7.47 max = 7.58 avg = 7.53 proxylessnasnet min = 8.73 max = 8.83 avg = 8.78 efficientnet_b0 min = 14.93 max = 15.13 avg = 15.03 efficientnetv2_b0 min = 20.17 max = 20.70 avg = 20.29 regnety_400m min = 12.50 max = 12.62 avg = 12.57 blazeface min = 2.95 max = 3.06 avg = 3.00 googlenet min = 26.25 max = 26.53 avg = 26.37 googlenet_int8 min = 26.54 max = 26.79 avg = 26.66 resnet18 min = 16.69 max = 16.90 avg = 16.80 resnet18_int8 min = 29.70 max = 29.93 avg = 29.81 alexnet min = 22.96 max = 23.12 avg = 23.03 vgg16 min = 88.39 max = 89.16 avg = 88.79 vgg16_int8 min = 245.86 max = 247.55 avg = 246.62 resnet50 min = 46.55 max = 46.86 avg = 46.70 resnet50_int8 min = 56.28 max = 56.63 avg = 56.43 squeezenet_ssd min = 23.65 max = 24.29 avg = 23.81 squeezenet_ssd_int8 min = 30.86 max = 31.27 avg = 30.99 mobilenet_ssd min = 25.17 max = 25.31 avg = 25.24 mobilenet_ssd_int8 min = 21.77 max = 21.97 avg = 21.84 mobilenet_yolo min = 48.03 max = 48.33 avg = 48.14 mobilenetv2_yolov3 min = 26.58 max = 26.81 avg = 26.66 yolov4-tiny min = 35.31 max = 35.53 avg = 35.41 nanodet_m min = 12.93 max = 13.08 avg = 13.01 yolo-fastest-1.1 min = 6.00 max = 6.10 avg = 6.04 yolo-fastestv2 min = 6.46 max = 6.61 avg = 6.52 i@orin:~/projects/ncnn/benchmark$ ./benchncnn 64 4 0 -1 0 loop_count = 64 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 4.54 max = 4.84 avg = 4.61 squeezenet_int8 min = 4.96 max = 5.41 avg = 5.05 mobilenet min = 5.96 max = 6.23 avg = 6.04 mobilenet_int8 min = 5.21 max = 5.50 avg = 5.30 mobilenet_v2 min = 5.05 max = 5.26 avg = 5.15 mobilenet_v3 min = 4.83 max = 5.14 avg = 4.90 shufflenet min = 5.11 max = 5.34 avg = 5.18 shufflenet_v2 min = 4.13 max = 4.44 avg = 4.18 mnasnet min = 4.93 max = 5.27 avg = 5.01 proxylessnasnet min = 5.64 max = 5.89 avg = 5.72 efficientnet_b0 min = 9.47 max = 10.60 avg = 9.60 efficientnetv2_b0 min = 12.67 max = 13.06 avg = 12.82 regnety_400m min = 10.27 max = 10.58 avg = 10.38 blazeface min = 2.05 max = 2.27 avg = 2.10 googlenet min = 15.57 max = 15.96 avg = 15.68 googlenet_int8 min = 16.19 max = 16.65 avg = 16.32 resnet18 min = 10.20 max = 11.76 avg = 10.35 resnet18_int8 min = 16.89 max = 17.31 avg = 17.03 alexnet min = 13.13 max = 13.70 avg = 13.32 vgg16 min = 51.03 max = 52.46 avg = 51.35 vgg16_int8 min = 131.08 max = 139.44 avg = 133.78 resnet50 min = 26.74 max = 28.32 avg = 26.91 resnet50_int8 min = 32.15 max = 32.74 avg = 32.38 squeezenet_ssd min = 16.58 max = 16.99 avg = 16.70 squeezenet_ssd_int8 min = 20.22 max = 21.67 avg = 20.51 mobilenet_ssd min = 14.68 max = 16.07 avg = 14.83 mobilenet_ssd_int8 min = 12.89 max = 13.27 avg = 13.01 mobilenet_yolo min = 28.44 max = 28.85 avg = 28.58 mobilenetv2_yolov3 min = 17.21 max = 21.31 avg = 17.44 yolov4-tiny min = 23.68 max = 24.38 avg = 23.88 nanodet_m min = 8.76 max = 9.17 avg = 8.86 yolo-fastest-1.1 min = 4.83 max = 5.04 avg = 4.88 yolo-fastestv2 min = 4.93 max = 5.17 avg = 5.00 i@orin:~/projects/ncnn/benchmark$ ./benchncnn 64 8 0 -1 0 loop_count = 64 num_threads = 8 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 3.52 max = 4.28 avg = 3.65 squeezenet_int8 min = 3.85 max = 4.11 avg = 3.93 mobilenet min = 3.78 max = 4.12 avg = 3.85 mobilenet_int8 min = 3.57 max = 3.85 avg = 3.63 mobilenet_v2 min = 4.14 max = 4.44 avg = 4.22 mobilenet_v3 min = 3.89 max = 4.26 avg = 3.97 shufflenet min = 4.78 max = 4.95 avg = 4.84 shufflenet_v2 min = 3.49 max = 3.84 avg = 3.54 mnasnet min = 3.94 max = 4.09 avg = 3.99 proxylessnasnet min = 4.41 max = 4.68 avg = 4.47 efficientnet_b0 min = 7.01 max = 7.85 avg = 7.13 efficientnetv2_b0 min = 9.22 max = 9.46 avg = 9.32 regnety_400m min = 9.34 max = 9.66 avg = 9.44 blazeface min = 1.86 max = 1.98 avg = 1.89 googlenet min = 10.37 max = 10.76 avg = 10.48 googlenet_int8 min = 11.03 max = 11.34 avg = 11.16 resnet18 min = 6.83 max = 7.12 avg = 6.93 resnet18_int8 min = 10.25 max = 11.50 avg = 10.42 alexnet min = 8.88 max = 9.71 avg = 9.01 vgg16 min = 31.26 max = 31.97 avg = 31.44 vgg16_int8 min = 71.31 max = 74.53 avg = 72.18 resnet50 min = 16.43 max = 16.84 avg = 16.52 resnet50_int8 min = 19.07 max = 20.28 avg = 19.42 squeezenet_ssd min = 13.50 max = 13.69 avg = 13.56 squeezenet_ssd_int8 min = 15.16 max = 16.06 avg = 15.30 mobilenet_ssd min = 9.73 max = 10.85 avg = 9.90 mobilenet_ssd_int8 min = 9.27 max = 9.46 avg = 9.36 mobilenet_yolo min = 17.58 max = 17.79 avg = 17.67 mobilenetv2_yolov3 min = 12.80 max = 13.50 avg = 12.90 yolov4-tiny min = 17.98 max = 21.31 avg = 18.24 nanodet_m min = 7.01 max = 7.18 avg = 7.09 yolo-fastest-1.1 min = 4.76 max = 4.86 avg = 4.80 yolo-fastestv2 min = 4.76 max = 4.88 avg = 4.82 i@orin:~/projects/ncnn/benchmark$ ./benchncnn 64 12 0 -1 0 loop_count = 64 num_threads = 12 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 3.50 max = 5.21 avg = 3.65 squeezenet_int8 min = 3.97 max = 4.44 avg = 4.12 mobilenet min = 3.49 max = 7.73 avg = 3.78 mobilenet_int8 min = 3.40 max = 3.86 avg = 3.49 mobilenet_v2 min = 4.07 max = 4.39 avg = 4.17 mobilenet_v3 min = 3.92 max = 4.17 avg = 4.03 shufflenet min = 5.08 max = 6.63 avg = 5.18 shufflenet_v2 min = 3.64 max = 5.11 avg = 3.75 mnasnet min = 3.86 max = 4.16 avg = 3.95 proxylessnasnet min = 4.30 max = 5.39 avg = 4.38 efficientnet_b0 min = 6.42 max = 9.19 avg = 6.61 efficientnetv2_b0 min = 8.96 max = 9.43 avg = 9.12 regnety_400m min = 10.11 max = 10.89 avg = 10.27 blazeface min = 1.93 max = 2.16 avg = 1.99 googlenet min = 9.72 max = 10.84 avg = 10.01 googlenet_int8 min = 10.91 max = 13.03 avg = 11.17 resnet18 min = 6.70 max = 7.27 avg = 6.92 resnet18_int8 min = 9.62 max = 12.93 avg = 10.14 alexnet min = 7.21 max = 7.47 avg = 7.32 vgg16 min = 29.61 max = 63.73 avg = 30.86 vgg16_int8 min = 64.91 max = 75.06 avg = 68.72 resnet50 min = 15.35 max = 16.28 avg = 15.73 resnet50_int8 min = 17.47 max = 18.98 avg = 18.09 squeezenet_ssd min = 13.40 max = 28.74 avg = 14.07 squeezenet_ssd_int8 min = 15.35 max = 16.77 avg = 15.67 mobilenet_ssd min = 9.51 max = 11.49 avg = 9.88 mobilenet_ssd_int8 min = 9.43 max = 10.08 avg = 9.58 mobilenet_yolo min = 16.88 max = 17.45 avg = 17.09 mobilenetv2_yolov3 min = 11.91 max = 31.90 avg = 12.50 yolov4-tiny min = 17.85 max = 18.87 avg = 18.36 nanodet_m min = 6.88 max = 7.64 avg = 7.06 yolo-fastest-1.1 min = 5.02 max = 5.53 avg = 5.12 yolo-fastestv2 min = 4.95 max = 5.60 avg = 5.05 i@orin:~/projects/ncnn/benchmark$ ./benchncnn 128 1 0 0 0 [0 NVIDIA Tegra Orin (nvgpu)] queueC=2[8] queueG=0[16] queueT=1[2] [0 NVIDIA Tegra Orin (nvgpu)] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 NVIDIA Tegra Orin (nvgpu)] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 NVIDIA Tegra Orin (nvgpu)] subgroup=32 basic=1 vote=1 ballot=1 shuffle=1 loop_count = 128 num_threads = 1 powersave = 0 gpu_device = 0 cooling_down = 0 squeezenet min = 2.13 max = 3.37 avg = 2.31 squeezenet_int8 min = 12.31 max = 12.51 avg = 12.42 mobilenet min = 2.03 max = 2.73 avg = 2.23 mobilenet_int8 min = 16.86 max = 17.91 avg = 16.99 mobilenet_v2 min = 2.59 max = 3.59 avg = 2.91 mobilenet_v3 min = 3.22 max = 4.23 avg = 3.71 shufflenet min = 2.57 max = 3.27 avg = 2.80 shufflenet_v2 min = 3.20 max = 4.03 avg = 3.47 mnasnet min = 2.45 max = 3.06 avg = 2.69 proxylessnasnet min = 2.50 max = 3.14 avg = 2.72 efficientnet_b0 min = 4.23 max = 8.73 avg = 4.85 efficientnetv2_b0 min = 8.15 max = 8.60 avg = 8.41 regnety_400m min = 3.25 max = 4.17 avg = 3.54 blazeface min = 1.29 max = 1.48 avg = 1.33 googlenet min = 4.95 max = 12.34 avg = 6.36 googlenet_int8 min = 47.49 max = 47.78 avg = 47.61 resnet18 min = 3.18 max = 9.49 avg = 4.04 resnet18_int8 min = 55.57 max = 55.88 avg = 55.73 alexnet min = 3.22 max = 14.56 avg = 4.25 vgg16 min = 6.82 max = 14.75 avg = 8.18 vgg16_int8 min = 473.55 max = 479.07 avg = 476.22 resnet50 min = 4.75 max = 15.06 avg = 6.08 resnet50_int8 min = 106.99 max = 107.48 avg = 107.22 squeezenet_ssd min = 6.87 max = 9.12 avg = 7.76 squeezenet_ssd_int8 min = 50.87 max = 51.17 avg = 51.01 mobilenet_ssd min = 4.44 max = 6.22 avg = 5.23 mobilenet_ssd_int8 min = 37.80 max = 38.03 avg = 37.92 mobilenet_yolo min = 5.41 max = 7.36 avg = 6.29 mobilenetv2_yolov3 min = 7.20 max = 9.96 avg = 7.30 yolov4-tiny min = 16.48 max = 28.81 avg = 18.40 nanodet_m min = 5.75 max = 8.54 avg = 6.85 yolo-fastest-1.1 min = 4.03 max = 4.75 avg = 4.35 yolo-fastestv2 min = 4.27 max = 5.23 avg = 4.71 ``` ### AMD Ryzen Threadripper 3970X (Zen2 3.7 GHz ~ 4.5 GHz x 32) ``` i@s:~/qtang/ncnn/benchmark$ ../build-vulkan/benchmark/benchncnn 10 1 0 -1 0 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 11.73 max = 11.88 avg = 11.78 mobilenet min = 21.63 max = 21.73 avg = 21.68 mobilenet_v2 min = 14.70 max = 14.95 avg = 14.82 mobilenet_v3 min = 12.12 max = 12.17 avg = 12.15 shufflenet min = 14.08 max = 14.16 avg = 14.12 shufflenet_v2 min = 25.99 max = 26.13 avg = 26.06 mnasnet min = 14.12 max = 14.17 avg = 14.14 proxylessnasnet min = 16.51 max = 16.71 avg = 16.61 efficientnet_b0 min = 22.88 max = 22.97 avg = 22.93 regnety_400m min = 18.50 max = 18.61 avg = 18.56 blazeface min = 6.18 max = 6.27 avg = 6.21 googlenet min = 58.42 max = 58.60 avg = 58.49 resnet18 min = 61.13 max = 61.84 avg = 61.40 alexnet min = 50.82 max = 50.98 avg = 50.92 vgg16 min = 217.19 max = 218.40 avg = 217.87 resnet50 min = 126.84 max = 137.46 avg = 128.21 squeezenet_ssd min = 114.24 max = 114.57 avg = 114.47 mobilenet_ssd min = 51.60 max = 51.89 avg = 51.77 mobilenet_yolo min = 125.09 max = 126.33 avg = 125.83 mobilenetv2_yolov3 min = 57.51 max = 57.79 avg = 57.65 yolov4-tiny min = 85.65 max = 85.97 avg = 85.79 ``` ### NVIDIA Quadro RTX 8000 (TU102 SM x 72 + Tensor Core x 576) ``` i@s:~/qtang/ncnn/benchmark$ ../build-vulkan/benchmark/benchncnn 256 1 0 1 0 [0 Quadro RTX 8000] queueC=2[8] queueG=0[16] queueT=1[2] [0 Quadro RTX 8000] bugsbn1=0 bugcopc=0 bugihfa=0 [0 Quadro RTX 8000] fp16p=1 fp16s=1 fp16a=1 int8s=1 int8a=1 [0 Quadro RTX 8000] subgroup=32 basic=1 vote=1 ballot=1 shuffle=1 [1 Quadro RTX 8000] queueC=2[8] queueG=0[16] queueT=1[2] [1 Quadro RTX 8000] bugsbn1=0 bugcopc=0 bugihfa=0 [1 Quadro RTX 8000] fp16p=1 fp16s=1 fp16a=1 int8s=1 int8a=1 [1 Quadro RTX 8000] subgroup=32 basic=1 vote=1 ballot=1 shuffle=1 loop_count = 256 num_threads = 1 powersave = 0 gpu_device = 1 cooling_down = 0 squeezenet min = 0.84 max = 1.39 avg = 0.93 mobilenet min = 0.90 max = 2.30 avg = 0.91 mobilenet_v2 min = 1.35 max = 9.59 avg = 1.46 mobilenet_v3 min = 1.60 max = 77.94 avg = 2.12 shufflenet min = 0.86 max = 2.27 avg = 0.88 shufflenet_v2 min = 1.25 max = 1.47 avg = 1.27 mnasnet min = 1.42 max = 20.77 avg = 1.72 proxylessnasnet min = 1.48 max = 1.67 avg = 1.49 efficientnet_b0 min = 2.56 max = 12.86 avg = 2.77 regnety_400m min = 1.84 max = 14.98 avg = 2.42 blazeface min = 0.64 max = 0.90 avg = 0.65 googlenet min = 2.94 max = 76.82 avg = 3.45 resnet18 min = 1.27 max = 10.56 avg = 1.56 alexnet min = 1.53 max = 71.76 avg = 1.96 vgg16 min = 4.90 max = 78.12 avg = 5.80 resnet50 min = 3.00 max = 12.51 avg = 3.07 squeezenet_ssd min = 5.60 max = 97.09 avg = 6.50 mobilenet_ssd min = 2.40 max = 93.64 avg = 3.30 mobilenet_yolo min = 2.96 max = 19.15 avg = 3.25 mobilenetv2_yolov3 min = 4.52 max = 66.96 avg = 5.32 yolov4-tiny min = 9.32 max = 72.92 avg = 14.01 ``` ### NVIDIA RTX3090 (GA102 SM x 82 + Tensor Core 328) ``` (base) i@t:~/wls/ncnn/benchmark$ ../build/benchmark/benchncnn 32 1 0 0 0 [0 GeForce RTX 3090] queueC=2[8] queueG=0[16] queueT=1[2] [0 GeForce RTX 3090] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 GeForce RTX 3090] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 GeForce RTX 3090] subgroup=32 basic=1 vote=1 ballot=1 shuffle=1 [1 GeForce RTX 3090] queueC=2[8] queueG=0[16] queueT=1[2] [1 GeForce RTX 3090] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [1 GeForce RTX 3090] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [1 GeForce RTX 3090] subgroup=32 basic=1 vote=1 ballot=1 shuffle=1 loop_count = 32 num_threads = 1 powersave = 0 gpu_device = 0 cooling_down = 0 squeezenet min = 1.76 max = 2.74 avg = 1.80 squeezenet_int8 min = 47.10 max = 47.75 avg = 47.21 mobilenet min = 4.77 max = 5.79 avg = 5.20 mobilenet_int8 min = 64.19 max = 67.05 avg = 64.39 mobilenet_v2 min = 2.44 max = 20.89 avg = 6.98 mobilenet_v3 min = 2.75 max = 2.87 avg = 2.77 shufflenet min = 2.20 max = 2.62 avg = 2.46 shufflenet_v2 min = 5.10 max = 7.43 avg = 5.75 mnasnet min = 3.47 max = 3.50 avg = 3.48 proxylessnasnet min = 2.59 max = 9.08 avg = 7.28 efficientnet_b0 min = 3.87 max = 4.65 avg = 3.91 efficientnetv2_b0 min = 29.48 max = 41.90 avg = 30.14 regnety_400m min = 2.89 max = 2.99 avg = 2.91 blazeface min = 1.55 max = 2.14 avg = 1.60 googlenet min = 4.33 max = 17.89 avg = 6.05 googlenet_int8 min = 174.46 max = 178.19 avg = 174.74 resnet18 min = 2.14 max = 11.04 avg = 5.33 resnet18_int8 min = 193.37 max = 193.83 avg = 193.55 alexnet min = 2.37 max = 15.99 avg = 4.50 vgg16 min = 4.55 max = 16.65 avg = 5.22 vgg16_int8 min = 1538.76 max = 1544.81 avg = 1540.79 resnet50 min = 4.13 max = 25.86 avg = 5.80 resnet50_int8 min = 400.89 max = 401.72 avg = 401.29 squeezenet_ssd min = 6.95 max = 7.81 avg = 7.07 squeezenet_ssd_int8 min = 158.51 max = 159.04 avg = 158.68 mobilenet_ssd min = 4.36 max = 18.98 avg = 9.40 mobilenet_ssd_int8 min = 130.74 max = 130.92 avg = 130.83 mobilenet_yolo min = 3.96 max = 11.94 avg = 6.48 mobilenetv2_yolov3 min = 6.07 max = 6.21 avg = 6.13 yolov4-tiny min = 13.01 max = 26.78 avg = 14.87 root@3090:~/Desktop/ncnn-20221128/build/benchmark$ ./benchncnn 100 10 2 0 0 [0 NVIDIA GeForce RTX 3090] queueC=2[8] queueG=0[16] queueT=1[2] [0 NVIDIA GeForce RTX 3090] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 NVIDIA GeForce RTX 3090] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 NVIDIA GeForce RTX 3090] subgroup=32 basic=1 vote=1 ballot=1 shuffle=1 loop_count = 100 num_threads = 10 powersave = 2 gpu_device = 0 cooling_down = 0 squeezenet min = 0.64 max = 0.66 avg = 0.65 squeezenet_int8 min = 4.30 max = 4.93 avg = 4.45 mobilenet min = 0.60 max = 1.85 avg = 1.32 mobilenet_int8 min = 3.08 max = 3.17 avg = 3.12 mobilenet_v2 min = 1.40 max = 1.46 avg = 1.42 mobilenet_v3 min = 1.22 max = 6.10 avg = 3.02 shufflenet min = 0.90 max = 0.97 avg = 0.92 shufflenet_v2 min = 1.06 max = 1.13 avg = 1.09 mnasnet min = 0.84 max = 0.98 avg = 0.91 proxylessnasnet min = 0.99 max = 3.01 avg = 2.45 efficientnet_b0 min = 2.11 max = 2.85 avg = 2.16 efficientnetv2_b0 min = 7.46 max = 28.58 avg = 8.55 regnety_400m min = 1.53 max = 1.75 avg = 1.59 blazeface min = 0.59 max = 0.94 avg = 0.63 googlenet min = 1.90 max = 12.22 avg = 2.63 googlenet_int8 min = 17.45 max = 18.69 avg = 17.81 resnet18 min = 0.90 max = 13.14 avg = 3.09 resnet18_int8 min = 16.25 max = 17.34 avg = 16.50 alexnet min = 0.86 max = 4.77 avg = 2.59 vgg16 min = 1.38 max = 11.20 avg = 2.91 vgg16_int8 min = 47.17 max = 49.02 avg = 47.57 resnet50 min = 1.54 max = 2.16 avg = 1.64 resnet50_int8 min = 22.90 max = 24.46 avg = 23.23 squeezenet_ssd min = 2.25 max = 10.91 avg = 4.12 squeezenet_ssd_int8 min = 11.98 max = 14.54 avg = 12.31 mobilenet_ssd min = 1.46 max = 8.98 avg = 3.38 mobilenet_ssd_int8 min = 6.13 max = 6.65 avg = 6.23 mobilenet_yolo min = 1.29 max = 1.43 avg = 1.34 mobilenetv2_yolov3 min = 3.64 max = 6.66 avg = 3.77 yolov4-tiny min = 9.04 max = 11.65 avg = 9.54 nanodet_m min = 1.43 max = 11.90 avg = 3.16 yolo-fastest-1.1 min = 1.40 max = 1.82 avg = 1.57 yolo-fastestv2 min = 1.36 max = 2.30 avg = 1.42 vision_transformer min = 202.71 max = 244.47 avg = 218.69 FastestDet min = 1.37 max = 5.37 avg = 2.77 ``` ### AMD Ryzen Embedded V1605B (Zen 2.0 GHz ~ 3.6 GHz x 4 + Radeon Vega 8 1.1GHz 8CU) ``` C:\Users\i\Desktop\benchmark>benchncnn.exe 32 1 0 -1 0 loop_count = 32 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 22.13 max = 24.07 avg = 22.88 squeezenet_int8 min = 58.54 max = 62.21 avg = 59.55 mobilenet min = 40.99 max = 43.67 avg = 41.70 mobilenet_int8 min = 98.06 max = 111.37 avg = 101.15 mobilenet_v2 min = 26.53 max = 28.96 avg = 27.81 mobilenet_v3 min = 22.96 max = 25.25 avg = 23.30 shufflenet min = 20.17 max = 28.78 avg = 21.09 shufflenet_v2 min = 19.06 max = 19.72 avg = 19.47 mnasnet min = 25.11 max = 39.53 avg = 27.54 proxylessnasnet min = 28.84 max = 35.16 avg = 30.03 efficientnet_b0 min = 43.16 max = 46.03 avg = 43.65 efficientnetv2_b0 min = 48.64 max = 52.07 avg = 49.62 regnety_400m min = 33.43 max = 35.87 avg = 33.97 blazeface min = 5.43 max = 6.04 avg = 5.56 googlenet min = 85.80 max = 90.93 avg = 87.65 googlenet_int8 min = 214.37 max = 230.75 avg = 219.50 resnet18 min = 76.58 max = 80.38 avg = 77.34 resnet18_int8 min = 231.16 max = 255.22 avg = 236.65 alexnet min = 60.69 max = 64.06 avg = 61.34 vgg16 min = 286.45 max = 307.04 avg = 290.86 vgg16_int8 min = 1797.58 max = 2079.73 avg = 1844.78 resnet50 min = 198.27 max = 215.03 avg = 201.37 resnet50_int8 min = 493.52 max = 499.67 avg = 496.95 squeezenet_ssd min = 189.97 max = 198.53 avg = 192.10 squeezenet_ssd_int8 min = 198.81 max = 214.55 avg = 203.59 mobilenet_ssd min = 87.56 max = 92.72 avg = 89.03 mobilenet_ssd_int8 min = 196.97 max = 209.51 avg = 201.95 mobilenet_yolo min = 206.87 max = 218.48 avg = 210.84 mobilenetv2_yolov3 min = 102.72 max = 108.18 avg = 104.62 yolov4-tiny min = 117.97 max = 134.73 avg = 121.26 C:\Users\i\Desktop\benchmark>benchncnn.exe 32 2 0 -1 0 loop_count = 32 num_threads = 2 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 13.43 max = 14.35 avg = 13.62 squeezenet_int8 min = 32.29 max = 50.76 avg = 33.56 mobilenet min = 23.42 max = 25.10 avg = 24.09 mobilenet_int8 min = 51.99 max = 55.42 avg = 53.01 mobilenet_v2 min = 15.45 max = 15.75 avg = 15.59 mobilenet_v3 min = 14.32 max = 14.75 avg = 14.39 shufflenet min = 12.64 max = 12.83 avg = 12.69 shufflenet_v2 min = 11.45 max = 12.44 avg = 11.60 mnasnet min = 14.43 max = 20.45 avg = 15.11 proxylessnasnet min = 16.18 max = 16.38 avg = 16.24 efficientnet_b0 min = 25.25 max = 28.42 avg = 26.59 efficientnetv2_b0 min = 27.57 max = 32.05 avg = 30.04 regnety_400m min = 22.74 max = 24.75 avg = 23.31 blazeface min = 3.44 max = 3.83 avg = 3.62 googlenet min = 49.39 max = 66.76 avg = 53.76 googlenet_int8 min = 113.89 max = 136.75 avg = 119.29 resnet18 min = 43.77 max = 67.24 avg = 46.14 resnet18_int8 min = 121.44 max = 148.01 avg = 126.95 alexnet min = 34.46 max = 37.38 avg = 35.50 vgg16 min = 177.16 max = 207.25 avg = 184.19 vgg16_int8 min = 951.86 max = 1155.60 avg = 990.51 resnet50 min = 112.28 max = 137.18 avg = 115.64 resnet50_int8 min = 260.69 max = 272.26 avg = 265.89 squeezenet_ssd min = 108.07 max = 121.66 avg = 110.35 squeezenet_ssd_int8 min = 109.01 max = 126.86 avg = 111.96 mobilenet_ssd min = 49.60 max = 52.62 avg = 50.46 mobilenet_ssd_int8 min = 104.22 max = 111.07 avg = 106.33 mobilenet_yolo min = 117.42 max = 136.73 avg = 122.92 mobilenetv2_yolov3 min = 61.66 max = 65.22 avg = 63.01 yolov4-tiny min = 72.64 max = 77.09 avg = 74.30 C:\Users\i\Desktop\benchmark>benchncnn.exe 32 4 0 -1 0 loop_count = 32 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 9.19 max = 14.82 avg = 11.15 squeezenet_int8 min = 19.00 max = 40.30 avg = 24.80 mobilenet min = 18.02 max = 39.84 avg = 27.38 mobilenet_int8 min = 28.04 max = 57.59 avg = 34.15 mobilenet_v2 min = 10.26 max = 17.79 avg = 13.36 mobilenet_v3 min = 8.87 max = 10.87 avg = 9.11 shufflenet min = 8.93 max = 11.96 avg = 9.34 shufflenet_v2 min = 7.37 max = 13.10 avg = 8.72 mnasnet min = 9.24 max = 14.90 avg = 11.32 proxylessnasnet min = 10.21 max = 11.89 avg = 10.39 efficientnet_b0 min = 16.22 max = 23.71 avg = 16.59 efficientnetv2_b0 min = 17.44 max = 31.42 avg = 22.85 regnety_400m min = 18.32 max = 24.02 avg = 18.90 blazeface min = 2.22 max = 2.81 avg = 2.30 googlenet min = 31.52 max = 51.80 avg = 42.11 googlenet_int8 min = 65.47 max = 114.41 avg = 75.98 resnet18 min = 28.90 max = 64.62 avg = 37.58 resnet18_int8 min = 71.29 max = 136.67 avg = 103.03 alexnet min = 23.67 max = 34.01 avg = 29.78 vgg16 min = 142.18 max = 211.00 avg = 170.46 vgg16_int8 min = 531.36 max = 871.25 avg = 625.60 resnet50 min = 69.23 max = 108.67 avg = 73.68 resnet50_int8 min = 149.18 max = 309.88 avg = 168.68 squeezenet_ssd min = 68.83 max = 81.70 avg = 71.01 squeezenet_ssd_int8 min = 66.34 max = 118.16 avg = 74.34 mobilenet_ssd min = 29.96 max = 34.32 avg = 30.74 mobilenet_ssd_int8 min = 56.87 max = 92.24 avg = 65.57 mobilenet_yolo min = 74.26 max = 113.91 avg = 81.28 mobilenetv2_yolov3 min = 42.16 max = 63.49 avg = 45.34 yolov4-tiny min = 53.06 max = 69.84 avg = 55.81 C:\Users\i\Desktop\benchmark>benchncnn.exe 32 1 0 0 0 [0 AMD Radeon(TM) Vega 8 Graphics] queueC=1[2] queueG=0[1] queueT=2[1] [0 AMD Radeon(TM) Vega 8 Graphics] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 AMD Radeon(TM) Vega 8 Graphics] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 AMD Radeon(TM) Vega 8 Graphics] subgroup=64 basic=1 vote=1 ballot=1 shuffle=1 loop_count = 32 num_threads = 1 powersave = 0 gpu_device = 0 cooling_down = 0 squeezenet min = 6.78 max = 7.09 avg = 6.91 squeezenet_int8 min = 58.93 max = 62.53 avg = 60.11 mobilenet min = 8.08 max = 8.39 avg = 8.25 mobilenet_int8 min = 97.74 max = 116.77 avg = 100.17 mobilenet_v2 min = 7.95 max = 8.27 avg = 8.14 mobilenet_v3 min = 8.70 max = 9.70 avg = 9.02 shufflenet min = 6.36 max = 7.64 avg = 7.01 shufflenet_v2 min = 7.04 max = 8.12 avg = 7.50 mnasnet min = 8.07 max = 9.08 avg = 8.38 proxylessnasnet min = 8.56 max = 9.66 avg = 8.81 efficientnet_b0 min = 16.68 max = 18.00 avg = 17.30 efficientnetv2_b0 min = 394.82 max = 404.88 avg = 401.05 regnety_400m min = 11.92 max = 12.17 avg = 12.03 blazeface min = 4.82 max = 6.50 avg = 5.42 googlenet min = 18.44 max = 19.66 avg = 19.18 googlenet_int8 min = 213.41 max = 231.79 avg = 218.31 resnet18 min = 14.27 max = 14.72 avg = 14.44 resnet18_int8 min = 228.79 max = 249.65 avg = 236.06 alexnet min = 17.31 max = 18.31 avg = 17.69 vgg16 min = 111.85 max = 123.35 avg = 112.98 vgg16_int8 min = 1789.64 max = 1838.84 avg = 1826.05 resnet50 min = 31.61 max = 32.86 avg = 32.12 resnet50_int8 min = 483.57 max = 505.72 avg = 491.76 squeezenet_ssd min = 99.66 max = 105.68 avg = 104.57 squeezenet_ssd_int8 min = 200.48 max = 208.71 avg = 203.02 mobilenet_ssd min = 33.45 max = 35.64 avg = 34.75 mobilenet_ssd_int8 min = 195.14 max = 205.35 avg = 200.18 mobilenet_yolo min = 59.20 max = 61.06 avg = 60.47 mobilenetv2_yolov3 min = 31.48 max = 33.25 avg = 32.84 yolov4-tiny min = 93.75 max = 97.45 avg = 96.00 ``` ### Qualcomm SM8150-AC Snapdragon 855+ (Kyro485 2.96 GHz + 2.42 GHz x 3 + 1.80 GHz x 4 + Adreno 640) ``` OnePlus7T:/data/local/tmp # ./benchncnn 8 4 2 -1 1 loop_count = 8 num_threads = 4 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 3.60 max = 3.70 avg = 3.64 squeezenet_int8 min = 3.67 max = 3.78 avg = 3.71 mobilenet min = 5.32 max = 5.42 avg = 5.38 mobilenet_int8 min = 4.20 max = 4.28 avg = 4.23 mobilenet_v2 min = 4.64 max = 4.73 avg = 4.68 mobilenet_v3 min = 4.13 max = 4.25 avg = 4.18 shufflenet min = 3.29 max = 3.40 avg = 3.33 shufflenet_v2 min = 2.98 max = 3.07 avg = 3.01 mnasnet min = 4.26 max = 4.37 avg = 4.31 proxylessnasnet min = 4.67 max = 4.78 avg = 4.72 efficientnet_b0 min = 7.23 max = 7.34 avg = 7.30 efficientnetv2_b0 min = 8.74 max = 8.87 avg = 8.81 regnety_400m min = 7.88 max = 7.99 avg = 7.95 blazeface min = 1.19 max = 1.30 avg = 1.22 googlenet min = 13.07 max = 13.20 avg = 13.12 googlenet_int8 min = 12.86 max = 12.98 avg = 12.93 resnet18 min = 10.33 max = 10.36 avg = 10.35 resnet18_int8 min = 9.42 max = 9.45 avg = 9.43 alexnet min = 11.88 max = 11.95 avg = 11.91 vgg16 min = 59.34 max = 60.69 avg = 60.19 vgg16_int8 min = 68.78 max = 69.07 avg = 68.93 resnet50 min = 26.18 max = 26.28 avg = 26.24 resnet50_int8 min = 20.86 max = 20.95 avg = 20.91 squeezenet_ssd min = 12.00 max = 12.76 avg = 12.19 squeezenet_ssd_int8 min = 11.67 max = 13.13 avg = 12.03 mobilenet_ssd min = 11.88 max = 12.68 avg = 12.03 mobilenet_ssd_int8 min = 9.28 max = 9.68 avg = 9.35 mobilenet_yolo min = 27.89 max = 28.06 avg = 27.96 mobilenetv2_yolov3 min = 18.00 max = 18.13 avg = 18.06 yolov4-tiny min = 25.25 max = 25.36 avg = 25.29 nanodet_m min = 8.93 max = 9.00 avg = 8.96 yolo-fastest-1.1 min = 3.73 max = 3.83 avg = 3.77 yolo-fastestv2 min = 3.38 max = 3.47 avg = 3.41 vision_transformer min = 567.94 max = 572.31 avg = 569.66 FastestDet min = 3.28 max = 3.37 avg = 3.32 OnePlus7T:/data/local/tmp # ./benchncnn 8 1 2 -1 1 loop_count = 8 num_threads = 1 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 8.24 max = 8.34 avg = 8.31 squeezenet_int8 min = 8.23 max = 8.34 avg = 8.30 mobilenet min = 14.38 max = 14.56 avg = 14.45 mobilenet_int8 min = 11.12 max = 11.24 avg = 11.17 mobilenet_v2 min = 9.82 max = 9.88 avg = 9.84 mobilenet_v3 min = 8.15 max = 8.24 avg = 8.21 shufflenet min = 5.32 max = 5.44 avg = 5.37 shufflenet_v2 min = 5.38 max = 5.51 avg = 5.44 mnasnet min = 9.25 max = 9.36 avg = 9.31 proxylessnasnet min = 10.95 max = 11.01 avg = 10.98 efficientnet_b0 min = 17.67 max = 17.79 avg = 17.73 efficientnetv2_b0 min = 20.56 max = 20.70 avg = 20.60 regnety_400m min = 11.96 max = 12.07 avg = 12.00 blazeface min = 2.19 max = 2.87 avg = 2.47 googlenet min = 32.10 max = 32.20 avg = 32.15 googlenet_int8 min = 32.00 max = 32.15 avg = 32.07 resnet18 min = 22.02 max = 22.28 avg = 22.12 resnet18_int8 min = 26.17 max = 26.26 avg = 26.22 alexnet min = 24.83 max = 24.99 avg = 24.92 vgg16 min = 129.57 max = 129.95 avg = 129.78 vgg16_int8 min = 202.08 max = 202.34 avg = 202.19 resnet50 min = 65.85 max = 66.01 avg = 65.93 resnet50_int8 min = 56.33 max = 56.49 avg = 56.42 squeezenet_ssd min = 22.52 max = 24.50 avg = 22.93 squeezenet_ssd_int8 min = 24.51 max = 26.83 avg = 24.98 mobilenet_ssd min = 30.55 max = 32.68 avg = 30.85 mobilenet_ssd_int8 min = 22.96 max = 23.75 avg = 23.09 mobilenet_yolo min = 68.74 max = 69.01 avg = 68.88 mobilenetv2_yolov3 min = 36.98 max = 37.16 avg = 37.06 yolov4-tiny min = 47.36 max = 47.45 avg = 47.41 nanodet_m min = 15.08 max = 15.30 avg = 15.17 yolo-fastest-1.1 min = 5.51 max = 5.61 avg = 5.55 yolo-fastestv2 min = 4.92 max = 5.02 avg = 4.97 vision_transformer min = 990.13 max = 994.45 avg = 991.95 FastestDet min = 5.06 max = 5.17 avg = 5.11 OnePlus7T:/data/local/tmp $ ./benchncnn 8 1 2 0 1 [0 Adreno (TM) 640] queueC=0[3] queueG=0[3] queueT=0[3] [0 Adreno (TM) 640] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=1 [0 Adreno (TM) 640] fp16-p/s/a=1/0/1 int8-p/s/a=1/0/0 [0 Adreno (TM) 640] subgroup=64 basic=1 vote=1 ballot=0 shuffle=0 loop_count = 8 num_threads = 1 powersave = 2 gpu_device = 0 cooling_down = 1 squeezenet min = 8.59 max = 9.51 avg = 9.09 mobilenet min = 13.04 max = 13.45 avg = 13.22 mobilenet_v2 min = 10.68 max = 11.38 avg = 10.85 mobilenet_v3 min = 11.86 max = 12.37 avg = 12.08 shufflenet min = 8.21 max = 8.40 avg = 8.25 shufflenet_v2 min = 8.84 max = 9.13 avg = 8.97 mnasnet min = 11.32 max = 11.72 avg = 11.45 proxylessnasnet min = 12.27 max = 12.86 avg = 12.55 efficientnet_b0 min = 22.64 max = 22.82 avg = 22.75 efficientnetv2_b0 min = 32.32 max = 38.20 avg = 35.79 regnety_400m min = 15.35 max = 15.86 avg = 15.64 blazeface min = 2.82 max = 2.93 avg = 2.86 googlenet min = 28.22 max = 28.34 avg = 28.26 resnet18 min = 24.71 max = 24.96 avg = 24.82 alexnet min = 27.94 max = 28.10 avg = 28.01 vgg16 min = 106.08 max = 106.53 avg = 106.30 resnet50 min = 55.28 max = 56.03 avg = 55.68 squeezenet_ssd min = 29.77 max = 30.65 avg = 30.05 mobilenet_ssd min = 29.14 max = 29.39 avg = 29.25 mobilenet_yolo min = 49.78 max = 50.09 avg = 49.94 mobilenetv2_yolov3 min = 31.11 max = 31.97 avg = 31.60 yolov4-tiny min = 46.22 max = 46.90 avg = 46.63 nanodet_m min = 15.96 max = 16.52 avg = 16.13 yolo-fastest-1.1 min = 9.59 max = 9.66 avg = 9.61 yolo-fastestv2 min = 7.99 max = 8.23 avg = 8.13 ``` ### Qualcomm MSM6150 Snapdragon 675 (Kyro460 2.0GHz x 2 + Kyro460 1.7GHz x 6 + Adreno 612) ``` violet:/data/local/tmp/ncnn $ ./benchncnn 8 2 0 loop_count = 8 num_threads = 2 powersave = 0 gpu_device = -1 squeezenet min = 23.29 max = 24.65 avg = 23.95 squeezenet_int8 min = 23.24 max = 61.55 avg = 31.20 mobilenet min = 31.60 max = 32.10 avg = 31.80 mobilenet_int8 min = 30.35 max = 32.03 avg = 30.95 mobilenet_v2 min = 25.92 max = 26.45 avg = 26.08 shufflenet min = 11.91 max = 12.11 avg = 12.00 mnasnet min = 21.38 max = 21.71 avg = 21.51 proxylessnasnet min = 25.53 max = 25.78 avg = 25.62 googlenet min = 93.62 max = 100.67 avg = 94.86 googlenet_int8 min = 90.74 max = 91.06 avg = 90.87 resnet18 min = 85.84 max = 87.37 avg = 86.50 resnet18_int8 min = 77.88 max = 78.11 avg = 78.00 alexnet min = 196.33 max = 201.73 avg = 200.19 vgg16 min = 560.71 max = 571.75 avg = 564.84 vgg16_int8 min = 651.51 max = 652.68 avg = 652.12 resnet50 min = 178.25 max = 179.86 avg = 178.77 resnet50_int8 min = 181.07 max = 183.26 avg = 181.64 squeezenet_ssd min = 64.86 max = 68.39 avg = 66.05 squeezenet_ssd_int8 min = 69.61 max = 70.37 avg = 69.93 mobilenet_ssd min = 65.92 max = 67.03 avg = 66.41 mobilenet_ssd_int8 min = 61.54 max = 63.38 avg = 62.27 mobilenet_yolo min = 143.42 max = 146.69 avg = 144.33 mobilenet_yolov3 min = 150.45 max = 152.30 avg = 151.36 violet:/data/local/tmp/ncnn $ ./benchncnn 8 1 0 loop_count = 8 num_threads = 1 powersave = 0 gpu_device = -1 squeezenet min = 36.04 max = 37.25 avg = 36.48 squeezenet_int8 min = 37.82 max = 79.20 avg = 43.13 mobilenet min = 54.29 max = 54.73 avg = 54.41 mobilenet_int8 min = 58.90 max = 60.11 avg = 59.39 mobilenet_v2 min = 38.64 max = 40.22 avg = 38.97 shufflenet min = 18.05 max = 18.39 avg = 18.19 mnasnet min = 34.65 max = 34.98 avg = 34.79 proxylessnasnet min = 42.61 max = 43.12 avg = 42.80 googlenet min = 164.74 max = 165.89 avg = 165.34 googlenet_int8 min = 159.93 max = 160.38 avg = 160.12 resnet18 min = 135.76 max = 137.93 avg = 136.98 resnet18_int8 min = 140.22 max = 144.06 avg = 141.92 alexnet min = 391.01 max = 396.85 avg = 392.74 vgg16 min = 1019.35 max = 1022.75 avg = 1021.26 vgg16_int8 min = 1122.25 max = 1137.99 avg = 1124.78 resnet50 min = 302.16 max = 304.22 avg = 303.05 resnet50_int8 min = 318.35 max = 319.50 avg = 318.84 squeezenet_ssd min = 91.26 max = 94.86 avg = 92.39 squeezenet_ssd_int8 min = 105.06 max = 106.17 avg = 105.56 mobilenet_ssd min = 105.01 max = 105.95 avg = 105.40 mobilenet_ssd_int8 min = 119.93 max = 120.50 avg = 120.19 mobilenet_yolo min = 229.87 max = 230.76 avg = 230.21 mobilenet_yolov3 min = 242.10 max = 242.91 avg = 242.47 ``` ### Kirin 970 (Cortex-A73 2.4GHz x 4 + Cortex-A53 1.8GHz x 4) ``` HWEML:/data/local/tmp/ncnnbench $ ./benchncnn 8 4 2 -1 1 [0 Mali-G72] queueC=0[2] queueG=0[2] queueT=0[2] [0 Mali-G72] buglssc=0 bugsbn1=0 buglbia=0 bugihfa=1 [0 Mali-G72] fp16p=1 fp16s=0 fp16a=1 int8s=0 int8a=0 loop_count = 8 num_threads = 4 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 24.38 max = 28.03 avg = 25.83 squeezenet_int8 min = 21.79 max = 24.80 avg = 22.60 mobilenet min = 34.09 max = 36.88 avg = 35.93 mobilenet_int8 min = 52.62 max = 61.70 avg = 55.38 mobilenet_v2 min = 23.71 max = 25.70 avg = 24.49 mobilenet_v3 min = 20.66 max = 25.68 avg = 23.07 shufflenet min = 17.89 max = 19.91 avg = 18.53 shufflenet_v2 min = 13.73 max = 16.54 avg = 15.37 mnasnet min = 24.36 max = 27.14 avg = 25.58 proxylessnasnet min = 27.19 max = 29.70 avg = 28.59 efficientnet_b0 min = 49.31 max = 50.26 avg = 49.70 regnety_400m min = 42.54 max = 51.22 avg = 46.71 blazeface min = 5.49 max = 7.67 avg = 6.27 googlenet min = 72.67 max = 81.22 avg = 75.92 googlenet_int8 min = 67.60 max = 74.50 avg = 71.21 resnet18 min = 69.32 max = 81.59 avg = 73.45 resnet18_int8 min = 60.92 max = 68.11 avg = 64.18 alexnet min = 60.90 max = 79.28 avg = 66.72 vgg16 min = 337.01 max = 378.89 avg = 352.37 vgg16_int8 min = 465.88 max = 505.19 avg = 489.76 resnet50 min = 207.75 max = 220.74 avg = 214.42 resnet50_int8 min = 165.67 max = 183.80 avg = 171.27 squeezenet_ssd min = 72.77 max = 84.45 avg = 79.09 squeezenet_ssd_int8 min = 75.37 max = 86.58 avg = 78.70 mobilenet_ssd min = 88.88 max = 96.43 avg = 92.02 mobilenet_ssd_int8 min = 89.04 max = 101.35 avg = 92.23 mobilenet_yolo min = 189.73 max = 206.55 avg = 193.64 mobilenetv2_yolov3 min = 99.08 max = 111.64 avg = 104.23 HWEML:/data/local/tmp/ncnnbench $ ./benchncnn 8 1 2 -1 1 [0 Mali-G72] queueC=0[2] queueG=0[2] queueT=0[2] [0 Mali-G72] buglssc=0 bugsbn1=0 buglbia=0 bugihfa=1 [0 Mali-G72] fp16p=1 fp16s=0 fp16a=1 int8s=0 int8a=0 loop_count = 8 num_threads = 1 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 73.47 max = 81.39 avg = 76.06 squeezenet_int8 min = 62.63 max = 73.66 avg = 66.52 mobilenet min = 103.85 max = 112.83 avg = 108.98 mobilenet_int8 min = 152.27 max = 161.26 avg = 157.17 mobilenet_v2 min = 70.53 max = 87.26 avg = 76.67 mobilenet_v3 min = 59.87 max = 68.59 avg = 63.08 shufflenet min = 36.69 max = 41.45 avg = 39.24 shufflenet_v2 min = 33.97 max = 37.84 avg = 35.03 mnasnet min = 69.24 max = 79.73 avg = 74.20 proxylessnasnet min = 78.63 max = 88.57 avg = 81.83 efficientnet_b0 min = 147.45 max = 159.07 avg = 152.09 regnety_400m min = 90.83 max = 98.51 avg = 93.82 blazeface min = 10.05 max = 11.59 avg = 10.78 googlenet min = 240.26 max = 277.71 avg = 259.61 googlenet_int8 min = 214.64 max = 233.56 avg = 225.01 resnet18 min = 245.62 max = 268.49 avg = 260.37 resnet18_int8 min = 184.85 max = 194.91 avg = 190.60 alexnet min = 202.52 max = 241.12 avg = 211.51 vgg16 min = 1632.98 max = 1769.05 avg = 1710.89 vgg16_int8 min = 1237.01 max = 1316.40 avg = 1273.44 resnet50 min = 558.41 max = 601.59 avg = 581.26 resnet50_int8 min = 425.26 max = 445.19 avg = 436.22 squeezenet_ssd min = 228.50 max = 255.89 avg = 244.63 squeezenet_ssd_int8 min = 166.97 max = 193.77 avg = 180.22 mobilenet_ssd min = 226.54 max = 246.62 avg = 235.75 mobilenet_ssd_int8 min = 231.35 max = 249.63 avg = 241.29 mobilenet_yolo min = 469.71 max = 508.79 avg = 497.50 mobilenetv2_yolov3 min = 242.88 max = 265.30 avg = 254.68 HWEML:/data/local/tmp/ncnnbench $ ./benchncnn 4 1 2 0 1 [0 Mali-G72] queueC=0[2] queueG=0[2] queueT=0[2] [0 Mali-G72] buglssc=0 bugsbn1=0 buglbia=0 bugihfa=1 [0 Mali-G72] fp16p=1 fp16s=0 fp16a=1 int8s=0 int8a=0 loop_count = 4 num_threads = 1 powersave = 2 gpu_device = 0 cooling_down = 1 squeezenet min = 24.54 max = 25.75 avg = 25.16 mobilenet min = 22.03 max = 29.61 avg = 27.31 mobilenet_v2 min = 20.15 max = 28.05 avg = 25.35 mobilenet_v3 min = 34.26 max = 37.49 avg = 35.51 shufflenet min = 26.29 max = 27.68 avg = 26.86 shufflenet_v2 min = 29.60 max = 32.08 avg = 31.27 mnasnet min = 25.85 max = 29.38 avg = 27.98 proxylessnasnet min = 23.64 max = 30.09 avg = 26.36 efficientnet_b0 min = 52.55 max = 58.51 avg = 55.56 regnety_400m min = 37.81 max = 43.22 avg = 40.30 blazeface min = 9.14 max = 10.93 avg = 10.08 googlenet min = 60.19 max = 62.84 avg = 61.51 resnet18 min = 50.42 max = 52.93 avg = 51.70 alexnet min = 195.34 max = 196.98 avg = 196.14 vgg16 min = 725.88 max = 751.20 avg = 739.99 resnet50 min = 124.47 max = 125.93 avg = 125.02 squeezenet_ssd min = 91.79 max = 97.04 avg = 93.56 mobilenet_ssd min = 51.81 max = 59.31 avg = 54.09 mobilenet_yolo min = 124.67 max = 127.62 avg = 126.53 mobilenetv2_yolov3 min = 53.11 max = 54.81 avg = 54.11 ``` ### Qualcomm MSM8998 Snapdragon 835 (Kyro 2.45GHz x 4 + Kyro 1.9GHz x 4 + Adreno 540) ``` taimen:/data/local/tmp/ncnnbench $ ./benchncnn 8 4 2 -1 0 [0 Adreno (TM) 540] queueC=0[3] queueG=0[3] queueT=0[3] [0 Adreno (TM) 540] buglssc=0 bugsbn1=1 buglbia=0 bugihfa=0 [0 Adreno (TM) 540] fp16p=1 fp16s=0 fp16a=0 int8s=0 int8a=0 loop_count = 8 num_threads = 4 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 28.46 max = 30.89 avg = 29.77 squeezenet_int8 min = 30.32 max = 32.92 avg = 31.68 mobilenet min = 36.65 max = 38.37 avg = 37.32 mobilenet_int8 min = 62.91 max = 66.71 avg = 64.49 mobilenet_v2 min = 27.85 max = 31.21 avg = 29.41 mobilenet_v3 min = 23.83 max = 26.40 avg = 24.79 shufflenet min = 15.65 max = 16.88 avg = 16.27 shufflenet_v2 min = 13.70 max = 14.49 avg = 14.08 mnasnet min = 25.04 max = 28.35 avg = 26.45 proxylessnasnet min = 27.49 max = 29.58 avg = 28.62 efficientnet_b0 min = 48.43 max = 49.41 avg = 48.85 regnety_400m min = 42.48 max = 43.78 avg = 43.18 blazeface min = 4.39 max = 4.68 avg = 4.51 googlenet min = 75.98 max = 78.40 avg = 77.37 googlenet_int8 min = 79.26 max = 83.20 avg = 80.55 resnet18 min = 73.60 max = 76.97 avg = 75.63 resnet18_int8 min = 62.93 max = 65.94 avg = 64.50 alexnet min = 64.18 max = 67.02 avg = 65.49 vgg16 min = 389.39 max = 399.13 avg = 394.09 vgg16_int8 min = 509.06 max = 524.41 avg = 514.76 resnet50 min = 188.21 max = 194.58 avg = 191.98 resnet50_int8 min = 182.84 max = 187.22 avg = 184.23 squeezenet_ssd min = 77.69 max = 81.17 avg = 79.24 squeezenet_ssd_int8 min = 81.71 max = 84.12 avg = 82.90 mobilenet_ssd min = 78.35 max = 81.50 avg = 79.82 mobilenet_ssd_int8 min = 96.84 max = 100.97 avg = 98.42 mobilenet_yolo min = 167.32 max = 170.71 avg = 168.87 mobilenetv2_yolov3 min = 97.00 max = 102.11 avg = 99.01 taimen:/data/local/tmp/ncnnbench $ ./benchncnn 8 1 2 -1 1 [0 Adreno (TM) 540] queueC=0[3] queueG=0[3] queueT=0[3] [0 Adreno (TM) 540] buglssc=0 bugsbn1=1 buglbia=0 bugihfa=0 [0 Adreno (TM) 540] fp16p=1 fp16s=0 fp16a=0 int8s=0 int8a=0 loop_count = 8 num_threads = 1 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 67.25 max = 71.39 avg = 69.35 squeezenet_int8 min = 62.12 max = 66.35 avg = 63.73 mobilenet min = 103.30 max = 110.39 avg = 107.13 mobilenet_int8 min = 155.24 max = 161.42 avg = 157.82 mobilenet_v2 min = 71.89 max = 74.73 avg = 73.48 mobilenet_v3 min = 58.35 max = 63.43 avg = 60.68 shufflenet min = 35.96 max = 39.43 avg = 36.94 shufflenet_v2 min = 35.53 max = 39.86 avg = 37.10 mnasnet min = 66.71 max = 74.00 avg = 68.65 proxylessnasnet min = 76.50 max = 82.20 avg = 78.57 efficientnet_b0 min = 142.32 max = 152.17 avg = 146.14 regnety_400m min = 89.60 max = 98.27 avg = 92.62 blazeface min = 10.45 max = 12.81 avg = 11.07 googlenet min = 222.75 max = 233.61 avg = 228.38 googlenet_int8 min = 206.70 max = 212.20 avg = 209.24 resnet18 min = 210.86 max = 220.25 avg = 213.65 resnet18_int8 min = 176.04 max = 183.58 avg = 178.71 alexnet min = 185.97 max = 195.91 avg = 191.40 vgg16 min = 1176.82 max = 1200.64 avg = 1187.88 vgg16_int8 min = 1086.52 max = 1105.00 avg = 1095.53 resnet50 min = 517.48 max = 533.99 avg = 526.04 resnet50_int8 min = 417.30 max = 435.81 avg = 422.36 squeezenet_ssd min = 164.88 max = 171.21 avg = 167.51 squeezenet_ssd_int8 min = 164.78 max = 171.77 avg = 168.36 mobilenet_ssd min = 221.41 max = 229.13 avg = 226.18 mobilenet_ssd_int8 min = 234.15 max = 245.91 avg = 239.01 mobilenet_yolo min = 471.34 max = 484.99 avg = 477.15 mobilenetv2_yolov3 min = 249.14 max = 257.61 avg = 252.54 taimen:/data/local/tmp/ncnnbench $ ./benchncnn 8 1 2 0 1 [0 Adreno (TM) 540] queueC=0[3] queueG=0[3] queueT=0[3] [0 Adreno (TM) 540] buglssc=0 bugsbn1=1 buglbia=0 bugihfa=0 [0 Adreno (TM) 540] fp16p=1 fp16s=0 fp16a=0 int8s=0 int8a=0 loop_count = 8 num_threads = 1 powersave = 2 gpu_device = 0 cooling_down = 1 squeezenet min = 18.74 max = 19.89 avg = 19.22 mobilenet min = 21.19 max = 25.61 avg = 22.94 mobilenet_v2 min = 24.15 max = 34.68 avg = 30.12 mobilenet_v3 min = 25.94 max = 33.15 avg = 30.09 shufflenet min = 25.05 max = 31.41 avg = 27.85 shufflenet_v2 min = 28.82 max = 32.04 avg = 30.95 mnasnet min = 21.34 max = 27.69 avg = 24.17 proxylessnasnet min = 25.51 max = 30.03 avg = 28.01 efficientnet_b0 min = 42.94 max = 47.44 avg = 45.28 regnety_400m min = 36.36 max = 55.73 avg = 41.82 blazeface min = 11.14 max = 13.11 avg = 12.20 googlenet min = 49.72 max = 56.92 avg = 51.79 resnet18 min = 44.63 max = 47.37 avg = 45.86 alexnet min = 42.83 max = 46.34 avg = 44.63 vgg16 min = 568.82 max = 586.75 avg = 578.60 resnet50 min = 108.63 max = 115.76 avg = 110.38 squeezenet_ssd min = 85.22 max = 104.73 avg = 93.14 mobilenet_ssd min = 49.91 max = 56.86 avg = 52.33 mobilenet_yolo min = 98.76 max = 109.37 avg = 102.27 mobilenetv2_yolov3 min = 57.49 max = 61.15 avg = 58.74 ``` ### Qualcomm SDM765G Snapdragon 765G (Kyro 1.8GHz x 6 + Kyro 2.2GHz x 2 + Adreno 620) ``` 130|bramble:/data/local/tmp $ ./benchncnn 8 4 2 -1 1 loop_count = 8 num_threads = 4 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 9.84 max = 11.72 avg = 10.36 squeezenet_int8 min = 10.80 max = 11.13 avg = 10.96 mobilenet min = 14.04 max = 14.37 avg = 14.20 mobilenet_int8 min = 13.39 max = 13.75 avg = 13.59 mobilenet_v2 min = 13.04 max = 13.51 avg = 13.27 mobilenet_v3 min = 11.00 max = 13.21 avg = 12.54 shufflenet min = 11.08 max = 11.22 avg = 11.16 shufflenet_v2 min = 8.45 max = 8.50 avg = 8.47 mnasnet min = 14.15 max = 14.69 avg = 14.38 proxylessnasnet min = 14.49 max = 15.07 avg = 14.83 efficientnet_b0 min = 28.99 max = 29.53 avg = 29.24 efficientnetv2_b0 min = 38.92 max = 39.34 avg = 39.14 regnety_400m min = 33.46 max = 33.81 avg = 33.62 blazeface min = 4.22 max = 4.30 avg = 4.27 googlenet min = 35.24 max = 36.94 avg = 35.57 googlenet_int8 min = 45.26 max = 46.46 avg = 45.78 resnet18 min = 33.14 max = 33.75 avg = 33.31 resnet18_int8 min = 43.26 max = 43.50 avg = 43.35 alexnet min = 25.40 max = 26.19 avg = 25.74 vgg16 min = 121.39 max = 122.35 avg = 121.78 vgg16_int8 min = 243.47 max = 249.94 avg = 245.56 resnet50 min = 67.05 max = 70.16 avg = 68.20 resnet50_int8 min = 76.95 max = 80.23 avg = 78.18 squeezenet_ssd min = 32.02 max = 33.27 avg = 32.51 squeezenet_ssd_int8 min = 36.31 max = 38.35 avg = 37.09 mobilenet_ssd min = 32.02 max = 34.55 avg = 32.99 mobilenet_ssd_int8 min = 32.31 max = 33.92 avg = 32.77 mobilenet_yolo min = 99.12 max = 109.81 avg = 103.00 mobilenetv2_yolov3 min = 59.74 max = 60.95 avg = 60.21 yolov4-tiny min = 57.83 max = 72.15 avg = 68.75 nanodet_m min = 22.76 max = 22.97 avg = 22.85 yolo-fastest-1.1 min = 13.58 max = 13.93 avg = 13.80 yolo-fastestv2 min = 12.06 max = 12.27 avg = 12.15 vision_transformer min = 1274.67 max = 1597.52 avg = 1363.14 FastestDet min = 9.75 max = 9.86 avg = 9.81 130|bramble:/data/local/tmp $ ./benchncnn 8 4 2 0 1 [0 Adreno (TM) 620] queueC=0[3] queueG=0[3] queueT=0[3] [0 Adreno (TM) 620] bugsbn1=1 bugbilz=0 bugcopc=0 bugihfa=0 [0 Adreno (TM) 620] fp16-p/s/u/a=1/1/0/1 int8-p/s/u/a=1/0/0/1 [0 Adreno (TM) 620] subgroup=64 basic/vote/ballot/shuffle=1/1/1/1 [0 Adreno (TM) 620] fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0 loop_count = 8 num_threads = 4 powersave = 2 gpu_device = 0 cooling_down = 1 squeezenet min = 25.06 max = 25.80 avg = 25.53 squeezenet_int8 min = 9.75 max = 9.82 avg = 9.78 mobilenet min = 43.43 max = 44.04 avg = 43.71 mobilenet_int8 min = 11.12 max = 11.59 avg = 11.34 mobilenet_v2 min = 32.14 max = 32.58 avg = 32.40 mobilenet_v3 min = 32.75 max = 32.98 avg = 32.87 shufflenet min = 29.29 max = 29.63 avg = 29.40 shufflenet_v2 min = 32.43 max = 33.18 avg = 32.69 mnasnet min = 34.58 max = 35.24 avg = 35.00 proxylessnasnet min = 40.61 max = 41.40 avg = 40.98 efficientnet_b0 min = 49.44 max = 50.46 avg = 49.95 efficientnetv2_b0 min = 185.31 max = 187.37 avg = 186.24 regnety_400m min = 41.43 max = 42.75 avg = 41.84 blazeface min = 13.47 max = 14.07 avg = 13.72 googlenet min = 78.12 max = 79.06 avg = 78.56 googlenet_int8 min = 48.73 max = 50.13 avg = 49.20 resnet18 min = 73.61 max = 74.05 avg = 73.75 resnet18_int8 min = 21.87 max = 22.05 avg = 21.95 alexnet min = 128.58 max = 129.51 avg = 128.97 vgg16 min = 437.64 max = 439.12 avg = 438.28 vgg16_int8 min = 232.77 max = 243.06 avg = 239.54 resnet50 min = 187.36 max = 188.47 avg = 188.01 resnet50_int8 min = 75.79 max = 77.33 avg = 76.64 squeezenet_ssd min = 80.68 max = 84.50 avg = 81.93 squeezenet_ssd_int8 min = 29.88 max = 30.77 avg = 30.30 mobilenet_ssd min = 94.77 max = 96.46 avg = 95.79 mobilenet_ssd_int8 min = 29.03 max = 30.07 avg = 29.53 mobilenet_yolo min = 185.97 max = 188.11 avg = 186.59 mobilenetv2_yolov3 min = 108.43 max = 164.75 avg = 121.55 yolov4-tiny min = 149.38 max = 158.39 avg = 153.92 nanodet_m min = 46.73 max = 48.85 avg = 47.73 yolo-fastest-1.1 min = 26.32 max = 26.77 avg = 26.54 yolo-fastestv2 min = 38.87 max = 39.31 avg = 39.13 vision_transformer min = 3392.80 max = 3397.79 avg = 3396.09 FastestDet min = 43.05 max = 43.81 avg = 43.45 ``` ### Qualcomm SDM660 Snapdragon 660 (Kyro260 2.2GHz x 4 + Kyro260 1.84GHz x 4 + Adreno 512) ``` lavender:/data/local/tmp/ncnnbench $ ./benchncnn 8 8 0 -1 1 [0 Adreno (TM) 512] queueC=0[3] queueG=0[3] queueT=0[3] [0 Adreno (TM) 512] buglssc=0 bugsbn1=1 buglbia=0 bugihfa=0 [0 Adreno (TM) 512] fp16p=1 fp16s=0 fp16a=0 int8s=0 int8a=0 loop_count = 8 num_threads = 8 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 29.05 max = 44.86 avg = 33.26 squeezenet_int8 min = 35.47 max = 37.10 avg = 36.09 mobilenet min = 31.59 max = 33.47 avg = 32.33 mobilenet_int8 min = 77.50 max = 91.15 avg = 82.98 mobilenet_v2 min = 33.63 max = 35.43 avg = 34.54 mobilenet_v3 min = 29.97 max = 49.80 avg = 34.81 shufflenet min = 28.52 max = 30.09 avg = 29.09 shufflenet_v2 min = 19.15 max = 21.15 avg = 19.99 mnasnet min = 29.91 max = 35.11 avg = 31.46 proxylessnasnet min = 33.28 max = 117.09 avg = 55.22 efficientnet_b0 min = 52.29 max = 57.93 avg = 55.04 regnety_400m min = 96.05 max = 116.42 avg = 102.07 blazeface min = 7.98 max = 11.83 avg = 8.89 googlenet min = 76.88 max = 103.99 avg = 84.54 googlenet_int8 min = 97.68 max = 118.56 avg = 104.92 resnet18 min = 75.93 max = 89.31 avg = 80.00 resnet18_int8 min = 73.27 max = 80.84 avg = 76.19 alexnet min = 90.94 max = 114.57 avg = 96.42 vgg16 min = 381.30 max = 615.62 avg = 555.96 vgg16_int8 min = 803.75 max = 1126.53 avg = 886.03 resnet50 min = 257.38 max = 285.19 avg = 266.59 resnet50_int8 min = 304.81 max = 338.01 avg = 314.84 squeezenet_ssd min = 117.59 max = 145.79 avg = 123.79 squeezenet_ssd_int8 min = 132.80 max = 163.00 avg = 149.99 mobilenet_ssd min = 103.98 max = 126.90 avg = 113.10 mobilenet_ssd_int8 min = 167.86 max = 188.46 avg = 180.56 mobilenet_yolo min = 201.75 max = 263.92 avg = 240.17 mobilenetv2_yolov3 min = 143.76 max = 167.77 avg = 151.94 lavender:/data/local/tmp/ncnnbench $ ./benchncnn 4 1 2 -1 1 [0 Adreno (TM) 512] queueC=0[3] queueG=0[3] queueT=0[3] [0 Adreno (TM) 512] buglssc=0 bugsbn1=1 buglbia=0 bugihfa=0 [0 Adreno (TM) 512] fp16p=1 fp16s=0 fp16a=0 int8s=0 int8a=0 loop_count = 4 num_threads = 1 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 69.75 max = 71.33 avg = 70.38 squeezenet_int8 min = 67.12 max = 68.07 avg = 67.59 mobilenet min = 107.65 max = 110.48 avg = 108.82 mobilenet_int8 min = 163.13 max = 164.74 avg = 164.24 mobilenet_v2 min = 75.50 max = 77.36 avg = 76.38 mobilenet_v3 min = 59.05 max = 59.36 avg = 59.23 shufflenet min = 38.33 max = 38.74 avg = 38.57 shufflenet_v2 min = 37.43 max = 38.97 avg = 38.32 mnasnet min = 69.29 max = 73.20 avg = 70.73 proxylessnasnet min = 80.81 max = 82.66 avg = 81.52 efficientnet_b0 min = 151.20 max = 152.38 avg = 151.72 regnety_400m min = 93.53 max = 94.53 avg = 94.19 blazeface min = 12.15 max = 12.82 avg = 12.46 googlenet min = 239.63 max = 242.64 avg = 241.06 googlenet_int8 min = 214.71 max = 216.53 avg = 215.79 resnet18 min = 234.20 max = 238.74 avg = 236.90 resnet18_int8 min = 181.57 max = 183.97 avg = 182.66 alexnet min = 205.94 max = 207.44 avg = 206.63 vgg16 min = 1188.14 max = 1201.95 avg = 1196.93 vgg16_int8 min = 1081.21 max = 1087.84 avg = 1085.17 resnet50 min = 556.54 max = 566.68 avg = 561.21 resnet50_int8 min = 433.19 max = 433.93 avg = 433.48 squeezenet_ssd min = 169.02 max = 170.54 avg = 169.73 squeezenet_ssd_int8 min = 176.28 max = 177.90 avg = 176.87 mobilenet_ssd min = 228.15 max = 232.69 avg = 230.38 mobilenet_ssd_int8 min = 236.97 max = 239.69 avg = 238.35 mobilenet_yolo min = 493.33 max = 506.34 avg = 499.79 mobilenetv2_yolov3 min = 252.53 max = 261.58 avg = 256.30 lavender:/data/local/tmp/ncnnbench $ ./benchncnn 4 1 2 0 1 [0 Adreno (TM) 512] queueC=0[3] queueG=0[3] queueT=0[3] [0 Adreno (TM) 512] buglssc=0 bugsbn1=1 buglbia=0 bugihfa=0 [0 Adreno (TM) 512] fp16p=1 fp16s=0 fp16a=0 int8s=0 int8a=0 loop_count = 4 num_threads = 1 powersave = 2 gpu_device = 0 cooling_down = 1 squeezenet min = 34.49 max = 34.65 avg = 34.55 mobilenet min = 54.45 max = 55.52 avg = 54.75 mobilenet_v2 min = 39.32 max = 39.58 avg = 39.50 mobilenet_v3 min = 36.13 max = 36.28 avg = 36.19 shufflenet min = 35.25 max = 35.42 avg = 35.31 shufflenet_v2 min = 31.38 max = 31.70 avg = 31.53 mnasnet min = 40.95 max = 41.32 avg = 41.13 proxylessnasnet min = 43.81 max = 44.05 avg = 43.90 efficientnet_b0 min = 68.34 max = 68.56 avg = 68.47 regnety_400m min = 53.89 max = 54.23 avg = 54.02 blazeface min = 19.82 max = 27.74 avg = 22.01 googlenet min = 119.46 max = 119.98 avg = 119.80 resnet18 min = 115.56 max = 120.28 avg = 116.88 alexnet min = 102.06 max = 105.56 avg = 102.97 vgg16 min = 1192.29 max = 1202.17 avg = 1197.03 resnet50 min = 294.87 max = 298.79 avg = 296.05 squeezenet_ssd min = 167.85 max = 168.42 avg = 168.09 mobilenet_ssd min = 120.30 max = 120.37 avg = 120.34 mobilenet_yolo min = 256.60 max = 260.21 avg = 257.54 mobilenetv2_yolov3 min = 121.48 max = 125.22 avg = 122.53 ``` ### Qualcomm MSM8996 Pro Snapdragon 821 (Kyro 2.35GHz x 2 + Kyro 2.19GHz x 2) ``` natrium:/data/local/tmp # ./benchncnn 8 4 0 -1 1 loop_count = 8 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 18.46 max = 19.12 avg = 18.78 squeezenet_int8 min = 16.69 max = 17.22 avg = 16.95 mobilenet min = 27.33 max = 28.74 avg = 27.88 mobilenet_int8 min = 20.14 max = 20.71 avg = 20.46 mobilenet_v2 min = 21.94 max = 23.09 avg = 22.38 mobilenet_v3 min = 18.81 max = 19.45 avg = 19.04 shufflenet min = 14.07 max = 14.75 avg = 14.29 shufflenet_v2 min = 11.52 max = 11.92 avg = 11.71 mnasnet min = 20.41 max = 21.75 avg = 20.74 proxylessnasnet min = 22.99 max = 23.63 avg = 23.13 efficientnet_b0 min = 34.74 max = 35.26 avg = 34.91 efficientnetv2_b0 min = 41.16 max = 41.60 avg = 41.39 regnety_400m min = 44.27 max = 45.01 avg = 44.69 blazeface min = 4.25 max = 4.71 avg = 4.43 googlenet min = 54.88 max = 55.55 avg = 55.12 googlenet_int8 min = 51.88 max = 52.72 avg = 52.25 resnet18 min = 44.33 max = 45.44 avg = 44.88 resnet18_int8 min = 51.24 max = 51.94 avg = 51.54 alexnet min = 38.62 max = 39.31 avg = 38.88 vgg16 min = 242.53 max = 244.23 avg = 243.16 vgg16_int8 min = 183.15 max = 204.96 avg = 192.16 resnet50 min = 122.14 max = 124.29 avg = 122.94 resnet50_int8 min = 116.61 max = 118.47 avg = 117.56 squeezenet_ssd min = 47.92 max = 49.01 avg = 48.45 squeezenet_ssd_int8 min = 43.21 max = 44.45 avg = 43.76 mobilenet_ssd min = 56.92 max = 58.21 avg = 57.56 mobilenet_ssd_int8 min = 42.26 max = 42.92 avg = 42.48 mobilenet_yolo min = 126.20 max = 128.50 avg = 127.10 mobilenetv2_yolov3 min = 75.49 max = 76.50 avg = 76.01 yolov4-tiny min = 94.24 max = 95.75 avg = 94.83 nanodet_m min = 31.30 max = 31.93 avg = 31.62 yolo-fastest-1.1 min = 16.89 max = 17.56 avg = 17.23 yolo-fastestv2 min = 12.97 max = 13.50 avg = 13.15 natrium:/data/local/tmp # ./benchncnn 4 1 0 -1 1 loop_count = 4 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 46.27 max = 46.60 avg = 46.45 squeezenet_int8 min = 41.33 max = 41.73 avg = 41.56 mobilenet min = 80.89 max = 81.16 avg = 81.00 mobilenet_int8 min = 60.33 max = 62.29 avg = 61.33 mobilenet_v2 min = 51.78 max = 52.02 avg = 51.88 mobilenet_v3 min = 43.71 max = 44.17 avg = 43.91 shufflenet min = 24.96 max = 25.08 avg = 25.02 shufflenet_v2 min = 24.09 max = 24.26 avg = 24.17 mnasnet min = 51.28 max = 51.42 avg = 51.35 proxylessnasnet min = 59.25 max = 59.66 avg = 59.48 efficientnet_b0 min = 92.16 max = 92.34 avg = 92.22 efficientnetv2_b0 min = 112.27 max = 113.63 avg = 113.17 regnety_400m min = 68.59 max = 68.85 avg = 68.75 blazeface min = 7.36 max = 7.83 avg = 7.59 googlenet min = 151.15 max = 151.53 avg = 151.37 googlenet_int8 min = 152.01 max = 158.63 avg = 154.18 resnet18 min = 121.49 max = 121.90 avg = 121.77 resnet18_int8 min = 154.54 max = 166.73 avg = 161.30 alexnet min = 97.41 max = 97.74 avg = 97.62 vgg16 min = 674.80 max = 675.86 avg = 675.38 vgg16_int8 min = 593.42 max = 602.98 avg = 596.93 resnet50 min = 360.44 max = 364.31 avg = 362.01 resnet50_int8 min = 371.21 max = 386.24 avg = 381.53 squeezenet_ssd min = 97.72 max = 98.32 avg = 98.01 squeezenet_ssd_int8 min = 98.33 max = 99.15 avg = 98.63 mobilenet_ssd min = 161.72 max = 161.89 avg = 161.79 mobilenet_ssd_int8 min = 122.44 max = 123.38 avg = 123.00 mobilenet_yolo min = 367.34 max = 369.59 avg = 368.97 mobilenetv2_yolov3 min = 190.09 max = 190.77 avg = 190.31 yolov4-tiny min = 241.59 max = 242.29 avg = 241.81 nanodet_m min = 63.03 max = 63.22 avg = 63.12 yolo-fastest-1.1 min = 29.06 max = 29.22 avg = 29.12 yolo-fastestv2 min = 22.72 max = 22.80 avg = 22.77 ``` ### Qualcomm MSM8994 Snapdragon 810 (Cortex-A57 2.0GHz x 4 + Cortex-A53 1.55GHz x 4) ``` angler:/data/local/tmp $ ./benchncnn 8 8 0 -1 1 loop_count = 8 num_threads = 8 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 25.83 max = 29.17 avg = 27.69 squeezenet_int8 min = 24.18 max = 26.31 avg = 25.18 mobilenet min = 33.94 max = 35.29 avg = 34.44 mobilenet_int8 min = 24.99 max = 26.12 avg = 25.46 mobilenet_v2 min = 32.63 max = 34.44 avg = 33.56 mobilenet_v3 min = 27.72 max = 30.14 avg = 29.35 shufflenet min = 23.23 max = 26.78 avg = 24.58 shufflenet_v2 min = 21.04 max = 22.25 avg = 21.68 mnasnet min = 29.51 max = 31.26 avg = 30.27 proxylessnasnet min = 34.21 max = 37.55 avg = 35.20 efficientnet_b0 min = 54.75 max = 60.45 avg = 56.38 efficientnetv2_b0 min = 63.60 max = 67.51 avg = 64.81 regnety_400m min = 60.80 max = 72.33 avg = 68.27 blazeface min = 5.96 max = 7.22 avg = 6.41 googlenet min = 80.62 max = 94.46 avg = 86.50 googlenet_int8 min = 69.05 max = 75.75 avg = 71.47 resnet18 min = 63.90 max = 75.96 avg = 69.64 resnet18_int8 min = 46.43 max = 62.23 avg = 53.22 alexnet min = 82.67 max = 90.25 avg = 87.03 vgg16 min = 562.23 max = 636.26 avg = 594.82 vgg16_int8 min = 303.42 max = 358.03 avg = 325.60 resnet50 min = 233.47 max = 279.99 avg = 248.49 resnet50_int8 min = 170.11 max = 198.27 avg = 183.35 squeezenet_ssd min = 86.97 max = 112.21 avg = 96.84 squeezenet_ssd_int8 min = 66.09 max = 77.00 avg = 70.57 mobilenet_ssd min = 76.95 max = 101.74 avg = 87.73 mobilenet_ssd_int8 min = 53.27 max = 60.50 avg = 57.46 mobilenet_yolo min = 206.42 max = 260.06 avg = 227.84 mobilenetv2_yolov3 min = 129.32 max = 147.76 avg = 138.90 yolov4-tiny min = 184.85 max = 213.03 avg = 203.52 nanodet_m min = 47.66 max = 60.55 avg = 53.00 angler:/data/local/tmp # ./benchncnn 4 4 2 -1 1 loop_count = 4 num_threads = 4 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 41.39 max = 47.64 avg = 43.08 squeezenet_int8 min = 36.92 max = 37.59 avg = 37.24 mobilenet min = 59.04 max = 59.43 avg = 59.22 mobilenet_int8 min = 44.67 max = 46.60 avg = 45.58 mobilenet_v2 min = 43.38 max = 43.71 avg = 43.62 mobilenet_v3 min = 37.57 max = 37.82 avg = 37.65 shufflenet min = 30.67 max = 30.86 avg = 30.76 shufflenet_v2 min = 27.80 max = 28.12 avg = 27.97 mnasnet min = 42.99 max = 46.41 avg = 44.21 proxylessnasnet min = 51.26 max = 53.52 avg = 52.04 efficientnet_b0 min = 81.58 max = 82.30 avg = 82.03 efficientnetv2_b0 min = 94.01 max = 94.48 avg = 94.27 regnety_400m min = 82.38 max = 83.86 avg = 82.95 blazeface min = 10.02 max = 10.42 avg = 10.18 googlenet min = 125.47 max = 126.72 avg = 125.92 googlenet_int8 min = 109.92 max = 111.65 avg = 110.44 resnet18 min = 110.14 max = 111.95 avg = 110.76 resnet18_int8 min = 78.21 max = 79.65 avg = 79.07 alexnet min = 78.09 max = 80.34 avg = 78.87 vgg16 min = 486.69 max = 494.97 avg = 490.35 vgg16_int8 min = 370.66 max = 377.64 avg = 373.78 resnet50 min = 272.31 max = 278.64 avg = 274.10 resnet50_int8 min = 215.57 max = 218.55 avg = 217.27 squeezenet_ssd min = 112.98 max = 114.75 avg = 113.60 squeezenet_ssd_int8 min = 91.85 max = 94.82 avg = 93.13 mobilenet_ssd min = 115.18 max = 116.56 avg = 115.95 mobilenet_ssd_int8 min = 90.95 max = 92.21 avg = 91.39 mobilenet_yolo min = 255.07 max = 259.01 avg = 256.18 mobilenetv2_yolov3 min = 155.52 max = 156.58 avg = 156.09 yolov4-tiny min = 231.89 max = 234.14 avg = 232.97 nanodet_m min = 72.74 max = 74.71 avg = 73.52 yolo-fastest-1.1 min = 35.25 max = 36.51 avg = 35.77 yolo-fastestv2 min = 29.94 max = 31.09 avg = 30.75 angler:/data/local/tmp # ./benchncnn 4 1 2 -1 1 loop_count = 4 num_threads = 1 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 70.83 max = 72.68 avg = 71.77 squeezenet_int8 min = 59.27 max = 59.60 avg = 59.51 mobilenet min = 110.70 max = 112.72 avg = 111.48 mobilenet_int8 min = 79.69 max = 80.01 avg = 79.81 mobilenet_v2 min = 77.85 max = 78.19 avg = 78.03 mobilenet_v3 min = 63.49 max = 63.92 avg = 63.73 shufflenet min = 41.43 max = 41.60 avg = 41.49 shufflenet_v2 min = 37.49 max = 38.26 avg = 37.97 mnasnet min = 73.91 max = 75.91 avg = 74.59 proxylessnasnet min = 94.13 max = 94.53 avg = 94.37 efficientnet_b0 min = 161.91 max = 162.38 avg = 162.10 efficientnetv2_b0 min = 179.33 max = 180.26 avg = 179.67 regnety_400m min = 100.35 max = 100.76 avg = 100.53 blazeface min = 12.57 max = 12.76 avg = 12.66 googlenet min = 232.77 max = 233.08 avg = 232.91 googlenet_int8 min = 203.39 max = 205.25 avg = 204.77 resnet18 min = 182.58 max = 183.17 avg = 182.91 resnet18_int8 min = 150.40 max = 152.07 avg = 151.35 alexnet min = 147.27 max = 149.00 avg = 148.06 vgg16 min = 986.93 max = 988.35 avg = 987.47 vgg16_int8 min = 816.37 max = 819.93 avg = 817.79 resnet50 min = 502.77 max = 510.88 avg = 508.53 resnet50_int8 min = 393.33 max = 398.07 avg = 395.86 squeezenet_ssd min = 175.01 max = 175.61 avg = 175.32 squeezenet_ssd_int8 min = 145.19 max = 145.94 avg = 145.66 mobilenet_ssd min = 231.04 max = 231.25 avg = 231.13 mobilenet_ssd_int8 min = 159.81 max = 160.52 avg = 160.13 mobilenet_yolo min = 517.86 max = 523.71 avg = 521.85 mobilenetv2_yolov3 min = 275.84 max = 279.16 avg = 277.13 yolov4-tiny min = 363.71 max = 366.14 avg = 364.56 nanodet_m min = 93.90 max = 95.09 avg = 94.40 yolo-fastest-1.1 min = 45.94 max = 46.09 avg = 46.01 yolo-fastestv2 min = 38.23 max = 38.33 avg = 38.29 angler:/data/local/tmp $ ./benchncnn 4 1 2 0 1 [0 Adreno (TM) 430] queueC=0[3] queueG=0[3] queueT=0[3] [0 Adreno (TM) 430] buglssc=0 bugsbn1=1 buglbia=0 bugihfa=0 [0 Adreno (TM) 430] fp16p=1 fp16s=0 fp16a=0 int8s=0 int8a=0 loop_count = 4 num_threads = 1 powersave = 2 gpu_device = 0 cooling_down = 1 squeezenet min = 39.49 max = 41.93 avg = 40.62 mobilenet min = 60.30 max = 61.81 avg = 60.88 mobilenet_v2 min = 45.38 max = 47.10 avg = 45.88 mobilenet_v3 min = 45.97 max = 47.39 avg = 46.69 shufflenet min = 29.12 max = 31.02 avg = 29.91 shufflenet_v2 min = 47.58 max = 50.06 avg = 48.26 mnasnet min = 47.84 max = 49.17 avg = 48.26 proxylessnasnet min = 49.51 max = 51.03 avg = 49.97 efficientnet_b0 min = 100.56 max = 105.60 avg = 102.45 regnety_400m min = 59.67 max = 61.24 avg = 60.56 blazeface min = 13.87 max = 13.98 avg = 13.93 googlenet min = 131.26 max = 136.33 avg = 133.40 resnet18 min = 116.38 max = 117.92 avg = 116.93 alexnet min = 72.59 max = 73.94 avg = 73.29 vgg16 min = 1090.07 max = 1101.71 avg = 1096.34 resnet50 min = 299.76 max = 300.78 avg = 300.40 squeezenet_ssd min = 181.95 max = 182.83 avg = 182.39 mobilenet_ssd min = 148.44 max = 151.07 avg = 149.75 mobilenet_yolo min = 284.46 max = 285.74 avg = 285.39 mobilenetv2_yolov3 min = 140.28 max = 148.62 avg = 144.83 ``` ### Qualcomm MSM8916 Snapdragon 410 (Cortex-A53 1.2GHz x 4) ``` HM2014812:/data/local/tmp # ./benchncnn 8 4 0 -1 1 loop_count = 8 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 65.45 max = 73.59 avg = 68.10 squeezenet_int8 min = 59.39 max = 65.54 avg = 61.14 mobilenet min = 86.69 max = 94.10 avg = 90.03 mobilenet_int8 min = 62.22 max = 69.67 avg = 64.13 mobilenet_v2 min = 77.98 max = 89.53 avg = 82.00 mobilenet_v3 min = 62.17 max = 68.31 avg = 63.90 shufflenet min = 47.52 max = 53.76 avg = 49.92 shufflenet_v2 min = 39.77 max = 46.08 avg = 40.66 mnasnet min = 69.27 max = 75.73 avg = 71.73 proxylessnasnet min = 78.72 max = 85.37 avg = 81.33 efficientnet_b0 min = 126.62 max = 136.67 avg = 130.69 efficientnetv2_b0 min = 143.24 max = 150.97 avg = 146.89 regnety_400m min = 108.79 max = 116.22 avg = 112.99 blazeface min = 14.85 max = 15.02 avg = 14.94 googlenet min = 180.91 max = 190.37 avg = 186.36 googlenet_int8 min = 160.07 max = 170.86 avg = 165.05 resnet18 min = 137.91 max = 155.37 avg = 144.99 resnet18_int8 min = 104.34 max = 110.20 avg = 106.76 alexnet min = 105.30 max = 114.73 avg = 109.53 vgg16 min = 829.16 max = 942.94 avg = 853.28 vgg16_int8 min = 515.61 max = 547.32 avg = 526.50 resnet50 min = 380.46 max = 443.90 avg = 393.71 resnet50_int8 min = 318.06 max = 327.13 avg = 323.23 squeezenet_ssd min = 178.22 max = 189.02 avg = 184.51 squeezenet_ssd_int8 min = 153.75 max = 163.44 avg = 158.05 mobilenet_ssd min = 189.45 max = 195.17 avg = 193.10 mobilenet_ssd_int8 min = 132.59 max = 139.63 avg = 137.23 mobilenet_yolo min = 404.52 max = 414.20 avg = 409.97 mobilenetv2_yolov3 min = 271.33 max = 279.98 avg = 275.08 yolov4-tiny min = 349.36 max = 372.54 avg = 357.98 nanodet_m min = 103.01 max = 111.71 avg = 105.82 HM2014812:/data/local/tmp # ./benchncnn 4 1 0 -1 1 loop_count = 4 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 147.48 max = 149.35 avg = 148.40 squeezenet_int8 min = 143.20 max = 144.55 avg = 143.98 mobilenet min = 243.78 max = 244.33 avg = 244.08 mobilenet_int8 min = 206.23 max = 207.13 avg = 206.55 mobilenet_v2 min = 168.04 max = 170.37 avg = 169.06 mobilenet_v3 min = 147.10 max = 147.91 avg = 147.55 shufflenet min = 88.47 max = 89.31 avg = 88.85 shufflenet_v2 min = 84.47 max = 84.80 avg = 84.60 mnasnet min = 162.81 max = 163.93 avg = 163.22 proxylessnasnet min = 208.18 max = 209.15 avg = 208.61 efficientnet_b0 min = 370.06 max = 371.14 avg = 370.64 efficientnetv2_b0 min = 418.28 max = 429.68 avg = 423.01 regnety_400m min = 216.42 max = 217.19 avg = 216.71 blazeface min = 27.63 max = 28.67 avg = 28.00 googlenet min = 525.25 max = 528.83 avg = 526.23 googlenet_int8 min = 469.78 max = 472.51 avg = 470.76 resnet18 min = 396.46 max = 399.66 avg = 397.57 resnet18_int8 min = 324.07 max = 326.64 avg = 325.34 alexnet min = 362.44 max = 363.02 avg = 362.68 vgg16 min = 2174.86 max = 2252.92 avg = 2215.62 vgg16_int8 min = 1726.07 max = 1732.69 avg = 1729.18 resnet50 min = 1136.96 max = 1142.94 avg = 1139.91 resnet50_int8 min = 977.73 max = 983.64 avg = 980.71 squeezenet_ssd min = 350.46 max = 353.35 avg = 351.37 squeezenet_ssd_int8 min = 333.91 max = 336.59 avg = 334.77 mobilenet_ssd min = 513.18 max = 519.05 avg = 516.22 mobilenet_ssd_int8 min = 424.37 max = 426.89 avg = 426.03 mobilenet_yolo min = 1143.20 max = 1145.04 avg = 1144.31 mobilenetv2_yolov3 min = 617.45 max = 619.30 avg = 618.37 yolov4-tiny min = 839.32 max = 847.57 avg = 844.61 nanodet_m min = 208.41 max = 211.31 avg = 210.03 ``` ### Qualcomm Snapdragon 888 (Cortex-X1 2.84GHz x1 + Cortex-A78 2.4GHz x3 + Cortex-A55 1.8GHz x4 + Adreno 660) ``` venus:/data/local/tmp $ ./benchncnn 8 8 2 -1 1 loop_count = 8 num_threads = 8 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 5.89 max = 6.04 avg = 5.98 squeezenet_int8 min = 6.09 max = 6.29 avg = 6.25 mobilenet min = 9.27 max = 10.22 avg = 9.64 mobilenet_int8 min = 5.90 max = 6.05 avg = 5.97 mobilenet_v2 min = 6.87 max = 8.42 avg = 7.63 mobilenet_v3 min = 8.93 max = 12.22 avg = 9.55 shufflenet min = 8.72 max = 11.44 avg = 9.20 shufflenet_v2 min = 6.05 max = 8.24 avg = 7.40 mnasnet min = 7.83 max = 9.03 avg = 8.53 proxylessnasnet min = 7.03 max = 9.62 avg = 7.88 efficientnet_b0 min = 12.62 max = 18.01 avg = 15.51 efficientnetv2_b0 min = 14.96 max = 23.75 avg = 19.61 regnety_400m min = 23.58 max = 23.87 avg = 23.72 blazeface min = 4.62 max = 4.87 avg = 4.73 googlenet min = 17.23 max = 25.41 avg = 19.83 googlenet_int8 min = 16.91 max = 17.05 avg = 16.99 resnet18 min = 12.05 max = 14.90 avg = 13.47 resnet18_int8 min = 15.10 max = 15.42 avg = 15.27 alexnet min = 13.85 max = 15.73 avg = 14.50 vgg16 min = 56.85 max = 57.88 avg = 57.32 vgg16_int8 min = 70.12 max = 72.99 avg = 71.53 resnet50 min = 29.45 max = 29.78 avg = 29.64 resnet50_int8 min = 24.99 max = 25.31 avg = 25.16 squeezenet_ssd min = 17.51 max = 22.63 avg = 19.25 squeezenet_ssd_int8 min = 16.81 max = 17.26 avg = 16.98 mobilenet_ssd min = 15.96 max = 16.52 avg = 16.11 mobilenet_ssd_int8 min = 13.70 max = 14.26 avg = 13.95 mobilenet_yolo min = 50.48 max = 52.88 avg = 51.76 mobilenetv2_yolov3 min = 22.63 max = 22.99 avg = 22.85 yolov4-tiny min = 29.01 max = 38.20 avg = 32.50 nanodet_m min = 12.58 max = 15.53 avg = 13.86 yolo-fastest-1.1 min = 8.57 max = 9.18 avg = 8.86 yolo-fastestv2 min = 6.85 max = 8.47 avg = 8.05 vision_transformer min = 548.48 max = 703.29 avg = 614.47 FastestDet min = 7.71 max = 9.31 avg = 8.15 venus:/data/local/tmp $ ./benchncnn 8 8 2 0 1 ./benchncnn 8 8 2 0 1 [0 Adreno (TM) 660] queueC=0[3] queueG=0[3] queueT=0[3] [0 Adreno (TM) 660] bugsbn1=1 bugbilz=0 bugcopc=0 bugihfa=0 [0 Adreno (TM) 660] fp16-p/s/u/a=1/1/0/1 int8-p/s/u/a=1/0/0/1 [0 Adreno (TM) 660] subgroup=64 basic/vote/ballot/shuffle=1/1/1/1 [0 Adreno (TM) 660] fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0 loop_count = 8 num_threads = 8 powersave = 2 gpu_device = 0 cooling_down = 1 squeezenet min = 10.63 max = 12.41 avg = 11.80 squeezenet_int8 min = 6.93 max = 8.82 avg = 7.86 mobilenet min = 12.79 max = 14.12 avg = 13.48 mobilenet_int8 min = 9.18 max = 9.70 avg = 9.44 mobilenet_v2 min = 14.73 max = 15.62 avg = 15.13 mobilenet_v3 min = 14.68 max = 16.72 avg = 15.70 shufflenet min = 11.28 max = 12.75 avg = 12.17 shufflenet_v2 min = 11.44 max = 14.27 avg = 12.07 mnasnet min = 14.54 max = 15.94 avg = 15.35 proxylessnasnet min = 16.33 max = 17.31 avg = 16.71 efficientnet_b0 min = 22.64 max = 25.42 avg = 24.35 efficientnetv2_b0 min = 41.16 max = 52.08 avg = 45.61 regnety_400m min = 17.56 max = 18.08 avg = 17.85 blazeface min = 2.87 max = 3.89 avg = 3.34 googlenet min = 31.64 max = 33.38 avg = 32.14 googlenet_int8 min = 18.29 max = 19.15 avg = 18.73 resnet18 min = 23.47 max = 24.60 avg = 23.85 resnet18_int8 min = 11.89 max = 17.17 avg = 14.54 alexnet min = 25.62 max = 26.23 avg = 25.98 vgg16 min = 41.81 max = 42.69 avg = 42.12 vgg16_int8 min = 79.43 max = 123.88 avg = 93.17 resnet50 min = 41.28 max = 43.27 avg = 41.79 resnet50_int8 min = 25.55 max = 26.34 avg = 25.97 squeezenet_ssd min = 30.10 max = 33.64 avg = 31.39 squeezenet_ssd_int8 min = 18.12 max = 18.58 avg = 18.30 mobilenet_ssd min = 28.29 max = 28.90 avg = 28.66 mobilenet_ssd_int8 min = 13.90 max = 14.31 avg = 14.02 mobilenet_yolo min = 43.88 max = 45.43 avg = 44.58 mobilenetv2_yolov3 min = 16.49 max = 37.05 avg = 19.32 yolov4-tiny min = 22.70 max = 50.58 avg = 34.92 nanodet_m min = 19.31 max = 19.88 avg = 19.57 yolo-fastest-1.1 min = 11.17 max = 11.33 avg = 11.26 yolo-fastestv2 min = 9.72 max = 10.04 avg = 9.85 vision_transformer min = 744.98 max = 758.15 avg = 751.62 FastestDet min = 11.95 max = 13.12 avg = 12.46 ``` ### Qualcomm Snapdragon X Elite (X1E78100), Oryon 3.4GHz x 12 + Adreno X1-85 Test on Oryon CPU ``` loop_count = 10 num_threads = 12 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 5.13 max = 5.19 avg = 5.16 squeezenet_int8 min = 4.31 max = 4.81 avg = 4.67 mobilenet min = 3.73 max = 3.85 avg = 3.78 mobilenet_int8 min = 2.51 max = 3.11 avg = 2.64 mobilenet_v2 min = 3.55 max = 3.70 avg = 3.60 mobilenet_v3 min = 3.28 max = 3.88 avg = 3.40 shufflenet min = 3.77 max = 5.07 avg = 4.02 shufflenet_v2 min = 3.24 max = 3.34 avg = 3.29 mnasnet min = 3.49 max = 4.09 avg = 3.58 proxylessnasnet min = 4.30 max = 4.93 avg = 4.41 efficientnet_b0 min = 4.97 max = 17.26 avg = 6.28 efficientnetv2_b0 min = 6.85 max = 10.19 avg = 7.39 regnety_400m min = 11.26 max = 11.36 avg = 11.31 blazeface min = 1.43 max = 1.48 avg = 1.44 googlenet min = 9.84 max = 9.96 avg = 9.89 googlenet_int8 min = 8.04 max = 8.33 avg = 8.13 resnet18 min = 6.63 max = 9.34 avg = 6.94 resnet18_int8 min = 5.47 max = 6.24 avg = 5.59 alexnet min = 7.52 max = 7.61 avg = 7.54 vgg16 min = 29.66 max = 32.27 avg = 30.07 vgg16_int8 min = 32.97 max = 34.43 avg = 33.32 resnet50 min = 16.54 max = 16.68 avg = 16.63 resnet50_int8 min = 11.12 max = 13.84 avg = 11.42 squeezenet_ssd min = 9.20 max = 9.77 avg = 9.39 squeezenet_ssd_int8 min = 8.50 max = 9.17 avg = 8.73 mobilenet_ssd min = 8.28 max = 8.67 avg = 8.36 mobilenet_ssd_int8 min = 5.59 max = 6.25 avg = 5.74 mobilenet_yolo min = 21.42 max = 22.77 avg = 21.65 mobilenetv2_yolov3 min = 14.03 max = 14.34 avg = 14.13 yolov4-tiny min = 23.60 max = 23.84 avg = 23.70 nanodet_m min = 6.64 max = 7.40 avg = 6.77 yolo-fastest-1.1 min = 4.14 max = 7.15 avg = 4.53 yolo-fastestv2 min = 3.63 max = 3.70 avg = 3.66 vision_transformer min = 384.74 max = 415.74 avg = 391.28 FastestDet min = 4.29 max = 4.94 avg = 4.40 ``` Test on X1-85 GPU ``` [0 Adreno X1-85] queueC=0[1] queueT=0[1] [0 Adreno X1-85] fp16-p/s/u/a=1/1/0/1 int8-p/s/u/a=1/0/0/1 bf16-p/s=1/0 [0 Adreno X1-85] subgroup=128(64~128) ops=1/1/1/1/1/1/1/1/1/1 [0 Adreno X1-85] fp16-cm=0 int8-cm=0 bf16-cm=0 fp8-cm=0 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = 0 cooling_down = 0 squeezenet min = 3.23 max = 3.99 avg = 3.63 mobilenet min = 3.33 max = 5.86 avg = 5.20 mobilenet_v2 min = 4.06 max = 4.77 avg = 4.52 mobilenet_v3 min = 4.61 max = 8.12 avg = 6.60 shufflenet min = 3.16 max = 7.45 avg = 4.65 shufflenet_v2 min = 3.90 max = 6.00 avg = 5.02 mnasnet min = 4.44 max = 5.12 avg = 4.81 proxylessnasnet min = 4.91 max = 7.02 avg = 6.15 efficientnet_b0 min = 6.61 max = 7.25 avg = 7.04 efficientnetv2_b0 min = 21.48 max = 56.52 avg = 39.03 regnety_400m min = 7.33 max = 7.60 avg = 7.44 blazeface min = 2.83 max = 4.59 avg = 4.30 googlenet min = 11.00 max = 12.98 avg = 12.60 resnet18 min = 12.11 max = 14.59 avg = 13.27 alexnet min = 11.64 max = 12.18 avg = 11.96 vgg16 min = 40.06 max = 45.62 avg = 42.88 resnet50 min = 18.99 max = 21.93 avg = 20.88 squeezenet_ssd min = 10.95 max = 14.73 avg = 13.03 mobilenet_ssd min = 7.92 max = 9.75 avg = 9.46 mobilenet_yolo min = 9.02 max = 12.54 avg = 11.38 mobilenetv2_yolov3 min = 12.70 max = 14.70 avg = 13.95 yolov4-tiny min = 25.88 max = 30.26 avg = 28.12 nanodet_m min = 9.38 max = 33.46 avg = 20.29 yolo-fastest-1.1 min = 6.08 max = 6.75 avg = 6.43 yolo-fastestv2 min = 4.50 max = 6.47 avg = 6.04 vision_transformer min = 184.89 max = 191.78 avg = 189.07 FastestDet min = 6.01 max = 7.83 avg = 6.43 ``` ### Raspberry Pi 3 Model B+ Broadcom BCM2837B0, Cortex-A53 (ARMv8) (1.4GHz x 4) ``` pi@raspberrypi:~/ncnn/build/benchmark $ ./benchncnn 4 4 0 -1 1 loop_count = 4 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 84.74 max = 85.60 avg = 85.22 squeezenet_int8 min = 74.48 max = 74.80 avg = 74.68 mobilenet min = 107.84 max = 110.13 avg = 108.66 mobilenet_int8 min = 66.91 max = 67.12 avg = 67.03 mobilenet_v2 min = 110.64 max = 112.73 avg = 111.68 mobilenet_v3 min = 85.78 max = 86.74 avg = 86.44 shufflenet min = 58.38 max = 60.32 avg = 59.33 shufflenet_v2 min = 46.76 max = 47.53 avg = 47.19 mnasnet min = 95.53 max = 95.88 avg = 95.78 proxylessnasnet min = 102.24 max = 105.58 avg = 103.38 efficientnet_b0 min = 134.87 max = 136.98 avg = 135.86 efficientnetv2_b0 min = 146.62 max = 148.06 avg = 147.13 regnety_400m min = 118.60 max = 119.51 avg = 119.03 blazeface min = 15.42 max = 15.61 avg = 15.52 googlenet min = 223.78 max = 224.85 avg = 224.22 googlenet_int8 min = 188.23 max = 190.15 avg = 189.21 resnet18 min = 270.86 max = 272.66 avg = 271.93 resnet18_int8 min = 159.57 max = 160.39 avg = 160.07 alexnet min = 157.79 max = 160.77 avg = 159.09 resnet50 min = 583.57 max = 591.41 avg = 587.42 resnet50_int8 min = 383.96 max = 401.37 avg = 391.87 squeezenet_ssd min = 247.90 max = 249.77 avg = 248.98 squeezenet_ssd_int8 min = 191.65 max = 192.81 avg = 192.17 mobilenet_ssd min = 240.11 max = 241.02 avg = 240.62 mobilenet_ssd_int8 min = 136.30 max = 137.26 avg = 136.73 mobilenet_yolo min = 523.59 max = 539.91 avg = 529.98 mobilenetv2_yolov3 min = 356.44 max = 366.85 avg = 362.06 yolov4-tiny min = 410.25 max = 422.18 avg = 417.17 nanodet_m min = 114.98 max = 115.83 avg = 115.40 yolo-fastest-1.1 min = 79.85 max = 80.83 avg = 80.28 yolo-fastestv2 min = 62.36 max = 62.91 avg = 62.60 FastestDet min = 67.11 max = 68.51 avg = 67.98 pi@raspberrypi:~/ncnn/build/benchmark $ ./benchncnn 4 1 0 -1 1 loop_count = 4 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 125.34 max = 125.81 avg = 125.58 squeezenet_int8 min = 135.56 max = 136.34 avg = 135.98 mobilenet min = 204.62 max = 207.06 avg = 205.65 mobilenet_int8 min = 181.34 max = 182.46 avg = 181.91 mobilenet_v2 min = 158.69 max = 158.94 avg = 158.80 mobilenet_v3 min = 127.13 max = 127.31 avg = 127.23 shufflenet min = 84.64 max = 85.29 avg = 84.89 shufflenet_v2 min = 74.28 max = 74.64 avg = 74.44 mnasnet min = 148.12 max = 148.65 avg = 148.42 proxylessnasnet min = 199.56 max = 201.99 avg = 200.42 efficientnet_b0 min = 240.94 max = 241.75 avg = 241.27 efficientnetv2_b0 min = 270.71 max = 270.90 avg = 270.83 regnety_400m min = 186.89 max = 187.08 avg = 187.01 blazeface min = 22.75 max = 23.24 avg = 22.95 googlenet min = 450.64 max = 450.96 avg = 450.79 googlenet_int8 min = 424.66 max = 426.83 avg = 425.78 resnet18 min = 379.21 max = 380.01 avg = 379.57 resnet18_int8 min = 312.23 max = 313.21 avg = 312.68 alexnet min = 270.13 max = 270.88 avg = 270.55 resnet50 min = 977.51 max = 981.89 avg = 979.75 resnet50_int8 min = 890.77 max = 896.89 avg = 893.83 squeezenet_ssd min = 331.52 max = 333.47 avg = 332.46 squeezenet_ssd_int8 min = 317.71 max = 319.64 avg = 318.62 mobilenet_ssd min = 425.42 max = 426.52 avg = 425.93 mobilenet_ssd_int8 min = 370.17 max = 370.90 avg = 370.66 mobilenet_yolo min = 930.40 max = 932.24 avg = 931.46 mobilenetv2_yolov3 min = 534.79 max = 543.56 avg = 539.20 yolov4-tiny min = 675.33 max = 676.83 avg = 676.14 nanodet_m min = 178.13 max = 178.98 avg = 178.64 yolo-fastest-1.1 min = 100.83 max = 101.96 avg = 101.49 yolo-fastestv2 min = 79.73 max = 79.94 avg = 79.84 FastestDet min = 89.09 max = 90.07 avg = 89.78 ``` ### Raspberry Pi 4 Model B Broadcom BCM2711B0, Cortex-A72 (ARMv8) (1.8GHz x 4) ``` pi@raspberrypi:~/ncnn/build/benchmark $ ./benchncnn 10 4 0 -1 1 loop_count = 10 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 46.28 max = 46.91 avg = 46.65 squeezenet_int8 min = 42.18 max = 44.98 avg = 42.59 mobilenet min = 60.74 max = 61.79 avg = 61.17 mobilenet_int8 min = 34.19 max = 34.55 avg = 34.37 mobilenet_v2 min = 61.63 max = 62.02 avg = 61.88 mobilenet_v3 min = 47.08 max = 48.40 avg = 47.53 shufflenet min = 32.91 max = 33.30 avg = 33.09 shufflenet_v2 min = 24.37 max = 24.73 avg = 24.56 mnasnet min = 51.80 max = 52.14 avg = 51.98 proxylessnasnet min = 53.02 max = 53.58 avg = 53.32 efficientnet_b0 min = 73.92 max = 74.44 avg = 74.19 efficientnetv2_b0 min = 79.10 max = 79.60 avg = 79.34 regnety_400m min = 65.27 max = 66.12 avg = 65.70 blazeface min = 8.62 max = 8.75 avg = 8.69 googlenet min = 113.74 max = 115.14 avg = 114.35 googlenet_int8 min = 100.87 max = 101.71 avg = 101.25 resnet18 min = 122.27 max = 125.39 avg = 123.12 resnet18_int8 min = 82.19 max = 94.12 avg = 83.92 alexnet min = 75.75 max = 78.08 avg = 76.40 vgg16 min = 541.66 max = 552.56 avg = 547.09 vgg16_int8 min = 391.44 max = 395.73 avg = 394.23 resnet50 min = 261.90 max = 263.91 avg = 262.83 resnet50_int8 min = 195.60 max = 198.08 avg = 196.65 squeezenet_ssd min = 127.01 max = 129.85 avg = 127.61 squeezenet_ssd_int8 min = 104.98 max = 107.67 avg = 105.47 mobilenet_ssd min = 120.43 max = 123.28 avg = 121.46 mobilenet_ssd_int8 min = 70.70 max = 72.85 avg = 71.14 mobilenet_yolo min = 270.89 max = 273.42 avg = 272.33 mobilenetv2_yolov3 min = 183.85 max = 185.73 avg = 184.88 yolov4-tiny min = 205.95 max = 209.90 avg = 207.22 nanodet_m min = 68.08 max = 68.69 avg = 68.38 yolo-fastest-1.1 min = 47.97 max = 48.20 avg = 48.06 yolo-fastestv2 min = 37.17 max = 37.69 avg = 37.47 vision_transformer min = 1872.31 max = 1964.95 avg = 1909.21 FastestDet min = 38.39 max = 39.17 avg = 38.69 pi@raspberrypi:~/ncnn/build/benchmark $ ./benchncnn 10 1 0 -1 1 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 73.35 max = 75.10 avg = 73.96 squeezenet_int8 min = 69.17 max = 69.66 avg = 69.42 mobilenet min = 123.76 max = 125.35 avg = 124.32 mobilenet_int8 min = 84.66 max = 85.24 avg = 84.82 mobilenet_v2 min = 92.98 max = 94.05 avg = 93.48 mobilenet_v3 min = 72.48 max = 73.14 avg = 72.81 shufflenet min = 47.17 max = 47.83 avg = 47.51 shufflenet_v2 min = 41.62 max = 42.60 avg = 42.12 mnasnet min = 83.60 max = 84.35 avg = 83.98 proxylessnasnet min = 98.48 max = 99.33 avg = 98.78 efficientnet_b0 min = 129.45 max = 130.02 avg = 129.73 efficientnetv2_b0 min = 155.06 max = 156.70 avg = 155.76 regnety_400m min = 105.39 max = 106.03 avg = 105.70 blazeface min = 12.54 max = 12.84 avg = 12.65 googlenet min = 235.38 max = 236.34 avg = 235.94 googlenet_int8 min = 209.63 max = 210.39 avg = 210.00 resnet18 min = 190.80 max = 191.43 avg = 191.10 resnet18_int8 min = 157.92 max = 158.97 avg = 158.50 alexnet min = 139.34 max = 139.44 avg = 139.40 vgg16 min = 1066.58 max = 1079.30 avg = 1071.85 vgg16_int8 min = 866.15 max = 873.75 avg = 869.84 resnet50 min = 533.15 max = 535.12 avg = 534.11 resnet50_int8 min = 423.72 max = 424.24 avg = 423.96 squeezenet_ssd min = 178.90 max = 179.53 avg = 179.30 squeezenet_ssd_int8 min = 157.05 max = 159.06 avg = 157.89 mobilenet_ssd min = 250.71 max = 251.26 avg = 251.00 mobilenet_ssd_int8 min = 170.21 max = 170.96 avg = 170.56 mobilenet_yolo min = 557.48 max = 560.08 avg = 558.80 mobilenetv2_yolov3 min = 301.60 max = 307.98 avg = 306.52 yolov4-tiny min = 370.55 max = 375.69 avg = 372.99 nanodet_m min = 103.05 max = 103.74 avg = 103.45 yolo-fastest-1.1 min = 56.58 max = 57.44 avg = 57.01 yolo-fastestv2 min = 46.69 max = 47.34 avg = 47.03 vision_transformer min = 6605.19 max = 6606.66 avg = 6605.73 FastestDet min = 52.11 max = 52.97 avg = 52.61 ``` ### Raspberry Pi 5 Broadcom BCM2712, Cortex-A76 (ARMv8) (2.4GHz x 4) ``` pi@raspberrypi:~/ncnn/benchmark $ ./benchncnn 10 4 0 -1 -1 >> text.out loop_count = 10 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 6.74 max = 8.16 avg = 7.38 squeezenet_int8 min = 6.97 max = 7.67 avg = 7.21 mobilenet min = 9.00 max = 72.98 avg = 33.88 mobilenet_int8 min = 8.68 max = 8.80 avg = 8.74 mobilenet_v2 min = 10.46 max = 10.63 avg = 10.52 mobilenet_v3 min = 7.30 max = 7.44 avg = 7.35 shufflenet min = 4.14 max = 4.18 avg = 4.16 shufflenet_v2 min = 3.37 max = 3.41 avg = 3.39 mnasnet min = 6.83 max = 8.55 avg = 7.10 proxylessnasnet min = 7.85 max = 7.97 avg = 7.88 efficientnet_b0 min = 12.28 max = 12.37 avg = 12.33 efficientnetv2_b0 min = 13.54 max = 13.84 avg = 13.69 regnety_400m min = 10.93 max = 11.07 avg = 10.99 blazeface min = 1.45 max = 1.48 avg = 1.47 googlenet min = 25.13 max = 25.47 avg = 25.35 googlenet_int8 min = 24.00 max = 24.23 avg = 24.12 resnet18 min = 19.84 max = 20.19 avg = 19.96 resnet18_int8 min = 16.68 max = 16.83 avg = 16.74 alexnet min = 21.21 max = 21.54 avg = 21.36 vgg16 min = 127.75 max = 134.00 avg = 129.24 vgg16_int8 min = 106.39 max = 110.66 avg = 107.01 resnet50 min = 45.94 max = 46.54 avg = 46.21 resnet50_int8 min = 40.16 max = 42.58 avg = 40.75 squeezenet_ssd min = 30.10 max = 30.95 avg = 30.37 squeezenet_ssd_int8 min = 27.71 max = 29.03 avg = 28.15 mobilenet_ssd min = 24.16 max = 24.89 avg = 24.52 mobilenet_ssd_int8 min = 21.79 max = 22.37 avg = 22.05 mobilenet_yolo min = 58.06 max = 58.45 avg = 58.19 mobilenetv2_yolov3 min = 37.49 max = 37.94 avg = 37.68 yolov4-tiny min = 44.45 max = 60.58 avg = 46.29 nanodet_m min = 11.01 max = 11.28 avg = 11.18 yolo-fastest-1.1 min = 5.53 max = 5.97 avg = 5.62 yolo-fastestv2 min = 4.76 max = 4.84 avg = 4.80 vision_transformer min = 600.65 max = 622.47 avg = 611.65 FastestDet min = 4.83 max = 6.94 avg = 5.34 pi@raspberrypi:~/ncnn/benchmark $ ./benchncnn 10 1 0 -1 -1 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 11.77 max = 12.18 avg = 11.87 squeezenet_int8 min = 11.67 max = 11.98 avg = 11.82 mobilenet min = 20.24 max = 20.59 avg = 20.30 mobilenet_int8 min = 14.38 max = 14.51 avg = 14.44 mobilenet_v2 min = 16.21 max = 16.49 avg = 16.38 mobilenet_v3 min = 11.64 max = 12.12 avg = 11.80 shufflenet min = 7.17 max = 7.24 avg = 7.20 shufflenet_v2 min = 7.07 max = 7.21 avg = 7.14 mnasnet min = 12.93 max = 13.03 avg = 12.99 proxylessnasnet min = 15.72 max = 15.80 avg = 15.74 efficientnet_b0 min = 24.12 max = 24.53 avg = 24.20 efficientnetv2_b0 min = 27.59 max = 28.04 avg = 27.75 regnety_400m min = 16.41 max = 16.66 avg = 16.49 blazeface min = 2.98 max = 3.04 avg = 3.02 googlenet min = 48.62 max = 48.87 avg = 48.71 googlenet_int8 min = 49.07 max = 49.26 avg = 49.15 resnet18 min = 29.54 max = 30.17 avg = 29.68 resnet18_int8 min = 36.30 max = 36.55 avg = 36.42 alexnet min = 35.24 max = 35.86 avg = 35.62 vgg16 min = 188.84 max = 190.87 avg = 189.63 vgg16_int8 min = 272.27 max = 274.15 avg = 273.10 resnet50 min = 89.04 max = 89.87 avg = 89.43 resnet50_int8 min = 80.00 max = 80.50 avg = 80.16 squeezenet_ssd min = 38.02 max = 38.69 avg = 38.29 squeezenet_ssd_int8 min = 40.58 max = 41.17 avg = 40.94 mobilenet_ssd min = 45.42 max = 47.08 avg = 45.90 mobilenet_ssd_int8 min = 36.05 max = 37.02 avg = 36.35 mobilenet_yolo min = 104.82 max = 106.56 avg = 105.69 mobilenetv2_yolov3 min = 60.11 max = 60.29 avg = 60.19 yolov4-tiny min = 67.61 max = 69.05 avg = 68.02 nanodet_m min = 19.63 max = 19.81 avg = 19.69 yolo-fastest-1.1 min = 8.10 max = 8.14 avg = 8.12 yolo-fastestv2 min = 7.21 max = 7.26 avg = 7.24 vision_transformer min = 1249.08 max = 1253.32 avg = 1250.30 FastestDet min = 7.33 max = 7.44 avg = 7.38 ``` ### Raspberry Pi 5 Broadcom BCM2712, VideoCore VII Graphics (Vulkan 1.2) ``` fan@raspberrypi:~/ncnn/benchmark $ ../build/benchmark/benchncnn 10 $(nproc) 0 0 [0 V3D 7.1.7] queueC=0[1] queueG=0[1] queueT=0[1] [0 V3D 7.1.7] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 V3D 7.1.7] fp16-p/s/a=1/1/0 int8-p/s/a=1/1/0 [0 V3D 7.1.7] subgroup=16 basic/vote/ballot/shuffle=1/0/0/0 [0 V3D 7.1.7] fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0 [1 llvmpipe (LLVM 15.0.6, 128 bits)] queueC=0[1] queueG=0[1] queueT=0[1] [1 llvmpipe (LLVM 15.0.6, 128 bits)] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [1 llvmpipe (LLVM 15.0.6, 128 bits)] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [1 llvmpipe (LLVM 15.0.6, 128 bits)] subgroup=4 basic/vote/ballot/shuffle=1/1/1/1 [1 llvmpipe (LLVM 15.0.6, 128 bits)] fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0 loop_count = 10 num_threads = 4 powersave = 0 gpu_device = 0 cooling_down = 1 squeezenet min = 120.75 max = 121.31 avg = 120.94 squeezenet_int8 min = 9.57 max = 24.49 avg = 11.23 mobilenet min = 160.32 max = 160.75 avg = 160.53 mobilenet_int8 min = 11.29 max = 11.47 avg = 11.37 mobilenet_v2 min = 121.05 max = 121.93 avg = 121.46 mobilenet_v3 min = 117.90 max = 119.20 avg = 118.48 shufflenet min = 70.82 max = 71.55 avg = 71.04 shufflenet_v2 min = 97.74 max = 98.58 avg = 98.00 mnasnet min = 118.21 max = 118.76 avg = 118.44 proxylessnasnet min = 124.28 max = 124.92 avg = 124.52 efficientnet_b0 min = 187.48 max = 188.38 avg = 187.93 efficientnetv2_b0 min = 270.11 max = 280.80 avg = 272.26 regnety_400m min = 142.14 max = 143.25 avg = 142.66 blazeface min = 31.97 max = 32.41 avg = 32.17 googlenet min = 346.30 max = 347.47 avg = 346.81 googlenet_int8 min = 30.77 max = 32.26 avg = 31.52 resnet18 min = 346.96 max = 347.50 avg = 347.26 resnet18_int8 min = 19.95 max = 20.95 avg = 20.48 alexnet min = 181.57 max = 182.03 avg = 181.75 vgg16 min = 1776.00 max = 1776.66 avg = 1776.40 vgg16_int8 min = 134.10 max = 141.76 avg = 136.32 resnet50 min = 841.90 max = 842.50 avg = 842.16 resnet50_int8 min = 54.29 max = 55.22 avg = 54.54 squeezenet_ssd min = 461.71 max = 468.09 avg = 466.97 squeezenet_ssd_int8 min = 38.05 max = 39.00 avg = 38.58 mobilenet_ssd min = 379.50 max = 381.66 avg = 380.14 mobilenet_ssd_int8 min = 29.91 max = 30.77 avg = 30.13 mobilenet_yolo min = 753.61 max = 755.06 avg = 753.97 mobilenetv2_yolov3 min = 382.18 max = 389.90 avg = 386.97 yolov4-tiny min = 673.87 max = 674.71 avg = 674.07 nanodet_m min = 206.55 max = 210.48 avg = 209.69 yolo-fastest-1.1 min = 109.98 max = 111.18 avg = 110.45 yolo-fastestv2 min = 86.07 max = 87.16 avg = 86.51 vision_transformer min = 20594.51 max = 20601.53 avg = 20596.59 FastestDet min = 90.25 max = 91.00 avg = 90.64 ``` ### Raspberry Pi 5 Broadcom BCM2712 Overclock to 2.9Ghz, VideoCore VII Graphics Overclock to 1.1Ghz (Vulkan 1.2) ``` pi@raspberrypi:~/ncnn/build/benchmark $ sudo echo "arm_freq=2900" >> /boot/firmware/config.txt pi@raspberrypi:~/ncnn/build/benchmark $ sudo echo "gpu_freq=1100" >> /boot/firmware/config.txt pi@raspberrypi:~/ncnn/build/benchmark $ sudo reboot pi@raspberrypi:~/ncnn/build/benchmark $ ./benchncnn 10 4 0 0 [0 V3D 7.1.7] queueC=0[1] queueG=0[1] queueT=0[1] [0 V3D 7.1.7] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 V3D 7.1.7] fp16-p/s/u/a=1/1/1/0 int8-p/s/u/a=1/1/1/0 [0 V3D 7.1.7] subgroup=16 basic/vote/ballot/shuffle=1/0/0/0 [0 V3D 7.1.7] fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0 [1 llvmpipe (LLVM 15.0.6, 128 bits)] queueC=0[1] queueG=0[1] queueT=0[1] [1 llvmpipe (LLVM 15.0.6, 128 bits)] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [1 llvmpipe (LLVM 15.0.6, 128 bits)] fp16-p/s/u/a=1/1/1/1 int8-p/s/u/a=1/1/1/1 [1 llvmpipe (LLVM 15.0.6, 128 bits)] subgroup=4 basic/vote/ballot/shuffle=1/1/1/1 [1 llvmpipe (LLVM 15.0.6, 128 bits)] fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0 loop_count = 10 num_threads = 4 powersave = 0 gpu_device = 0 cooling_down = 1 squeezenet min = 106.98 max = 107.05 avg = 107.02 squeezenet_int8 min = 8.51 max = 8.83 avg = 8.65 mobilenet min = 147.66 max = 147.71 avg = 147.68 mobilenet_int8 min = 10.21 max = 10.54 avg = 10.37 mobilenet_v2 min = 110.11 max = 110.23 avg = 110.18 mobilenet_v3 min = 101.84 max = 102.03 avg = 101.92 shufflenet min = 59.77 max = 59.84 avg = 59.80 shufflenet_v2 min = 81.46 max = 81.60 avg = 81.51 mnasnet min = 105.88 max = 105.98 avg = 105.94 proxylessnasnet min = 108.82 max = 108.89 avg = 108.86 efficientnet_b0 min = 168.79 max = 168.93 avg = 168.87 efficientnetv2_b0 min = 232.52 max = 232.80 avg = 232.65 regnety_400m min = 130.33 max = 130.49 avg = 130.36 blazeface min = 22.23 max = 22.49 avg = 22.39 googlenet min = 299.25 max = 299.37 avg = 299.31 googlenet_int8 min = 29.21 max = 29.97 avg = 29.58 resnet18 min = 304.47 max = 304.64 avg = 304.58 resnet18_int8 min = 19.31 max = 20.77 avg = 20.24 alexnet min = 203.68 max = 203.79 avg = 203.76 vgg16 min = 1571.91 max = 1572.22 avg = 1572.06 vgg16_int8 min = 128.46 max = 130.89 avg = 129.96 resnet50 min = 754.16 max = 754.33 avg = 754.26 resnet50_int8 min = 52.65 max = 53.48 avg = 53.09 squeezenet_ssd min = 398.22 max = 398.36 avg = 398.28 squeezenet_ssd_int8 min = 34.26 max = 34.67 avg = 34.51 mobilenet_ssd min = 344.81 max = 344.99 avg = 344.89 mobilenet_ssd_int8 min = 27.59 max = 28.01 avg = 27.77 mobilenet_yolo min = 712.53 max = 712.63 avg = 712.59 mobilenetv2_yolov3 min = 362.81 max = 363.11 avg = 362.90 yolov4-tiny min = 589.30 max = 589.51 avg = 589.39 nanodet_m min = 178.83 max = 178.97 avg = 178.88 yolo-fastest-1.1 min = 92.36 max = 92.58 avg = 92.45 yolo-fastestv2 min = 70.68 max = 70.84 avg = 70.74 vision_transformer min = 18615.94 max = 18648.17 avg = 18633.77 FastestDet min = 74.59 max = 74.68 avg = 74.63 pi@raspberrypi:~/ncnn/build/benchmark $ ./benchncnn 10 4 0 -1 loop_count = 10 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 7.61 max = 7.76 avg = 7.70 squeezenet_int8 min = 7.97 max = 8.68 avg = 8.23 mobilenet min = 9.65 max = 9.91 avg = 9.80 mobilenet_int8 min = 10.60 max = 36.93 avg = 13.29 mobilenet_v2 min = 12.25 max = 12.64 avg = 12.40 mobilenet_v3 min = 8.14 max = 8.26 avg = 8.20 shufflenet min = 3.72 max = 3.82 avg = 3.77 shufflenet_v2 min = 2.99 max = 3.10 avg = 3.05 mnasnet min = 7.27 max = 7.46 avg = 7.37 proxylessnasnet min = 8.39 max = 8.55 avg = 8.48 efficientnet_b0 min = 13.15 max = 13.59 avg = 13.39 efficientnetv2_b0 min = 14.79 max = 15.30 avg = 14.91 regnety_400m min = 9.49 max = 9.71 avg = 9.57 blazeface min = 1.41 max = 1.46 avg = 1.43 googlenet min = 28.60 max = 28.87 avg = 28.73 googlenet_int8 min = 27.09 max = 27.77 avg = 27.47 resnet18 min = 21.47 max = 21.88 avg = 21.65 resnet18_int8 min = 20.07 max = 20.30 avg = 20.24 alexnet min = 22.75 max = 23.47 avg = 23.05 vgg16 min = 154.32 max = 158.51 avg = 157.40 vgg16_int8 min = 127.78 max = 162.60 avg = 133.21 resnet50 min = 49.36 max = 49.86 avg = 49.63 resnet50_int8 min = 46.44 max = 46.89 avg = 46.74 squeezenet_ssd min = 37.31 max = 74.95 avg = 41.30 squeezenet_ssd_int8 min = 32.62 max = 33.63 avg = 33.09 mobilenet_ssd min = 27.40 max = 27.99 avg = 27.68 mobilenet_ssd_int8 min = 26.70 max = 27.71 avg = 27.23 mobilenet_yolo min = 60.25 max = 61.10 avg = 60.67 mobilenetv2_yolov3 min = 43.51 max = 44.29 avg = 43.87 yolov4-tiny min = 51.63 max = 52.64 avg = 52.24 nanodet_m min = 11.89 max = 12.06 avg = 11.97 yolo-fastest-1.1 min = 5.63 max = 5.78 avg = 5.69 yolo-fastestv2 min = 5.34 max = 5.48 avg = 5.40 vision_transformer min = 481.78 max = 506.72 avg = 493.05 FastestDet min = 4.91 max = 5.14 avg = 5.01 ``` ### Raspberry Pi Zero 2 W Broadcom BCM2710A1, Cortex-A53 (ARMv8) (1.0GHz x 4) ``` loop_count = 8 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 119.52 max = 120.29 avg = 119.93 squeezenet_int8 min = 96.32 max = 96.96 avg = 96.55 mobilenet min = 162.60 max = 165.49 avg = 163.19 mobilenet_int8 min = 90.78 max = 91.39 avg = 91.03 mobilenet_v2 min = 145.71 max = 148.83 avg = 147.39 mobilenet_v3 min = 113.89 max = 151.95 avg = 119.04 shufflenet min = 72.72 max = 73.27 avg = 72.96 shufflenet_v2 min = 63.64 max = 64.50 avg = 64.13 mnasnet min = 126.07 max = 126.93 avg = 126.53 proxylessnasnet min = 139.90 max = 140.84 avg = 140.35 efficientnet_b0 min = 201.88 max = 202.55 avg = 202.14 efficientnetv2_b0 min = 227.22 max = 228.84 avg = 228.09 regnety_400m min = 156.49 max = 157.47 avg = 156.96 blazeface min = 22.79 max = 23.28 avg = 23.10 googlenet min = 323.74 max = 324.90 avg = 324.45 googlenet_int8 min = 250.86 max = 252.82 avg = 251.63 resnet18 min = 351.37 max = 355.67 avg = 353.45 resnet18_int8 min = 194.83 max = 196.68 avg = 195.51 alexnet min = 271.18 max = 273.53 avg = 272.18 resnet50 min = 777.44 max = 797.47 avg = 782.63 resnet50_int8 min = 496.78 max = 498.86 avg = 497.57 squeezenet_ssd min = 376.10 max = 382.41 avg = 379.13 squeezenet_ssd_int8 min = 255.99 max = 257.57 avg = 256.78 mobilenet_ssd min = 338.64 max = 339.93 avg = 339.50 mobilenet_ssd_int8 min = 190.24 max = 190.68 avg = 190.48 mobilenet_yolo min = 746.83 max = 748.14 avg = 747.53 mobilenetv2_yolov3 min = 487.99 max = 491.18 avg = 489.37 yolov4-tiny min = 644.73 max = 652.24 avg = 646.64 nanodet_m min = 165.27 max = 167.12 avg = 166.27 yolo-fastest-1.1 min = 98.74 max = 100.02 avg = 99.17 yolo-fastestv2 min = 80.52 max = 81.86 avg = 81.29 loop_count = 8 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 240.53 max = 241.07 avg = 240.77 squeezenet_int8 min = 212.63 max = 213.23 avg = 212.94 mobilenet min = 393.79 max = 394.04 avg = 393.94 mobilenet_int8 min = 286.58 max = 286.95 avg = 286.75 mobilenet_v2 min = 273.97 max = 274.51 avg = 274.23 mobilenet_v3 min = 233.77 max = 234.59 avg = 234.20 shufflenet min = 133.05 max = 133.36 avg = 133.23 shufflenet_v2 min = 128.86 max = 129.47 avg = 129.18 mnasnet min = 265.70 max = 266.17 avg = 265.93 proxylessnasnet min = 329.78 max = 330.54 avg = 330.13 efficientnet_b0 min = 518.42 max = 519.38 avg = 519.00 efficientnetv2_b0 min = 594.37 max = 595.17 avg = 594.74 regnety_400m min = 329.53 max = 330.44 avg = 329.87 blazeface min = 42.24 max = 45.56 avg = 43.96 googlenet min = 780.05 max = 780.63 avg = 780.39 googlenet_int8 min = 663.83 max = 664.43 avg = 664.15 resnet18 min = 653.62 max = 657.59 avg = 654.69 resnet18_int8 min = 479.03 max = 479.72 avg = 479.40 alexnet min = 687.99 max = 690.34 avg = 689.15 resnet50 min = 1800.97 max = 1806.11 avg = 1802.79 resnet50_int8 min = 1311.68 max = 1314.56 avg = 1313.15 squeezenet_ssd min = 563.63 max = 565.57 avg = 564.44 squeezenet_ssd_int8 min = 481.24 max = 483.97 avg = 482.20 mobilenet_ssd min = 799.21 max = 829.10 avg = 803.56 mobilenet_ssd_int8 min = 568.11 max = 568.88 avg = 568.42 mobilenet_yolo min = 1815.60 max = 1816.44 avg = 1815.93 mobilenetv2_yolov3 min = 951.34 max = 952.15 avg = 951.72 yolov4-tiny min = 1258.21 max = 1259.49 avg = 1258.66 nanodet_m min = 301.04 max = 304.09 avg = 301.70 yolo-fastest-1.1 min = 155.04 max = 155.98 avg = 155.53 yolo-fastestv2 min = 126.77 max = 127.40 avg = 127.05 ``` ### Banana Pi M2 Zero 2 AllWinner H2+, Cortex-A7 (ARMv7-A) (1.2GHz x 4) ``` loop_count = 8 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 230.97 max = 232.18 avg = 231.49 squeezenet_int8 min = 171.12 max = 172.87 avg = 171.68 mobilenet min = 327.65 max = 340.92 avg = 329.88 mobilenet_int8 min = 166.58 max = 169.55 avg = 167.47 mobilenet_v2 min = 276.81 max = 278.67 avg = 277.55 mobilenet_v3 min = 220.74 max = 225.14 avg = 222.08 shufflenet min = 147.97 max = 157.68 avg = 149.40 shufflenet_v2 min = 146.56 max = 154.90 avg = 148.25 mnasnet min = 243.06 max = 244.47 avg = 243.80 proxylessnasnet min = 260.38 max = 261.47 avg = 260.66 efficientnet_b0 min = 368.98 max = 371.03 avg = 369.96 efficientnetv2_b0 min = 433.96 max = 459.25 avg = 437.52 regnety_400m min = 307.53 max = 312.29 avg = 308.68 blazeface min = 46.54 max = 47.35 avg = 46.98 googlenet min = 647.86 max = 669.20 avg = 651.19 googlenet_int8 min = 439.90 max = 442.35 avg = 441.38 resnet18 min = 642.53 max = 856.58 avg = 698.28 resnet18_int8 min = 352.10 max = 354.51 avg = 353.44 alexnet min = 593.16 max = 624.20 avg = 598.66 resnet50 min = 1556.12 max = 1782.22 avg = 1606.86 resnet50_int8 min = 911.63 max = 999.42 avg = 924.37 squeezenet_ssd min = 653.85 max = 658.07 avg = 655.19 squeezenet_ssd_int8 min = 456.26 max = 467.76 avg = 459.87 mobilenet_ssd min = 671.93 max = 682.64 avg = 674.88 mobilenet_ssd_int8 min = 347.18 max = 349.07 avg = 347.81 mobilenet_yolo min = 1471.16 max = 1492.65 avg = 1479.30 mobilenetv2_yolov3 min = 895.90 max = 906.60 avg = 899.74 yolov4-tiny min = 1178.53 max = 1205.79 avg = 1183.98 nanodet_m min = 358.89 max = 366.07 avg = 362.20 yolo-fastest-1.1 min = 189.93 max = 192.18 avg = 190.91 yolo-fastestv2 min = 158.60 max = 161.33 avg = 159.43 loop_count = 8 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 602.97 max = 604.97 avg = 603.46 squeezenet_int8 min = 431.18 max = 432.42 avg = 431.77 mobilenet min = 971.52 max = 986.64 avg = 974.04 mobilenet_int8 min = 556.74 max = 556.98 avg = 556.84 mobilenet_v2 min = 682.85 max = 684.17 avg = 683.34 mobilenet_v3 min = 585.10 max = 585.76 avg = 585.57 shufflenet min = 340.64 max = 342.63 avg = 341.26 shufflenet_v2 min = 322.41 max = 324.13 avg = 323.35 mnasnet min = 644.30 max = 645.93 avg = 644.71 proxylessnasnet min = 732.50 max = 733.30 avg = 732.96 efficientnet_b0 min = 1084.70 max = 1094.98 avg = 1086.52 efficientnetv2_b0 min = 1282.27 max = 1283.67 avg = 1282.60 regnety_400m min = 764.60 max = 768.54 avg = 765.30 blazeface min = 100.48 max = 106.28 avg = 103.33 googlenet min = 1878.69 max = 1883.96 avg = 1880.76 googlenet_int8 min = 1274.31 max = 1296.02 avg = 1279.59 resnet18 min = 1837.91 max = 1843.95 avg = 1839.17 resnet18_int8 min = 1011.98 max = 1014.43 avg = 1013.01 alexnet min = 1997.59 max = 2001.81 avg = 1999.42 resnet50 min = 4844.31 max = 4857.05 avg = 4847.80 resnet50_int8 min = 2792.59 max = 2810.08 avg = 2797.30 squeezenet_ssd min = 1438.96 max = 1443.31 avg = 1441.09 squeezenet_ssd_int8 min = 1046.76 max = 1053.00 avg = 1049.22 mobilenet_ssd min = 2018.66 max = 2023.70 avg = 2019.67 mobilenet_ssd_int8 min = 1129.16 max = 1130.62 avg = 1129.82 mobilenet_yolo min = 4724.90 max = 4728.57 avg = 4726.41 mobilenetv2_yolov3 min = 2410.67 max = 2427.95 avg = 2413.89 yolov4-tiny min = 3177.27 max = 3185.52 avg = 3179.71 nanodet_m min = 761.38 max = 768.79 avg = 766.53 yolo-fastest-1.1 min = 391.82 max = 393.32 avg = 392.39 yolo-fastestv2 min = 316.93 max = 319.86 avg = 318.33 ``` ### Radxa Orion O6 (Big Cortex‑A720 2.6Ghz x4 + Medium Cortex‑A720 x 4 + Little Cortex‑A520 x 4 + Arm Immortals G720 MC10 GPU 1.1Ghz) ``` radxa@orion-o6:~/ncnn/build/benchmark$ ./benchncnn 4 1 2 -1 1 loop_count = 4 num_threads = 1 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 8.52 max = 8.53 avg = 8.53 squeezenet_int8 min = 6.49 max = 6.50 avg = 6.50 mobilenet min = 15.56 max = 15.61 avg = 15.58 mobilenet_int8 min = 8.68 max = 8.70 avg = 8.69 mobilenet_v2 min = 9.67 max = 9.68 avg = 9.67 mobilenet_v3 min = 8.05 max = 8.07 avg = 8.06 shufflenet min = 5.30 max = 5.32 avg = 5.31 shufflenet_v2 min = 5.55 max = 5.57 avg = 5.56 mnasnet min = 9.23 max = 9.26 avg = 9.25 proxylessnasnet min = 11.58 max = 11.58 avg = 11.58 efficientnet_b0 min = 18.67 max = 18.68 avg = 18.67 efficientnetv2_b0 min = 21.55 max = 21.59 avg = 21.57 regnety_400m min = 13.02 max = 13.07 avg = 13.05 blazeface min = 2.04 max = 2.06 avg = 2.05 googlenet min = 35.36 max = 35.49 avg = 35.40 googlenet_int8 min = 27.86 max = 27.97 avg = 27.91 resnet18 min = 21.68 max = 21.74 avg = 21.70 resnet18_int8 min = 19.07 max = 19.12 avg = 19.09 alexnet min = 23.94 max = 24.06 avg = 24.02 vgg16 min = 123.48 max = 124.36 avg = 123.87 vgg16_int8 min = 139.53 max = 139.72 avg = 139.64 resnet50 min = 68.07 max = 68.09 avg = 68.08 resnet50_int8 min = 39.99 max = 40.07 avg = 40.03 squeezenet_ssd min = 20.35 max = 20.43 avg = 20.38 squeezenet_ssd_int8 min = 18.62 max = 18.69 avg = 18.67 mobilenet_ssd min = 31.40 max = 31.56 avg = 31.48 mobilenet_ssd_int8 min = 17.44 max = 17.54 avg = 17.49 mobilenet_yolo min = 70.84 max = 70.94 avg = 70.88 mobilenetv2_yolov3 min = 35.24 max = 35.30 avg = 35.28 yolov4-tiny min = 42.96 max = 43.02 avg = 42.99 nanodet_m min = 13.05 max = 13.11 avg = 13.08 yolo-fastest-1.1 min = 5.21 max = 5.22 avg = 5.22 yolo-fastestv2 min = 4.48 max = 4.50 avg = 4.49 vision_transformer min = 1001.70 max = 1002.06 avg = 1001.90 FastestDet min = 4.65 max = 4.67 avg = 4.66 radxa@orion-o6:~/ncnn/build/benchmark$ ./benchncnn 4 12 2 -1 1 loop_count = 4 num_threads = 12 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 38.01 max = 40.45 avg = 39.00 squeezenet_int8 min = 45.53 max = 45.73 avg = 45.60 mobilenet min = 33.35 max = 37.73 avg = 35.96 mobilenet_int8 min = 33.87 max = 34.05 avg = 33.93 mobilenet_v2 min = 57.97 max = 61.42 avg = 59.74 mobilenet_v3 min = 65.47 max = 65.76 avg = 65.65 shufflenet min = 110.95 max = 111.29 avg = 111.12 shufflenet_v2 min = 63.97 max = 64.20 avg = 64.08 mnasnet min = 56.06 max = 56.44 avg = 56.23 proxylessnasnet min = 63.84 max = 64.36 avg = 64.10 efficientnet_b0 min = 94.52 max = 94.79 avg = 94.65 efficientnetv2_b0 min = 154.39 max = 158.08 avg = 156.57 regnety_400m min = 454.18 max = 457.25 avg = 455.08 blazeface min = 44.79 max = 45.03 avg = 44.92 googlenet min = 91.22 max = 93.72 avg = 92.01 googlenet_int8 min = 115.45 max = 118.36 avg = 116.69 resnet18 min = 42.81 max = 50.61 avg = 45.62 resnet18_int8 min = 45.26 max = 47.70 avg = 46.52 alexnet min = 25.74 max = 28.83 avg = 26.66 vgg16 min = 61.15 max = 64.72 avg = 63.09 vgg16_int8 min = 67.75 max = 73.18 avg = 69.38 resnet50 min = 90.29 max = 100.58 avg = 96.62 resnet50_int8 min = 92.35 max = 97.42 avg = 94.64 squeezenet_ssd min = 105.26 max = 111.83 avg = 107.89 squeezenet_ssd_int8 min = 117.49 max = 121.57 avg = 118.91 mobilenet_ssd min = 89.79 max = 95.18 avg = 92.15 mobilenet_ssd_int8 min = 97.02 max = 103.84 avg = 99.86 mobilenet_yolo min = 603.04 max = 606.87 avg = 605.03 mobilenetv2_yolov3 min = 75.32 max = 80.43 avg = 76.83 yolov4-tiny min = 51.46 max = 60.43 avg = 56.32 nanodet_m min = 104.05 max = 109.94 avg = 107.06 yolo-fastest-1.1 min = 90.31 max = 90.50 avg = 90.41 yolo-fastestv2 min = 94.72 max = 96.62 avg = 95.52 vision_transformer min = 323.38 max = 333.42 avg = 329.50 FastestDet min = 80.86 max = 83.37 avg = 81.84 radxa@orion-o6:~/ncnn/build/benchmark$ ./benchncnn 4 1 2 0 1 [0 Mali-G720-Immortalis] queueC=0[2] queueG=0[2] queueT=0[2] [0 Mali-G720-Immortalis] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 Mali-G720-Immortalis] fp16-p/s/u/a=1/1/1/1 int8-p/s/u/a=1/1/1/1 [0 Mali-G720-Immortalis] subgroup=16 basic/vote/ballot/shuffle=1/1/1/1 [0 Mali-G720-Immortalis] fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0 loop_count = 4 num_threads = 1 powersave = 2 gpu_device = 0 cooling_down = 1 squeezenet min = 16.33 max = 16.59 avg = 16.45 squeezenet_int8 min = 6.36 max = 10.08 avg = 7.32 mobilenet min = 3.45 max = 27.79 avg = 14.90 mobilenet_int8 min = 8.71 max = 8.76 avg = 8.74 mobilenet_v2 min = 4.31 max = 4.43 avg = 4.40 mobilenet_v3 min = 19.81 max = 19.86 avg = 19.83 shufflenet min = 14.76 max = 14.83 avg = 14.79 shufflenet_v2 min = 15.24 max = 15.33 avg = 15.28 mnasnet min = 3.71 max = 10.64 avg = 5.55 proxylessnasnet min = 4.82 max = 4.95 avg = 4.90 efficientnet_b0 min = 6.58 max = 6.62 avg = 6.60 efficientnetv2_b0 min = 56.26 max = 57.46 avg = 56.82 regnety_400m min = 5.30 max = 30.08 avg = 17.72 blazeface min = 4.36 max = 4.52 avg = 4.46 googlenet min = 9.03 max = 9.07 avg = 9.05 googlenet_int8 min = 27.90 max = 27.94 avg = 27.92 resnet18 min = 6.47 max = 28.26 avg = 11.93 resnet18_int8 min = 19.79 max = 19.83 avg = 19.81 alexnet min = 7.76 max = 7.81 avg = 7.77 vgg16 min = 27.58 max = 27.90 avg = 27.77 vgg16_int8 min = 143.28 max = 144.19 avg = 143.68 resnet50 min = 14.06 max = 14.22 avg = 14.15 resnet50_int8 min = 41.37 max = 41.48 avg = 41.43 squeezenet_ssd min = 11.11 max = 60.31 avg = 47.93 squeezenet_ssd_int8 min = 19.29 max = 19.39 avg = 19.35 mobilenet_ssd min = 8.78 max = 8.88 avg = 8.82 mobilenet_ssd_int8 min = 17.60 max = 17.66 avg = 17.62 mobilenet_yolo min = 13.64 max = 13.91 avg = 13.76 mobilenetv2_yolov3 min = 11.97 max = 15.79 avg = 14.01 yolov4-tiny min = 26.72 max = 32.41 avg = 28.27 nanodet_m min = 9.84 max = 13.42 avg = 10.76 yolo-fastest-1.1 min = 15.38 max = 15.62 avg = 15.56 yolo-fastestv2 min = 13.56 max = 13.67 avg = 13.61 vision_transformer min = 831.86 max = 835.66 avg = 833.83 FastestDet min = 13.85 max = 13.92 avg = 13.88 ``` ### Radxa Zero 3W, Cortex-A55 (ARMv82) (1.416 GHz x 4) ``` loop_count = 10 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 34.51 max = 106.19 avg = 79.43 squeezenet_int8 min = 31.48 max = 49.87 avg = 34.65 mobilenet min = 42.23 max = 45.36 avg = 42.89 mobilenet_int8 min = 35.97 max = 53.84 avg = 38.77 mobilenet_v2 min = 39.61 max = 40.35 avg = 40.00 mobilenet_v3 min = 31.19 max = 31.85 avg = 31.50 shufflenet min = 24.75 max = 27.74 avg = 25.55 shufflenet_v2 min = 22.00 max = 22.70 avg = 22.31 mnasnet min = 34.95 max = 53.55 avg = 37.39 proxylessnasnet min = 39.96 max = 44.32 avg = 40.81 efficientnet_b0 min = 49.76 max = 67.77 avg = 52.61 efficientnetv2_b0 min = 64.00 max = 85.78 avg = 67.06 regnety_400m min = 55.23 max = 73.22 avg = 57.87 blazeface min = 7.80 max = 10.39 avg = 8.27 googlenet min = 98.24 max = 118.27 avg = 101.78 googlenet_int8 min = 98.81 max = 115.66 avg = 101.52 resnet18 min = 75.33 max = 88.59 avg = 78.19 resnet18_int8 min = 76.31 max = 95.17 avg = 79.03 alexnet min = 65.07 max = 73.80 avg = 67.18 vgg16 min = 423.20 max = 455.15 avg = 436.32 vgg16_int8 min = 591.82 max = 620.22 avg = 607.55 resnet50 min = 185.53 max = 207.10 avg = 193.03 resnet50_int8 min = 176.84 max = 194.73 avg = 181.81 squeezenet_ssd min = 96.64 max = 118.46 avg = 100.86 squeezenet_ssd_int8 min = 96.61 max = 123.48 avg = 104.64 mobilenet_ssd min = 95.38 max = 110.52 avg = 98.61 mobilenet_ssd_int8 min = 76.21 max = 95.41 avg = 79.10 mobilenet_yolo min = 210.73 max = 235.47 avg = 221.72 mobilenetv2_yolov3 min = 134.59 max = 154.33 avg = 139.54 yolov4-tiny min = 167.79 max = 191.60 avg = 171.25 nanodet_m min = 63.22 max = 80.73 avg = 66.25 yolo-fastest-1.1 min = 32.87 max = 88.05 avg = 47.36 yolo-fastestv2 min = 26.03 max = 27.01 avg = 26.54 vision_transformer min = 3682.51 max = 3882.79 avg = 3809.42 FastestDet min = 30.69 max = 50.65 avg = 33.65 ``` ### Avaota Aim T527, Allwinner T527 (Cortex-A55 2.2GHz x 4 + Cortex-A55 1.8GHz x 4) ``` ./benchncnn 4 4 2 -1 1 loop_count = 4 num_threads = 4 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 14.15 max = 14.21 avg = 14.17 squeezenet_int8 min = 21.05 max = 21.12 avg = 21.09 mobilenet min = 19.22 max = 19.30 avg = 19.25 mobilenet_int8 min = 18.65 max = 19.52 avg = 19.07 mobilenet_v2 min = 20.23 max = 21.01 avg = 20.63 mobilenet_v3 min = 15.34 max = 15.48 avg = 15.41 shufflenet min = 10.30 max = 10.37 avg = 10.33 shufflenet_v2 min = 9.18 max = 9.34 avg = 9.23 mnasnet min = 15.58 max = 15.62 avg = 15.60 proxylessnasnet min = 19.64 max = 19.73 avg = 19.67 efficientnet_b0 min = 25.62 max = 25.81 avg = 25.69 efficientnetv2_b0 min = 36.95 max = 37.46 avg = 37.17 regnety_400m min = 23.75 max = 24.13 avg = 23.90 blazeface min = 3.37 max = 3.42 avg = 3.40 googlenet min = 57.36 max = 58.32 avg = 57.88 googlenet_int8 min = 60.80 max = 62.30 avg = 61.50 resnet18 min = 39.99 max = 40.34 avg = 40.17 resnet18_int8 min = 54.18 max = 56.08 avg = 55.16 alexnet min = 41.87 max = 42.21 avg = 42.08 vgg16 min = 260.14 max = 260.94 avg = 260.51 vgg16_int8 min = 347.42 max = 348.90 avg = 348.30 resnet50 min = 90.91 max = 91.26 avg = 91.07 resnet50_int8 min = 121.94 max = 122.56 avg = 122.28 squeezenet_ssd min = 57.11 max = 57.57 avg = 57.37 squeezenet_ssd_int8 min = 74.70 max = 75.18 avg = 74.91 mobilenet_ssd min = 49.60 max = 49.96 avg = 49.71 mobilenet_ssd_int8 min = 49.45 max = 49.93 avg = 49.63 mobilenet_yolo min = 114.98 max = 115.37 avg = 115.18 mobilenetv2_yolov3 min = 75.74 max = 75.97 avg = 75.87 yolov4-tiny min = 99.09 max = 99.43 avg = 99.25 nanodet_m min = 29.40 max = 29.77 avg = 29.60 yolo-fastest-1.1 min = 13.78 max = 13.85 avg = 13.82 yolo-fastestv2 min = 12.91 max = 13.10 avg = 12.98 vision_transformer min = 1641.78 max = 1648.71 avg = 1646.65 FastestDet min = 12.24 max = 12.61 avg = 12.42 ``` ### Khadas VIM3, Amlogic A311D (Cortex-A73 2.2GHz x 4 + Cortex-A53 1.8GHz x 2) ``` vim3:/data/local/tmp # ./benchncnn 8 4 2 -1 1 loop_count = 8 num_threads = 4 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 30.98 max = 31.26 avg = 31.09 squeezenet_int8 min = 24.70 max = 24.84 avg = 24.78 mobilenet min = 42.57 max = 43.37 avg = 42.96 mobilenet_int8 min = 22.33 max = 22.52 avg = 22.44 mobilenet_v2 min = 39.36 max = 39.77 avg = 39.56 mobilenet_v3 min = 30.13 max = 30.45 avg = 30.28 shufflenet min = 21.62 max = 21.94 avg = 21.80 shufflenet_v2 min = 18.83 max = 19.24 avg = 19.05 mnasnet min = 33.54 max = 34.08 avg = 33.80 proxylessnasnet min = 35.81 max = 36.05 avg = 35.95 efficientnet_b0 min = 53.82 max = 54.44 avg = 54.21 efficientnetv2_b0 min = 62.20 max = 62.60 avg = 62.43 regnety_400m min = 48.82 max = 49.27 avg = 49.05 blazeface min = 6.34 max = 6.51 avg = 6.43 googlenet min = 81.96 max = 82.53 avg = 82.23 googlenet_int8 min = 64.42 max = 65.00 avg = 64.77 resnet18 min = 77.00 max = 77.83 avg = 77.46 resnet18_int8 min = 48.91 max = 49.14 avg = 49.05 alexnet min = 60.43 max = 60.93 avg = 60.69 vgg16 min = 414.89 max = 423.00 avg = 418.75 vgg16_int8 min = 245.58 max = 246.37 avg = 245.94 resnet50 min = 185.53 max = 187.35 avg = 186.18 resnet50_int8 min = 123.36 max = 124.75 avg = 124.17 squeezenet_ssd min = 85.87 max = 86.42 avg = 86.23 squeezenet_ssd_int8 min = 64.90 max = 65.24 avg = 65.08 mobilenet_ssd min = 88.32 max = 90.02 avg = 89.10 mobilenet_ssd_int8 min = 46.85 max = 47.18 avg = 46.98 mobilenet_yolo min = 192.33 max = 195.38 avg = 194.10 mobilenetv2_yolov3 min = 127.33 max = 128.58 avg = 127.96 yolov4-tiny min = 150.44 max = 152.02 avg = 151.20 nanodet_m min = 54.22 max = 54.61 avg = 54.37 yolo-fastest-1.1 min = 28.13 max = 28.76 avg = 28.40 yolo-fastestv2 min = 22.10 max = 22.26 avg = 22.19 vim3:/data/local/tmp # ./benchncnn 4 1 2 -1 1 loop_count = 4 num_threads = 1 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 68.25 max = 68.85 avg = 68.67 squeezenet_int8 min = 51.92 max = 52.08 avg = 52.01 mobilenet min = 112.69 max = 113.72 avg = 113.33 mobilenet_int8 min = 66.43 max = 66.89 avg = 66.68 mobilenet_v2 min = 81.36 max = 81.77 avg = 81.62 mobilenet_v3 min = 62.33 max = 63.39 avg = 62.94 shufflenet min = 37.84 max = 38.03 avg = 37.93 shufflenet_v2 min = 37.33 max = 38.08 avg = 37.68 mnasnet min = 73.83 max = 74.32 avg = 74.03 proxylessnasnet min = 85.19 max = 86.43 avg = 85.84 efficientnet_b0 min = 138.68 max = 139.69 avg = 139.19 efficientnetv2_b0 min = 167.53 max = 167.99 avg = 167.75 regnety_400m min = 94.78 max = 95.81 avg = 95.21 blazeface min = 11.22 max = 11.43 avg = 11.28 googlenet min = 229.35 max = 230.91 avg = 229.89 googlenet_int8 min = 173.04 max = 173.48 avg = 173.24 resnet18 min = 191.54 max = 193.78 avg = 192.49 resnet18_int8 min = 132.97 max = 133.51 avg = 133.25 alexnet min = 140.31 max = 141.95 avg = 141.18 vgg16 min = 1093.71 max = 1100.95 avg = 1097.64 vgg16_int8 min = 734.44 max = 736.16 avg = 735.05 resnet50 min = 530.38 max = 533.93 avg = 531.87 resnet50_int8 min = 332.88 max = 334.22 avg = 333.71 squeezenet_ssd min = 159.08 max = 160.98 avg = 160.16 squeezenet_ssd_int8 min = 126.97 max = 127.96 avg = 127.43 mobilenet_ssd min = 238.92 max = 241.14 avg = 239.70 mobilenet_ssd_int8 min = 135.57 max = 136.02 avg = 135.78 mobilenet_yolo min = 539.59 max = 543.88 avg = 541.90 mobilenetv2_yolov3 min = 281.32 max = 285.05 avg = 283.24 yolov4-tiny min = 381.99 max = 384.93 avg = 383.53 nanodet_m min = 98.32 max = 98.85 avg = 98.60 yolo-fastest-1.1 min = 44.59 max = 44.95 avg = 44.80 yolo-fastestv2 min = 36.88 max = 37.11 avg = 36.98 vim3:/data/local/tmp $ ./benchncnn 8 6 2 0 1 [0 Mali-G52] queueC=0[2] queueG=0[2] queueT=0[2] [0 Mali-G52] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=1 [0 Mali-G52] fp16-p/s/u/a=1/1/1/1 int8-p/s/u/a=1/0/0/0 [0 Mali-G52] subgroup=8 basic/vote/ballot/shuffle=1/0/0/0 [0 Mali-G52] fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0 loop_count = 8 num_threads = 6 powersave = 2 gpu_device = 0 cooling_down = 1 squeezenet min = 21.29 max = 21.81 avg = 21.56 squeezenet_int8 min = 37.59 max = 37.85 avg = 37.70 mobilenet min = 32.08 max = 32.61 avg = 32.42 mobilenet_int8 min = 40.12 max = 40.46 avg = 40.28 mobilenet_v2 min = 24.55 max = 24.67 avg = 24.62 mobilenet_v3 min = 25.35 max = 25.60 avg = 25.47 shufflenet min = 18.78 max = 89.48 avg = 35.41 shufflenet_v2 min = 21.15 max = 21.33 avg = 21.22 mnasnet min = 25.08 max = 25.31 avg = 25.21 proxylessnasnet min = 26.97 max = 27.18 avg = 27.05 efficientnet_b0 min = 40.70 max = 40.91 avg = 40.81 efficientnetv2_b0 min = 189.26 max = 192.84 avg = 191.33 regnety_400m min = 30.88 max = 31.17 avg = 31.03 blazeface min = 24.34 max = 24.52 avg = 24.45 googlenet min = 67.14 max = 67.43 avg = 67.30 googlenet_int8 min = 98.06 max = 98.57 avg = 98.35 resnet18 min = 61.13 max = 61.63 avg = 61.44 resnet18_int8 min = 72.63 max = 73.48 avg = 73.01 alexnet min = 68.88 max = 70.34 avg = 69.71 vgg16 min = 347.48 max = 348.48 avg = 347.94 vgg16_int8 min = 342.50 max = 357.78 avg = 353.13 resnet50 min = 158.90 max = 160.10 avg = 159.76 resnet50_int8 min = 211.35 max = 212.68 avg = 212.11 squeezenet_ssd min = 81.61 max = 82.17 avg = 81.91 squeezenet_ssd_int8 min = 85.52 max = 85.98 avg = 85.79 mobilenet_ssd min = 73.38 max = 74.41 avg = 74.02 mobilenet_ssd_int8 min = 85.13 max = 91.47 avg = 86.13 mobilenet_yolo min = 154.47 max = 155.23 avg = 154.74 mobilenetv2_yolov3 min = 100.75 max = 101.96 avg = 101.27 yolov4-tiny min = 140.52 max = 161.68 avg = 153.85 nanodet_m min = 85.27 max = 110.53 avg = 94.81 yolo-fastest-1.1 min = 23.56 max = 42.04 avg = 33.10 yolo-fastestv2 min = 19.54 max = 21.66 avg = 21.01 vision_transformer min = 6395.34 max = 6418.70 avg = 6410.43 FastestDet min = 21.53 max = 23.21 avg = 22.98 ``` ### Rockchip RK3588 (Cortex-A76 2.4GHz x 4 + Cortex-A55 1.8GHz x 4) ``` rk3588_s:/data/local/tmp # ./benchncnn 8 4 2 -1 1 loop_count = 8 num_threads = 4 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 7.57 max = 7.68 avg = 7.60 squeezenet_int8 min = 8.43 max = 8.52 avg = 8.46 mobilenet min = 11.01 max = 11.08 avg = 11.05 mobilenet_int8 min = 8.89 max = 8.96 avg = 8.91 mobilenet_v2 min = 8.73 max = 8.78 avg = 8.76 mobilenet_v3 min = 7.90 max = 7.95 avg = 7.92 shufflenet min = 7.95 max = 8.02 avg = 7.99 shufflenet_v2 min = 6.09 max = 6.13 avg = 6.11 mnasnet min = 8.30 max = 8.35 avg = 8.33 proxylessnasnet min = 9.67 max = 9.72 avg = 9.69 efficientnet_b0 min = 17.51 max = 17.60 avg = 17.56 efficientnetv2_b0 min = 28.10 max = 28.17 avg = 28.14 regnety_400m min = 16.33 max = 16.39 avg = 16.35 blazeface min = 2.81 max = 2.89 avg = 2.83 googlenet min = 33.33 max = 33.41 avg = 33.37 googlenet_int8 min = 33.62 max = 33.87 avg = 33.77 resnet18 min = 18.83 max = 18.90 avg = 18.86 resnet18_int8 min = 33.92 max = 34.10 avg = 34.00 alexnet min = 29.07 max = 29.11 avg = 29.09 vgg16 min = 106.86 max = 107.40 avg = 107.06 vgg16_int8 min = 283.66 max = 284.16 avg = 283.94 resnet50 min = 53.70 max = 54.21 avg = 53.83 resnet50_int8 min = 66.11 max = 66.24 avg = 66.15 squeezenet_ssd min = 34.88 max = 35.04 avg = 34.99 squeezenet_ssd_int8 min = 43.25 max = 43.62 avg = 43.37 mobilenet_ssd min = 31.32 max = 31.42 avg = 31.37 mobilenet_ssd_int8 min = 26.11 max = 26.18 avg = 26.13 mobilenet_yolo min = 58.89 max = 59.02 avg = 58.95 mobilenetv2_yolov3 min = 37.53 max = 37.64 avg = 37.58 yolov4-tiny min = 52.95 max = 53.31 avg = 53.03 nanodet_m min = 16.06 max = 16.14 avg = 16.10 yolo-fastest-1.1 min = 8.42 max = 8.47 avg = 8.45 yolo-fastestv2 min = 7.81 max = 7.88 avg = 7.84 rk3588_s:/data/local/tmp # ./benchncnn 8 1 2 -1 1 loop_count = 8 num_threads = 1 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 25.04 max = 25.14 avg = 25.07 squeezenet_int8 min = 26.29 max = 26.38 avg = 26.33 mobilenet min = 41.17 max = 41.23 avg = 41.19 mobilenet_int8 min = 32.51 max = 32.57 avg = 32.54 mobilenet_v2 min = 27.27 max = 27.31 avg = 27.29 mobilenet_v3 min = 22.49 max = 22.54 avg = 22.51 shufflenet min = 18.15 max = 18.22 avg = 18.18 shufflenet_v2 min = 15.82 max = 15.86 avg = 15.85 mnasnet min = 26.45 max = 26.50 avg = 26.47 proxylessnasnet min = 31.60 max = 31.66 avg = 31.62 efficientnet_b0 min = 55.53 max = 55.68 avg = 55.62 efficientnetv2_b0 min = 96.84 max = 96.92 avg = 96.89 regnety_400m min = 33.66 max = 33.70 avg = 33.68 blazeface min = 8.80 max = 8.84 avg = 8.83 googlenet min = 116.89 max = 117.06 avg = 116.97 googlenet_int8 min = 107.92 max = 108.03 avg = 107.98 resnet18 min = 60.97 max = 61.18 avg = 61.05 resnet18_int8 min = 118.95 max = 119.04 avg = 119.00 alexnet min = 93.49 max = 93.59 avg = 93.55 vgg16 min = 333.81 max = 334.52 avg = 334.07 vgg16_int8 min = 947.19 max = 947.55 avg = 947.35 resnet50 min = 186.95 max = 187.42 avg = 187.15 resnet50_int8 min = 225.72 max = 225.86 avg = 225.75 squeezenet_ssd min = 93.29 max = 93.66 avg = 93.47 squeezenet_ssd_int8 min = 120.22 max = 120.95 avg = 120.49 mobilenet_ssd min = 105.84 max = 105.90 avg = 105.87 mobilenet_ssd_int8 min = 85.95 max = 86.04 avg = 86.01 mobilenet_yolo min = 194.22 max = 194.64 avg = 194.41 mobilenetv2_yolov3 min = 103.63 max = 103.72 avg = 103.69 yolov4-tiny min = 136.59 max = 137.14 avg = 136.91 nanodet_m min = 41.40 max = 41.49 avg = 41.43 yolo-fastest-1.1 min = 18.73 max = 18.80 avg = 18.77 yolo-fastestv2 min = 18.25 max = 18.31 avg = 18.28 rk3588_s:/data/local/tmp # ./benchncnn 8 4 1 -1 1 loop_count = 8 num_threads = 4 powersave = 1 gpu_device = -1 cooling_down = 1 squeezenet min = 25.54 max = 25.99 avg = 25.71 squeezenet_int8 min = 30.88 max = 31.16 avg = 31.01 mobilenet min = 36.24 max = 62.95 avg = 39.89 mobilenet_int8 min = 31.90 max = 32.37 avg = 32.06 mobilenet_v2 min = 27.49 max = 27.82 avg = 27.64 mobilenet_v3 min = 26.30 max = 26.69 avg = 26.45 shufflenet min = 25.49 max = 25.72 avg = 25.60 shufflenet_v2 min = 21.59 max = 22.67 avg = 21.78 mnasnet min = 27.92 max = 28.10 avg = 28.00 proxylessnasnet min = 34.18 max = 34.42 avg = 34.28 efficientnet_b0 min = 57.37 max = 57.60 avg = 57.45 efficientnetv2_b0 min = 83.50 max = 84.03 avg = 83.66 regnety_400m min = 50.83 max = 51.27 avg = 50.98 blazeface min = 14.07 max = 14.29 avg = 14.17 googlenet min = 100.60 max = 101.00 avg = 100.87 googlenet_int8 min = 106.58 max = 107.14 avg = 106.71 resnet18 min = 58.60 max = 59.62 avg = 59.00 resnet18_int8 min = 84.90 max = 85.15 avg = 84.99 alexnet min = 86.06 max = 86.58 avg = 86.22 vgg16 min = 308.42 max = 309.18 avg = 308.81 vgg16_int8 min = 543.61 max = 545.09 avg = 544.40 resnet50 min = 163.45 max = 164.44 avg = 163.92 resnet50_int8 min = 179.51 max = 180.16 avg = 179.83 squeezenet_ssd min = 96.32 max = 97.24 avg = 96.71 squeezenet_ssd_int8 min = 116.48 max = 117.65 avg = 116.85 mobilenet_ssd min = 92.12 max = 93.09 avg = 92.55 mobilenet_ssd_int8 min = 81.78 max = 82.42 avg = 81.95 mobilenet_yolo min = 174.95 max = 175.40 avg = 175.15 mobilenetv2_yolov3 min = 110.63 max = 111.05 avg = 110.81 yolov4-tiny min = 163.37 max = 164.24 avg = 163.63 nanodet_m min = 52.96 max = 53.59 avg = 53.12 yolo-fastest-1.1 min = 28.98 max = 29.33 avg = 29.20 yolo-fastestv2 min = 23.52 max = 24.16 avg = 23.76 rk3588_s:/data/local/tmp # ./benchncnn 8 1 1 -1 1 loop_count = 8 num_threads = 1 powersave = 1 gpu_device = -1 cooling_down = 1 squeezenet min = 83.46 max = 83.63 avg = 83.53 squeezenet_int8 min = 101.39 max = 102.29 avg = 101.77 mobilenet min = 131.78 max = 132.25 avg = 131.87 mobilenet_int8 min = 111.66 max = 112.60 avg = 111.94 mobilenet_v2 min = 92.92 max = 227.19 avg = 132.44 mobilenet_v3 min = 78.38 max = 78.64 avg = 78.49 shufflenet min = 62.98 max = 63.17 avg = 63.09 shufflenet_v2 min = 56.85 max = 57.23 avg = 57.00 mnasnet min = 87.53 max = 87.71 avg = 87.60 proxylessnasnet min = 113.25 max = 114.10 avg = 113.58 efficientnet_b0 min = 180.95 max = 181.16 avg = 181.07 efficientnetv2_b0 min = 285.34 max = 285.62 avg = 285.51 regnety_400m min = 109.24 max = 109.36 avg = 109.31 blazeface min = 41.12 max = 41.53 avg = 41.23 googlenet min = 358.94 max = 359.55 avg = 359.24 googlenet_int8 min = 371.32 max = 371.84 avg = 371.51 resnet18 min = 209.97 max = 210.42 avg = 210.22 resnet18_int8 min = 302.93 max = 303.51 avg = 303.26 alexnet min = 318.95 max = 321.70 avg = 319.40 vgg16 min = 1126.11 max = 1127.83 avg = 1126.98 vgg16_int8 min = 2026.90 max = 2034.04 avg = 2029.35 resnet50 min = 602.90 max = 603.70 avg = 603.30 resnet50_int8 min = 647.33 max = 649.41 avg = 648.65 squeezenet_ssd min = 280.60 max = 281.50 avg = 281.02 squeezenet_ssd_int8 min = 359.41 max = 362.07 avg = 360.66 mobilenet_ssd min = 319.11 max = 319.29 avg = 319.21 mobilenet_ssd_int8 min = 272.16 max = 273.36 avg = 272.83 mobilenet_yolo min = 607.07 max = 607.38 avg = 607.21 mobilenetv2_yolov3 min = 326.66 max = 326.95 avg = 326.80 yolov4-tiny min = 449.56 max = 450.45 avg = 450.04 nanodet_m min = 142.09 max = 142.54 avg = 142.32 yolo-fastest-1.1 min = 63.74 max = 63.80 avg = 63.78 yolo-fastestv2 min = 57.56 max = 58.17 avg = 57.97 rk3588_s:/data/local/tmp # ./benchncnn 8 1 2 0 0 [0 Mali-G610] queueC=0[2] queueG=0[2] queueT=0[2] [0 Mali-G610] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 Mali-G610] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 Mali-G610] subgroup=16 basic=1 vote=1 ballot=1 shuffle=1 loop_count = 8 num_threads = 1 powersave = 2 gpu_device = 0 cooling_down = 0 squeezenet min = 7.09 max = 7.20 avg = 7.13 mobilenet min = 9.16 max = 9.32 avg = 9.22 mobilenet_v2 min = 10.18 max = 10.32 avg = 10.25 mobilenet_v3 min = 8.01 max = 8.09 avg = 8.04 shufflenet min = 5.88 max = 5.93 avg = 5.89 shufflenet_v2 min = 6.30 max = 6.33 avg = 6.31 mnasnet min = 7.91 max = 8.00 avg = 7.95 proxylessnasnet min = 11.20 max = 11.42 avg = 11.30 regnety_400m min = 11.65 max = 11.84 avg = 11.74 blazeface min = 2.50 max = 2.59 avg = 2.53 googlenet min = 17.69 max = 17.78 avg = 17.74 resnet18 min = 16.04 max = 16.39 avg = 16.25 alexnet min = 15.47 max = 15.66 avg = 15.56 vgg16 min = 64.74 max = 65.42 avg = 65.04 resnet50 min = 37.83 max = 38.31 avg = 38.12 squeezenet_ssd min = 23.14 max = 23.44 avg = 23.26 mobilenet_ssd min = 22.48 max = 23.01 avg = 22.74 mobilenet_yolo min = 40.08 max = 40.72 avg = 40.32 mobilenetv2_yolov3 min = 31.88 max = 32.57 avg = 32.12 yolov4-tiny min = 49.64 max = 50.73 avg = 50.13 nanodet_m min = 10.60 max = 10.70 avg = 10.64 yolo-fastest-1.1 min = 7.63 max = 7.66 avg = 7.64 yolo-fastestv2 min = 6.99 max = 7.02 avg = 7.00 ``` ### Station-M3/ROC-RK3588S-PC, Rockchip RK3588S (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz + Mali-G610) StationOS (Android) ``` roc_rk3588s_pc:/data/local/tmp # ./benchncnn 10 1 0 0 0 ./benchncnn 10 1 0 0 0 [0 Mali-G610] queueC=0[2] queueG=0[2] queueT=0[2] [0 Mali-G610] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 Mali-G610] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 Mali-G610] subgroup=16 basic/vote/ballot/shuffle=1/1/1/1 [0 Mali-G610] fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = 0 cooling_down = 0 squeezenet min = 7.83 max = 14.17 avg = 9.76 squeezenet_int8 min = 13.41 max = 13.52 avg = 13.45 mobilenet min = 8.73 max = 9.68 avg = 9.07 mobilenet_int8 min = 17.70 max = 17.89 avg = 17.80 mobilenet_v2 min = 10.73 max = 21.20 avg = 18.93 mobilenet_v3 min = 9.00 max = 13.36 avg = 10.64 shufflenet min = 7.79 max = 7.93 avg = 7.85 shufflenet_v2 min = 8.01 max = 8.06 avg = 8.03 mnasnet min = 7.43 max = 8.71 avg = 8.28 proxylessnasnet min = 10.56 max = 12.07 avg = 11.70 efficientnet_b0 min = 2.15 max = 2.19 avg = 2.17 efficientnetv2_b0 min = 0.56 max = 0.62 avg = 0.57 regnety_400m min = 1.65 max = 1.69 avg = 1.67 blazeface min = 0.76 max = 0.79 avg = 0.78 googlenet min = 1.53 max = 1.60 avg = 1.56 googlenet_int8 min = 60.85 max = 61.01 avg = 60.93 resnet18 min = 0.63 max = 0.82 avg = 0.65 resnet18_int8 min = 64.60 max = 65.13 avg = 64.78 alexnet min = 0.35 max = 0.40 avg = 0.37 vgg16 min = 0.54 max = 0.60 avg = 0.56 vgg16_int8 min = 445.21 max = 562.09 avg = 537.10 resnet50 min = 0.95 max = 0.97 avg = 0.96 resnet50_int8 min = 113.02 max = 113.38 avg = 113.17 squeezenet_ssd min = 1.94 max = 2.00 avg = 1.96 squeezenet_ssd_int8 min = 52.09 max = 56.93 avg = 56.35 mobilenet_ssd min = 1.19 max = 1.26 avg = 1.21 mobilenet_ssd_int8 min = 44.33 max = 44.87 avg = 44.66 mobilenet_yolo min = 1.05 max = 1.24 avg = 1.13 mobilenetv2_yolov3 min = 1.18 max = 1.25 avg = 1.21 yolov4-tiny min = 0.78 max = 0.80 avg = 0.78 nanodet_m min = 3.43 max = 3.80 avg = 3.57 yolo-fastest-1.1 min = 1.43 max = 1.50 avg = 1.47 yolo-fastestv2 min = 2.03 max = 2.10 avg = 2.05 vision_transformer min = 0.32 max = 0.36 avg = 0.35 FastestDet min = 1.90 max = 1.95 avg = 1.93 roc_rk3588s_pc:/data/local/tmp # ./benchncnn 10 1 0 -1 0 ./benchncnn 10 1 0 -1 0 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 13.36 max = 13.50 avg = 13.40 squeezenet_int8 min = 16.22 max = 16.34 avg = 16.30 mobilenet min = 22.41 max = 22.49 avg = 22.44 mobilenet_int8 min = 17.76 max = 17.94 avg = 17.84 mobilenet_v2 min = 17.60 max = 17.80 avg = 17.70 mobilenet_v3 min = 13.55 max = 13.70 avg = 13.61 shufflenet min = 7.91 max = 7.95 avg = 7.93 shufflenet_v2 min = 8.36 max = 8.40 avg = 8.38 mnasnet min = 14.50 max = 14.60 avg = 14.56 proxylessnasnet min = 16.99 max = 17.12 avg = 17.06 efficientnet_b0 min = 26.55 max = 26.78 avg = 26.62 efficientnetv2_b0 min = 46.96 max = 47.44 avg = 47.30 regnety_400m min = 18.53 max = 18.63 avg = 18.58 blazeface min = 2.98 max = 3.02 avg = 3.00 googlenet min = 62.69 max = 63.14 avg = 62.90 googlenet_int8 min = 60.86 max = 61.54 avg = 61.05 resnet18 min = 30.34 max = 31.39 avg = 31.22 resnet18_int8 min = 57.42 max = 57.67 avg = 57.56 alexnet min = 40.81 max = 40.87 avg = 40.84 vgg16 min = 192.71 max = 195.20 avg = 194.26 vgg16_int8 min = 450.95 max = 534.38 avg = 482.27 resnet50 min = 105.11 max = 105.64 avg = 105.30 resnet50_int8 min = 105.94 max = 132.01 avg = 116.48 squeezenet_ssd min = 51.36 max = 51.59 avg = 51.51 squeezenet_ssd_int8 min = 69.01 max = 69.83 avg = 69.37 mobilenet_ssd min = 53.19 max = 55.24 avg = 53.50 mobilenet_ssd_int8 min = 44.49 max = 44.98 avg = 44.74 mobilenet_yolo min = 112.65 max = 113.28 avg = 112.94 mobilenetv2_yolov3 min = 63.38 max = 63.83 avg = 63.55 yolov4-tiny min = 77.57 max = 78.20 avg = 77.90 nanodet_m min = 25.21 max = 25.81 avg = 25.58 yolo-fastest-1.1 min = 8.76 max = 8.84 avg = 8.80 yolo-fastestv2 min = 8.46 max = 8.53 avg = 8.50 vision_transformer min = 1499.53 max = 1501.32 avg = 1500.50 FastestDet min = 7.04 max = 7.08 avg = 7.06 ``` ### Station P2, Rockchip RK3568 (Cortex-A55 2.0GHz x 4) ``` ./benchncnn 4 4 0 -1 1 loop_count = 4 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 26.02 max = 27.15 avg = 26.74 squeezenet_int8 min = 44.69 max = 45.70 avg = 45.24 mobilenet min = 32.63 max = 33.49 avg = 33.10 mobilenet_int8 min = 44.23 max = 45.86 avg = 44.99 mobilenet_v2 min = 31.59 max = 32.02 avg = 31.86 mobilenet_v3 min = 25.71 max = 26.44 avg = 26.10 shufflenet min = 22.12 max = 23.17 avg = 22.52 shufflenet_v2 min = 17.84 max = 18.21 avg = 17.96 mnasnet min = 28.26 max = 28.70 avg = 28.45 proxylessnasnet min = 31.96 max = 32.25 avg = 32.13 efficientnet_b0 min = 53.17 max = 54.48 avg = 53.60 efficientnetv2_b0 min = 70.08 max = 70.69 avg = 70.30 regnety_400m min = 40.80 max = 41.79 avg = 41.10 blazeface min = 10.79 max = 11.57 avg = 11.11 googlenet min = 83.66 max = 92.22 avg = 86.23 googlenet_int8 min = 116.44 max = 118.34 avg = 117.08 resnet18 min = 61.38 max = 62.52 avg = 61.94 resnet18_int8 min = 95.58 max = 96.93 avg = 96.28 alexnet min = 69.90 max = 70.59 avg = 70.19 vgg16 min = 334.24 max = 343.89 avg = 337.24 vgg16_int8 min = 464.88 max = 474.71 avg = 468.29 resnet50 min = 141.65 max = 146.23 avg = 143.78 resnet50_int8 min = 230.36 max = 254.75 avg = 241.24 squeezenet_ssd min = 98.38 max = 104.60 avg = 100.50 squeezenet_ssd_int8 min = 134.73 max = 137.88 avg = 136.12 mobilenet_ssd min = 77.48 max = 79.92 avg = 78.64 mobilenet_ssd_int8 min = 101.44 max = 102.61 avg = 102.06 mobilenet_yolo min = 149.12 max = 150.14 avg = 149.76 mobilenetv2_yolov3 min = 103.71 max = 107.81 avg = 105.69 yolov4-tiny min = 145.75 max = 149.35 avg = 147.09 nanodet_m min = 52.91 max = 54.06 avg = 53.53 ./benchncnn 4 2 0 -1 1 loop_count = 4 num_threads = 2 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 33.78 max = 34.38 avg = 34.16 squeezenet_int8 min = 61.66 max = 62.11 avg = 61.85 mobilenet min = 46.53 max = 46.74 avg = 46.62 mobilenet_int8 min = 71.06 max = 71.76 avg = 71.38 mobilenet_v2 min = 39.05 max = 39.38 avg = 39.19 mobilenet_v3 min = 32.20 max = 32.47 avg = 32.29 shufflenet min = 27.13 max = 27.40 avg = 27.27 shufflenet_v2 min = 23.38 max = 23.92 avg = 23.62 mnasnet min = 35.51 max = 35.73 avg = 35.62 proxylessnasnet min = 42.98 max = 43.16 avg = 43.06 efficientnet_b0 min = 75.34 max = 75.79 avg = 75.61 efficientnetv2_b0 min = 107.34 max = 107.83 avg = 107.60 regnety_400m min = 47.91 max = 48.20 avg = 48.02 blazeface min = 16.38 max = 16.63 avg = 16.49 googlenet min = 124.27 max = 125.24 avg = 124.65 googlenet_int8 min = 177.78 max = 178.39 avg = 178.06 resnet18 min = 82.02 max = 82.70 avg = 82.38 resnet18_int8 min = 148.06 max = 149.03 avg = 148.39 alexnet min = 105.20 max = 105.91 avg = 105.54 vgg16 min = 459.65 max = 464.94 avg = 462.02 vgg16_int8 min = 737.54 max = 750.64 avg = 742.90 resnet50 min = 204.44 max = 205.20 avg = 204.84 resnet50_int8 min = 364.47 max = 366.04 avg = 365.53 squeezenet_ssd min = 124.42 max = 128.01 avg = 125.80 squeezenet_ssd_int8 min = 179.29 max = 183.83 avg = 181.43 mobilenet_ssd min = 113.85 max = 115.50 avg = 114.41 mobilenet_ssd_int8 min = 161.35 max = 162.38 avg = 161.71 mobilenet_yolo min = 214.95 max = 216.62 avg = 215.72 mobilenetv2_yolov3 min = 134.23 max = 136.26 avg = 135.07 yolov4-tiny min = 194.72 max = 195.49 avg = 195.18 nanodet_m min = 67.67 max = 68.09 avg = 67.90 ./benchncnn 4 1 0 -1 1 loop_count = 4 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 54.31 max = 55.65 avg = 55.00 squeezenet_int8 min = 103.96 max = 106.28 avg = 104.92 mobilenet min = 79.02 max = 79.46 avg = 79.25 mobilenet_int8 min = 130.06 max = 130.61 avg = 130.36 mobilenet_v2 min = 60.15 max = 60.66 avg = 60.31 mobilenet_v3 min = 49.40 max = 49.57 avg = 49.49 shufflenet min = 39.39 max = 39.78 avg = 39.60 shufflenet_v2 min = 35.48 max = 35.70 avg = 35.62 mnasnet min = 55.38 max = 56.10 avg = 55.71 proxylessnasnet min = 70.29 max = 70.48 avg = 70.35 efficientnet_b0 min = 128.56 max = 129.96 avg = 129.26 efficientnetv2_b0 min = 181.00 max = 181.56 avg = 181.24 regnety_400m min = 67.15 max = 69.62 avg = 67.95 blazeface min = 26.07 max = 26.58 avg = 26.33 googlenet min = 219.19 max = 221.32 avg = 220.01 googlenet_int8 min = 317.62 max = 319.40 avg = 318.37 resnet18 min = 135.33 max = 136.94 avg = 135.88 resnet18_int8 min = 264.69 max = 265.51 avg = 265.16 alexnet min = 190.54 max = 193.50 avg = 191.88 vgg16 min = 790.99 max = 809.24 avg = 795.85 vgg16_int8 min = 1354.48 max = 1358.89 avg = 1357.40 resnet50 min = 358.08 max = 362.96 avg = 360.29 resnet50_int8 min = 667.92 max = 670.40 avg = 668.78 squeezenet_ssd min = 193.15 max = 194.02 avg = 193.49 squeezenet_ssd_int8 min = 291.42 max = 294.70 avg = 293.16 mobilenet_ssd min = 189.54 max = 190.28 avg = 189.97 mobilenet_ssd_int8 min = 289.94 max = 290.40 avg = 290.28 mobilenet_yolo min = 370.37 max = 384.69 avg = 375.11 mobilenetv2_yolov3 min = 210.93 max = 211.70 avg = 211.40 yolov4-tiny min = 309.11 max = 310.74 avg = 309.89 nanodet_m min = 100.42 max = 112.25 avg = 103.66 ``` ### Rock3A, Rockchip RK3568 (Cortex-A55 2.0GHz x 4) ubuntu 20.04 ``` rock@rock3a:~/ncnn/build/benchmark$ ./benchncnn 8 4 0 -1 1 loop_count = 8 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 29.52 max = 30.30 avg = 29.76 squeezenet_int8 min = 35.40 max = 36.19 avg = 35.88 mobilenet min = 34.47 max = 35.44 avg = 34.84 mobilenet_int8 min = 34.19 max = 34.53 avg = 34.40 mobilenet_v2 min = 35.75 max = 36.09 avg = 35.88 mobilenet_v3 min = 28.12 max = 28.82 avg = 28.49 shufflenet min = 23.62 max = 24.08 avg = 23.84 shufflenet_v2 min = 19.37 max = 19.64 avg = 19.52 mnasnet min = 30.84 max = 31.45 avg = 31.02 proxylessnasnet min = 35.73 max = 36.07 avg = 35.90 efficientnet_b0 min = 48.16 max = 49.29 avg = 48.64 efficientnetv2_b0 min = 66.62 max = 67.11 avg = 66.85 regnety_400m min = 41.11 max = 41.64 avg = 41.34 blazeface min = 12.38 max = 12.64 avg = 12.56 googlenet min = 86.73 max = 87.79 avg = 87.11 googlenet_int8 min = 101.42 max = 103.87 avg = 102.55 resnet18 min = 64.85 max = 65.84 avg = 65.23 resnet18_int8 min = 93.55 max = 94.54 avg = 94.03 alexnet min = 70.89 max = 73.58 avg = 71.57 vgg16 min = 356.13 max = 358.52 avg = 357.15 vgg16_int8 min = 521.92 max = 524.13 avg = 523.11 resnet50 min = 147.65 max = 150.33 avg = 148.52 resnet50_int8 min = 191.94 max = 192.73 avg = 192.30 squeezenet_ssd min = 104.32 max = 105.75 avg = 105.00 squeezenet_ssd_int8 min = 125.97 max = 127.53 avg = 126.70 mobilenet_ssd min = 82.29 max = 82.65 avg = 82.47 mobilenet_ssd_int8 min = 79.26 max = 80.93 avg = 79.72 mobilenet_yolo min = 165.51 max = 165.86 avg = 165.72 mobilenetv2_yolov3 min = 116.11 max = 116.83 avg = 116.43 yolov4-tiny min = 152.09 max = 153.39 avg = 152.60 nanodet_m min = 53.63 max = 54.14 avg = 53.92 rock@rock3a:~/ncnn/build/benchmark$ ./benchncnn 4 1 0 -1 1 loop_count = 4 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 62.47 max = 63.04 avg = 62.84 squeezenet_int8 min = 67.23 max = 68.48 avg = 67.93 mobilenet min = 85.27 max = 85.69 avg = 85.49 mobilenet_int8 min = 75.00 max = 75.48 avg = 75.26 mobilenet_v2 min = 68.41 max = 69.09 avg = 68.76 mobilenet_v3 min = 54.19 max = 54.52 avg = 54.34 shufflenet min = 45.90 max = 46.30 avg = 46.09 shufflenet_v2 min = 39.64 max = 40.07 avg = 39.91 mnasnet min = 62.16 max = 62.41 avg = 62.30 proxylessnasnet min = 80.79 max = 81.41 avg = 81.12 efficientnet_b0 min = 113.47 max = 113.68 avg = 113.57 efficientnetv2_b0 min = 167.30 max = 167.58 avg = 167.44 regnety_400m min = 72.12 max = 72.24 avg = 72.17 blazeface min = 31.89 max = 32.04 avg = 31.95 googlenet min = 224.27 max = 224.86 avg = 224.55 googlenet_int8 min = 240.02 max = 240.93 avg = 240.45 resnet18 min = 150.25 max = 150.69 avg = 150.47 resnet18_int8 min = 226.70 max = 228.19 avg = 227.56 alexnet min = 197.44 max = 199.16 avg = 198.17 vgg16 min = 859.80 max = 860.79 avg = 860.35 vgg16_int8 min = 1409.66 max = 1411.92 avg = 1411.07 resnet50 min = 381.04 max = 382.73 avg = 381.86 resnet50_int8 min = 441.78 max = 445.00 avg = 443.29 squeezenet_ssd min = 208.14 max = 208.67 avg = 208.41 squeezenet_ssd_int8 min = 248.82 max = 250.80 avg = 249.89 mobilenet_ssd min = 200.95 max = 201.21 avg = 201.06 mobilenet_ssd_int8 min = 173.81 max = 174.54 avg = 174.28 mobilenet_yolo min = 394.65 max = 395.00 avg = 394.78 mobilenetv2_yolov3 min = 231.80 max = 232.27 avg = 232.08 yolov4-tiny min = 321.31 max = 322.43 avg = 321.79 nanodet_m min = 103.81 max = 104.61 avg = 104.25 ``` ### Station-M2/ROC-RK3566-PC, Rockchip RK3566 (Cortex-A55 1.8GHz x 4 + Mali-G52) StationOS (Android) ``` rk3566_roc_pc:/data/local/tmp # ./benchncnn 10 1 0 0 0 ./benchncnn 10 1 0 0 0 [0 Mali-G52] queueC=0[2] queueG=0[2] queueT=0[2] [0 Mali-G52] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=1 [0 Mali-G52] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 Mali-G52] subgroup=8 basic/vote/ballot/shuffle=1/1/1/1 [0 Mali-G52] fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = 0 cooling_down = 0 squeezenet min = 43.67 max = 44.15 avg = 43.82 squeezenet_int8 min = 62.72 max = 63.99 avg = 63.49 mobilenet min = 74.32 max = 74.82 avg = 74.58 mobilenet_int8 min = 64.42 max = 65.43 avg = 64.89 mobilenet_v2 min = 52.96 max = 53.23 avg = 53.09 mobilenet_v3 min = 51.55 max = 53.12 avg = 51.96 shufflenet min = 40.73 max = 41.28 avg = 40.98 shufflenet_v2 min = 41.56 max = 43.62 avg = 42.22 mnasnet min = 54.37 max = 54.63 avg = 54.52 proxylessnasnet min = 57.91 max = 59.38 avg = 58.36 efficientnet_b0 min = 38.40 max = 40.29 avg = 39.06 efficientnetv2_b0 min = 36.91 max = 38.45 avg = 37.72 regnety_400m min = 69.07 max = 69.98 avg = 69.40 blazeface min = 12.26 max = 13.08 avg = 12.57 googlenet min = 147.08 max = 147.80 avg = 147.48 googlenet_int8 min = 221.94 max = 225.99 avg = 223.12 resnet18 min = 137.90 max = 138.50 avg = 138.19 resnet18_int8 min = 187.84 max = 190.88 avg = 188.81 alexnet min = 167.56 max = 168.92 avg = 168.17 vgg16 min = 713.42 max = 715.20 avg = 714.51 vgg16_int8 min = 1279.97 max = 1302.95 avg = 1294.59 resnet50 min = 369.74 max = 375.95 avg = 372.60 resnet50_int8 min = 391.86 max = 397.49 avg = 395.17 squeezenet_ssd min = 155.18 max = 156.09 avg = 155.62 squeezenet_ssd_int8 min = 218.83 max = 222.64 avg = 221.11 mobilenet_ssd min = 161.62 max = 163.22 avg = 162.27 mobilenet_ssd_int8 min = 147.33 max = 149.16 avg = 148.23 mobilenet_yolo min = 344.09 max = 349.15 avg = 346.73 mobilenetv2_yolov3 min = 168.72 max = 169.64 avg = 169.22 yolov4-tiny min = 239.44 max = 241.11 avg = 240.00 nanodet_m min = 88.06 max = 89.89 avg = 88.87 yolo-fastest-1.1 min = 36.05 max = 37.86 avg = 36.47 yolo-fastestv2 min = 34.80 max = 36.58 avg = 35.37 vision_transformer min = 356.42 max = 359.37 avg = 358.03 FastestDet min = 38.03 max = 38.52 avg = 38.24 rk3566_roc_pc:/data/local/tmp # ./benchncnn 10 1 0 -1 0 ./benchncnn 10 1 0 -1 0 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 47.01 max = 48.12 avg = 47.62 squeezenet_int8 min = 63.30 max = 64.10 avg = 63.74 mobilenet min = 70.24 max = 71.52 avg = 70.63 mobilenet_int8 min = 63.90 max = 65.25 avg = 64.41 mobilenet_v2 min = 55.75 max = 56.26 avg = 56.02 mobilenet_v3 min = 45.56 max = 46.47 avg = 46.17 shufflenet min = 34.16 max = 35.16 avg = 34.64 shufflenet_v2 min = 32.58 max = 33.86 avg = 33.25 mnasnet min = 52.43 max = 53.15 avg = 52.80 proxylessnasnet min = 65.55 max = 67.04 avg = 66.36 efficientnet_b0 min = 82.52 max = 82.97 avg = 82.64 efficientnetv2_b0 min = 148.90 max = 150.47 avg = 149.64 regnety_400m min = 63.33 max = 64.29 avg = 63.70 blazeface min = 11.55 max = 12.35 avg = 11.77 googlenet min = 205.85 max = 208.74 avg = 207.17 googlenet_int8 min = 222.72 max = 225.84 avg = 223.98 resnet18 min = 134.19 max = 136.81 avg = 135.39 resnet18_int8 min = 187.26 max = 189.45 avg = 188.36 alexnet min = 143.01 max = 144.97 avg = 143.42 vgg16 min = 829.44 max = 839.46 avg = 835.37 vgg16_int8 min = 1299.25 max = 1306.89 avg = 1301.71 resnet50 min = 326.54 max = 330.21 avg = 328.27 resnet50_int8 min = 391.67 max = 395.59 avg = 393.27 squeezenet_ssd min = 166.12 max = 168.33 avg = 167.08 squeezenet_ssd_int8 min = 221.82 max = 223.85 avg = 222.69 mobilenet_ssd min = 163.17 max = 166.55 avg = 164.11 mobilenet_ssd_int8 min = 146.16 max = 148.20 avg = 147.41 mobilenet_yolo min = 335.15 max = 338.32 avg = 336.66 mobilenetv2_yolov3 min = 193.18 max = 195.51 avg = 194.33 yolov4-tiny min = 288.82 max = 292.16 avg = 290.36 nanodet_m min = 98.31 max = 100.30 avg = 99.20 yolo-fastest-1.1 min = 37.73 max = 38.97 avg = 38.40 yolo-fastestv2 min = 36.21 max = 37.90 avg = 37.13 vision_transformer min = 7385.59 max = 7410.59 avg = 7402.20 FastestDet min = 34.55 max = 35.42 avg = 35.06 ``` ### Rockchip RK3399 (Cortex-A72 1.8GHz x 2 + Cortex-A53 1.5GHz x 4) ``` nanopc-t4:/data/local/tmp # ./benchncnn 8 2 2 -1 1 loop_count = 8 num_threads = 2 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 43.73 max = 44.30 avg = 43.97 squeezenet_int8 min = 37.92 max = 38.39 avg = 38.09 mobilenet min = 64.28 max = 66.66 avg = 65.14 mobilenet_int8 min = 43.17 max = 43.73 avg = 43.38 mobilenet_v2 min = 51.30 max = 52.18 avg = 51.75 mobilenet_v3 min = 41.51 max = 43.25 avg = 42.10 shufflenet min = 27.43 max = 28.27 avg = 27.75 shufflenet_v2 min = 24.96 max = 25.79 avg = 25.55 mnasnet min = 45.44 max = 46.95 avg = 46.16 proxylessnasnet min = 51.98 max = 53.52 avg = 52.48 efficientnet_b0 min = 83.79 max = 84.68 avg = 84.27 efficientnetv2_b0 min = 97.89 max = 99.27 avg = 98.55 regnety_400m min = 65.15 max = 65.89 avg = 65.41 blazeface min = 8.74 max = 8.89 avg = 8.80 googlenet min = 131.46 max = 140.16 avg = 133.24 googlenet_int8 min = 115.72 max = 118.34 avg = 116.60 resnet18 min = 111.77 max = 113.18 avg = 112.37 resnet18_int8 min = 84.27 max = 84.90 avg = 84.49 alexnet min = 105.74 max = 109.87 avg = 107.15 vgg16 min = 619.88 max = 634.59 avg = 629.15 vgg16_int8 min = 447.14 max = 451.09 avg = 448.53 resnet50 min = 291.51 max = 296.55 avg = 293.08 resnet50_int8 min = 224.09 max = 227.03 avg = 225.02 squeezenet_ssd min = 109.72 max = 112.09 avg = 110.78 squeezenet_ssd_int8 min = 93.41 max = 94.83 avg = 93.97 mobilenet_ssd min = 131.30 max = 132.82 avg = 131.94 mobilenet_ssd_int8 min = 87.52 max = 88.89 avg = 88.35 mobilenet_yolo min = 288.02 max = 289.84 avg = 288.61 mobilenetv2_yolov3 min = 168.45 max = 170.94 avg = 169.79 yolov4-tiny min = 217.45 max = 226.39 avg = 219.76 nanodet_m min = 65.74 max = 66.84 avg = 66.49 yolo-fastest-1.1 min = 32.91 max = 33.74 avg = 33.37 yolo-fastestv2 min = 28.90 max = 37.31 avg = 30.27 nanopc-t4:/data/local/tmp # ./benchncnn 8 1 2 -1 1 loop_count = 8 num_threads = 1 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 71.35 max = 73.02 avg = 71.83 squeezenet_int8 min = 60.39 max = 60.96 avg = 60.69 mobilenet min = 111.12 max = 113.02 avg = 111.99 mobilenet_int8 min = 80.14 max = 81.59 avg = 81.00 mobilenet_v2 min = 78.18 max = 80.89 avg = 79.18 mobilenet_v3 min = 63.49 max = 64.26 avg = 63.90 shufflenet min = 38.90 max = 40.28 avg = 39.26 shufflenet_v2 min = 37.72 max = 38.45 avg = 38.02 mnasnet min = 72.34 max = 73.59 avg = 72.87 proxylessnasnet min = 87.33 max = 89.70 avg = 88.45 efficientnet_b0 min = 145.14 max = 146.77 avg = 145.93 efficientnetv2_b0 min = 169.33 max = 171.16 avg = 170.16 regnety_400m min = 99.08 max = 99.80 avg = 99.47 blazeface min = 12.28 max = 12.69 avg = 12.48 googlenet min = 228.18 max = 229.36 avg = 228.64 googlenet_int8 min = 201.62 max = 203.71 avg = 202.25 resnet18 min = 175.71 max = 180.53 avg = 176.85 resnet18_int8 min = 151.42 max = 152.45 avg = 151.83 alexnet min = 160.81 max = 186.24 avg = 165.30 vgg16 min = 1044.34 max = 1080.88 avg = 1062.34 vgg16_int8 min = 844.53 max = 851.71 avg = 848.65 resnet50 min = 503.25 max = 505.20 avg = 504.18 resnet50_int8 min = 397.71 max = 400.19 avg = 398.63 squeezenet_ssd min = 162.98 max = 165.97 avg = 164.34 squeezenet_ssd_int8 min = 145.93 max = 148.59 avg = 146.94 mobilenet_ssd min = 226.54 max = 229.80 avg = 227.80 mobilenet_ssd_int8 min = 159.97 max = 163.18 avg = 161.06 mobilenet_yolo min = 512.90 max = 517.47 avg = 515.06 mobilenetv2_yolov3 min = 274.88 max = 280.24 avg = 276.36 yolov4-tiny min = 351.97 max = 358.70 avg = 355.60 nanodet_m min = 95.32 max = 97.83 avg = 96.28 yolo-fastest-1.1 min = 43.47 max = 46.52 avg = 44.55 yolo-fastestv2 min = 37.22 max = 37.63 avg = 37.45 nanopc-t4:/data/local/tmp # ./benchncnn 8 4 1 -1 1 loop_count = 8 num_threads = 4 powersave = 1 gpu_device = -1 cooling_down = 1 squeezenet min = 48.11 max = 48.51 avg = 48.24 squeezenet_int8 min = 43.19 max = 44.17 avg = 43.40 mobilenet min = 65.47 max = 66.40 avg = 65.68 mobilenet_int8 min = 49.15 max = 51.65 avg = 49.76 mobilenet_v2 min = 53.60 max = 54.19 avg = 53.87 mobilenet_v3 min = 52.83 max = 92.92 avg = 66.25 shufflenet min = 35.71 max = 36.03 avg = 35.83 shufflenet_v2 min = 31.88 max = 32.38 avg = 32.16 mnasnet min = 51.59 max = 54.01 avg = 52.30 proxylessnasnet min = 60.11 max = 60.40 avg = 60.24 efficientnet_b0 min = 98.22 max = 99.40 avg = 98.56 efficientnetv2_b0 min = 114.19 max = 123.90 avg = 115.89 regnety_400m min = 85.89 max = 86.20 avg = 86.03 blazeface min = 11.23 max = 11.37 avg = 11.31 googlenet min = 142.25 max = 160.88 avg = 145.26 googlenet_int8 min = 125.45 max = 128.50 avg = 125.96 resnet18 min = 116.68 max = 118.26 avg = 117.00 resnet18_int8 min = 88.43 max = 90.95 avg = 89.08 alexnet min = 150.91 max = 160.01 avg = 152.51 vgg16 min = 674.91 max = 684.83 avg = 679.08 vgg16_int8 min = 417.60 max = 422.52 avg = 419.60 resnet50 min = 297.23 max = 299.37 avg = 298.03 resnet50_int8 min = 243.99 max = 251.39 avg = 245.99 squeezenet_ssd min = 127.92 max = 128.53 avg = 128.17 squeezenet_ssd_int8 min = 112.54 max = 114.63 avg = 113.19 mobilenet_ssd min = 136.43 max = 140.14 avg = 137.33 mobilenet_ssd_int8 min = 102.14 max = 105.00 avg = 102.77 mobilenet_yolo min = 291.45 max = 294.04 avg = 292.63 mobilenetv2_yolov3 min = 183.13 max = 187.00 avg = 184.05 yolov4-tiny min = 257.46 max = 268.76 avg = 260.49 nanodet_m min = 83.16 max = 91.03 avg = 84.77 yolo-fastest-1.1 min = 43.53 max = 43.87 avg = 43.74 yolo-fastestv2 min = 35.04 max = 35.54 avg = 35.17 nanopc-t4:/data/local/tmp # ./benchncnn 8 1 1 -1 1 loop_count = 8 num_threads = 1 powersave = 1 gpu_device = -1 cooling_down = 1 squeezenet min = 129.63 max = 130.58 avg = 129.85 squeezenet_int8 min = 124.10 max = 126.34 avg = 124.81 mobilenet min = 207.92 max = 208.72 avg = 208.41 mobilenet_int8 min = 175.55 max = 176.11 avg = 175.84 mobilenet_v2 min = 143.02 max = 143.56 avg = 143.25 mobilenet_v3 min = 133.11 max = 134.05 avg = 133.33 shufflenet min = 77.97 max = 78.54 avg = 78.19 shufflenet_v2 min = 75.59 max = 76.05 avg = 75.82 mnasnet min = 139.86 max = 141.77 avg = 140.19 proxylessnasnet min = 178.57 max = 179.57 avg = 179.03 efficientnet_b0 min = 316.10 max = 317.82 avg = 316.86 efficientnetv2_b0 min = 359.26 max = 362.03 avg = 360.31 regnety_400m min = 182.64 max = 183.03 avg = 182.82 blazeface min = 25.81 max = 26.53 avg = 26.20 googlenet min = 448.45 max = 450.80 avg = 449.35 googlenet_int8 min = 406.07 max = 410.65 avg = 408.04 resnet18 min = 351.64 max = 362.12 avg = 354.19 resnet18_int8 min = 298.10 max = 300.45 avg = 299.26 alexnet min = 586.92 max = 588.73 avg = 587.80 vgg16 min = 2170.12 max = 2202.80 avg = 2183.32 vgg16_int8 min = 1533.65 max = 1542.01 avg = 1537.33 resnet50 min = 975.40 max = 977.79 avg = 976.61 resnet50_int8 min = 851.59 max = 855.22 avg = 853.75 squeezenet_ssd min = 306.35 max = 307.54 avg = 306.96 squeezenet_ssd_int8 min = 291.32 max = 292.87 avg = 292.18 mobilenet_ssd min = 423.70 max = 424.63 avg = 424.11 mobilenet_ssd_int8 min = 358.62 max = 359.42 avg = 359.04 mobilenet_yolo min = 928.06 max = 929.25 avg = 928.55 mobilenetv2_yolov3 min = 496.96 max = 499.29 avg = 497.73 yolov4-tiny min = 712.80 max = 714.15 avg = 713.55 nanodet_m min = 179.42 max = 180.60 avg = 179.75 yolo-fastest-1.1 min = 88.06 max = 88.85 avg = 88.35 yolo-fastestv2 min = 68.68 max = 69.83 avg = 69.08 nanopc-t4:/data/local/tmp # ./benchncnn 4 1 2 0 0 [0 Mali-T860] queueC=0[2] queueG=0[2] queueT=0[2] [0 Mali-T860] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=1 [0 Mali-T860] fp16-p/s/a=1/0/1 int8-p/s/a=1/0/0 [0 Mali-T860] subgroup=0 basic=0 vote=0 ballot=0 shuffle=0 loop_count = 4 num_threads = 1 powersave = 2 gpu_device = 0 cooling_down = 0 squeezenet min = 24.57 max = 24.71 avg = 24.64 mobilenet min = 35.86 max = 36.14 avg = 36.04 mobilenet_v2 min = 30.18 max = 30.19 avg = 30.19 mobilenet_v3 min = 30.88 max = 31.12 avg = 31.01 shufflenet min = 33.90 max = 33.98 avg = 33.93 shufflenet_v2 min = 29.10 max = 29.14 avg = 29.12 mnasnet min = 30.49 max = 30.59 avg = 30.53 proxylessnasnet min = 33.56 max = 33.61 avg = 33.59 efficientnet_b0 min = 51.15 max = 51.54 avg = 51.38 efficientnetv2_b0 min = 86.26 max = 87.36 avg = 86.91 regnety_400m min = 38.44 max = 38.54 avg = 38.49 blazeface min = 9.66 max = 9.74 avg = 9.70 googlenet min = 80.62 max = 80.96 avg = 80.81 resnet18 min = 74.07 max = 74.36 avg = 74.23 alexnet min = 76.84 max = 77.26 avg = 77.08 vgg16 min = 300.71 max = 300.89 avg = 300.80 resnet50 min = 175.96 max = 176.72 avg = 176.23 squeezenet_ssd min = 71.20 max = 71.38 avg = 71.32 mobilenet_ssd min = 76.99 max = 77.47 avg = 77.19 mobilenet_yolo min = 160.41 max = 160.84 avg = 160.62 mobilenetv2_yolov3 min = 91.31 max = 91.37 avg = 91.35 yolov4-tiny min = 130.78 max = 131.54 avg = 131.16 nanodet_m min = 55.90 max = 56.03 avg = 55.96 yolo-fastest-1.1 min = 25.50 max = 25.66 avg = 25.59 yolo-fastestv2 min = 24.94 max = 25.07 avg = 25.01 ``` ### MYIR RemiPi,Renesas RZG2L(Cortex-A55 1.5GHz x 2) ``` root@myir-remi-1g:~/ncnn# time ./benchncnn 10 4 0 -1 1 loop_count = 10 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 85.38 max = 87.72 avg = 86.78 squeezenet_int8 min = 84.23 max = 86.46 avg = 85.59 mobilenet min = 121.01 max = 122.55 avg = 121.76 mobilenet_int8 min = 95.64 max = 97.27 avg = 96.25 mobilenet_v2 min = 101.35 max = 102.24 avg = 101.72 mobilenet_v3 min = 84.09 max = 86.66 avg = 84.86 shufflenet min = 63.32 max = 65.16 avg = 64.53 shufflenet_v2 min = 60.33 max = 62.35 avg = 61.04 mnasnet min = 95.51 max = 96.70 avg = 95.95 proxylessnasnet min = 124.46 max = 125.82 avg = 125.14 efficientnet_b0 min = 144.94 max = 146.46 avg = 145.56 efficientnetv2_b0 min = 182.87 max = 185.63 avg = 184.56 regnety_400m min = 105.31 max = 106.42 avg = 105.72 blazeface min = 21.34 max = 21.90 avg = 21.50 googlenet min = 313.01 max = 318.42 avg = 314.25 googlenet_int8 min = 301.87 max = 304.93 avg = 303.66 resnet18 min = 248.02 max = 253.93 avg = 250.12 resnet18_int8 min = 244.65 max = 246.62 avg = 245.66 alexnet min = 204.00 max = 206.39 avg = 205.21 resnet50 min = 583.13 max = 584.82 avg = 584.11 resnet50_int8 min = 517.42 max = 520.97 avg = 519.07 squeezenet_ssd min = 266.63 max = 273.34 avg = 268.60 squeezenet_ssd_int8 min = 255.42 max = 260.98 avg = 257.15 mobilenet_ssd min = 267.16 max = 270.41 avg = 268.20 mobilenet_ssd_int8 min = 205.03 max = 206.43 avg = 205.53 mobilenet_yolo min = 571.08 max = 576.15 avg = 574.18 mobilenetv2_yolov3 min = 342.52 max = 344.84 avg = 343.38 yolov4-tiny min = 499.74 max = 503.13 avg = 501.45 nanodet_m min = 161.87 max = 163.90 avg = 162.93 yolo-fastest-1.1 min = 72.84 max = 74.81 avg = 73.35 yolo-fastestv2 min = 68.24 max = 70.49 avg = 68.74 vision_transformer min = 12464.09 max = 12491.57 avg = 12475.63 FastestDet min = 67.92 max = 69.90 avg = 68.94 ``` ### OrangePi Zero 2, Allwinner H616 (Cortex-A53 1.5GHz x 4) ``` orangepi@zero2:~/ncnn/benchmark$ ./benchncnn 10 4 0 -1 1 loop_count = 10 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 76.25 max = 90.20 avg = 78.99 squeezenet_int8 min = 59.92 max = 60.44 avg = 60.10 mobilenet min = 106.91 max = 132.22 avg = 109.99 mobilenet_int8 min = 57.96 max = 59.06 avg = 58.19 mobilenet_v2 min = 97.93 max = 124.48 avg = 100.91 mobilenet_v3 min = 82.27 max = 83.93 avg = 83.00 shufflenet min = 55.27 max = 82.06 avg = 58.40 shufflenet_v2 min = 44.94 max = 71.99 avg = 48.10 mnasnet min = 90.66 max = 91.41 avg = 90.92 proxylessnasnet min = 91.55 max = 118.74 avg = 94.71 efficientnet_b0 min = 127.95 max = 155.13 avg = 131.25 efficientnetv2_b0 min = 145.96 max = 173.67 avg = 149.36 regnety_400m min = 102.83 max = 103.52 avg = 103.08 blazeface min = 14.46 max = 14.95 avg = 14.77 googlenet min = 217.71 max = 244.16 avg = 221.38 googlenet_int8 min = 163.04 max = 187.69 avg = 166.20 resnet18 min = 251.45 max = 277.52 avg = 255.00 resnet18_int8 min = 136.54 max = 161.95 avg = 141.60 alexnet min = 212.07 max = 233.27 avg = 215.34 vgg16 min = 1206.92 max = 1981.79 avg = 1673.28 vgg16_int8 min = 622.93 max = 702.12 avg = 661.83 resnet50 min = 555.84 max = 643.69 avg = 576.17 resnet50_int8 min = 348.11 max = 374.25 avg = 354.17 squeezenet_ssd min = 224.68 max = 251.32 avg = 230.59 squeezenet_ssd_int8 min = 154.87 max = 182.66 avg = 159.08 mobilenet_ssd min = 238.49 max = 426.65 avg = 263.18 mobilenet_ssd_int8 min = 118.36 max = 138.39 avg = 120.78 mobilenet_yolo min = 500.28 max = 615.83 avg = 553.59 mobilenetv2_yolov3 min = 340.27 max = 369.13 avg = 347.17 yolov4-tiny min = 365.04 max = 408.48 avg = 383.93 nanodet_m min = 112.88 max = 141.85 avg = 116.13 yolo-fastest-1.1 min = 72.05 max = 73.46 avg = 72.68 yolo-fastestv2 min = 54.94 max = 55.35 avg = 55.15 vision_transformer min = 6842.19 max = 9125.07 avg = 7343.64 FastestDet min = 59.09 max = 59.87 avg = 59.35 ``` ### OrangePi4 LTS, Rockchip RK3399 (Cortex-A72 1.8GHz x 2 + Cortex-A53 1.5GHz x 4) Test Ubuntu 22.04 Gnome Desktop ``` orangepi@orangepi4-lts:~/ncnn/benchmark$ ./benchncnn 10 6 0 -1 0 loop_count = 10 num_threads = 6 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 40.89 max = 50.29 avg = 45.15 squeezenet_int8 min = 40.36 max = 48.57 avg = 43.56 mobilenet min = 55.81 max = 67.35 avg = 59.81 mobilenet_int8 min = 39.96 max = 45.10 avg = 42.09 mobilenet_v2 min = 53.29 max = 64.12 avg = 57.40 mobilenet_v3 min = 38.94 max = 51.11 avg = 43.06 shufflenet min = 27.32 max = 38.53 avg = 31.85 shufflenet_v2 min = 24.38 max = 31.17 avg = 28.32 mnasnet min = 47.02 max = 50.68 avg = 48.86 proxylessnasnet min = 52.31 max = 61.31 avg = 56.66 efficientnet_b0 min = 68.14 max = 76.07 avg = 72.62 efficientnetv2_b0 min = 77.23 max = 96.07 avg = 84.83 regnety_400m min = 60.81 max = 81.72 avg = 72.37 blazeface min = 7.24 max = 8.19 avg = 7.68 googlenet min = 122.99 max = 132.67 avg = 128.90 googlenet_int8 min = 108.45 max = 121.17 avg = 115.37 resnet18 min = 100.67 max = 115.30 avg = 107.65 resnet18_int8 min = 80.17 max = 87.56 avg = 84.01 alexnet min = 71.00 max = 83.09 avg = 76.21 vgg16 min = 557.67 max = 606.30 avg = 581.12 vgg16_int8 min = 369.93 max = 393.20 avg = 384.86 resnet50 min = 254.25 max = 272.90 avg = 265.18 resnet50_int8 min = 220.70 max = 231.50 avg = 225.03 squeezenet_ssd min = 118.91 max = 131.52 avg = 123.91 squeezenet_ssd_int8 min = 98.25 max = 116.42 avg = 110.13 mobilenet_ssd min = 126.62 max = 134.13 avg = 129.56 mobilenet_ssd_int8 min = 83.83 max = 91.61 avg = 86.75 mobilenet_yolo min = 281.19 max = 299.79 avg = 290.05 mobilenetv2_yolov3 min = 180.37 max = 194.10 avg = 185.61 yolov4-tiny min = 215.28 max = 227.29 avg = 221.61 nanodet_m min = 64.63 max = 75.86 avg = 70.46 yolo-fastest-1.1 min = 39.54 max = 48.30 avg = 44.76 yolo-fastestv2 min = 29.91 max = 53.15 avg = 37.32 vision_transformer min = 2520.25 max = 2595.28 avg = 2557.05 FastestDet min = 32.45 max = 47.38 avg = 40.55 orangepi@orangepi4-lts:~/ncnn/benchmark$ ./benchncnn 10 4 1 -1 0 loop_count = 10 num_threads = 4 powersave = 1 gpu_device = -1 cooling_down = 0 squeezenet min = 48.90 max = 56.65 avg = 53.09 squeezenet_int8 min = 48.09 max = 54.69 avg = 51.26 mobilenet min = 66.06 max = 79.73 avg = 73.96 mobilenet_int8 min = 51.33 max = 58.30 avg = 54.71 mobilenet_v2 min = 61.06 max = 88.93 avg = 71.48 mobilenet_v3 min = 50.41 max = 65.40 avg = 56.51 shufflenet min = 38.11 max = 63.95 avg = 44.03 shufflenet_v2 min = 33.27 max = 36.43 avg = 34.89 mnasnet min = 60.02 max = 72.71 avg = 64.57 proxylessnasnet min = 66.61 max = 73.25 avg = 70.65 efficientnet_b0 min = 87.27 max = 94.97 avg = 91.00 efficientnetv2_b0 min = 99.89 max = 112.09 avg = 106.13 regnety_400m min = 84.65 max = 92.78 avg = 89.51 blazeface min = 9.73 max = 11.45 avg = 10.85 googlenet min = 154.74 max = 164.25 avg = 159.33 googlenet_int8 min = 140.29 max = 148.08 avg = 144.18 resnet18 min = 131.51 max = 244.02 avg = 150.56 resnet18_int8 min = 102.11 max = 114.40 avg = 108.32 alexnet min = 81.13 max = 92.35 avg = 86.86 vgg16 min = 649.91 max = 668.62 avg = 660.25 vgg16_int8 min = 513.75 max = 523.77 avg = 518.17 resnet50 min = 330.89 max = 378.23 avg = 344.07 resnet50_int8 min = 280.38 max = 286.93 avg = 284.43 squeezenet_ssd min = 134.35 max = 146.97 avg = 141.17 squeezenet_ssd_int8 min = 126.31 max = 137.29 avg = 130.73 mobilenet_ssd min = 146.83 max = 161.70 avg = 155.08 mobilenet_ssd_int8 min = 105.74 max = 117.05 avg = 111.62 mobilenet_yolo min = 339.30 max = 352.16 avg = 345.22 mobilenetv2_yolov3 min = 223.12 max = 234.18 avg = 229.81 yolov4-tiny min = 267.30 max = 272.95 avg = 270.47 nanodet_m min = 78.72 max = 86.18 avg = 81.81 yolo-fastest-1.1 min = 47.96 max = 55.08 avg = 51.81 yolo-fastestv2 min = 38.01 max = 44.32 avg = 42.29 vision_transformer min = 3499.34 max = 3526.15 avg = 3514.43 FastestDet min = 40.14 max = 44.37 avg = 42.30 orangepi@orangepi4-lts:~/ncnn/benchmark$ ./benchncnn 10 2 2 -1 0 loop_count = 10 num_threads = 2 powersave = 2 gpu_device = -1 cooling_down = 0 squeezenet min = 45.65 max = 46.72 avg = 46.15 squeezenet_int8 min = 42.60 max = 43.01 avg = 42.76 mobilenet min = 69.35 max = 70.59 avg = 69.92 mobilenet_int8 min = 46.08 max = 46.35 avg = 46.20 mobilenet_v2 min = 57.47 max = 58.90 avg = 58.08 mobilenet_v3 min = 44.72 max = 45.47 avg = 45.05 shufflenet min = 31.74 max = 32.16 avg = 31.97 shufflenet_v2 min = 26.74 max = 26.98 avg = 26.86 mnasnet min = 50.47 max = 51.20 avg = 50.82 proxylessnasnet min = 57.31 max = 58.24 avg = 57.68 efficientnet_b0 min = 79.61 max = 80.79 avg = 80.02 efficientnetv2_b0 min = 92.67 max = 93.37 avg = 93.08 regnety_400m min = 67.08 max = 68.07 avg = 67.59 blazeface min = 8.56 max = 8.81 avg = 8.70 googlenet min = 136.82 max = 138.26 avg = 137.44 googlenet_int8 min = 121.96 max = 122.64 avg = 122.36 resnet18 min = 118.04 max = 119.24 avg = 118.49 resnet18_int8 min = 89.55 max = 92.11 avg = 90.38 alexnet min = 80.75 max = 82.34 avg = 81.24 vgg16 min = 602.11 max = 628.12 avg = 612.26 vgg16_int8 min = 481.31 max = 484.49 avg = 482.84 resnet50 min = 307.31 max = 310.10 avg = 308.88 resnet50_int8 min = 240.45 max = 243.43 avg = 241.76 squeezenet_ssd min = 119.65 max = 122.93 avg = 121.34 squeezenet_ssd_int8 min = 102.71 max = 103.45 avg = 103.20 mobilenet_ssd min = 142.16 max = 143.58 avg = 142.54 mobilenet_ssd_int8 min = 93.20 max = 93.81 avg = 93.41 mobilenet_yolo min = 315.42 max = 318.06 avg = 317.00 mobilenetv2_yolov3 min = 190.59 max = 191.74 avg = 190.96 yolov4-tiny min = 228.77 max = 230.49 avg = 229.78 nanodet_m min = 66.82 max = 67.23 avg = 67.02 yolo-fastest-1.1 min = 38.20 max = 40.89 avg = 38.85 yolo-fastestv2 min = 32.53 max = 33.48 avg = 33.03 vision_transformer min = 3372.17 max = 3516.54 avg = 3461.89 FastestDet min = 32.92 max = 35.55 avg = 33.62 ``` ### OrangePicm4, Rockchip Rk3566 (Cortex-A55 1.8GHz x 4) ``` orangepi@orangepicm4:~/code/ncnn-test$ ./benchncnn 10 4 0 -1 1 loop_count = 10 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 23.91 max = 91.49 avg = 31.03 squeezenet_int8 min = 24.44 max = 25.39 avg = 24.75 mobilenet min = 30.67 max = 31.75 avg = 30.98 mobilenet_int8 min = 27.87 max = 28.48 avg = 28.05 mobilenet_v2 min = 31.82 max = 32.56 avg = 32.07 mobilenet_v3 min = 24.63 max = 24.91 avg = 24.81 shufflenet min = 19.77 max = 20.19 avg = 20.01 shufflenet_v2 min = 16.67 max = 40.81 avg = 28.79 mnasnet min = 27.48 max = 28.36 avg = 27.75 proxylessnasnet min = 33.04 max = 37.30 avg = 33.70 efficientnet_b0 min = 39.21 max = 175.34 avg = 53.26 efficientnetv2_b0 min = 48.94 max = 78.68 avg = 52.44 regnety_400m min = 39.81 max = 40.15 avg = 39.96 blazeface min = 6.22 max = 6.36 avg = 6.30 googlenet min = 75.48 max = 120.58 avg = 82.05 googlenet_int8 min = 74.42 max = 78.70 avg = 75.29 resnet18 min = 58.21 max = 99.04 avg = 66.07 resnet18_int8 min = 54.18 max = 79.91 avg = 57.31 alexnet min = 49.18 max = 161.71 avg = 63.03 vgg16 min = 323.82 max = 452.63 avg = 360.92 vgg16_int8 min = 379.18 max = 527.82 avg = 432.99 resnet50 min = 135.84 max = 200.71 avg = 142.54 resnet50_int8 min = 126.06 max = 169.65 avg = 136.29 squeezenet_ssd min = 77.62 max = 137.89 avg = 86.87 squeezenet_ssd_int8 min = 74.17 max = 76.22 avg = 74.91 mobilenet_ssd min = 68.60 max = 132.81 avg = 75.30 mobilenet_ssd_int8 min = 58.01 max = 59.24 avg = 58.81 mobilenet_yolo min = 151.61 max = 247.03 avg = 168.31 mobilenetv2_yolov3 min = 106.00 max = 163.45 avg = 111.92 yolov4-tiny min = 132.99 max = 193.53 avg = 139.88 nanodet_m min = 51.43 max = 87.10 avg = 58.17 yolo-fastest-1.1 min = 26.10 max = 66.68 avg = 30.33 yolo-fastestv2 min = 21.87 max = 69.79 avg = 35.55 vision_transformer min = 2301.36 max = 2513.89 avg = 2426.14 FastestDet min = 21.33 max = 21.59 avg = 21.47 orangepi@orangepicm4:~/code/ncnn-test$ ./benchncnn 10 1 0 -1 1 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 47.26 max = 48.21 avg = 47.68 squeezenet_int8 min = 50.80 max = 54.79 avg = 51.64 mobilenet min = 68.18 max = 71.72 avg = 68.78 mobilenet_int8 min = 58.34 max = 58.73 avg = 58.56 mobilenet_v2 min = 56.56 max = 57.38 avg = 57.04 mobilenet_v3 min = 45.52 max = 53.46 avg = 47.98 shufflenet min = 34.88 max = 75.06 avg = 46.15 shufflenet_v2 min = 33.43 max = 49.65 avg = 36.86 mnasnet min = 53.87 max = 54.08 avg = 53.98 proxylessnasnet min = 70.99 max = 71.40 avg = 71.14 efficientnet_b0 min = 83.79 max = 89.78 avg = 84.96 efficientnetv2_b0 min = 103.89 max = 117.47 avg = 105.81 regnety_400m min = 63.68 max = 81.25 avg = 66.66 blazeface min = 12.18 max = 39.24 avg = 21.79 googlenet min = 179.41 max = 202.18 avg = 185.39 googlenet_int8 min = 187.88 max = 198.49 avg = 191.01 resnet18 min = 132.67 max = 148.94 avg = 136.09 resnet18_int8 min = 150.37 max = 158.14 avg = 153.17 alexnet min = 115.00 max = 120.17 avg = 116.26 vgg16 min = 809.99 max = 851.07 avg = 827.73 vgg16_int8 min = 1149.74 max = 1161.37 avg = 1154.22 resnet50 min = 327.19 max = 350.42 avg = 332.12 resnet50_int8 min = 325.08 max = 332.46 avg = 327.17 squeezenet_ssd min = 150.33 max = 163.00 avg = 153.12 squeezenet_ssd_int8 min = 152.21 max = 157.94 avg = 155.36 mobilenet_ssd min = 149.30 max = 150.23 avg = 149.72 mobilenet_ssd_int8 min = 121.93 max = 127.07 avg = 123.03 mobilenet_yolo min = 330.91 max = 345.64 avg = 336.21 mobilenetv2_yolov3 min = 193.25 max = 214.92 avg = 198.82 yolov4-tiny min = 284.38 max = 332.54 avg = 293.43 nanodet_m min = 90.69 max = 100.74 avg = 92.56 yolo-fastest-1.1 min = 38.93 max = 51.96 avg = 42.11 yolo-fastestv2 min = 35.74 max = 48.11 avg = 38.63 vision_transformer min = 7280.18 max = 7301.27 avg = 7292.38 FastestDet min = 36.54 max = 42.31 avg = 38.41 ``` ### OrangePi5, Rockchip RK3588s (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz) ``` orangepi@orangepi5:~/ncnn-master/benchmark$ ./benchncnn 10 8 0 -1 0 loop_count = 10 num_threads = 8 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 6.22 max = 6.69 avg = 6.37 squeezenet_int8 min = 7.93 max = 8.32 avg = 8.07 mobilenet min = 9.08 max = 14.02 avg = 9.81 mobilenet_int8 min = 7.89 max = 9.02 avg = 8.47 mobilenet_v2 min = 7.77 max = 8.09 avg = 7.92 mobilenet_v3 min = 6.87 max = 8.19 avg = 7.46 shufflenet min = 5.98 max = 10.21 avg = 7.23 shufflenet_v2 min = 4.82 max = 5.04 avg = 4.93 mnasnet min = 6.15 max = 6.36 avg = 6.24 proxylessnasnet min = 9.50 max = 10.50 avg = 9.93 efficientnet_b0 min = 11.46 max = 11.79 avg = 11.60 efficientnetv2_b0 min = 18.61 max = 19.48 avg = 18.88 regnety_400m min = 10.54 max = 12.44 avg = 10.86 blazeface min = 1.96 max = 5.35 avg = 2.58 googlenet min = 26.62 max = 32.59 avg = 29.96 googlenet_int8 min = 28.27 max = 32.80 avg = 30.01 resnet18 min = 15.52 max = 18.29 avg = 16.37 resnet18_int8 min = 23.33 max = 26.89 avg = 24.99 alexnet min = 19.92 max = 22.75 avg = 21.06 vgg16 min = 101.18 max = 122.44 avg = 107.45 vgg16_int8 min = 164.69 max = 227.98 avg = 189.73 resnet50 min = 42.96 max = 59.26 avg = 50.83 resnet50_int8 min = 54.46 max = 66.72 avg = 61.37 squeezenet_ssd min = 24.39 max = 31.19 avg = 27.69 squeezenet_ssd_int8 min = 27.15 max = 41.55 avg = 33.68 mobilenet_ssd min = 22.26 max = 26.89 avg = 23.95 mobilenet_ssd_int8 min = 21.18 max = 24.21 avg = 23.05 mobilenet_yolo min = 52.65 max = 65.53 avg = 58.47 mobilenetv2_yolov3 min = 31.34 max = 45.15 avg = 34.63 yolov4-tiny min = 40.55 max = 49.32 avg = 43.85 nanodet_m min = 16.08 max = 19.51 avg = 17.58 yolo-fastest-1.1 min = 6.48 max = 7.33 avg = 6.98 yolo-fastestv2 min = 4.96 max = 11.66 avg = 7.30 vision_transformer min = 678.22 max = 815.73 avg = 729.16 FastestDet min = 4.95 max = 10.65 avg = 6.88 orangepi@orangepi5:~/ncnn-master/benchmark$ ./benchncnn 10 4 1 -1 0 loop_count = 10 num_threads = 4 powersave = 1 gpu_device = -1 cooling_down = 0 squeezenet min = 10.91 max = 11.14 avg = 11.03 squeezenet_int8 min = 14.26 max = 14.55 avg = 14.30 mobilenet min = 15.92 max = 16.26 avg = 16.11 mobilenet_int8 min = 14.71 max = 15.22 avg = 14.91 mobilenet_v2 min = 12.28 max = 12.49 avg = 12.37 mobilenet_v3 min = 11.31 max = 11.72 avg = 11.46 shufflenet min = 10.10 max = 10.33 avg = 10.24 shufflenet_v2 min = 9.38 max = 9.70 avg = 9.55 mnasnet min = 12.28 max = 12.80 avg = 12.44 proxylessnasnet min = 16.54 max = 16.66 avg = 16.60 efficientnet_b0 min = 19.56 max = 20.66 avg = 19.86 efficientnetv2_b0 min = 34.06 max = 34.65 avg = 34.41 regnety_400m min = 23.97 max = 24.69 avg = 24.20 blazeface min = 3.39 max = 3.56 avg = 3.48 googlenet min = 46.96 max = 47.90 avg = 47.56 googlenet_int8 min = 49.56 max = 50.23 avg = 49.79 resnet18 min = 28.44 max = 29.54 avg = 28.77 resnet18_int8 min = 41.32 max = 42.44 avg = 41.67 alexnet min = 31.83 max = 32.77 avg = 32.32 vgg16 min = 170.32 max = 178.30 avg = 173.22 vgg16_int8 min = 282.55 max = 299.32 avg = 287.78 resnet50 min = 78.00 max = 81.57 avg = 78.79 resnet50_int8 min = 89.12 max = 92.31 avg = 90.92 squeezenet_ssd min = 38.07 max = 39.07 avg = 38.59 squeezenet_ssd_int8 min = 50.98 max = 52.56 avg = 51.68 mobilenet_ssd min = 38.79 max = 39.67 avg = 39.34 mobilenet_ssd_int8 min = 33.53 max = 35.26 avg = 34.66 mobilenet_yolo min = 90.50 max = 92.32 avg = 90.99 mobilenetv2_yolov3 min = 51.38 max = 51.93 avg = 51.56 yolov4-tiny min = 75.65 max = 76.80 avg = 76.17 nanodet_m min = 21.33 max = 21.68 avg = 21.50 yolo-fastest-1.1 min = 11.18 max = 12.06 avg = 11.36 yolo-fastestv2 min = 9.87 max = 10.33 avg = 10.15 vision_transformer min = 1475.77 max = 1477.97 avg = 1476.77 FastestDet min = 9.39 max = 9.73 avg = 9.53 orangepi@orangepi5:~/ncnn-master/benchmark$ ./benchncnn 10 4 2 -1 0 loop_count = 10 num_threads = 4 powersave = 2 gpu_device = -1 cooling_down = 0 squeezenet min = 3.59 max = 3.70 avg = 3.66 squeezenet_int8 min = 4.32 max = 4.42 avg = 4.36 mobilenet min = 5.50 max = 5.55 avg = 5.53 mobilenet_int8 min = 4.52 max = 4.60 avg = 4.56 mobilenet_v2 min = 4.50 max = 4.60 avg = 4.54 mobilenet_v3 min = 4.09 max = 4.28 avg = 4.15 shufflenet min = 3.49 max = 3.58 avg = 3.51 shufflenet_v2 min = 2.91 max = 3.07 avg = 2.97 mnasnet min = 4.18 max = 4.25 avg = 4.21 proxylessnasnet min = 4.94 max = 5.00 avg = 4.97 efficientnet_b0 min = 7.50 max = 7.54 avg = 7.52 efficientnetv2_b0 min = 11.32 max = 11.41 avg = 11.37 regnety_400m min = 7.92 max = 8.01 avg = 7.95 blazeface min = 1.21 max = 1.31 avg = 1.24 googlenet min = 15.03 max = 15.17 avg = 15.10 googlenet_int8 min = 15.48 max = 15.61 avg = 15.55 resnet18 min = 9.91 max = 9.97 avg = 9.93 resnet18_int8 min = 15.80 max = 16.00 avg = 15.89 alexnet min = 12.35 max = 12.64 avg = 12.48 vgg16 min = 61.92 max = 65.62 avg = 62.93 vgg16_int8 min = 129.94 max = 131.65 avg = 130.65 resnet50 min = 27.41 max = 27.62 avg = 27.52 resnet50_int8 min = 33.01 max = 33.23 avg = 33.08 squeezenet_ssd min = 13.92 max = 14.27 avg = 14.02 squeezenet_ssd_int8 min = 18.04 max = 18.40 avg = 18.15 mobilenet_ssd min = 13.69 max = 13.80 avg = 13.74 mobilenet_ssd_int8 min = 10.95 max = 11.10 avg = 11.02 mobilenet_yolo min = 32.06 max = 32.30 avg = 32.17 mobilenetv2_yolov3 min = 19.27 max = 20.68 avg = 19.97 yolov4-tiny min = 25.41 max = 29.51 avg = 27.76 nanodet_m min = 6.68 max = 6.73 avg = 6.70 yolo-fastest-1.1 min = 3.77 max = 4.02 avg = 3.83 yolo-fastestv2 min = 3.41 max = 3.65 avg = 3.48 vision_transformer min = 548.32 max = 654.71 avg = 579.48 FastestDet min = 3.38 max = 3.46 avg = 3.42 ``` ### OrangePi5 Plus, Rockchip RK3588 (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz) ``` orangepi@orangepi5plus:~/ncnn$ ./benchncnn 8 4 2 -1 1 loop_count = 8 num_threads = 4 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 5.55 max = 5.67 avg = 5.61 squeezenet_int8 min = 5.39 max = 5.76 avg = 5.60 mobilenet min = 7.43 max = 7.50 avg = 7.47 mobilenet_int8 min = 6.91 max = 7.00 avg = 6.96 mobilenet_v2 min = 8.24 max = 8.47 avg = 8.33 mobilenet_v3 min = 6.63 max = 7.32 avg = 6.84 shufflenet min = 4.10 max = 4.23 avg = 4.14 shufflenet_v2 min = 3.51 max = 3.61 avg = 3.56 mnasnet min = 5.76 max = 7.79 avg = 6.53 proxylessnasnet min = 6.66 max = 7.19 avg = 6.79 efficientnet_b0 min = 10.32 max = 10.73 avg = 10.40 efficientnetv2_b0 min = 11.48 max = 11.78 avg = 11.61 regnety_400m min = 9.73 max = 9.85 avg = 9.79 blazeface min = 1.39 max = 1.62 avg = 1.46 googlenet min = 21.48 max = 23.08 avg = 22.79 googlenet_int8 min = 20.82 max = 21.78 avg = 21.01 resnet18 min = 9.37 max = 10.05 avg = 9.50 resnet18_int8 min = 14.88 max = 19.64 avg = 15.90 alexnet min = 24.74 max = 24.93 avg = 24.81 vgg16 min = 58.75 max = 62.44 avg = 59.52 vgg16_int8 min = 73.68 max = 75.89 avg = 74.14 resnet50 min = 44.88 max = 45.10 avg = 44.98 resnet50_int8 min = 35.54 max = 36.02 avg = 35.71 squeezenet_ssd min = 12.07 max = 26.66 avg = 19.03 squeezenet_ssd_int8 min = 21.95 max = 25.51 avg = 23.21 mobilenet_ssd min = 12.62 max = 12.73 avg = 12.67 mobilenet_ssd_int8 min = 17.21 max = 17.68 avg = 17.44 mobilenet_yolo min = 32.82 max = 32.98 avg = 32.91 mobilenetv2_yolov3 min = 18.67 max = 20.52 avg = 19.57 yolov4-tiny min = 38.82 max = 40.84 avg = 39.82 nanodet_m min = 9.05 max = 9.22 avg = 9.13 yolo-fastest-1.1 min = 4.67 max = 5.04 avg = 4.74 yolo-fastestv2 min = 4.27 max = 4.32 avg = 4.29 vision_transformer min = 429.32 max = 431.02 avg = 430.20 FastestDet min = 4.28 max = 4.72 avg = 4.36 ``` ### RDK X3 Module (Cortex-A53 1.5GHz x 4) aarch64 ``` root@ubuntu:/home/sunrise/ncnn-master/benchmark# ../build-aarch64-linux-gnu/benchmark/benchncnn 10 4 0 -1 1 loop_count = 10 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 49.83 max = 50.57 avg = 50.08 squeezenet_int8 min = 48.43 max = 49.18 avg = 48.67 mobilenet min = 68.37 max = 69.09 avg = 68.63 mobilenet_int8 min = 58.19 max = 58.72 avg = 58.37 mobilenet_v2 min = 58.76 max = 60.62 avg = 59.20 mobilenet_v3 min = 49.75 max = 50.60 avg = 50.06 shufflenet min = 37.17 max = 37.96 avg = 37.50 shufflenet_v2 min = 32.08 max = 32.42 avg = 32.22 mnasnet min = 55.51 max = 57.02 avg = 55.90 proxylessnasnet min = 68.15 max = 69.53 avg = 68.78 efficientnet_b0 min = 88.64 max = 90.16 avg = 89.43 efficientnetv2_b0 min = 102.45 max = 103.42 avg = 102.92 regnety_400m min = 88.22 max = 89.09 avg = 88.62 blazeface min = 9.78 max = 10.15 avg = 9.93 googlenet min = 152.20 max = 153.92 avg = 153.28 googlenet_int8 min = 141.80 max = 143.30 avg = 142.48 resnet18 min = 116.70 max = 117.59 avg = 117.03 resnet18_int8 min = 104.42 max = 105.85 avg = 104.94 alexnet min = 82.55 max = 83.23 avg = 82.82 vgg16 min = 590.22 max = 598.18 avg = 594.35 vgg16_int8 min = 504.56 max = 507.21 avg = 505.73 resnet50 min = 307.36 max = 308.68 avg = 308.03 resnet50_int8 min = 281.35 max = 283.87 avg = 282.30 squeezenet_ssd min = 124.93 max = 126.51 avg = 125.51 squeezenet_ssd_int8 min = 118.07 max = 118.89 avg = 118.29 mobilenet_ssd min = 142.27 max = 142.57 avg = 142.44 mobilenet_ssd_int8 min = 116.51 max = 117.60 avg = 117.04 mobilenet_yolo min = 314.64 max = 317.09 avg = 315.93 mobilenetv2_yolov3 min = 204.55 max = 205.30 avg = 204.93 yolov4-tiny min = 246.69 max = 249.64 avg = 247.95 nanodet_m min = 77.73 max = 78.30 avg = 77.99 yolo-fastest-1.1 min = 46.29 max = 47.52 avg = 46.93 yolo-fastestv2 min = 36.55 max = 36.95 avg = 36.73 vision_transformer min = 3372.85 max = 3409.14 avg = 3377.75 FastestDet min = 38.23 max = 38.77 avg = 38.49 ``` ### NanoPi R2S, Rockchip RK3328 (Cortex-A53 1.3GHz x 4) Armbian focal (21.05.1) aarch64 ``` root@nanopi-r2s:~/ncnn/build/benchmark# ./benchncnn 8 4 0 loop_count = 8 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 62.20 max = 62.81 avg = 62.49 squeezenet_int8 min = 57.92 max = 71.46 avg = 59.76 mobilenet min = 82.88 max = 89.36 avg = 84.52 mobilenet_int8 min = 57.16 max = 96.22 avg = 62.29 mobilenet_v2 min = 73.68 max = 75.92 avg = 74.17 mobilenet_v3 min = 59.57 max = 60.14 avg = 59.84 shufflenet min = 52.34 max = 52.70 avg = 52.53 shufflenet_v2 min = 45.51 max = 45.92 avg = 45.73 mnasnet min = 67.75 max = 83.15 avg = 69.82 proxylessnasnet min = 81.70 max = 83.66 avg = 82.31 efficientnet_b0 min = 121.10 max = 123.22 avg = 121.55 efficientnetv2_b0 min = 138.93 max = 192.15 avg = 154.94 regnety_400m min = 99.62 max = 116.29 avg = 101.97 blazeface min = 18.80 max = 19.15 avg = 19.01 googlenet min = 176.36 max = 202.84 avg = 181.86 googlenet_int8 min = 155.50 max = 190.50 avg = 161.20 resnet18 min = 165.79 max = 201.57 avg = 172.56 resnet18_int8 min = 122.24 max = 160.53 avg = 134.24 alexnet min = 227.07 max = 238.09 avg = 232.19 vgg16_int8 min = 522.14 max = 551.75 avg = 531.68 resnet50 min = 378.30 max = 440.21 avg = 388.56 resnet50_int8 min = 315.76 max = 373.97 avg = 329.88 squeezenet_ssd min = 175.37 max = 200.86 avg = 179.01 squeezenet_ssd_int8 min = 134.71 max = 147.57 avg = 136.57 mobilenet_ssd min = 174.43 max = 212.11 avg = 180.61 mobilenet_ssd_int8 min = 119.41 max = 153.75 avg = 124.21 mobilenet_yolo min = 366.27 max = 422.67 avg = 383.65 mobilenetv2_yolov3 min = 238.56 max = 281.97 avg = 247.56 yolov4-tiny min = 311.45 max = 333.32 avg = 316.79 nanodet_m min = 114.15 max = 122.39 avg = 115.44 root@nanopi-r2s:~/ncnn/build/benchmark# ./benchncnn 8 2 0 loop_count = 8 num_threads = 2 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 89.02 max = 90.52 avg = 89.35 squeezenet_int8 min = 81.19 max = 81.90 avg = 81.42 mobilenet min = 131.47 max = 134.39 avg = 132.34 mobilenet_int8 min = 102.20 max = 103.03 avg = 102.66 mobilenet_v2 min = 102.40 max = 108.12 avg = 103.91 mobilenet_v3 min = 89.17 max = 90.10 avg = 89.53 shufflenet min = 65.74 max = 68.86 avg = 66.50 shufflenet_v2 min = 62.83 max = 64.41 avg = 63.25 mnasnet min = 98.01 max = 98.24 avg = 98.14 proxylessnasnet min = 121.10 max = 123.55 avg = 121.80 efficientnet_b0 min = 187.79 max = 188.41 avg = 188.08 efficientnetv2_b0 min = 211.96 max = 213.99 avg = 212.74 regnety_400m min = 124.98 max = 125.49 avg = 125.28 blazeface min = 24.91 max = 25.14 avg = 25.00 googlenet min = 278.47 max = 283.24 avg = 280.79 googlenet_int8 min = 243.81 max = 247.82 avg = 245.30 resnet18 min = 257.46 max = 259.29 avg = 258.29 resnet18_int8 min = 187.18 max = 188.74 avg = 187.70 alexnet min = 384.52 max = 387.07 avg = 385.84 vgg16_int8 min = 897.26 max = 901.68 avg = 899.19 resnet50 min = 618.85 max = 623.92 avg = 620.85 resnet50_int8 min = 512.33 max = 514.93 avg = 513.64 squeezenet_ssd min = 211.21 max = 218.71 avg = 213.02 squeezenet_ssd_int8 min = 193.32 max = 193.97 avg = 193.70 mobilenet_ssd min = 271.11 max = 275.58 avg = 272.06 mobilenet_ssd_int8 min = 208.80 max = 209.59 avg = 209.05 mobilenet_yolo min = 570.55 max = 575.98 avg = 572.73 mobilenetv2_yolov3 min = 329.04 max = 353.84 avg = 340.42 yolov4-tiny min = 435.16 max = 463.68 avg = 457.69 nanodet_m min = 155.70 max = 159.13 avg = 156.50 ``` ### EAIDK 310, Rockchip RK3228H (Cortex-A53 1.3GHz x 4) fedora-28 aarch64 ``` [openailab@MiWiFi-R1D-srv benchmark]$ ./benchncnn 8 4 0 -1 1 loop_count = 8 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 68.97 max = 71.42 avg = 69.65 squeezenet_int8 min = 58.47 max = 59.58 avg = 58.77 mobilenet min = 90.87 max = 100.18 avg = 92.48 mobilenet_int8 min = 59.46 max = 63.02 avg = 60.01 mobilenet_v2 min = 82.92 max = 112.01 avg = 88.10 mobilenet_v3 min = 66.65 max = 69.57 avg = 67.27 shufflenet min = 48.22 max = 48.49 avg = 48.34 shufflenet_v2 min = 48.52 max = 52.88 avg = 49.17 mnasnet min = 75.63 max = 79.83 avg = 76.43 proxylessnasnet min = 84.73 max = 86.69 avg = 85.16 efficientnet_b0 min = 125.69 max = 129.00 avg = 126.38 efficientnetv2_b0 min = 144.44 max = 149.01 avg = 145.33 regnety_400m min = 99.69 max = 101.23 avg = 100.38 blazeface min = 15.84 max = 16.24 avg = 16.03 googlenet min = 194.64 max = 199.29 avg = 196.07 googlenet_int8 min = 158.54 max = 165.64 avg = 160.25 resnet18 min = 200.65 max = 221.60 avg = 204.30 resnet18_int8 min = 122.69 max = 126.57 avg = 123.54 alexnet min = 175.54 max = 200.91 avg = 181.38 resnet50 min = 428.75 max = 466.51 avg = 439.67 resnet50_int8 min = 324.95 max = 347.47 avg = 329.74 squeezenet_ssd min = 199.86 max = 207.51 avg = 201.99 squeezenet_ssd_int8 min = 150.35 max = 176.92 avg = 154.60 mobilenet_ssd min = 186.50 max = 189.92 avg = 188.09 mobilenet_ssd_int8 min = 123.55 max = 127.17 avg = 124.63 mobilenet_yolo min = 393.83 max = 414.09 avg = 398.57 mobilenetv2_yolov3 min = 263.49 max = 273.11 avg = 266.11 yolov4-tiny min = 342.33 max = 363.69 avg = 346.34 nanodet_m min = 119.66 max = 127.29 avg = 121.26 yolo-fastest-1.1 min = 61.87 max = 90.26 avg = 65.77 yolo-fastestv2 min = 48.48 max = 50.82 avg = 48.93 [openailab@MiWiFi-R1D-srv benchmark]$ ./benchncnn 4 1 0 -1 1 loop_count = 4 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 152.15 max = 152.67 avg = 152.43 squeezenet_int8 min = 143.22 max = 144.24 avg = 143.61 mobilenet min = 237.77 max = 239.69 avg = 238.47 mobilenet_int8 min = 199.91 max = 201.35 avg = 200.50 mobilenet_v2 min = 169.67 max = 170.18 avg = 169.93 mobilenet_v3 min = 150.06 max = 151.17 avg = 150.78 shufflenet min = 91.78 max = 92.38 avg = 92.06 shufflenet_v2 min = 100.86 max = 101.75 avg = 101.50 mnasnet min = 165.10 max = 166.74 avg = 166.24 proxylessnasnet min = 218.42 max = 220.55 avg = 219.12 efficientnet_b0 min = 348.00 max = 349.03 avg = 348.49 efficientnetv2_b0 min = 404.06 max = 406.16 avg = 405.00 regnety_400m min = 209.48 max = 211.36 avg = 210.44 blazeface min = 31.31 max = 32.61 avg = 32.00 googlenet min = 510.38 max = 512.43 avg = 511.25 googlenet_int8 min = 454.38 max = 456.19 avg = 455.02 resnet18 min = 407.78 max = 409.45 avg = 408.34 resnet18_int8 min = 357.01 max = 360.72 avg = 358.74 alexnet min = 504.12 max = 506.74 avg = 505.08 resnet50 min = 1115.42 max = 1121.91 avg = 1118.67 resnet50_int8 min = 973.38 max = 976.26 avg = 975.21 squeezenet_ssd min = 361.52 max = 363.69 avg = 362.38 squeezenet_ssd_int8 min = 333.81 max = 337.16 avg = 335.24 mobilenet_ssd min = 477.43 max = 478.36 avg = 477.82 mobilenet_ssd_int8 min = 409.33 max = 409.67 avg = 409.52 mobilenet_yolo min = 1048.79 max = 1057.72 avg = 1053.80 mobilenetv2_yolov3 min = 567.04 max = 571.44 avg = 569.04 yolov4-tiny min = 788.40 max = 790.74 avg = 789.12 nanodet_m min = 253.68 max = 254.59 avg = 254.16 yolo-fastest-1.1 min = 102.44 max = 103.11 avg = 102.67 yolo-fastestv2 min = 82.19 max = 82.43 avg = 82.35 ``` ### NVIDIA Jetson Orin Nano ``` orin@nano:~/ncnn/benchmark$ ./benchncnn 8 6 0 0 1 [0 NVIDIA Tegra Orin (nvgpu)] queueC=2[8] queueG=0[16] queueT=1[2] [0 NVIDIA Tegra Orin (nvgpu)] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 NVIDIA Tegra Orin (nvgpu)] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 NVIDIA Tegra Orin (nvgpu)] subgroup=32 basic/vote/ballot/shuffle=1/1/1/1 [0 NVIDIA Tegra Orin (nvgpu)] fp16-matrix-16_8_8/16_8_16/16_16_16=1/1/1 loop_count = 8 num_threads = 6 powersave = 0 gpu_device = 0 cooling_down = 1 squeezenet min = 5.31 max = 5.95 avg = 5.44 squeezenet_int8 min = 5.13 max = 6.24 avg = 5.57 mobilenet min = 2.98 max = 5.52 avg = 3.66 mobilenet_int8 min = 5.97 max = 7.76 avg = 6.98 mobilenet_v2 min = 6.73 max = 6.98 avg = 6.91 mobilenet_v3 min = 8.58 max = 8.77 avg = 8.71 shufflenet min = 7.33 max = 7.43 avg = 7.39 shufflenet_v2 min = 7.59 max = 8.46 avg = 8.27 mnasnet min = 4.78 max = 6.81 avg = 5.41 proxylessnasnet min = 7.39 max = 7.65 avg = 7.52 efficientnet_b0 min = 10.81 max = 15.28 avg = 12.27 efficientnetv2_b0 min = 46.58 max = 48.56 avg = 47.70 regnety_400m min = 9.86 max = 10.46 avg = 10.04 blazeface min = 3.98 max = 4.66 avg = 4.31 googlenet min = 10.01 max = 14.44 avg = 11.48 googlenet_int8 min = 18.07 max = 19.55 avg = 18.65 resnet18 min = 6.52 max = 9.73 avg = 8.26 resnet18_int8 min = 13.28 max = 20.58 avg = 14.96 alexnet min = 8.71 max = 9.05 avg = 8.84 vgg16 min = 19.28 max = 19.49 avg = 19.35 vgg16_int8 min = 98.14 max = 100.92 avg = 99.76 resnet50 min = 9.25 max = 9.37 avg = 9.31 resnet50_int8 min = 31.16 max = 34.44 avg = 32.59 squeezenet_ssd min = 13.60 max = 18.96 avg = 16.68 squeezenet_ssd_int8 min = 17.81 max = 19.83 avg = 18.75 mobilenet_ssd min = 11.88 max = 13.86 avg = 13.27 mobilenet_ssd_int8 min = 14.05 max = 21.16 avg = 15.64 mobilenet_yolo min = 14.18 max = 14.41 avg = 14.26 mobilenetv2_yolov3 min = 16.65 max = 18.78 avg = 18.06 yolov4-tiny min = 25.60 max = 26.56 avg = 25.92 nanodet_m min = 15.71 max = 19.89 avg = 19.03 yolo-fastest-1.1 min = 8.72 max = 9.18 avg = 8.96 yolo-fastestv2 min = 7.97 max = 8.10 avg = 8.04 vision_transformer min = 821.34 max = 825.91 avg = 823.26 FastestDet min = 7.72 max = 8.15 avg = 7.81 orin@nano:~/ncnn/benchmark$ ./benchncnn 8 1 0 0 1 [0 NVIDIA Tegra Orin (nvgpu)] queueC=2[8] queueG=0[16] queueT=1[2] [0 NVIDIA Tegra Orin (nvgpu)] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 NVIDIA Tegra Orin (nvgpu)] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 NVIDIA Tegra Orin (nvgpu)] subgroup=32 basic/vote/ballot/shuffle=1/1/1/1 [0 NVIDIA Tegra Orin (nvgpu)] fp16-matrix-16_8_8/16_8_16/16_16_16=1/1/1 loop_count = 8 num_threads = 1 powersave = 0 gpu_device = 0 cooling_down = 1 squeezenet min = 5.05 max = 5.23 avg = 5.09 squeezenet_int8 min = 15.93 max = 16.09 avg = 16.00 mobilenet min = 2.97 max = 5.49 avg = 3.84 mobilenet_int8 min = 23.27 max = 23.38 avg = 23.33 mobilenet_v2 min = 3.61 max = 4.01 avg = 3.83 mobilenet_v3 min = 6.12 max = 8.36 avg = 6.67 shufflenet min = 4.07 max = 7.25 avg = 6.22 shufflenet_v2 min = 8.49 max = 8.82 avg = 8.67 mnasnet min = 3.70 max = 8.23 avg = 5.37 proxylessnasnet min = 6.36 max = 9.16 avg = 7.52 efficientnet_b0 min = 10.55 max = 10.81 avg = 10.65 efficientnetv2_b0 min = 28.22 max = 28.62 avg = 28.54 regnety_400m min = 7.22 max = 10.04 avg = 8.50 blazeface min = 3.70 max = 3.86 avg = 3.76 googlenet min = 7.18 max = 9.76 avg = 8.21 googlenet_int8 min = 63.19 max = 63.54 avg = 63.32 resnet18 min = 4.67 max = 4.73 avg = 4.70 resnet18_int8 min = 50.51 max = 50.81 avg = 50.65 alexnet min = 8.56 max = 10.64 avg = 9.02 vgg16 min = 19.24 max = 19.50 avg = 19.31 vgg16_int8 min = 411.02 max = 412.40 avg = 411.60 resnet50 min = 9.14 max = 9.52 avg = 9.41 resnet50_int8 min = 112.04 max = 112.43 avg = 112.25 squeezenet_ssd min = 13.23 max = 13.79 avg = 13.52 squeezenet_ssd_int8 min = 46.52 max = 46.98 avg = 46.77 mobilenet_ssd min = 8.89 max = 12.51 avg = 9.95 mobilenet_ssd_int8 min = 47.66 max = 48.73 avg = 48.13 mobilenet_yolo min = 9.68 max = 9.75 avg = 9.70 mobilenetv2_yolov3 min = 15.84 max = 17.54 avg = 16.83 yolov4-tiny min = 23.32 max = 25.49 avg = 24.56 nanodet_m min = 13.59 max = 19.53 avg = 15.85 yolo-fastest-1.1 min = 7.68 max = 11.32 avg = 8.20 yolo-fastestv2 min = 7.75 max = 7.84 avg = 7.78 vision_transformer min = 822.27 max = 829.73 avg = 825.74 FastestDet min = 7.51 max = 8.05 avg = 7.68 orin@nano:~/ncnn/benchmark$ ./benchncnn 8 6 0 -1 1 loop_count = 8 num_threads = 6 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 5.07 max = 6.99 avg = 5.69 squeezenet_int8 min = 5.08 max = 5.79 avg = 5.42 mobilenet min = 6.96 max = 8.20 avg = 7.45 mobilenet_int8 min = 5.91 max = 7.33 avg = 6.37 mobilenet_v2 min = 5.86 max = 7.55 avg = 6.51 mobilenet_v3 min = 5.60 max = 7.22 avg = 6.14 shufflenet min = 5.20 max = 5.79 avg = 5.44 shufflenet_v2 min = 4.56 max = 5.90 avg = 4.86 mnasnet min = 5.43 max = 6.44 avg = 5.83 proxylessnasnet min = 5.92 max = 8.70 avg = 6.83 efficientnet_b0 min = 10.09 max = 11.57 avg = 10.65 efficientnetv2_b0 min = 12.79 max = 15.96 avg = 14.12 regnety_400m min = 14.04 max = 21.23 avg = 15.88 blazeface min = 1.76 max = 1.90 avg = 1.81 googlenet min = 19.45 max = 25.43 avg = 21.21 googlenet_int8 min = 17.67 max = 18.59 avg = 18.20 resnet18 min = 12.26 max = 19.47 avg = 15.13 resnet18_int8 min = 13.02 max = 14.78 avg = 13.86 alexnet min = 12.27 max = 19.18 avg = 15.02 vgg16 min = 59.43 max = 89.43 avg = 65.11 vgg16_int8 min = 97.71 max = 141.28 avg = 108.00 resnet50 min = 38.69 max = 40.67 avg = 39.26 resnet50_int8 min = 28.67 max = 31.63 avg = 29.93 squeezenet_ssd min = 14.52 max = 26.92 avg = 17.89 squeezenet_ssd_int8 min = 16.61 max = 19.27 avg = 17.82 mobilenet_ssd min = 16.61 max = 22.65 avg = 17.89 mobilenet_ssd_int8 min = 13.22 max = 14.83 avg = 14.04 mobilenet_yolo min = 40.10 max = 44.28 avg = 41.48 mobilenetv2_yolov3 min = 21.48 max = 22.83 avg = 22.01 yolov4-tiny min = 33.30 max = 37.31 avg = 34.59 nanodet_m min = 10.80 max = 12.62 avg = 11.54 yolo-fastest-1.1 min = 5.51 max = 6.03 avg = 5.75 yolo-fastestv2 min = 4.98 max = 6.35 avg = 5.44 vision_transformer min = 610.40 max = 681.89 avg = 628.84 FastestDet min = 4.82 max = 6.19 avg = 5.32 orin@nano:~/ncnn/benchmark$ ./benchncnn 8 1 0 -1 1 loop_count = 8 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 15.94 max = 16.23 avg = 16.04 squeezenet_int8 min = 15.91 max = 16.09 avg = 15.98 mobilenet min = 28.77 max = 28.91 avg = 28.83 mobilenet_int8 min = 23.29 max = 23.63 avg = 23.46 mobilenet_v2 min = 19.32 max = 19.43 avg = 19.37 mobilenet_v3 min = 16.57 max = 16.65 avg = 16.61 shufflenet min = 10.39 max = 10.48 avg = 10.44 shufflenet_v2 min = 10.61 max = 10.69 avg = 10.65 mnasnet min = 18.61 max = 18.69 avg = 18.65 proxylessnasnet min = 21.97 max = 22.17 avg = 22.05 efficientnet_b0 min = 36.73 max = 36.89 avg = 36.83 efficientnetv2_b0 min = 41.72 max = 41.97 avg = 41.83 regnety_400m min = 25.71 max = 26.03 avg = 25.85 blazeface min = 3.59 max = 3.63 avg = 3.60 googlenet min = 66.85 max = 67.38 avg = 67.12 googlenet_int8 min = 63.65 max = 63.85 avg = 63.74 resnet18 min = 48.49 max = 49.21 avg = 48.83 resnet18_int8 min = 50.82 max = 51.16 avg = 50.92 alexnet min = 57.67 max = 58.24 avg = 58.03 vgg16 min = 280.03 max = 281.34 avg = 280.77 vgg16_int8 min = 413.51 max = 414.67 avg = 414.08 resnet50 min = 138.19 max = 138.94 avg = 138.48 resnet50_int8 min = 112.53 max = 112.86 avg = 112.68 squeezenet_ssd min = 46.26 max = 46.46 avg = 46.37 squeezenet_ssd_int8 min = 47.56 max = 48.33 avg = 47.85 mobilenet_ssd min = 60.51 max = 60.81 avg = 60.68 mobilenet_ssd_int8 min = 47.47 max = 47.76 avg = 47.58 mobilenet_yolo min = 136.20 max = 136.54 avg = 136.37 mobilenetv2_yolov3 min = 69.80 max = 70.04 avg = 69.93 yolov4-tiny min = 87.71 max = 88.63 avg = 88.12 nanodet_m min = 25.73 max = 26.06 avg = 25.85 yolo-fastest-1.1 min = 10.25 max = 10.35 avg = 10.29 yolo-fastestv2 min = 9.25 max = 9.38 avg = 9.33 vision_transformer min = 2282.07 max = 2690.34 avg = 2481.94 FastestDet min = 9.80 max = 9.88 avg = 9.84 ``` ### NVIDIA Jetson Nano ``` [0 NVIDIA Tegra X1 (nvgpu)] queueC=0[16] queueG=0[16] queueT=0[16] [0 NVIDIA Tegra X1 (nvgpu)] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 NVIDIA Tegra X1 (nvgpu)] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 NVIDIA Tegra X1 (nvgpu)] subgroup=32 basic=1 vote=1 ballot=1 shuffle=1 loop_count = 8 num_threads = 4 powersave = 0 gpu_device = 0 cooling_down = 1 squeezenet min = 12.15 max = 26.48 avg = 18.11 squeezenet_int8 min = 27.60 max = 42.50 avg = 29.89 mobilenet min = 16.07 max = 16.10 avg = 16.09 mobilenet_int8 min = 30.65 max = 32.15 avg = 31.07 mobilenet_v2 min = 12.87 max = 13.15 avg = 12.99 mobilenet_v3 min = 13.32 max = 16.65 avg = 14.57 shufflenet min = 14.21 max = 14.34 avg = 14.29 shufflenet_v2 min = 13.03 max = 21.97 avg = 19.02 mnasnet min = 13.33 max = 13.64 avg = 13.49 proxylessnasnet min = 14.65 max = 14.91 avg = 14.76 efficientnet_b0 min = 21.26 max = 21.41 avg = 21.35 efficientnetv2_b0 min = 54.66 max = 60.81 avg = 57.16 regnety_400m min = 17.91 max = 18.08 avg = 18.01 blazeface min = 6.87 max = 7.03 avg = 6.94 googlenet min = 43.30 max = 43.54 avg = 43.43 googlenet_int8 min = 80.07 max = 84.28 avg = 81.10 resnet18 min = 43.89 max = 44.06 avg = 43.98 resnet18_int8 min = 60.70 max = 63.43 avg = 61.60 alexnet min = 74.21 max = 75.20 avg = 74.45 vgg16 min = 310.39 max = 310.65 avg = 310.52 vgg16_int8 min = 293.15 max = 297.28 avg = 294.93 resnet50 min = 93.03 max = 93.22 avg = 93.12 resnet50_int8 min = 158.54 max = 161.25 avg = 159.56 squeezenet_ssd min = 55.88 max = 57.43 avg = 56.46 squeezenet_ssd_int8 min = 72.42 max = 73.25 avg = 72.73 mobilenet_ssd min = 35.38 max = 37.57 avg = 36.63 mobilenet_ssd_int8 min = 62.92 max = 64.97 avg = 63.63 mobilenet_yolo min = 76.56 max = 80.44 avg = 78.05 mobilenetv2_yolov3 min = 46.35 max = 48.14 avg = 47.26 yolov4-tiny min = 95.38 max = 97.55 avg = 96.45 nanodet_m min = 22.82 max = 26.01 avg = 24.48 yolo-fastest-1.1 min = 20.23 max = 25.51 avg = 21.52 yolo-fastestv2 min = 20.67 max = 20.82 avg = 20.75 [0 NVIDIA Tegra X1 (nvgpu)] queueC=0[16] queueG=0[16] queueT=0[16] [0 NVIDIA Tegra X1 (nvgpu)] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 NVIDIA Tegra X1 (nvgpu)] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 NVIDIA Tegra X1 (nvgpu)] subgroup=32 basic=1 vote=1 ballot=1 shuffle=1 loop_count = 8 num_threads = 1 powersave = 0 gpu_device = 0 cooling_down = 1 squeezenet min = 12.00 max = 15.41 avg = 13.55 squeezenet_int8 min = 78.76 max = 79.14 avg = 78.91 mobilenet min = 16.03 max = 16.25 avg = 16.15 mobilenet_int8 min = 107.58 max = 107.68 avg = 107.61 mobilenet_v2 min = 12.84 max = 13.13 avg = 12.99 mobilenet_v3 min = 13.29 max = 16.64 avg = 14.38 shufflenet min = 14.23 max = 14.54 avg = 14.34 shufflenet_v2 min = 12.94 max = 13.21 avg = 13.02 mnasnet min = 13.42 max = 13.66 avg = 13.53 proxylessnasnet min = 14.64 max = 14.94 avg = 14.76 efficientnet_b0 min = 21.28 max = 21.51 avg = 21.36 efficientnetv2_b0 min = 74.32 max = 78.50 avg = 77.79 regnety_400m min = 17.94 max = 18.26 avg = 18.07 blazeface min = 6.83 max = 6.94 avg = 6.89 googlenet min = 43.45 max = 43.63 avg = 43.52 googlenet_int8 min = 255.68 max = 256.33 avg = 255.92 resnet18 min = 43.96 max = 44.06 avg = 44.01 resnet18_int8 min = 192.01 max = 192.64 avg = 192.33 alexnet min = 74.04 max = 74.23 avg = 74.14 vgg16 min = 310.32 max = 310.64 avg = 310.44 vgg16_int8 min = 1003.05 max = 1004.27 avg = 1003.66 resnet50 min = 93.05 max = 93.34 avg = 93.21 resnet50_int8 min = 516.27 max = 517.12 avg = 516.69 squeezenet_ssd min = 56.67 max = 56.86 avg = 56.73 squeezenet_ssd_int8 min = 182.96 max = 184.26 avg = 183.71 mobilenet_ssd min = 35.61 max = 35.70 avg = 35.65 mobilenet_ssd_int8 min = 217.02 max = 217.50 avg = 217.23 mobilenet_yolo min = 78.10 max = 78.36 avg = 78.20 mobilenetv2_yolov3 min = 49.86 max = 57.83 avg = 53.18 yolov4-tiny min = 96.76 max = 96.86 avg = 96.82 nanodet_m min = 25.26 max = 25.36 avg = 25.31 yolo-fastest-1.1 min = 21.55 max = 24.22 avg = 23.78 yolo-fastestv2 min = 20.80 max = 21.01 avg = 20.90 loop_count = 8 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 30.03 max = 31.41 avg = 30.59 squeezenet_int8 min = 27.32 max = 27.76 avg = 27.50 mobilenet min = 41.74 max = 42.57 avg = 42.05 mobilenet_int8 min = 30.48 max = 31.57 avg = 30.85 mobilenet_v2 min = 33.49 max = 34.18 avg = 33.83 mobilenet_v3 min = 30.59 max = 30.96 avg = 30.79 shufflenet min = 21.07 max = 31.68 avg = 22.53 shufflenet_v2 min = 19.55 max = 20.01 avg = 19.71 mnasnet min = 31.70 max = 32.26 avg = 31.93 proxylessnasnet min = 36.90 max = 38.55 avg = 37.27 efficientnet_b0 min = 68.42 max = 77.60 avg = 70.60 efficientnetv2_b0 min = 73.72 max = 81.05 avg = 75.31 regnety_400m min = 56.67 max = 66.82 avg = 58.24 blazeface min = 6.55 max = 6.96 avg = 6.74 googlenet min = 92.74 max = 94.22 avg = 93.12 googlenet_int8 min = 80.86 max = 87.28 avg = 82.41 resnet18 min = 83.10 max = 84.30 avg = 83.44 resnet18_int8 min = 59.40 max = 65.86 avg = 60.70 alexnet min = 89.21 max = 92.45 avg = 89.98 vgg16 min = 445.72 max = 451.09 avg = 447.39 vgg16_int8 min = 292.81 max = 295.55 avg = 294.34 resnet50 min = 203.42 max = 204.45 avg = 204.08 resnet50_int8 min = 157.87 max = 160.30 avg = 158.67 squeezenet_ssd min = 85.60 max = 87.24 avg = 86.18 squeezenet_ssd_int8 min = 73.10 max = 85.64 avg = 74.94 mobilenet_ssd min = 86.75 max = 96.51 avg = 88.49 mobilenet_ssd_int8 min = 63.40 max = 71.57 avg = 64.97 mobilenet_yolo min = 193.84 max = 195.24 avg = 194.62 mobilenetv2_yolov3 min = 115.80 max = 117.27 avg = 116.27 yolov4-tiny min = 156.30 max = 158.26 avg = 156.81 nanodet_m min = 46.64 max = 47.97 avg = 47.12 yolo-fastest-1.1 min = 25.78 max = 27.86 avg = 26.29 yolo-fastestv2 min = 20.54 max = 30.73 avg = 22.18 loop_count = 8 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 85.91 max = 86.86 avg = 86.14 squeezenet_int8 min = 77.57 max = 78.10 avg = 77.69 mobilenet min = 137.43 max = 138.03 avg = 137.63 mobilenet_int8 min = 108.06 max = 108.21 avg = 108.13 mobilenet_v2 min = 93.81 max = 94.70 avg = 93.99 mobilenet_v3 min = 81.77 max = 82.49 avg = 81.99 shufflenet min = 47.84 max = 48.46 avg = 48.17 shufflenet_v2 min = 47.93 max = 48.23 avg = 48.09 mnasnet min = 91.73 max = 92.55 avg = 91.98 proxylessnasnet min = 115.41 max = 115.75 avg = 115.56 efficientnet_b0 min = 225.64 max = 226.21 avg = 225.94 efficientnetv2_b0 min = 239.71 max = 240.20 avg = 239.89 regnety_400m min = 118.46 max = 118.84 avg = 118.61 blazeface min = 15.58 max = 17.14 avg = 16.21 googlenet min = 286.85 max = 287.51 avg = 287.11 googlenet_int8 min = 256.44 max = 256.74 avg = 256.53 resnet18 min = 221.27 max = 221.93 avg = 221.60 resnet18_int8 min = 189.95 max = 191.34 avg = 190.74 alexnet min = 284.30 max = 285.40 avg = 284.87 vgg16 min = 1241.51 max = 1244.53 avg = 1242.90 vgg16_int8 min = 1003.92 max = 1004.47 avg = 1004.29 resnet50 min = 624.43 max = 625.34 avg = 624.84 resnet50_int8 min = 516.64 max = 517.26 avg = 516.99 squeezenet_ssd min = 190.21 max = 191.35 avg = 190.71 squeezenet_ssd_int8 min = 182.97 max = 184.19 avg = 183.38 mobilenet_ssd min = 275.60 max = 276.17 avg = 275.90 mobilenet_ssd_int8 min = 216.67 max = 217.58 avg = 216.94 mobilenet_yolo min = 616.16 max = 617.45 avg = 616.71 mobilenetv2_yolov3 min = 324.88 max = 325.73 avg = 325.19 yolov4-tiny min = 421.01 max = 423.52 avg = 422.14 nanodet_m min = 117.39 max = 117.75 avg = 117.54 yolo-fastest-1.1 min = 54.55 max = 55.61 avg = 54.87 yolo-fastestv2 min = 44.40 max = 44.78 avg = 44.57 ``` ### NVIDIA Jetson TX2 NX(NV-Denver2 2.0Ghz x 2 + Cortex-A57 2.0Ghz x 4 + 256-core NVIDIA Pascal iGPU) ``` fan@ubuntu:~/ncnn/benchmark$ ../build/benchmark/benchncnn 10 $(nproc) 0 0 [0 NVIDIA Tegra X2 (nvgpu)] queueC=0[16] queueG=0[16] queueT=0[16] [0 NVIDIA Tegra X2 (nvgpu)] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 NVIDIA Tegra X2 (nvgpu)] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 NVIDIA Tegra X2 (nvgpu)] subgroup=32 basic/vote/ballot/shuffle=1/1/1/1 [0 NVIDIA Tegra X2 (nvgpu)] fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0 loop_count = 10 num_threads = 6 powersave = 0 gpu_device = 0 cooling_down = 1 squeezenet min = 4.84 max = 6.12 avg = 5.33 squeezenet_int8 min = 23.14 max = 148.62 avg = 52.65 mobilenet min = 7.23 max = 7.57 avg = 7.40 mobilenet_int8 min = 19.69 max = 101.50 avg = 44.15 mobilenet_v2 min = 6.65 max = 6.86 avg = 6.76 mobilenet_v3 min = 7.22 max = 8.34 avg = 8.01 shufflenet min = 6.14 max = 6.73 avg = 6.51 shufflenet_v2 min = 5.33 max = 5.43 avg = 5.39 mnasnet min = 6.98 max = 7.47 avg = 7.16 proxylessnasnet min = 6.90 max = 7.52 avg = 7.09 efficientnet_b0 min = 11.42 max = 11.89 avg = 11.67 efficientnetv2_b0 min = 26.48 max = 51.57 avg = 36.25 regnety_400m min = 8.94 max = 9.45 avg = 9.13 blazeface min = 2.08 max = 3.21 avg = 2.42 googlenet min = 15.33 max = 15.78 avg = 15.53 googlenet_int8 min = 64.02 max = 158.22 avg = 79.32 resnet18 min = 12.25 max = 13.28 avg = 12.78 resnet18_int8 min = 41.89 max = 156.59 avg = 57.07 alexnet min = 20.15 max = 20.51 avg = 20.32 vgg16 min = 62.45 max = 64.63 avg = 63.06 vgg16_int8 min = 198.24 max = 271.71 avg = 217.63 resnet50 min = 30.05 max = 31.11 avg = 30.39 resnet50_int8 min = 129.03 max = 205.33 avg = 154.72 squeezenet_ssd min = 18.48 max = 22.90 avg = 20.26 squeezenet_ssd_int8 min = 48.18 max = 71.20 avg = 60.89 mobilenet_ssd min = 15.56 max = 15.76 avg = 15.67 mobilenet_ssd_int8 min = 55.10 max = 114.34 avg = 67.41 mobilenet_yolo min = 28.75 max = 32.54 avg = 30.30 mobilenetv2_yolov3 min = 26.15 max = 32.36 avg = 29.57 yolov4-tiny min = 23.08 max = 37.19 avg = 25.43 nanodet_m min = 15.81 max = 19.99 avg = 18.10 yolo-fastest-1.1 min = 7.35 max = 11.26 avg = 8.69 yolo-fastestv2 min = 6.16 max = 6.61 avg = 6.31 vision_transformer min = 1301.45 max = 1356.58 avg = 1321.51 FastestDet min = 5.64 max = 6.60 avg = 5.90 fan@ubuntu:~/ncnn/benchmark$ ../build/benchmark/benchncnn 10 1 0 0 [0 NVIDIA Tegra X2 (nvgpu)] queueC=0[16] queueG=0[16] queueT=0[16] [0 NVIDIA Tegra X2 (nvgpu)] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 NVIDIA Tegra X2 (nvgpu)] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 NVIDIA Tegra X2 (nvgpu)] subgroup=32 basic/vote/ballot/shuffle=1/1/1/1 [0 NVIDIA Tegra X2 (nvgpu)] fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = 0 cooling_down = 1 squeezenet min = 5.10 max = 6.33 avg = 5.51 squeezenet_int8 min = 56.36 max = 59.23 avg = 57.79 mobilenet min = 6.61 max = 9.93 avg = 7.27 mobilenet_int8 min = 95.73 max = 107.69 avg = 99.35 mobilenet_v2 min = 6.66 max = 9.87 avg = 7.22 mobilenet_v3 min = 7.20 max = 8.77 avg = 7.61 shufflenet min = 5.87 max = 6.13 avg = 5.97 shufflenet_v2 min = 5.63 max = 8.24 avg = 6.10 mnasnet min = 6.55 max = 9.05 avg = 7.10 proxylessnasnet min = 7.29 max = 7.86 avg = 7.50 efficientnet_b0 min = 11.22 max = 12.13 avg = 11.49 efficientnetv2_b0 min = 20.21 max = 24.55 avg = 21.42 regnety_400m min = 8.94 max = 10.77 avg = 9.37 blazeface min = 2.30 max = 2.45 avg = 2.35 googlenet min = 15.48 max = 17.88 avg = 16.32 googlenet_int8 min = 197.08 max = 205.18 avg = 200.93 resnet18 min = 12.69 max = 13.38 avg = 13.01 resnet18_int8 min = 147.42 max = 154.63 avg = 149.94 alexnet min = 20.49 max = 20.83 avg = 20.62 vgg16 min = 62.43 max = 63.41 avg = 62.81 vgg16_int8 min = 802.28 max = 810.33 avg = 805.66 resnet50 min = 29.96 max = 30.56 avg = 30.26 resnet50_int8 min = 488.38 max = 494.67 avg = 491.09 squeezenet_ssd min = 18.35 max = 18.84 avg = 18.59 squeezenet_ssd_int8 min = 121.27 max = 124.52 avg = 122.21 mobilenet_ssd min = 15.13 max = 15.60 avg = 15.30 mobilenet_ssd_int8 min = 206.22 max = 225.98 avg = 222.55 mobilenet_yolo min = 30.12 max = 31.28 avg = 30.41 mobilenetv2_yolov3 min = 26.65 max = 27.08 avg = 26.87 yolov4-tiny min = 22.91 max = 23.32 avg = 23.04 nanodet_m min = 11.57 max = 11.99 avg = 11.75 yolo-fastest-1.1 min = 7.06 max = 7.49 avg = 7.25 yolo-fastestv2 min = 6.17 max = 6.65 avg = 6.34 vision_transformer min = 1185.13 max = 1193.94 avg = 1189.50 FastestDet min = 5.78 max = 6.87 avg = 6.11 fan@ubuntu:~/ncnn/benchmark$ ../build/benchmark/benchncnn 10 $(nproc) 0 -1 loop_count = 10 num_threads = 6 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 19.92 max = 22.96 avg = 21.43 squeezenet_int8 min = 20.33 max = 25.17 avg = 22.63 mobilenet min = 27.25 max = 80.19 avg = 36.64 mobilenet_int8 min = 21.22 max = 31.14 avg = 27.05 mobilenet_v2 min = 21.95 max = 25.77 avg = 24.10 mobilenet_v3 min = 20.10 max = 34.13 avg = 25.30 shufflenet min = 14.96 max = 108.36 avg = 28.88 shufflenet_v2 min = 13.25 max = 29.33 avg = 16.43 mnasnet min = 19.41 max = 111.63 avg = 30.57 proxylessnasnet min = 22.58 max = 27.29 avg = 24.43 efficientnet_b0 min = 32.95 max = 35.53 avg = 34.46 efficientnetv2_b0 min = 36.91 max = 52.12 avg = 41.72 regnety_400m min = 43.87 max = 152.33 avg = 56.15 blazeface min = 4.51 max = 16.71 avg = 6.79 googlenet min = 59.37 max = 93.96 avg = 70.88 googlenet_int8 min = 57.95 max = 124.06 avg = 71.47 resnet18 min = 51.99 max = 134.81 avg = 68.50 resnet18_int8 min = 40.54 max = 130.18 avg = 54.10 alexnet min = 41.42 max = 67.03 avg = 52.66 vgg16 min = 253.75 max = 295.39 avg = 265.01 vgg16_int8 min = 183.96 max = 334.83 avg = 206.81 resnet50 min = 305.79 max = 330.68 avg = 316.55 resnet50_int8 min = 120.10 max = 133.19 avg = 125.92 squeezenet_ssd min = 51.06 max = 125.69 avg = 67.34 squeezenet_ssd_int8 min = 44.56 max = 156.68 avg = 61.47 mobilenet_ssd min = 52.27 max = 123.50 avg = 64.86 mobilenet_ssd_int8 min = 48.18 max = 183.44 avg = 63.25 mobilenet_yolo min = 120.27 max = 160.73 avg = 130.75 mobilenetv2_yolov3 min = 74.39 max = 167.08 avg = 86.50 yolov4-tiny min = 108.39 max = 123.62 avg = 112.81 nanodet_m min = 32.38 max = 91.62 avg = 42.01 yolo-fastest-1.1 min = 17.97 max = 157.78 avg = 34.93 yolo-fastestv2 min = 16.12 max = 19.55 avg = 18.03 vision_transformer min = 2317.30 max = 2437.95 avg = 2375.98 FastestDet min = 15.52 max = 127.95 avg = 27.40 fan@ubuntu:~/ncnn/benchmark$ ../build/benchmark/benchncnn 10 1 0 -1 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 48.72 max = 50.66 avg = 49.98 squeezenet_int8 min = 56.50 max = 61.58 avg = 58.64 mobilenet min = 88.10 max = 89.76 avg = 88.92 mobilenet_int8 min = 95.08 max = 96.92 avg = 95.82 mobilenet_v2 min = 58.72 max = 61.48 avg = 59.54 mobilenet_v3 min = 48.58 max = 49.95 avg = 49.24 shufflenet min = 30.42 max = 32.03 avg = 31.17 shufflenet_v2 min = 28.27 max = 29.37 avg = 28.65 mnasnet min = 56.85 max = 58.22 avg = 57.37 proxylessnasnet min = 68.67 max = 71.23 avg = 69.64 efficientnet_b0 min = 89.27 max = 92.67 avg = 90.33 efficientnetv2_b0 min = 107.72 max = 109.86 avg = 108.53 regnety_400m min = 85.19 max = 91.74 avg = 86.95 blazeface min = 8.60 max = 8.80 avg = 8.71 googlenet min = 161.58 max = 166.70 avg = 163.60 googlenet_int8 min = 183.79 max = 189.43 avg = 186.17 resnet18 min = 123.43 max = 126.29 avg = 124.86 resnet18_int8 min = 140.80 max = 144.92 avg = 142.60 alexnet min = 93.16 max = 100.47 avg = 96.44 vgg16 min = 664.14 max = 671.67 avg = 667.90 vgg16_int8 min = 799.67 max = 813.66 avg = 803.50 resnet50 min = 384.10 max = 388.46 avg = 386.49 resnet50_int8 min = 448.11 max = 473.27 avg = 465.12 squeezenet_ssd min = 106.58 max = 109.62 avg = 107.39 squeezenet_ssd_int8 min = 118.39 max = 122.62 avg = 120.43 mobilenet_ssd min = 178.89 max = 183.37 avg = 180.47 mobilenet_ssd_int8 min = 201.46 max = 207.18 avg = 203.00 mobilenet_yolo min = 407.54 max = 411.12 avg = 409.33 mobilenetv2_yolov3 min = 211.83 max = 214.46 avg = 213.20 yolov4-tiny min = 249.11 max = 254.22 avg = 251.38 nanodet_m min = 69.41 max = 71.26 avg = 70.28 yolo-fastest-1.1 min = 30.99 max = 33.29 avg = 32.03 yolo-fastestv2 min = 27.70 max = 28.90 avg = 27.93 vision_transformer min = 3203.45 max = 3402.10 avg = 3286.58 FastestDet min = 29.05 max = 32.57 avg = 30.53 ``` ### Rockchip RK3288-CG.W (Cortex-A17 1.8GHz x 4) ``` WW_Tinker_Board:/data/local/tmp # ./benchncnn 8 4 0 -1 1 loop_count = 8 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 56.61 max = 56.80 avg = 56.69 squeezenet_int8 min = 40.63 max = 41.05 avg = 40.89 mobilenet min = 83.91 max = 84.59 avg = 84.23 mobilenet_int8 min = 36.15 max = 36.44 avg = 36.25 mobilenet_v2 min = 71.12 max = 71.73 avg = 71.54 mobilenet_v3 min = 56.08 max = 56.56 avg = 56.28 shufflenet min = 37.39 max = 37.75 avg = 37.55 shufflenet_v2 min = 35.19 max = 35.52 avg = 35.34 mnasnet min = 62.08 max = 62.36 avg = 62.24 proxylessnasnet min = 66.98 max = 67.38 avg = 67.16 efficientnet_b0 min = 109.95 max = 110.71 avg = 110.15 efficientnetv2_b0 min = 122.56 max = 123.31 avg = 122.94 regnety_400m min = 88.84 max = 89.19 avg = 88.99 blazeface min = 11.79 max = 11.92 avg = 11.85 googlenet min = 162.56 max = 165.39 avg = 163.19 googlenet_int8 min = 110.35 max = 110.91 avg = 110.60 resnet18 min = 172.39 max = 173.99 avg = 173.24 resnet18_int8 min = 84.00 max = 84.40 avg = 84.19 alexnet min = 156.71 max = 158.23 avg = 157.59 vgg16 min = 956.95 max = 964.32 avg = 960.60 vgg16_int8 min = 388.10 max = 389.52 avg = 388.68 resnet50 min = 403.05 max = 404.80 avg = 404.01 resnet50_int8 min = 205.12 max = 207.42 avg = 206.19 squeezenet_ssd min = 163.61 max = 165.79 avg = 164.93 squeezenet_ssd_int8 min = 125.88 max = 126.35 avg = 126.12 mobilenet_ssd min = 175.97 max = 176.86 avg = 176.39 mobilenet_ssd_int8 min = 76.90 max = 77.74 avg = 77.35 mobilenet_yolo min = 385.59 max = 387.19 avg = 386.60 mobilenetv2_yolov3 min = 234.88 max = 236.22 avg = 235.66 yolov4-tiny min = 307.44 max = 310.64 avg = 308.54 nanodet_m min = 92.54 max = 93.15 avg = 92.82 yolo-fastest-1.1 min = 46.69 max = 47.02 avg = 46.83 yolo-fastestv2 min = 38.37 max = 38.68 avg = 38.54 WW_Tinker_Board:/data/local/tmp # ./benchncnn 4 1 0 -1 1 loop_count = 4 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 138.27 max = 138.57 avg = 138.41 squeezenet_int8 min = 85.97 max = 86.23 avg = 86.05 mobilenet min = 234.90 max = 235.08 avg = 235.00 mobilenet_int8 min = 99.92 max = 100.45 avg = 100.12 mobilenet_v2 min = 157.76 max = 157.99 avg = 157.86 mobilenet_v3 min = 130.05 max = 130.23 avg = 130.17 shufflenet min = 74.48 max = 74.62 avg = 74.55 shufflenet_v2 min = 74.05 max = 74.25 avg = 74.13 mnasnet min = 150.74 max = 151.03 avg = 150.87 proxylessnasnet min = 171.09 max = 171.23 avg = 171.16 efficientnet_b0 min = 306.85 max = 307.02 avg = 306.97 efficientnetv2_b0 min = 347.40 max = 347.87 avg = 347.64 regnety_400m min = 190.26 max = 190.33 avg = 190.29 blazeface min = 25.25 max = 25.68 avg = 25.47 googlenet min = 432.09 max = 432.48 avg = 432.32 googlenet_int8 min = 275.55 max = 276.07 avg = 275.88 resnet18 min = 355.11 max = 358.56 avg = 356.90 resnet18_int8 min = 205.80 max = 206.68 avg = 206.26 alexnet min = 330.09 max = 330.29 avg = 330.15 vgg16 min = 2122.95 max = 2124.45 avg = 2123.68 vgg16_int8 min = 1048.53 max = 1049.29 avg = 1048.86 resnet50 min = 1047.27 max = 1048.33 avg = 1047.63 resnet50_int8 min = 517.75 max = 519.28 avg = 518.81 squeezenet_ssd min = 304.69 max = 305.75 avg = 305.16 squeezenet_ssd_int8 min = 219.16 max = 219.94 avg = 219.45 mobilenet_ssd min = 483.73 max = 484.12 avg = 484.01 mobilenet_ssd_int8 min = 208.89 max = 209.19 avg = 209.09 mobilenet_yolo min = 1092.75 max = 1093.70 avg = 1093.13 mobilenetv2_yolov3 min = 560.66 max = 560.92 avg = 560.77 yolov4-tiny min = 704.69 max = 705.38 avg = 705.12 nanodet_m min = 187.13 max = 187.57 avg = 187.39 yolo-fastest-1.1 min = 83.05 max = 83.11 avg = 83.08 yolo-fastestv2 min = 72.19 max = 72.23 avg = 72.21 WW_Tinker_Board:/data/local/tmp # ./benchncnn 4 1 0 0 0 [0 Mali-T760] queueC=0[2] queueG=0[2] queueT=0[2] [0 Mali-T760] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=1 [0 Mali-T760] fp16-p/s/a=1/0/1 int8-p/s/a=1/0/0 [0 Mali-T760] subgroup=0 basic=0 vote=0 ballot=0 shuffle=0 loop_count = 4 num_threads = 1 powersave = 0 gpu_device = 0 cooling_down = 0 squeezenet min = 41.78 max = 41.82 avg = 41.79 mobilenet min = 62.67 max = 62.80 avg = 62.74 mobilenet_v2 min = 51.08 max = 51.26 avg = 51.17 mobilenet_v3 min = 51.43 max = 51.70 avg = 51.51 shufflenet min = 56.83 max = 56.94 avg = 56.87 shufflenet_v2 min = 48.46 max = 48.63 avg = 48.53 mnasnet min = 52.31 max = 52.63 avg = 52.42 proxylessnasnet min = 57.33 max = 57.46 avg = 57.41 efficientnet_b0 min = 87.52 max = 87.80 avg = 87.62 efficientnetv2_b0 min = 123.83 max = 124.67 avg = 124.34 regnety_400m min = 65.52 max = 65.81 avg = 65.64 blazeface min = 14.56 max = 14.73 avg = 14.62 googlenet min = 138.52 max = 139.39 avg = 138.89 resnet18 min = 124.45 max = 124.81 avg = 124.58 alexnet min = 130.46 max = 130.68 avg = 130.54 ``` ### HiSilicon Hi3519V101 (Cortex-A17 1.2GHz x 1) ``` root@Hi3519:/ncnn-benchmark # taskset 2 ./benchncnn 8 1 0 loop_count = 8 num_threads = 1 powersave = 0 squeezenet min = 272.97 max = 275.84 avg = 274.85 squeezenet-int8 min = 200.87 max = 202.47 avg = 201.74 mobilenet min = 480.90 max = 482.16 avg = 481.64 mobilenet_v2 min = 350.01 max = 352.39 avg = 350.81 shufflenet min = 152.40 max = 153.17 avg = 152.80 googlenet min = 1096.65 max = 1101.35 avg = 1099.21 resnet18 min = 983.92 max = 987.00 avg = 985.25 alexnet min = 1140.30 max = 1141.55 avg = 1140.92 squeezenet-ssd min = 574.62 max = 580.12 avg = 577.23 mobilenet-ssd min = 960.26 max = 969.13 avg = 965.93 mobilenet-yolo min = 1867.78 max = 1880.08 avg = 1873.89 ``` ### iPhone 5S (Apple A7 1.3GHz x 2) ``` iPhone:~ root# ./benchncnn 8 2 0 -1 [0 Apple A7 GPU] queueC=0[8] queueT=0[8] memU=1 memDL=1 memHV=1 [0 Apple A7 GPU] fp16p=1 fp16s=0 fp16a=0 int8s=0 int8a=0 loop_count = 8 num_threads = 2 powersave = 0 gpu_device = -1 squeezenet min = 49.21 max = 50.40 avg = 49.74 squeezenet_int8 min = 54.73 max = 57.39 avg = 56.70 mobilenet min = 79.03 max = 80.00 avg = 79.44 mobilenet_int8 min = 109.95 max = 112.69 avg = 111.38 mobilenet_v2 min = 57.34 max = 57.88 avg = 57.47 mobilenet_v3 min = 52.66 max = 53.73 avg = 53.12 shufflenet min = 32.78 max = 36.12 avg = 35.12 shufflenet_v2 min = 31.25 max = 32.10 avg = 31.61 mnasnet min = 54.58 max = 56.12 avg = 55.44 proxylessnasnet min = 69.52 max = 72.42 avg = 70.40 googlenet min = 192.82 max = 194.20 avg = 193.35 googlenet_int8 min = 235.43 max = 244.71 avg = 239.64 resnet18 min = 164.33 max = 167.27 avg = 165.51 resnet18_int8 min = 176.16 max = 179.73 avg = 178.60 alexnet min = 224.50 max = 228.21 avg = 226.51 vgg16 min = 4262.28 max = 4400.29 avg = 4300.34 vgg16_int8 min = 2835.84 max = 2955.22 avg = 2890.26 resnet50 min = 542.66 max = 1344.49 avg = 737.05 resnet50_int8 min = 426.08 max = 435.34 avg = 431.87 squeezenet_ssd min = 129.03 max = 131.44 avg = 129.99 squeezenet_ssd_int8 min = 155.52 max = 161.42 avg = 158.51 mobilenet_ssd min = 168.18 max = 170.17 avg = 169.42 mobilenet_ssd_int8 min = 205.78 max = 212.07 avg = 209.66 mobilenet_yolo min = 347.32 max = 363.15 avg = 355.72 mobilenetv2_yolov3 min = 193.11 max = 196.64 avg = 194.31 iPhone:~ root# ./benchncnn 4 1 0 -1 [0 Apple A7 GPU] queueC=0[8] queueT=0[8] memU=1 memDL=1 memHV=1 [0 Apple A7 GPU] fp16p=1 fp16s=0 fp16a=0 int8s=0 int8a=0 loop_count = 4 num_threads = 1 powersave = 0 gpu_device = -1 squeezenet min = 86.36 max = 86.81 avg = 86.57 squeezenet_int8 min = 99.62 max = 100.07 avg = 99.83 mobilenet min = 143.11 max = 146.50 avg = 145.38 mobilenet_int8 min = 202.25 max = 203.32 avg = 203.02 mobilenet_v2 min = 97.56 max = 98.55 avg = 98.09 mobilenet_v3 min = 87.45 max = 87.68 avg = 87.52 shufflenet min = 54.01 max = 54.13 avg = 54.08 shufflenet_v2 min = 48.11 max = 48.65 avg = 48.36 mnasnet min = 95.02 max = 95.77 avg = 95.25 proxylessnasnet min = 123.91 max = 124.61 avg = 124.18 googlenet min = 344.23 max = 348.95 avg = 345.97 googlenet_int8 min = 420.30 max = 420.99 avg = 420.65 resnet18 min = 300.44 max = 301.36 avg = 300.99 resnet18_int8 min = 308.60 max = 310.52 avg = 309.70 alexnet min = 423.92 max = 429.84 avg = 427.24 vgg16 min = 4787.59 max = 5015.23 avg = 4900.43 vgg16_int8 min = 3560.59 max = 3722.75 avg = 3639.88 resnet50 min = 797.88 max = 1294.57 avg = 985.63 resnet50_int8 min = 751.15 max = 760.25 avg = 757.89 squeezenet_ssd min = 193.75 max = 196.13 avg = 195.29 squeezenet_ssd_int8 min = 243.78 max = 245.19 avg = 244.74 mobilenet_ssd min = 299.69 max = 307.22 avg = 305.12 mobilenet_ssd_int8 min = 385.91 max = 389.82 avg = 388.48 mobilenet_yolo min = 657.00 max = 659.31 avg = 658.08 mobilenetv2_yolov3 min = 335.59 max = 342.22 avg = 339.37 iPhone:~ root# ./benchncnn 4 1 0 0 [0 Apple A7 GPU] queueC=0[8] queueT=0[8] memU=1 memDL=1 memHV=1 [0 Apple A7 GPU] fp16p=1 fp16s=0 fp16a=0 int8s=0 int8a=0 loop_count = 4 num_threads = 1 powersave = 0 gpu_device = 0 squeezenet min = 260.18 max = 262.55 avg = 261.09 mobilenet min = 288.73 max = 291.83 avg = 289.67 mobilenet_v2 min = 265.72 max = 267.05 avg = 266.14 mobilenet_v3 min = 255.86 max = 257.35 avg = 256.43 shufflenet min = 236.66 max = 239.49 avg = 237.98 shufflenet_v2 min = 244.92 max = 247.75 avg = 246.22 mnasnet min = 254.75 max = 256.48 avg = 255.85 proxylessnasnet min = 281.42 max = 282.62 avg = 282.11 googlenet min = 745.36 max = 764.91 avg = 754.16 resnet18 min = 721.26 max = 741.98 avg = 734.78 alexnet min = 521.43 max = 530.95 avg = 527.01 resnet50 min = 1494.86 max = 1505.79 avg = 1501.49 squeezenet_ssd min = 1096.45 max = 1102.84 avg = 1098.55 mobilenet_ssd min = 639.50 max = 641.81 avg = 640.83 mobilenet_yolo min = 1445.16 max = 1450.94 avg = 1447.42 mobilenetv2_yolov3 min = 1047.24 max = 1060.97 avg = 1052.86 ``` ### Freescale i.MX7 Dual (Cortex A7 1.0GHz x 2) ``` imx7d_pico:/data/local/tmp $ ./benchncnn 8 2 0 -1 1 loop_count = 8 num_threads = 2 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 220.10 max = 226.46 avg = 222.89 squeezenet_int8 min = 159.26 max = 165.25 avg = 161.71 mobilenet min = 366.92 max = 373.78 avg = 371.55 mobilenet_int8 min = 223.14 max = 229.66 avg = 225.66 mobilenet_v2 min = 252.32 max = 259.41 avg = 255.54 mobilenet_v3 min = 214.05 max = 222.24 avg = 217.53 shufflenet min = 137.02 max = 144.79 avg = 138.85 shufflenet_v2 min = 134.89 max = 140.75 avg = 137.18 mnasnet min = 250.64 max = 256.75 avg = 253.33 proxylessnasnet min = 285.35 max = 291.43 avg = 288.37 efficientnet_b0 min = 430.47 max = 436.63 avg = 434.75 regnety_400m min = 317.69 max = 325.77 avg = 321.24 blazeface min = 42.93 max = 43.30 avg = 43.14 googlenet min = 721.84 max = 728.40 avg = 724.23 googlenet_int8 min = 504.07 max = 511.06 avg = 507.39 resnet18 min = 645.61 max = 653.08 avg = 648.51 resnet18_int8 min = 370.84 max = 514.38 avg = 392.80 alexnet min = 783.64 max = 794.83 avg = 786.95 squeezenet_ssd min = 508.71 max = 513.70 avg = 511.29 squeezenet_ssd_int8 min = 402.85 max = 409.32 avg = 406.45 mobilenet_ssd min = 763.70 max = 771.52 avg = 767.61 mobilenet_ssd_int8 min = 457.99 max = 460.85 avg = 459.76 mobilenet_yolo min = 1730.90 max = 1746.52 avg = 1741.26 mobilenetv2_yolov3 min = 884.00 max = 892.97 avg = 889.38 yolov4-tiny min = 1181.20 max = 1218.20 avg = 1202.28 nanodet_m min = 331.53 max = 339.89 avg = 334.62 imx7d_pico:/data/local/tmp $ ./benchncnn 4 1 0 -1 1 loop_count = 4 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 408.39 max = 410.27 avg = 408.95 squeezenet_int8 min = 290.25 max = 290.95 avg = 290.61 mobilenet min = 707.10 max = 711.64 avg = 708.47 mobilenet_int8 min = 434.95 max = 436.16 avg = 435.66 mobilenet_v2 min = 466.52 max = 467.41 avg = 466.96 mobilenet_v3 min = 407.03 max = 408.29 avg = 407.56 shufflenet min = 240.65 max = 241.07 avg = 240.85 shufflenet_v2 min = 229.27 max = 235.66 avg = 231.51 mnasnet min = 471.21 max = 471.48 avg = 471.35 proxylessnasnet min = 544.74 max = 547.62 avg = 546.20 efficientnet_b0 min = 824.09 max = 824.44 avg = 824.20 regnety_400m min = 570.20 max = 571.73 avg = 570.82 blazeface min = 76.46 max = 77.05 avg = 76.81 googlenet min = 1368.82 max = 1369.99 avg = 1369.33 googlenet_int8 min = 945.51 max = 946.61 avg = 945.91 resnet18 min = 1237.79 max = 1257.12 avg = 1246.80 resnet18_int8 min = 705.09 max = 706.72 avg = 705.63 alexnet min = 1516.35 max = 1522.82 avg = 1519.52 squeezenet_ssd min = 906.97 max = 908.48 avg = 907.68 squeezenet_ssd_int8 min = 727.15 max = 728.16 avg = 727.77 mobilenet_ssd min = 1475.19 max = 1478.52 avg = 1476.81 mobilenet_ssd_int8 min = 883.88 max = 890.68 avg = 885.90 mobilenet_yolo min = 3408.43 max = 3418.63 avg = 3412.52 mobilenetv2_yolov3 min = 1685.18 max = 1695.89 avg = 1689.23 yolov4-tiny min = 2168.24 max = 2183.24 avg = 2175.93 nanodet_m min = 561.56 max = 562.05 avg = 561.72 ``` ### Z7-Lite 7020 XC7Z020CLG400-2 (Cortex-A9 766MHz x 2) ``` root@petalinux_hdmi:~# LD_LIBRARY_PATH=. ./benchncnn 8 2 0 -1 1 loop_count = 8 num_threads = 2 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 389.18 max = 390.13 avg = 389.60 squeezenet_int8 min = 254.33 max = 255.24 avg = 254.85 mobilenet min = 623.71 max = 625.01 avg = 624.46 mobilenet_int8 min = 240.40 max = 241.03 avg = 240.87 mobilenet_v2 min = 450.00 max = 450.89 avg = 450.40 mobilenet_v3 min = 362.99 max = 363.66 avg = 363.28 shufflenet min = 212.20 max = 213.28 avg = 212.84 shufflenet_v2 min = 210.26 max = 212.64 avg = 211.53 mnasnet min = 408.67 max = 409.64 avg = 409.17 proxylessnasnet min = 449.86 max = 450.94 avg = 450.45 efficientnet_b0 min = 737.40 max = 739.58 avg = 738.32 efficientnetv2_b0 min = 848.58 max = 849.74 avg = 849.24 regnety_400m min = 501.32 max = 503.02 avg = 501.87 blazeface min = 70.89 max = 72.22 avg = 71.61 squeezenet_ssd min = 978.55 max = 979.86 avg = 979.22 squeezenet_ssd_int8 min = 691.90 max = 694.18 avg = 692.73 mobilenet_ssd min = 1353.12 max = 1354.13 avg = 1353.53 mobilenet_ssd_int8 min = 496.26 max = 497.29 avg = 496.61 nanodet_m min = 542.04 max = 546.29 avg = 544.73 yolo-fastest-1.1 min = 282.75 max = 286.11 avg = 284.24 yolo-fastestv2 min = 230.91 max = 232.74 avg = 231.56 root@petalinux_hdmi:~# LD_LIBRARY_PATH=. ./benchncnn 4 1 0 -1 1 loop_count = 4 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 637.19 max = 639.33 avg = 637.82 squeezenet_int8 min = 390.31 max = 391.63 avg = 390.94 mobilenet min = 1085.54 max = 1085.96 avg = 1085.71 mobilenet_int8 min = 437.28 max = 437.65 avg = 437.44 mobilenet_v2 min = 716.03 max = 716.75 avg = 716.35 mobilenet_v3 min = 587.83 max = 588.55 avg = 588.21 shufflenet min = 331.28 max = 331.97 avg = 331.63 shufflenet_v2 min = 331.03 max = 333.19 avg = 331.76 mnasnet min = 682.68 max = 683.11 avg = 682.82 proxylessnasnet min = 763.89 max = 764.80 avg = 764.35 efficientnet_b0 min = 1288.61 max = 1289.10 avg = 1288.81 efficientnetv2_b0 min = 1499.12 max = 1500.11 avg = 1499.65 regnety_400m min = 852.03 max = 853.16 avg = 852.68 blazeface min = 109.40 max = 111.51 avg = 110.41 squeezenet_ssd min = 1493.25 max = 1497.00 avg = 1494.87 squeezenet_ssd_int8 min = 1016.77 max = 1019.31 avg = 1017.99 mobilenet_ssd min = 2379.20 max = 2379.83 avg = 2379.64 mobilenet_ssd_int8 min = 881.70 max = 881.89 avg = 881.83 nanodet_m min = 831.13 max = 832.58 avg = 831.87 yolo-fastest-1.1 min = 466.80 max = 469.90 avg = 468.79 yolo-fastestv2 min = 352.07 max = 355.20 avg = 353.36 ``` ### Loongson 2K1000 (GS264 1.0GHz x 2) ``` root@ls2k:~/ncnn/build/benchmark# ./benchncnn 10 2 0 -1 1 loop_count = 10 num_threads = 2 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 184.33 max = 184.94 avg = 184.65 squeezenet_int8 min = 201.42 max = 201.99 avg = 201.72 mobilenet min = 277.17 max = 278.04 avg = 277.66 mobilenet_int8 min = 234.61 max = 235.17 avg = 234.81 mobilenet_v2 min = 223.10 max = 274.92 avg = 228.71 mobilenet_v3 min = 185.79 max = 201.76 avg = 187.60 shufflenet min = 129.78 max = 131.09 avg = 130.28 shufflenet_v2 min = 115.86 max = 116.77 avg = 116.42 mnasnet min = 213.92 max = 214.72 avg = 214.26 proxylessnasnet min = 240.05 max = 242.02 avg = 240.86 efficientnet_b0 min = 347.52 max = 348.53 avg = 348.13 efficientnetv2_b0 min = 382.78 max = 479.58 avg = 398.18 regnety_400m min = 270.00 max = 312.84 avg = 274.66 blazeface min = 37.60 max = 38.02 avg = 37.79 googlenet min = 659.55 max = 693.17 avg = 666.17 googlenet_int8 min = 678.26 max = 718.39 avg = 682.79 resnet18 min = 499.75 max = 766.88 avg = 532.49 resnet18_int8 min = 500.38 max = 533.97 avg = 504.56 alexnet min = 508.49 max = 542.94 avg = 516.13 vgg16 min = 2654.06 max = 3082.44 avg = 2762.51 vgg16_int8 min = 2628.96 max = 2665.35 avg = 2647.12 resnet50 min = 1256.97 max = 1417.45 avg = 1283.04 resnet50_int8 min = 1232.55 max = 1276.94 avg = 1244.59 squeezenet_ssd min = 538.83 max = 588.03 avg = 553.44 squeezenet_ssd_int8 min = 501.67 max = 532.61 avg = 505.72 mobilenet_ssd min = 571.14 max = 600.93 avg = 578.22 mobilenet_ssd_int8 min = 478.67 max = 515.39 avg = 483.06 mobilenet_yolo min = 1644.48 max = 1729.17 avg = 1669.18 mobilenetv2_yolov3 min = 752.22 max = 792.40 avg = 760.10 yolov4-tiny min = 994.48 max = 1096.10 avg = 1016.49 nanodet_m min = 299.12 max = 343.99 avg = 303.98 yolo-fastest-1.1 min = 141.56 max = 142.93 avg = 142.04 yolo-fastestv2 min = 125.66 max = 168.88 avg = 130.28 root@ls2k:~/ncnn/build/benchmark# ./benchncnn 4 1 0 -1 1 loop_count = 4 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 295.48 max = 296.42 avg = 295.98 squeezenet_int8 min = 334.05 max = 336.31 avg = 335.35 mobilenet min = 476.33 max = 479.00 avg = 477.41 mobilenet_int8 min = 446.03 max = 448.21 avg = 446.73 mobilenet_v2 min = 343.26 max = 343.97 avg = 343.69 mobilenet_v3 min = 296.84 max = 297.31 avg = 297.11 shufflenet min = 202.31 max = 203.96 avg = 202.79 shufflenet_v2 min = 181.69 max = 182.42 avg = 182.08 mnasnet min = 353.73 max = 354.12 avg = 353.99 proxylessnasnet min = 404.49 max = 405.00 avg = 404.75 efficientnet_b0 min = 592.54 max = 593.81 avg = 593.14 efficientnetv2_b0 min = 649.91 max = 651.49 avg = 650.54 regnety_400m min = 425.96 max = 426.33 avg = 426.12 blazeface min = 59.74 max = 60.19 avg = 59.90 googlenet min = 1120.13 max = 1217.54 avg = 1146.27 googlenet_int8 min = 1205.17 max = 1213.43 avg = 1208.13 resnet18 min = 803.07 max = 997.37 avg = 856.09 resnet18_int8 min = 911.74 max = 916.16 avg = 913.31 alexnet min = 883.47 max = 903.08 avg = 889.06 vgg16 min = 4425.52 max = 4587.36 avg = 4467.61 vgg16_int8 min = 4896.90 max = 4993.15 avg = 4924.44 resnet50 min = 2163.22 max = 2169.90 avg = 2167.49 resnet50_int8 min = 2202.87 max = 2218.00 avg = 2210.51 squeezenet_ssd min = 831.06 max = 926.94 avg = 856.24 squeezenet_ssd_int8 min = 800.52 max = 803.28 avg = 801.72 mobilenet_ssd min = 979.74 max = 980.82 avg = 980.22 mobilenet_ssd_int8 min = 893.79 max = 895.41 avg = 894.51 mobilenet_yolo min = 2578.17 max = 2586.30 avg = 2582.55 mobilenetv2_yolov3 min = 1190.77 max = 1207.67 avg = 1196.06 yolov4-tiny min = 1558.29 max = 1570.18 avg = 1561.52 nanodet_m min = 442.90 max = 444.27 avg = 443.72 yolo-fastest-1.1 min = 203.60 max = 208.43 avg = 205.20 yolo-fastestv2 min = 184.61 max = 185.05 avg = 184.75 ``` ### Loongson 2K1000LA (LA264 1.0GHz * 2) ``` root@ls2kla:~/ncnn/build/benchmark# ./benchncnn 10 2 0 -1 1 loop_count = 10 num_threads = 2 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 151.11 max = 162.36 avg = 153.30 squeezenet_int8 min = 195.32 max = 198.63 avg = 196.12 mobilenet min = 279.27 max = 283.42 avg = 280.40 mobilenet_int8 min = 264.78 max = 268.41 avg = 265.76 mobilenet_v2 min = 204.39 max = 207.69 avg = 205.77 mobilenet_v3 min = 171.32 max = 187.07 avg = 173.15 shufflenet min = 147.43 max = 150.72 avg = 147.89 shufflenet_v2 min = 169.42 max = 172.58 avg = 170.35 mnasnet min = 204.87 max = 208.01 avg = 205.63 proxylessnasnet min = 226.79 max = 237.74 avg = 229.02 efficientnet_b0 min = 302.30 max = 310.91 avg = 303.87 efficientnetv2_b0 min = 327.65 max = 361.15 avg = 334.45 regnety_400m min = 264.08 max = 278.49 avg = 266.35 blazeface min = 31.80 max = 39.18 avg = 32.88 googlenet min = 562.95 max = 578.42 avg = 566.28 googlenet_int8 min = 598.16 max = 613.56 avg = 601.68 resnet18 min = 466.73 max = 472.08 avg = 469.58 resnet18_int8 min = 489.69 max = 493.74 avg = 491.63 alexnet min = 381.35 max = 388.12 avg = 384.78 vgg16 min = 2321.29 max = 2345.89 avg = 2330.29 vgg16_int8 min = 2562.86 max = 2568.06 avg = 2565.68 resnet50 min = 1219.09 max = 1225.67 avg = 1221.36 resnet50_int8 min = 1263.44 max = 1266.74 avg = 1265.09 squeezenet_ssd min = 433.23 max = 441.06 avg = 437.07 squeezenet_ssd_int8 min = 438.69 max = 443.17 avg = 440.81 mobilenet_ssd min = 587.37 max = 598.57 avg = 589.99 mobilenet_ssd_int8 min = 539.62 max = 552.57 avg = 542.87 mobilenet_yolo min = 1485.30 max = 1491.17 avg = 1487.81 mobilenetv2_yolov3 min = 711.57 max = 722.91 avg = 715.07 yolov4-tiny min = 954.76 max = 961.66 avg = 957.28 nanodet_m min = 364.22 max = 369.32 avg = 365.94 yolo-fastest-1.1 min = 154.81 max = 160.45 avg = 156.23 yolo-fastestv2 min = 157.39 max = 168.82 avg = 159.51 vision_transformer min = 18926.46 max = 18980.43 avg = 18951.29 FastestDet min = 168.81 max = 176.77 avg = 170.26 root@ls2kla:~/ncnn/build/benchmark# ./benchncnn 4 1 0 -1 1 loop_count = 4 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 272.76 max = 280.89 avg = 275.29 squeezenet_int8 min = 352.02 max = 353.25 avg = 352.40 mobilenet min = 519.09 max = 519.68 avg = 519.34 mobilenet_int8 min = 509.85 max = 510.23 avg = 510.04 mobilenet_v2 min = 352.06 max = 352.74 avg = 352.37 mobilenet_v3 min = 295.13 max = 295.70 avg = 295.39 shufflenet min = 241.58 max = 241.94 avg = 241.73 shufflenet_v2 min = 282.88 max = 283.39 avg = 283.18 mnasnet min = 357.74 max = 358.21 avg = 357.98 proxylessnasnet min = 403.26 max = 411.69 avg = 406.02 efficientnet_b0 min = 546.11 max = 546.88 avg = 546.53 efficientnetv2_b0 min = 596.83 max = 597.05 avg = 596.93 regnety_400m min = 441.94 max = 442.02 avg = 441.98 blazeface min = 54.08 max = 54.59 avg = 54.38 googlenet min = 1042.19 max = 1048.03 avg = 1044.40 googlenet_int8 min = 1118.22 max = 1121.18 avg = 1119.79 resnet18 min = 838.79 max = 839.81 avg = 839.43 resnet18_int8 min = 939.62 max = 940.72 avg = 940.23 alexnet min = 729.36 max = 740.65 avg = 734.19 vgg16 min = 4326.68 max = 4335.10 avg = 4330.97 vgg16_int8 min = 4896.71 max = 4909.63 avg = 4905.14 resnet50 min = 2277.36 max = 2280.34 avg = 2279.14 resnet50_int8 min = 2399.07 max = 2402.21 avg = 2400.78 squeezenet_ssd min = 751.49 max = 753.79 avg = 752.20 squeezenet_ssd_int8 min = 771.01 max = 774.08 avg = 771.91 mobilenet_ssd min = 1063.41 max = 1065.65 avg = 1064.16 mobilenet_ssd_int8 min = 1031.59 max = 1033.03 avg = 1032.09 mobilenet_yolo min = 2585.33 max = 2586.65 avg = 2586.11 mobilenetv2_yolov3 min = 1246.35 max = 1248.43 avg = 1247.32 yolov4-tiny min = 1639.13 max = 1642.47 avg = 1640.87 nanodet_m min = 606.40 max = 607.14 avg = 606.86 yolo-fastest-1.1 min = 242.15 max = 244.64 avg = 243.43 yolo-fastestv2 min = 246.92 max = 247.84 avg = 247.27 vision_transformer min = 36607.51 max = 36870.44 avg = 36724.88 FastestDet min = 266.96 max = 268.86 avg = 267.94 ``` ### Loongson 2K2000 (LA364 1.5GHz * 2 with lsx) ``` loongson@loongson-pc:~/ncnn/build/benchmark$ ./benchncnn 4 2 0 -1 1 loop_count = 4 num_threads = 2 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 58.54 max = 61.57 avg = 60.37 squeezenet_int8 min = 66.79 max = 72.05 avg = 70.49 mobilenet min = 110.46 max = 112.72 avg = 111.84 mobilenet_int8 min = 117.83 max = 126.51 avg = 123.42 mobilenet_v2 min = 65.19 max = 70.78 avg = 67.73 mobilenet_v3 min = 51.30 max = 56.61 avg = 54.52 shufflenet min = 32.78 max = 35.11 avg = 33.99 shufflenet_v2 min = 31.58 max = 32.59 avg = 32.15 mnasnet min = 64.18 max = 78.53 avg = 68.72 proxylessnasnet min = 73.49 max = 85.30 avg = 77.35 efficientnet_b0 min = 101.83 max = 106.26 avg = 104.91 efficientnetv2_b0 min = 126.55 max = 131.95 avg = 127.91 regnety_400m min = 88.19 max = 92.58 avg = 89.60 blazeface min = 8.57 max = 8.68 avg = 8.63 googlenet min = 207.97 max = 214.47 avg = 211.07 googlenet_int8 min = 237.92 max = 241.06 avg = 239.76 resnet18 min = 153.42 max = 161.54 avg = 158.21 resnet18_int8 min = 177.77 max = 183.83 avg = 181.90 alexnet min = 145.71 max = 149.41 avg = 147.97 vgg16 min = 937.03 max = 961.65 avg = 945.20 vgg16_int8 min = 850.20 max = 869.47 avg = 859.99 resnet50 min = 497.95 max = 524.29 avg = 511.85 resnet50_int8 min = 541.22 max = 549.09 avg = 544.30 squeezenet_ssd min = 155.11 max = 163.01 avg = 159.72 squeezenet_ssd_int8 min = 136.11 max = 138.38 avg = 137.36 mobilenet_ssd min = 226.97 max = 231.33 avg = 229.20 mobilenet_ssd_int8 min = 248.61 max = 253.10 avg = 250.83 mobilenet_yolo min = 613.25 max = 626.75 avg = 619.83 mobilenetv2_yolov3 min = 249.50 max = 258.17 avg = 255.75 yolov4-tiny min = 312.41 max = 349.24 avg = 328.38 nanodet_m min = 81.50 max = 84.20 avg = 83.14 yolo-fastest-1.1 min = 30.46 max = 30.91 avg = 30.71 yolo-fastestv2 min = 26.78 max = 28.80 avg = 28.10 vision_transformer min = 4483.37 max = 4519.06 avg = 4507.04 FastestDet min = 31.15 max = 32.37 avg = 32.06 ``` ### Loongson 3A3000 (GS464E 1.45GHz * 4) ``` root@3A3K:~/Desktop/ncnn-20221128/build/benchmark$ ./benchncnn 5 4 2 -1 0 loop_count = 5 num_threads = 4 powersave = 2 gpu_device = -1 cooling_down = 0 squeezenet min = 88.82 max = 116.74 avg = 94.92 squeezenet_int8 min = 140.62 max = 162.48 avg = 146.32 mobilenet min = 144.80 max = 244.58 avg = 172.14 mobilenet_int8 min = 265.21 max = 293.89 avg = 281.80 mobilenet_v2 min = 109.80 max = 156.74 avg = 120.48 mobilenet_v3 min = 90.18 max = 93.25 avg = 91.50 shufflenet min = 56.64 max = 216.12 avg = 100.68 shufflenet_v2 min = 45.70 max = 142.00 avg = 65.20 mnasnet min = 106.99 max = 229.11 avg = 134.22 proxylessnasnet min = 123.68 max = 261.01 avg = 155.97 efficientnet_b0 min = 160.98 max = 191.14 avg = 171.55 efficientnetv2_b0 min = 162.75 max = 187.67 avg = 176.19 regnety_400m min = 135.06 max = 174.12 avg = 151.30 blazeface min = 15.26 max = 43.81 avg = 23.91 googlenet min = 327.16 max = 386.02 avg = 350.25 googlenet_int8 min = 500.45 max = 637.39 avg = 540.62 resnet18 min = 254.45 max = 421.56 avg = 304.48 resnet18_int8 min = 385.14 max = 559.01 avg = 439.74 alexnet min = 179.19 max = 220.91 avg = 190.63 vgg16 min = 1563.99 max = 1645.01 avg = 1619.63 vgg16_int8 min = 1436.00 max = 1530.45 avg = 1473.00 resnet50 min = 702.35 max = 833.23 avg = 764.14 resnet50_int8 min = 1099.40 max = 1208.84 avg = 1154.51 squeezenet_ssd min = 191.40 max = 270.10 avg = 218.75 squeezenet_ssd_int8 min = 304.51 max = 387.51 avg = 344.98 mobilenet_ssd min = 315.77 max = 417.37 avg = 344.40 mobilenet_ssd_int8 min = 554.28 max = 656.07 avg = 580.72 mobilenet_yolo min = 806.48 max = 851.22 avg = 825.50 mobilenetv2_yolov3 min = 382.38 max = 503.38 avg = 421.03 yolov4-tiny min = 502.87 max = 620.30 avg = 550.08 nanodet_m min = 126.00 max = 314.03 avg = 184.93 yolo-fastest-1.1 min = 64.68 max = 189.47 avg = 110.89 yolo-fastestv2 min = 69.03 max = 116.31 avg = 82.36 vision_transformer min = 14737.56 max = 15012.35 avg = 14890.56 FastestDet min = 84.30 max = 139.87 avg = 102.23 ``` ### Loongson 3A4000 (GS464V 1.8GHz * 4 with MSA128) ``` root@3A4K:~/Desktop/ncnn-20221128/build/benchmark$ ./benchncnn 10 4 2 -1 0 loop_count = 10 num_threads = 4 powersave = 2 gpu_device = -1 cooling_down = 0 squeezenet min = 17.04 max = 39.86 avg = 20.39 squeezenet_int8 min = 21.77 max = 25.93 avg = 23.02 mobilenet min = 26.34 max = 97.11 avg = 38.24 mobilenet_int8 min = 32.93 max = 33.31 avg = 33.07 mobilenet_v2 min = 19.40 max = 19.91 avg = 19.63 mobilenet_v3 min = 16.48 max = 45.31 avg = 19.68 shufflenet min = 12.23 max = 116.79 avg = 22.86 shufflenet_v2 min = 11.14 max = 11.59 avg = 11.37 mnasnet min = 18.33 max = 51.66 avg = 24.52 proxylessnasnet min = 22.03 max = 22.46 avg = 22.19 efficientnet_b0 min = 34.94 max = 129.52 avg = 45.76 efficientnetv2_b0 min = 38.58 max = 67.86 avg = 41.84 regnety_400m min = 35.53 max = 38.59 avg = 36.14 blazeface min = 4.08 max = 4.34 avg = 4.17 googlenet min = 72.60 max = 100.31 avg = 76.25 googlenet_int8 min = 82.09 max = 107.09 avg = 86.78 resnet18 min = 53.99 max = 100.21 avg = 63.52 resnet18_int8 min = 57.20 max = 77.00 avg = 60.47 alexnet min = 61.95 max = 80.86 avg = 65.01 vgg16 min = 329.58 max = 438.99 avg = 360.40 vgg16_int8 min = 293.27 max = 366.16 avg = 311.23 resnet50 min = 138.06 max = 260.50 avg = 169.27 resnet50_int8 min = 154.06 max = 244.31 avg = 173.37 squeezenet_ssd min = 60.44 max = 97.92 avg = 65.41 squeezenet_ssd_int8 min = 55.34 max = 136.72 avg = 68.15 mobilenet_ssd min = 57.97 max = 139.16 avg = 69.27 mobilenet_ssd_int8 min = 66.66 max = 89.91 avg = 71.00 mobilenet_yolo min = 169.38 max = 711.10 avg = 242.62 mobilenetv2_yolov3 min = 75.61 max = 97.83 avg = 80.23 yolov4-tiny min = 110.52 max = 143.67 avg = 118.53 nanodet_m min = 24.04 max = 92.81 avg = 32.45 yolo-fastest-1.1 min = 10.97 max = 32.77 avg = 15.05 yolo-fastestv2 min = 11.54 max = 12.09 avg = 11.84 vision_transformer min = 4193.41 max = 4274.03 avg = 4213.64 FastestDet min = 12.54 max = 13.01 avg = 12.78 ``` ### Loongson 3A4000 (GS464V 1.8GHz * 4 with MSA128) Test on UOS V20 E1050 ``` uos@uos-PC:~/ncnn/benchmark$ ./benchncnn 10 4 2 -1 0 loop_count = 10 num_threads = 4 powersave = 2 gpu_device = -1 cooling_down = 0 squeezenet min = 25.28 max = 38.19 avg = 27.81 squeezenet_int8 min = 21.61 max = 22.13 avg = 21.85 mobilenet min = 44.77 max = 69.54 avg = 55.37 mobilenet_int8 min = 32.96 max = 44.00 avg = 36.08 mobilenet_v2 min = 29.21 max = 52.70 avg = 35.47 mobilenet_v3 min = 24.62 max = 27.32 avg = 25.18 shufflenet min = 18.90 max = 49.70 avg = 22.95 shufflenet_v2 min = 15.87 max = 22.38 avg = 17.67 mnasnet min = 29.08 max = 69.37 avg = 35.53 proxylessnasnet min = 33.30 max = 94.15 avg = 42.81 efficientnet_b0 min = 49.34 max = 61.22 avg = 52.01 efficientnetv2_b0 min = 57.89 max = 72.55 avg = 60.72 regnety_400m min = 50.65 max = 74.16 avg = 57.56 blazeface min = 4.97 max = 5.33 avg = 5.11 googlenet min = 101.45 max = 119.73 avg = 106.85 googlenet_int8 min = 83.94 max = 99.75 avg = 87.36 resnet18 min = 81.65 max = 99.76 avg = 85.96 resnet18_int8 min = 58.60 max = 75.88 avg = 60.62 alexnet min = 77.05 max = 208.05 avg = 120.39 vgg16 min = 427.51 max = 676.57 avg = 531.53 vgg16_int8 min = 326.59 max = 487.96 avg = 417.74 resnet50 min = 221.51 max = 580.11 avg = 305.64 resnet50_int8 min = 158.00 max = 190.71 avg = 167.50 squeezenet_ssd min = 98.87 max = 135.55 avg = 115.54 squeezenet_ssd_int8 min = 66.33 max = 361.40 avg = 148.19 mobilenet_ssd min = 94.12 max = 340.16 avg = 184.85 mobilenet_ssd_int8 min = 88.26 max = 150.47 avg = 112.35 mobilenet_yolo min = 252.07 max = 510.61 avg = 327.21 mobilenetv2_yolov3 min = 115.31 max = 183.63 avg = 147.28 yolov4-tiny min = 153.92 max = 259.18 avg = 196.70 nanodet_m min = 34.95 max = 66.15 avg = 46.41 yolo-fastest-1.1 min = 15.34 max = 15.94 avg = 15.62 yolo-fastestv2 min = 15.53 max = 16.06 avg = 15.80 vision_transformer min = 4200.48 max = 5853.43 avg = 4555.42 FastestDet min = 16.73 max = 18.72 avg = 17.08 uos@uos-PC:~/ncnn/benchmark$ ./benchncnn 10 4 1 -1 0 loop_count = 10 num_threads = 4 powersave = 1 gpu_device = -1 cooling_down = 0 squeezenet min = 25.93 max = 47.61 avg = 28.45 squeezenet_int8 min = 21.84 max = 27.09 avg = 22.84 mobilenet min = 44.61 max = 83.44 avg = 52.52 mobilenet_int8 min = 32.91 max = 45.99 avg = 34.52 mobilenet_v2 min = 29.44 max = 37.14 avg = 30.43 mobilenet_v3 min = 24.54 max = 42.68 avg = 27.25 shufflenet min = 17.16 max = 42.10 avg = 20.08 shufflenet_v2 min = 15.99 max = 16.43 avg = 16.29 mnasnet min = 29.14 max = 43.37 avg = 30.79 proxylessnasnet min = 33.15 max = 34.12 avg = 33.52 efficientnet_b0 min = 49.35 max = 87.75 avg = 54.03 efficientnetv2_b0 min = 57.69 max = 84.67 avg = 64.12 regnety_400m min = 50.55 max = 75.35 avg = 55.31 blazeface min = 5.01 max = 5.16 avg = 5.05 googlenet min = 101.51 max = 116.33 avg = 105.38 googlenet_int8 min = 84.34 max = 102.58 avg = 89.89 resnet18 min = 80.58 max = 94.47 avg = 86.27 resnet18_int8 min = 59.00 max = 76.66 avg = 62.15 alexnet min = 91.72 max = 117.98 avg = 102.20 vgg16 min = 435.57 max = 453.90 avg = 441.39 vgg16_int8 min = 308.39 max = 332.69 avg = 321.09 resnet50 min = 219.93 max = 249.30 avg = 231.93 resnet50_int8 min = 156.78 max = 179.34 avg = 163.43 squeezenet_ssd min = 109.48 max = 153.84 avg = 123.75 squeezenet_ssd_int8 min = 74.33 max = 117.03 avg = 93.81 mobilenet_ssd min = 94.91 max = 161.38 avg = 127.78 mobilenet_ssd_int8 min = 82.35 max = 112.79 avg = 91.86 mobilenet_yolo min = 252.05 max = 285.16 avg = 266.33 mobilenetv2_yolov3 min = 113.98 max = 173.83 avg = 139.60 yolov4-tiny min = 150.06 max = 210.96 avg = 164.94 nanodet_m min = 34.62 max = 67.81 avg = 48.43 yolo-fastest-1.1 min = 15.78 max = 16.09 avg = 15.93 yolo-fastestv2 min = 15.54 max = 32.82 avg = 17.62 vision_transformer min = 4202.89 max = 5573.15 avg = 4426.38 FastestDet min = 16.39 max = 17.06 avg = 16.75 uos@uos-PC:~/ncnn/benchmark$ ./benchncnn 10 4 0 -1 0 loop_count = 10 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 25.98 max = 36.75 avg = 28.86 squeezenet_int8 min = 22.04 max = 30.86 avg = 23.28 mobilenet min = 44.82 max = 60.73 avg = 46.72 mobilenet_int8 min = 33.00 max = 48.45 avg = 34.70 mobilenet_v2 min = 29.53 max = 56.78 avg = 33.98 mobilenet_v3 min = 24.69 max = 45.60 avg = 28.13 shufflenet min = 17.25 max = 24.72 avg = 18.18 shufflenet_v2 min = 16.00 max = 31.27 avg = 17.62 mnasnet min = 28.95 max = 44.73 avg = 32.58 proxylessnasnet min = 32.99 max = 45.42 avg = 34.66 efficientnet_b0 min = 49.71 max = 53.47 avg = 50.25 efficientnetv2_b0 min = 57.51 max = 78.56 avg = 61.47 regnety_400m min = 50.18 max = 71.85 avg = 54.77 blazeface min = 4.98 max = 9.36 avg = 5.48 googlenet min = 101.25 max = 121.71 avg = 105.71 googlenet_int8 min = 82.97 max = 111.81 avg = 89.49 resnet18 min = 75.66 max = 87.19 avg = 78.72 resnet18_int8 min = 58.92 max = 108.67 avg = 76.70 alexnet min = 79.12 max = 144.22 avg = 101.91 vgg16 min = 430.14 max = 460.46 avg = 444.56 vgg16_int8 min = 308.08 max = 350.15 avg = 324.86 resnet50 min = 219.60 max = 258.59 avg = 237.46 resnet50_int8 min = 156.54 max = 180.28 avg = 163.11 squeezenet_ssd min = 77.71 max = 137.36 avg = 119.68 squeezenet_ssd_int8 min = 78.88 max = 113.64 avg = 95.83 mobilenet_ssd min = 94.82 max = 156.99 avg = 119.67 mobilenet_ssd_int8 min = 77.17 max = 98.29 avg = 86.90 mobilenet_yolo min = 252.29 max = 295.62 avg = 265.58 mobilenetv2_yolov3 min = 114.28 max = 159.82 avg = 140.03 yolov4-tiny min = 150.99 max = 203.07 avg = 165.18 nanodet_m min = 34.48 max = 71.56 avg = 49.84 yolo-fastest-1.1 min = 15.36 max = 30.00 avg = 17.11 yolo-fastestv2 min = 15.42 max = 26.96 avg = 16.78 vision_transformer min = 4187.60 max = 4319.84 avg = 4220.05 FastestDet min = 16.30 max = 24.88 avg = 17.49 ``` ### Loongson 3A5000 (LA464 2.5GHz * 4) ``` root@3A5K:~/Desktop/ncnn-20230223/build/benchmark$ ./benchncnn 10 4 2 -1 0 loop_count = 10 num_threads = 4 powersave = 2 gpu_device = -1 cooling_down = 0 squeezenet min = 11.97 max = 19.38 avg = 13.61 squeezenet_int8 min = 14.96 max = 15.36 avg = 15.12 mobilenet min = 20.14 max = 27.50 avg = 21.12 mobilenet_int8 min = 25.28 max = 35.06 avg = 27.37 mobilenet_v2 min = 12.82 max = 13.20 avg = 12.98 mobilenet_v3 min = 11.39 max = 25.03 avg = 12.86 shufflenet min = 7.35 max = 7.50 avg = 7.40 shufflenet_v2 min = 7.12 max = 7.23 avg = 7.18 mnasnet min = 12.85 max = 21.69 avg = 13.83 proxylessnasnet min = 15.35 max = 15.79 avg = 15.43 efficientnet_b0 min = 24.20 max = 24.46 avg = 24.30 efficientnetv2_b0 min = 26.80 max = 42.43 avg = 29.25 regnety_400m min = 22.85 max = 38.30 avg = 24.51 blazeface min = 2.57 max = 2.67 avg = 2.60 googlenet min = 49.09 max = 85.91 avg = 67.57 googlenet_int8 min = 64.89 max = 95.28 avg = 76.41 resnet18 min = 42.43 max = 62.39 avg = 52.38 resnet18_int8 min = 47.96 max = 68.69 avg = 56.75 alexnet min = 46.01 max = 59.26 avg = 49.20 vgg16 min = 246.82 max = 261.80 avg = 252.81 vgg16_int8 min = 247.13 max = 256.81 avg = 252.37 resnet50 min = 102.17 max = 138.16 avg = 117.65 resnet50_int8 min = 115.09 max = 151.30 avg = 129.13 squeezenet_ssd min = 43.62 max = 70.64 avg = 53.89 squeezenet_ssd_int8 min = 38.66 max = 60.12 avg = 47.66 mobilenet_ssd min = 42.67 max = 68.78 avg = 53.95 mobilenet_ssd_int8 min = 56.29 max = 68.31 avg = 59.86 mobilenet_yolo min = 129.04 max = 188.26 avg = 149.64 mobilenetv2_yolov3 min = 61.80 max = 71.41 avg = 66.43 yolov4-tiny min = 88.64 max = 108.17 avg = 95.48 nanodet_m min = 16.24 max = 16.57 avg = 16.34 yolo-fastest-1.1 min = 6.98 max = 7.16 avg = 7.05 yolo-fastestv2 min = 6.95 max = 7.29 avg = 7.08 vision_transformer min = 2910.63 max = 3109.29 avg = 2949.04 FastestDet min = 7.66 max = 7.90 avg = 7.80 ``` ### Loongson 3A6000 (LA664 2.5GHz * 4+4) ``` ~/ncnn/build/benchmark$ ./benchncnn 10 8 2 -1 0 loop_count = 10 num_threads = 8 powersave = 2 gpu_device = -1 cooling_down = 0 squeezenet min = 7.12 max = 7.20 avg = 7.16 squeezenet_int8 min = 8.93 max = 9.20 avg = 8.98 mobilenet min = 11.81 max = 11.88 avg = 11.84 mobilenet_int8 min = 14.25 max = 14.33 avg = 14.28 mobilenet_v2 min = 8.06 max = 8.16 avg = 8.08 mobilenet_v3 min = 6.84 max = 6.90 avg = 6.87 shufflenet min = 5.38 max = 5.44 avg = 5.39 shufflenet_v2 min = 5.20 max = 5.22 avg = 5.20 mnasnet min = 8.06 max = 8.10 avg = 8.07 proxylessnasnet min = 8.94 max = 9.09 avg = 8.99 efficientnet_b0 min = 13.43 max = 13.65 avg = 13.48 efficientnetv2_b0 min = 16.06 max = 16.18 avg = 16.11 regnety_400m min = 18.11 max = 18.18 avg = 18.14 blazeface min = 1.59 max = 1.61 avg = 1.60 googlenet min = 26.08 max = 26.24 avg = 26.17 googlenet_int8 min = 31.25 max = 31.42 avg = 31.34 resnet18 min = 19.65 max = 19.73 avg = 19.69 resnet18_int8 min = 25.55 max = 25.66 avg = 25.60 alexnet min = 19.56 max = 19.81 avg = 19.67 vgg16 min = 115.32 max = 116.38 avg = 115.99 vgg16_int8 min = 135.94 max = 136.73 avg = 136.34 resnet50 min = 56.46 max = 56.96 avg = 56.81 resnet50_int8 min = 66.13 max = 66.40 avg = 66.27 squeezenet_ssd min = 22.84 max = 22.99 avg = 22.89 squeezenet_ssd_int8 min = 22.34 max = 22.76 avg = 22.54 mobilenet_ssd min = 24.67 max = 24.75 avg = 24.71 mobilenet_ssd_int8 min = 29.32 max = 29.37 avg = 29.34 mobilenet_yolo min = 82.82 max = 84.02 avg = 83.40 mobilenetv2_yolov3 min = 30.31 max = 30.45 avg = 30.38 yolov4-tiny min = 42.49 max = 42.74 avg = 42.62 nanodet_m min = 11.00 max = 11.08 avg = 11.02 yolo-fastest-1.1 min = 5.28 max = 5.40 avg = 5.31 yolo-fastestv2 min = 5.09 max = 5.10 avg = 5.10 vision_transformer min = 869.40 max = 898.18 avg = 874.07 FastestDet min = 5.28 max = 5.37 avg = 5.31 ``` ### Phytium FT-2000/4 (FTC663 armv8 2.2GHz x 4) Test on Kylin OS V10 ``` mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 1 0 -1 0 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 40.92 max = 43.43 avg = 41.34 squeezenet_int8 min = 35.48 max = 36.07 avg = 35.75 mobilenet min = 72.23 max = 72.53 avg = 72.39 mobilenet_int8 min = 48.10 max = 48.59 avg = 48.31 mobilenet_v2 min = 47.94 max = 48.45 avg = 48.13 mobilenet_v3 min = 37.95 max = 39.59 avg = 38.41 shufflenet min = 21.51 max = 21.84 avg = 21.64 shufflenet_v2 min = 21.10 max = 21.45 avg = 21.26 mnasnet min = 44.53 max = 45.15 avg = 44.74 proxylessnasnet min = 53.02 max = 53.62 avg = 53.21 efficientnet_b0 min = 79.81 max = 80.51 avg = 80.15 efficientnetv2_b0 min = 92.55 max = 103.10 avg = 97.53 regnety_400m min = 58.52 max = 70.04 avg = 64.20 blazeface min = 6.06 max = 9.85 avg = 6.88 googlenet min = 146.49 max = 162.69 avg = 152.98 googlenet_int8 min = 127.38 max = 132.11 avg = 128.51 resnet18 min = 107.79 max = 108.83 avg = 108.37 resnet18_int8 min = 97.28 max = 99.03 avg = 97.73 alexnet min = 89.95 max = 91.63 avg = 90.28 vgg16 min = 642.27 max = 647.16 avg = 644.09 vgg16_int8 min = 567.03 max = 574.11 avg = 568.74 resnet50 min = 329.12 max = 331.79 avg = 330.10 resnet50_int8 min = 252.48 max = 253.65 avg = 252.93 squeezenet_ssd min = 96.46 max = 96.95 avg = 96.69 squeezenet_ssd_int8 min = 92.35 max = 93.24 avg = 92.72 mobilenet_ssd min = 149.14 max = 150.56 avg = 149.40 mobilenet_ssd_int8 min = 97.56 max = 98.03 avg = 97.82 mobilenet_yolo min = 339.71 max = 340.60 avg = 339.89 mobilenetv2_yolov3 min = 174.53 max = 175.80 avg = 175.01 yolov4-tiny min = 213.72 max = 214.94 avg = 214.08 nanodet_m min = 49.95 max = 50.47 avg = 50.19 yolo-fastest-1.1 min = 23.80 max = 24.42 avg = 23.91 yolo-fastestv2 min = 19.78 max = 19.95 avg = 19.84 vision_transformer min = 3927.51 max = 4025.76 avg = 3947.06 FastestDet min = 21.78 max = 22.17 avg = 21.88 mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 4 1 -1 0 loop_count = 10 num_threads = 4 powersave = 1 gpu_device = -1 cooling_down = 0 squeezenet min = 70.80 max = 76.55 avg = 72.49 squeezenet_int8 min = 110.36 max = 133.06 avg = 114.23 mobilenet min = 77.97 max = 85.73 avg = 79.98 mobilenet_int8 min = 80.05 max = 84.09 avg = 81.76 mobilenet_v2 min = 101.07 max = 192.92 avg = 139.32 mobilenet_v3 min = 108.60 max = 129.37 avg = 113.80 shufflenet min = 160.96 max = 188.96 avg = 168.62 shufflenet_v2 min = 96.20 max = 190.31 avg = 119.77 mnasnet min = 97.34 max = 104.00 avg = 99.85 proxylessnasnet min = 112.58 max = 276.49 avg = 145.74 efficientnet_b0 min = 171.01 max = 238.15 avg = 195.53 efficientnetv2_b0 min = 235.31 max = 299.00 avg = 254.12 regnety_400m min = 1059.87 max = 1173.49 avg = 1084.13 blazeface min = 58.69 max = 64.83 avg = 60.83 googlenet min = 190.47 max = 257.76 avg = 207.71 googlenet_int8 min = 285.67 max = 327.20 avg = 300.87 resnet18 min = 111.87 max = 118.36 avg = 114.48 resnet18_int8 min = 143.08 max = 147.98 avg = 144.93 alexnet min = 72.83 max = 76.52 avg = 74.01 vgg16 min = 390.35 max = 406.58 avg = 397.19 vgg16_int8 min = 358.54 max = 369.89 avg = 364.31 resnet50 min = 275.57 max = 300.14 avg = 283.21 resnet50_int8 min = 315.18 max = 371.22 avg = 328.43 squeezenet_ssd min = 170.14 max = 200.18 avg = 175.23 squeezenet_ssd_int8 min = 259.01 max = 271.23 avg = 263.35 mobilenet_ssd min = 166.85 max = 170.64 avg = 168.74 mobilenet_ssd_int8 min = 191.71 max = 195.91 avg = 193.44 mobilenet_yolo min = 960.70 max = 1080.81 avg = 983.68 mobilenetv2_yolov3 min = 187.72 max = 207.92 avg = 192.60 yolov4-tiny min = 172.72 max = 177.62 avg = 174.63 nanodet_m min = 128.79 max = 137.31 avg = 131.04 yolo-fastest-1.1 min = 132.39 max = 148.06 avg = 137.90 yolo-fastestv2 min = 130.97 max = 137.73 avg = 133.53 vision_transformer min = 2229.10 max = 2392.59 avg = 2304.21 FastestDet min = 119.98 max = 126.26 avg = 122.40 mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 4 2 -1 0 loop_count = 10 num_threads = 4 powersave = 2 gpu_device = -1 cooling_down = 0 squeezenet min = 70.93 max = 75.55 avg = 72.93 squeezenet_int8 min = 109.65 max = 153.48 avg = 124.20 mobilenet min = 78.02 max = 85.80 avg = 81.97 mobilenet_int8 min = 80.34 max = 89.31 avg = 83.20 mobilenet_v2 min = 99.51 max = 110.36 avg = 102.54 mobilenet_v3 min = 109.04 max = 116.28 avg = 111.75 shufflenet min = 160.04 max = 166.21 avg = 163.59 shufflenet_v2 min = 88.90 max = 91.82 avg = 90.24 mnasnet min = 97.02 max = 103.09 avg = 98.70 proxylessnasnet min = 111.21 max = 117.47 avg = 113.97 efficientnet_b0 min = 167.99 max = 175.35 avg = 171.26 efficientnetv2_b0 min = 228.59 max = 245.97 avg = 232.79 regnety_400m min = 1049.34 max = 1085.18 avg = 1064.68 blazeface min = 59.35 max = 64.91 avg = 60.35 googlenet min = 187.87 max = 195.29 avg = 190.56 googlenet_int8 min = 283.22 max = 301.69 avg = 287.66 resnet18 min = 111.48 max = 116.76 avg = 112.88 resnet18_int8 min = 142.41 max = 148.79 avg = 145.14 alexnet min = 72.59 max = 75.37 avg = 73.62 vgg16 min = 389.61 max = 452.95 avg = 424.36 vgg16_int8 min = 365.57 max = 465.13 avg = 422.84 resnet50 min = 283.07 max = 411.14 avg = 332.88 resnet50_int8 min = 323.21 max = 381.13 avg = 340.59 squeezenet_ssd min = 178.21 max = 252.82 avg = 211.62 squeezenet_ssd_int8 min = 263.82 max = 372.38 avg = 284.38 mobilenet_ssd min = 166.29 max = 281.36 avg = 195.16 mobilenet_ssd_int8 min = 194.00 max = 220.95 avg = 204.07 mobilenet_yolo min = 964.99 max = 1027.13 avg = 989.45 mobilenetv2_yolov3 min = 218.58 max = 512.86 avg = 265.12 yolov4-tiny min = 172.20 max = 177.27 avg = 174.14 nanodet_m min = 128.78 max = 222.66 avg = 150.88 yolo-fastest-1.1 min = 132.52 max = 196.41 avg = 149.03 yolo-fastestv2 min = 131.39 max = 138.72 avg = 134.96 vision_transformer min = 2243.31 max = 2659.56 avg = 2395.76 FastestDet min = 119.44 max = 126.07 avg = 122.27 ``` ### Phytium FT-2000+/64 (FTC662 armv8 2.4GHz x 8) ``` [root@bogon benchmark]# ./benchncnn 10 1 0 -1 0 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 57.60 max = 59.78 avg = 58.51 squeezenet_int8 min = 47.05 max = 47.89 avg = 47.40 mobilenet min = 91.08 max = 95.16 avg = 91.89 mobilenet_int8 min = 60.27 max = 61.17 avg = 60.74 mobilenet_v2 min = 63.38 max = 68.12 avg = 66.96 mobilenet_v3 min = 53.34 max = 54.71 avg = 54.01 shufflenet min = 37.87 max = 41.78 avg = 39.37 shufflenet_v2 min = 35.89 max = 37.30 avg = 36.40 mnasnet min = 59.57 max = 63.23 avg = 60.25 proxylessnasnet min = 71.24 max = 71.93 avg = 71.51 efficientnet_b0 min = 134.34 max = 141.14 avg = 137.74 efficientnetv2_b0 min = 143.82 max = 145.63 avg = 144.36 regnety_400m min = 76.96 max = 77.66 avg = 77.27 blazeface min = 11.57 max = 11.90 avg = 11.70 googlenet min = 188.10 max = 191.27 avg = 189.02 googlenet_int8 min = 167.54 max = 169.63 avg = 168.38 resnet18 min = 144.76 max = 163.39 avg = 154.95 resnet18_int8 min = 124.14 max = 129.84 avg = 127.83 alexnet min = 198.22 max = 208.86 avg = 205.35 vgg16 min = 848.10 max = 891.00 avg = 859.94 vgg16_int8 min = 686.54 max = 742.77 avg = 704.74 resnet50 min = 413.45 max = 428.84 avg = 417.81 resnet50_int8 min = 306.32 max = 324.27 avg = 316.47 squeezenet_ssd min = 147.62 max = 149.58 avg = 148.48 squeezenet_ssd_int8 min = 116.18 max = 134.86 avg = 126.93 mobilenet_ssd min = 188.49 max = 191.97 avg = 189.48 mobilenet_ssd_int8 min = 120.28 max = 121.36 avg = 120.83 mobilenet_yolo min = 421.79 max = 425.68 avg = 423.51 mobilenetv2_yolov3 min = 222.86 max = 225.58 avg = 224.01 yolov4-tiny min = 303.77 max = 310.70 avg = 307.45 nanodet_m min = 80.87 max = 82.11 avg = 81.35 [root@bogon benchmark]# ./benchncnn 10 8 0 -1 0 loop_count = 10 num_threads = 8 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 14.53 max = 14.92 avg = 14.68 squeezenet_int8 min = 11.67 max = 11.89 avg = 11.82 mobilenet min = 17.60 max = 20.05 avg = 18.34 mobilenet_int8 min = 9.94 max = 10.22 avg = 10.08 mobilenet_v2 min = 18.46 max = 19.18 avg = 18.81 mobilenet_v3 min = 16.30 max = 16.71 avg = 16.45 shufflenet min = 14.65 max = 14.93 avg = 14.78 shufflenet_v2 min = 11.23 max = 11.56 avg = 11.35 mnasnet min = 15.65 max = 16.08 avg = 15.92 proxylessnasnet min = 18.78 max = 21.72 avg = 19.68 efficientnet_b0 min = 29.16 max = 29.62 avg = 29.37 efficientnetv2_b0 min = 33.28 max = 35.48 avg = 34.23 regnety_400m min = 44.90 max = 47.36 avg = 46.32 blazeface min = 4.23 max = 4.43 avg = 4.30 googlenet min = 42.11 max = 42.98 avg = 42.38 googlenet_int8 min = 33.24 max = 38.21 avg = 34.10 resnet18 min = 33.27 max = 34.00 avg = 33.57 resnet18_int8 min = 23.66 max = 24.78 avg = 24.24 alexnet min = 35.78 max = 37.68 avg = 36.46 vgg16 min = 219.60 max = 235.79 avg = 222.11 vgg16_int8 min = 128.64 max = 135.19 avg = 130.73 resnet50 min = 84.15 max = 85.48 avg = 84.66 resnet50_int8 min = 58.87 max = 61.98 avg = 59.85 squeezenet_ssd min = 47.60 max = 50.24 avg = 48.54 squeezenet_ssd_int8 min = 36.42 max = 37.89 avg = 36.99 mobilenet_ssd min = 39.37 max = 42.63 avg = 41.06 mobilenet_ssd_int8 min = 21.59 max = 22.05 avg = 21.83 mobilenet_yolo min = 83.16 max = 88.75 avg = 85.29 mobilenetv2_yolov3 min = 58.13 max = 59.50 avg = 58.62 yolov4-tiny min = 74.18 max = 76.56 avg = 75.13 nanodet_m min = 25.16 max = 31.45 avg = 26.71 root@FT2K:~/Desktop/ncnn-20221128/build/benchmark$ ./benchncnn 10 4 2 -1 0 loop_count = 10 num_threads = 4 powersave = 2 gpu_device = -1 cooling_down = 0 squeezenet min = 14.19 max = 21.46 avg = 15.16 squeezenet_int8 min = 11.63 max = 12.08 avg = 11.91 mobilenet min = 20.52 max = 37.00 avg = 23.66 mobilenet_int8 min = 13.38 max = 25.95 avg = 15.01 mobilenet_v2 min = 15.80 max = 16.59 avg = 16.12 mobilenet_v3 min = 13.38 max = 17.62 avg = 14.21 shufflenet min = 10.62 max = 11.10 avg = 10.85 shufflenet_v2 min = 9.09 max = 12.30 avg = 9.66 mnasnet min = 14.85 max = 15.67 avg = 15.14 proxylessnasnet min = 16.83 max = 17.10 avg = 16.98 efficientnet_b0 min = 24.59 max = 26.40 avg = 25.06 efficientnetv2_b0 min = 30.25 max = 34.46 avg = 31.42 regnety_400m min = 32.37 max = 41.10 avg = 35.17 blazeface min = 3.00 max = 3.56 avg = 3.18 googlenet min = 49.52 max = 64.98 avg = 56.29 googlenet_int8 min = 38.65 max = 52.51 avg = 43.90 resnet18 min = 42.81 max = 53.94 avg = 45.38 resnet18_int8 min = 32.53 max = 53.62 avg = 37.26 alexnet min = 33.92 max = 47.88 avg = 37.12 vgg16 min = 214.19 max = 228.96 avg = 220.16 vgg16_int8 min = 164.22 max = 224.51 avg = 180.15 resnet50 min = 106.90 max = 189.61 avg = 133.34 resnet50_int8 min = 79.62 max = 94.41 avg = 83.56 squeezenet_ssd min = 48.00 max = 49.11 avg = 48.43 squeezenet_ssd_int8 min = 33.59 max = 47.60 avg = 37.57 mobilenet_ssd min = 43.97 max = 58.84 avg = 49.64 mobilenet_ssd_int8 min = 27.94 max = 32.89 avg = 29.56 mobilenet_yolo min = 107.29 max = 118.80 avg = 114.24 mobilenetv2_yolov3 min = 63.44 max = 106.75 avg = 70.69 yolov4-tiny min = 89.93 max = 155.39 avg = 101.90 nanodet_m min = 20.34 max = 28.67 avg = 21.44 yolo-fastest-1.1 min = 11.74 max = 12.24 avg = 11.96 yolo-fastestv2 min = 9.81 max = 9.98 avg = 9.91 vision_transformer min = 1617.60 max = 1634.13 avg = 1625.87 FastestDet min = 10.19 max = 10.55 avg = 10.36 ``` ### HUAWEI KunPeng 920 2251K (x8 cores) test on UOS 1050 ``` mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 1 0 -1 0 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 12.11 max = 12.40 avg = 12.25 squeezenet_int8 min = 14.24 max = 14.50 avg = 14.36 mobilenet min = 20.52 max = 21.11 avg = 20.63 mobilenet_int8 min = 18.29 max = 18.63 avg = 18.45 mobilenet_v2 min = 13.73 max = 13.90 avg = 13.79 mobilenet_v3 min = 11.37 max = 11.49 avg = 11.41 shufflenet min = 7.90 max = 7.96 avg = 7.92 shufflenet_v2 min = 8.09 max = 8.13 avg = 8.11 mnasnet min = 13.26 max = 13.44 avg = 13.30 proxylessnasnet min = 16.19 max = 16.39 avg = 16.26 efficientnet_b0 min = 34.92 max = 35.22 avg = 35.04 efficientnetv2_b0 min = 43.82 max = 44.39 avg = 43.94 regnety_400m min = 17.55 max = 18.02 avg = 17.65 blazeface min = 3.05 max = 3.08 avg = 3.07 googlenet min = 58.65 max = 59.26 avg = 58.89 googlenet_int8 min = 60.55 max = 63.00 avg = 61.96 resnet18 min = 34.27 max = 35.43 avg = 34.84 resnet18_int8 min = 60.79 max = 62.15 avg = 61.47 alexnet min = 42.01 max = 44.43 avg = 43.36 vgg16 min = 174.46 max = 177.33 avg = 175.57 vgg16_int8 min = 453.93 max = 457.03 avg = 454.79 resnet50 min = 95.36 max = 96.27 avg = 95.55 resnet50_int8 min = 119.77 max = 121.26 avg = 120.46 squeezenet_ssd min = 39.05 max = 39.69 avg = 39.20 squeezenet_ssd_int8 min = 55.06 max = 56.23 avg = 55.72 mobilenet_ssd min = 45.20 max = 45.96 avg = 45.49 mobilenet_ssd_int8 min = 39.40 max = 40.13 avg = 39.76 mobilenet_yolo min = 98.86 max = 99.85 avg = 99.34 mobilenetv2_yolov3 min = 51.17 max = 52.89 avg = 51.89 yolov4-tiny min = 66.43 max = 67.23 avg = 66.70 nanodet_m min = 20.59 max = 20.79 avg = 20.71 yolo-fastest-1.1 min = 7.90 max = 7.99 avg = 7.93 yolo-fastestv2 min = 7.45 max = 7.49 avg = 7.47 vision_transformer min = 1586.33 max = 1595.34 avg = 1589.76 FastestDet min = 7.45 max = 7.52 avg = 7.47 mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 8 0 -1 0 loop_count = 10 num_threads = 8 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 2.93 max = 3.10 avg = 3.00 squeezenet_int8 min = 3.47 max = 3.56 avg = 3.52 mobilenet min = 3.89 max = 4.04 avg = 3.94 mobilenet_int8 min = 3.29 max = 3.39 avg = 3.33 mobilenet_v2 min = 3.95 max = 4.08 avg = 3.98 mobilenet_v3 min = 3.45 max = 3.59 avg = 3.49 shufflenet min = 3.42 max = 4.66 avg = 3.62 shufflenet_v2 min = 2.60 max = 2.94 avg = 2.68 mnasnet min = 3.46 max = 3.57 avg = 3.52 proxylessnasnet min = 3.94 max = 12.34 avg = 4.88 efficientnet_b0 min = 7.31 max = 7.60 avg = 7.38 efficientnetv2_b0 min = 9.01 max = 9.22 avg = 9.08 regnety_400m min = 8.56 max = 9.36 avg = 8.70 blazeface min = 1.36 max = 3.52 avg = 1.60 googlenet min = 11.80 max = 12.02 avg = 11.93 googlenet_int8 min = 11.87 max = 23.09 avg = 13.16 resnet18 min = 7.27 max = 7.64 avg = 7.38 resnet18_int8 min = 11.02 max = 11.73 avg = 11.20 alexnet min = 9.05 max = 9.35 avg = 9.17 vgg16 min = 44.13 max = 50.84 avg = 46.89 vgg16_int8 min = 75.15 max = 80.73 avg = 77.52 resnet50 min = 18.72 max = 27.49 avg = 19.96 resnet50_int8 min = 22.72 max = 36.80 avg = 26.78 squeezenet_ssd min = 13.96 max = 27.42 avg = 15.62 squeezenet_ssd_int8 min = 15.01 max = 29.53 avg = 19.51 mobilenet_ssd min = 9.37 max = 13.34 avg = 10.44 mobilenet_ssd_int8 min = 8.07 max = 24.28 avg = 9.83 mobilenet_yolo min = 22.06 max = 24.89 avg = 22.91 mobilenetv2_yolov3 min = 14.41 max = 15.97 avg = 14.78 yolov4-tiny min = 20.71 max = 23.96 avg = 21.42 nanodet_m min = 6.37 max = 6.59 avg = 6.45 yolo-fastest-1.1 min = 4.27 max = 4.52 avg = 4.34 yolo-fastestv2 min = 3.53 max = 3.63 avg = 3.58 vision_transformer min = 435.60 max = 523.43 avg = 479.70 FastestDet min = 3.54 max = 7.95 avg = 5.24 mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 4 2 -1 0 loop_count = 10 num_threads = 4 powersave = 2 gpu_device = -1 cooling_down = 0 squeezenet min = 4.04 max = 4.22 avg = 4.09 squeezenet_int8 min = 4.64 max = 4.76 avg = 4.69 mobilenet min = 6.04 max = 6.06 avg = 6.05 mobilenet_int8 min = 5.23 max = 5.32 avg = 5.25 mobilenet_v2 min = 5.00 max = 5.03 avg = 5.01 mobilenet_v3 min = 4.49 max = 4.69 avg = 4.52 shufflenet min = 3.90 max = 3.94 avg = 3.91 shufflenet_v2 min = 3.27 max = 3.48 avg = 3.33 mnasnet min = 4.80 max = 4.83 avg = 4.82 proxylessnasnet min = 5.20 max = 5.28 avg = 5.23 efficientnet_b0 min = 10.53 max = 11.06 avg = 10.68 efficientnetv2_b0 min = 13.18 max = 13.37 avg = 13.25 regnety_400m min = 9.20 max = 9.25 avg = 9.22 blazeface min = 1.43 max = 1.45 avg = 1.44 googlenet min = 17.63 max = 17.78 avg = 17.71 googlenet_int8 min = 17.63 max = 18.03 avg = 17.85 resnet18 min = 10.34 max = 10.59 avg = 10.40 resnet18_int8 min = 17.93 max = 18.84 avg = 18.25 alexnet min = 13.28 max = 13.37 avg = 13.31 vgg16 min = 55.41 max = 56.60 avg = 55.70 vgg16_int8 min = 123.71 max = 125.34 avg = 124.48 resnet50 min = 27.82 max = 28.22 avg = 27.95 resnet50_int8 min = 34.50 max = 34.89 avg = 34.70 squeezenet_ssd min = 14.67 max = 15.19 avg = 14.85 squeezenet_ssd_int8 min = 19.76 max = 20.32 avg = 19.87 mobilenet_ssd min = 13.15 max = 13.38 avg = 13.21 mobilenet_ssd_int8 min = 11.52 max = 11.70 avg = 11.60 mobilenet_yolo min = 30.95 max = 31.28 avg = 31.05 mobilenetv2_yolov3 min = 20.04 max = 20.36 avg = 20.16 yolov4-tiny min = 25.61 max = 26.73 avg = 25.80 nanodet_m min = 7.93 max = 7.97 avg = 7.95 yolo-fastest-1.1 min = 4.52 max = 4.59 avg = 4.53 yolo-fastestv2 min = 3.74 max = 3.88 avg = 3.77 vision_transformer min = 546.94 max = 726.81 avg = 698.27 FastestDet min = 3.59 max = 3.61 avg = 3.60 ``` ### HUAWEI KunPeng 920 3211K (x24 cores) test on ubuntu 22.04 ``` (base) mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 1 0 -1 0 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 12.11 max = 12.20 avg = 12.14 squeezenet_int8 min = 14.34 max = 14.46 avg = 14.41 mobilenet min = 20.27 max = 20.36 avg = 20.31 mobilenet_int8 min = 17.45 max = 17.74 avg = 17.58 mobilenet_v2 min = 13.72 max = 13.87 avg = 13.78 mobilenet_v3 min = 11.51 max = 11.69 avg = 11.61 shufflenet min = 8.07 max = 8.36 avg = 8.20 shufflenet_v2 min = 8.13 max = 8.17 avg = 8.14 mnasnet min = 13.34 max = 13.45 avg = 13.41 proxylessnasnet min = 16.22 max = 16.35 avg = 16.29 efficientnet_b0 min = 34.69 max = 35.14 avg = 34.82 efficientnetv2_b0 min = 44.54 max = 44.68 avg = 44.61 regnety_400m min = 18.06 max = 18.15 avg = 18.10 blazeface min = 3.06 max = 3.22 avg = 3.12 googlenet min = 56.80 max = 57.60 avg = 57.08 googlenet_int8 min = 58.64 max = 59.98 avg = 59.42 resnet18 min = 35.02 max = 35.35 avg = 35.10 resnet18_int8 min = 61.13 max = 61.68 avg = 61.33 alexnet min = 42.56 max = 43.05 avg = 42.69 vgg16 min = 186.32 max = 188.73 avg = 187.20 vgg16_int8 min = 459.01 max = 461.48 avg = 460.29 resnet50 min = 97.59 max = 98.32 avg = 97.83 resnet50_int8 min = 118.67 max = 120.45 avg = 119.78 squeezenet_ssd min = 39.62 max = 39.95 avg = 39.81 squeezenet_ssd_int8 min = 56.72 max = 57.63 avg = 57.00 mobilenet_ssd min = 45.44 max = 45.82 avg = 45.63 mobilenet_ssd_int8 min = 38.99 max = 40.08 avg = 39.39 mobilenet_yolo min = 98.71 max = 99.27 avg = 98.94 mobilenetv2_yolov3 min = 51.50 max = 52.41 avg = 51.87 yolov4-tiny min = 68.02 max = 68.43 avg = 68.24 nanodet_m min = 20.49 max = 20.64 avg = 20.59 yolo-fastest-1.1 min = 8.17 max = 8.45 avg = 8.23 yolo-fastestv2 min = 7.73 max = 8.06 avg = 7.87 vision_transformer min = 1620.65 max = 1630.45 avg = 1625.64 FastestDet min = 7.65 max = 7.77 avg = 7.69 (base) mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 2 0 -1 0 loop_count = 10 num_threads = 2 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 6.77 max = 6.85 avg = 6.81 squeezenet_int8 min = 7.98 max = 8.07 avg = 8.03 mobilenet min = 10.70 max = 10.78 avg = 10.73 mobilenet_int8 min = 9.21 max = 9.36 avg = 9.28 mobilenet_v2 min = 7.91 max = 7.99 avg = 7.94 mobilenet_v3 min = 6.72 max = 6.92 avg = 6.78 shufflenet min = 5.34 max = 5.55 avg = 5.38 shufflenet_v2 min = 5.12 max = 5.15 avg = 5.14 mnasnet min = 7.74 max = 7.86 avg = 7.80 proxylessnasnet min = 9.00 max = 9.03 avg = 9.02 efficientnet_b0 min = 18.51 max = 18.58 avg = 18.54 efficientnetv2_b0 min = 23.68 max = 23.83 avg = 23.74 regnety_400m min = 12.65 max = 12.68 avg = 12.66 blazeface min = 1.99 max = 2.14 avg = 2.03 googlenet min = 30.83 max = 31.29 avg = 30.91 googlenet_int8 min = 31.97 max = 33.12 avg = 32.45 resnet18 min = 18.81 max = 18.87 avg = 18.84 resnet18_int8 min = 32.80 max = 32.99 avg = 32.90 alexnet min = 22.88 max = 23.16 avg = 22.94 vgg16 min = 100.58 max = 101.12 avg = 100.90 vgg16_int8 min = 235.81 max = 237.97 avg = 236.20 resnet50 min = 51.12 max = 51.43 avg = 51.28 resnet50_int8 min = 62.46 max = 63.02 avg = 62.72 squeezenet_ssd min = 23.26 max = 23.73 avg = 23.38 squeezenet_ssd_int8 min = 31.91 max = 32.30 avg = 32.13 mobilenet_ssd min = 24.73 max = 24.95 avg = 24.84 mobilenet_ssd_int8 min = 20.99 max = 21.52 avg = 21.21 mobilenet_yolo min = 54.91 max = 55.70 avg = 55.15 mobilenetv2_yolov3 min = 30.18 max = 30.52 avg = 30.31 yolov4-tiny min = 40.46 max = 40.61 avg = 40.55 nanodet_m min = 12.56 max = 12.72 avg = 12.62 yolo-fastest-1.1 min = 6.00 max = 6.15 avg = 6.04 yolo-fastestv2 min = 5.32 max = 5.59 avg = 5.43 vision_transformer min = 894.51 max = 896.28 avg = 895.57 FastestDet min = 5.33 max = 5.42 avg = 5.36 (base) mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 4 0 -1 0 loop_count = 10 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 4.18 max = 4.35 avg = 4.22 squeezenet_int8 min = 4.85 max = 4.98 avg = 4.89 mobilenet min = 5.80 max = 5.95 avg = 5.89 mobilenet_int8 min = 4.86 max = 4.94 avg = 4.89 mobilenet_v2 min = 4.66 max = 4.73 avg = 4.69 mobilenet_v3 min = 4.46 max = 4.50 avg = 4.48 shufflenet min = 4.01 max = 4.17 avg = 4.04 shufflenet_v2 min = 3.39 max = 3.41 avg = 3.39 mnasnet min = 4.81 max = 4.93 avg = 4.85 proxylessnasnet min = 5.47 max = 5.54 avg = 5.49 efficientnet_b0 min = 10.49 max = 10.55 avg = 10.52 efficientnetv2_b0 min = 13.67 max = 13.77 avg = 13.72 regnety_400m min = 10.20 max = 10.24 avg = 10.21 blazeface min = 1.52 max = 1.58 avg = 1.54 googlenet min = 17.65 max = 17.69 avg = 17.68 googlenet_int8 min = 18.14 max = 18.27 avg = 18.19 resnet18 min = 10.52 max = 10.63 avg = 10.57 resnet18_int8 min = 17.42 max = 17.53 avg = 17.49 alexnet min = 13.12 max = 13.20 avg = 13.16 vgg16 min = 55.24 max = 55.45 avg = 55.35 vgg16_int8 min = 123.46 max = 124.23 avg = 123.75 resnet50 min = 28.31 max = 28.57 avg = 28.39 resnet50_int8 min = 34.10 max = 34.39 avg = 34.23 squeezenet_ssd min = 14.85 max = 14.96 avg = 14.91 squeezenet_ssd_int8 min = 19.71 max = 19.88 avg = 19.82 mobilenet_ssd min = 13.49 max = 13.58 avg = 13.52 mobilenet_ssd_int8 min = 11.60 max = 11.70 avg = 11.66 mobilenet_yolo min = 31.74 max = 31.96 avg = 31.81 mobilenetv2_yolov3 min = 17.87 max = 18.03 avg = 17.93 yolov4-tiny min = 25.63 max = 25.78 avg = 25.72 nanodet_m min = 8.16 max = 8.22 avg = 8.20 yolo-fastest-1.1 min = 4.72 max = 4.86 avg = 4.75 yolo-fastestv2 min = 3.98 max = 4.15 avg = 4.00 vision_transformer min = 501.18 max = 503.51 avg = 502.12 FastestDet min = 3.74 max = 3.76 avg = 3.75 (base) mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 8 0 -1 0 loop_count = 10 num_threads = 8 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 2.91 max = 3.10 avg = 2.97 squeezenet_int8 min = 3.42 max = 3.74 avg = 3.51 mobilenet min = 3.57 max = 3.70 avg = 3.61 mobilenet_int8 min = 3.06 max = 3.14 avg = 3.10 mobilenet_v2 min = 3.73 max = 3.75 avg = 3.75 mobilenet_v3 min = 3.50 max = 3.66 avg = 3.56 shufflenet min = 3.63 max = 3.65 avg = 3.64 shufflenet_v2 min = 2.85 max = 3.02 avg = 2.95 mnasnet min = 3.60 max = 3.67 avg = 3.62 proxylessnasnet min = 4.00 max = 4.08 avg = 4.03 efficientnet_b0 min = 7.31 max = 7.34 avg = 7.33 efficientnetv2_b0 min = 9.44 max = 9.51 avg = 9.47 regnety_400m min = 9.76 max = 10.07 avg = 9.90 blazeface min = 1.56 max = 1.75 avg = 1.61 googlenet min = 11.22 max = 11.28 avg = 11.25 googlenet_int8 min = 11.40 max = 12.82 avg = 11.76 resnet18 min = 6.83 max = 6.96 avg = 6.90 resnet18_int8 min = 10.28 max = 10.38 avg = 10.33 alexnet min = 8.75 max = 8.88 avg = 8.80 vgg16 min = 36.00 max = 36.72 avg = 36.29 vgg16_int8 min = 67.38 max = 67.72 avg = 67.54 resnet50 min = 17.63 max = 17.82 avg = 17.68 resnet50_int8 min = 20.05 max = 20.21 avg = 20.15 squeezenet_ssd min = 11.18 max = 11.45 avg = 11.26 squeezenet_ssd_int8 min = 14.09 max = 14.23 avg = 14.18 mobilenet_ssd min = 8.60 max = 8.69 avg = 8.64 mobilenet_ssd_int8 min = 7.75 max = 7.87 avg = 7.81 mobilenet_yolo min = 21.97 max = 22.25 avg = 22.09 mobilenetv2_yolov3 min = 14.04 max = 14.18 avg = 14.12 yolov4-tiny min = 19.66 max = 19.93 avg = 19.81 nanodet_m min = 6.52 max = 6.67 avg = 6.57 yolo-fastest-1.1 min = 4.61 max = 4.76 avg = 4.66 yolo-fastestv2 min = 3.78 max = 3.91 avg = 3.82 vision_transformer min = 323.01 max = 327.38 avg = 323.75 FastestDet min = 3.50 max = 3.54 avg = 3.51 (base) mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 16 0 -1 0 loop_count = 10 num_threads = 16 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 3.00 max = 3.25 avg = 3.08 squeezenet_int8 min = 4.13 max = 4.47 avg = 4.21 mobilenet min = 3.27 max = 3.42 avg = 3.34 mobilenet_int8 min = 3.49 max = 3.58 avg = 3.56 mobilenet_v2 min = 3.86 max = 4.10 avg = 3.97 mobilenet_v3 min = 3.72 max = 3.80 avg = 3.76 shufflenet min = 4.67 max = 4.78 avg = 4.72 shufflenet_v2 min = 3.16 max = 3.24 avg = 3.20 mnasnet min = 3.51 max = 3.65 avg = 3.57 proxylessnasnet min = 4.08 max = 4.35 avg = 4.15 efficientnet_b0 min = 7.51 max = 7.80 avg = 7.63 efficientnetv2_b0 min = 8.92 max = 9.39 avg = 9.05 regnety_400m min = 14.80 max = 15.05 avg = 14.89 blazeface min = 2.14 max = 2.28 avg = 2.20 googlenet min = 9.91 max = 10.00 avg = 9.96 googlenet_int8 min = 11.51 max = 11.65 avg = 11.60 resnet18 min = 6.39 max = 6.56 avg = 6.46 resnet18_int8 min = 9.76 max = 9.91 avg = 9.84 alexnet min = 6.99 max = 7.10 avg = 7.04 vgg16 min = 27.52 max = 28.64 avg = 27.88 vgg16_int8 min = 45.64 max = 45.93 avg = 45.78 resnet50 min = 13.96 max = 14.17 avg = 14.07 resnet50_int8 min = 16.82 max = 16.93 avg = 16.89 squeezenet_ssd min = 11.11 max = 11.54 avg = 11.23 squeezenet_ssd_int8 min = 13.77 max = 14.00 avg = 13.88 mobilenet_ssd min = 8.21 max = 8.46 avg = 8.35 mobilenet_ssd_int8 min = 8.87 max = 9.03 avg = 8.94 mobilenet_yolo min = 30.77 max = 31.35 avg = 31.08 mobilenetv2_yolov3 min = 12.11 max = 13.10 avg = 12.43 yolov4-tiny min = 18.25 max = 18.68 avg = 18.41 nanodet_m min = 6.55 max = 6.68 avg = 6.59 yolo-fastest-1.1 min = 6.00 max = 6.22 avg = 6.09 yolo-fastestv2 min = 4.86 max = 5.01 avg = 4.94 vision_transformer min = 218.18 max = 220.49 avg = 218.79 FastestDet min = 5.01 max = 5.14 avg = 5.07 (base) mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 24 0 -1 0 loop_count = 10 num_threads = 24 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 3.52 max = 3.96 avg = 3.70 squeezenet_int8 min = 5.49 max = 5.83 avg = 5.65 mobilenet min = 3.42 max = 3.83 avg = 3.55 mobilenet_int8 min = 3.69 max = 45.17 avg = 11.59 mobilenet_v2 min = 4.63 max = 5.44 avg = 4.84 mobilenet_v3 min = 4.51 max = 4.89 avg = 4.68 shufflenet min = 6.21 max = 6.52 avg = 6.36 shufflenet_v2 min = 3.98 max = 17.54 avg = 5.45 mnasnet min = 4.28 max = 4.56 avg = 4.39 proxylessnasnet min = 4.76 max = 5.13 avg = 4.92 efficientnet_b0 min = 7.45 max = 111.76 avg = 22.59 efficientnetv2_b0 min = 10.87 max = 33.13 avg = 13.51 regnety_400m min = 20.97 max = 21.73 avg = 21.46 blazeface min = 2.56 max = 2.82 avg = 2.67 googlenet min = 10.54 max = 105.87 avg = 21.85 googlenet_int8 min = 14.21 max = 77.02 avg = 22.23 resnet18 min = 7.08 max = 7.51 avg = 7.31 resnet18_int8 min = 11.25 max = 50.66 avg = 19.14 alexnet min = 7.13 max = 8.67 avg = 7.44 vgg16 min = 27.59 max = 35.35 avg = 29.12 vgg16_int8 min = 44.43 max = 51.76 avg = 46.90 resnet50 min = 15.16 max = 105.98 avg = 24.91 resnet50_int8 min = 19.82 max = 20.50 avg = 20.16 squeezenet_ssd min = 13.03 max = 13.69 avg = 13.40 squeezenet_ssd_int8 min = 17.62 max = 187.55 avg = 39.92 mobilenet_ssd min = 8.83 max = 71.97 avg = 15.37 mobilenet_ssd_int8 min = 10.22 max = 49.61 avg = 15.26 mobilenet_yolo min = 35.19 max = 46.43 avg = 36.93 mobilenetv2_yolov3 min = 12.96 max = 15.57 avg = 13.41 yolov4-tiny min = 19.22 max = 21.43 avg = 19.89 nanodet_m min = 7.71 max = 8.74 avg = 8.09 yolo-fastest-1.1 min = 6.71 max = 78.72 avg = 14.16 yolo-fastestv2 min = 5.72 max = 6.08 avg = 5.88 vision_transformer min = 192.16 max = 221.86 avg = 202.73 FastestDet min = 5.13 max = 5.47 avg = 5.30 ``` ### HUAWEI Kunpeng 920 7260 (x64 cores) test on Ubuntu 20.04 (gcc 9.4.0) ``` root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 1 0 -1 0 loop_count = 300 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 11.64 max = 12.11 avg = 11.71 squeezenet_int8 min = 12.22 max = 13.22 avg = 12.37 mobilenet min = 20.00 max = 20.79 avg = 20.08 mobilenet_int8 min = 17.44 max = 19.09 avg = 17.64 mobilenet_v2 min = 13.29 max = 14.25 avg = 13.39 mobilenet_v3 min = 11.06 max = 11.84 avg = 11.11 shufflenet min = 7.56 max = 7.74 avg = 7.59 shufflenet_v2 min = 7.84 max = 8.37 avg = 7.88 mnasnet min = 13.07 max = 13.78 avg = 13.14 proxylessnasnet min = 15.71 max = 16.31 avg = 15.77 efficientnet_b0 min = 34.79 max = 35.98 avg = 34.92 efficientnetv2_b0 min = 35.28 max = 36.36 avg = 35.41 regnety_400m min = 17.06 max = 17.74 avg = 17.16 blazeface min = 2.99 max = 3.04 avg = 3.01 googlenet min = 50.76 max = 51.74 avg = 51.00 googlenet_int8 min = 50.31 max = 52.27 avg = 50.65 resnet18 min = 34.97 max = 37.17 avg = 35.82 resnet18_int8 min = 40.47 max = 42.03 avg = 40.78 alexnet min = 39.19 max = 39.80 avg = 39.32 vgg16 min = 176.62 max = 181.29 avg = 177.07 vgg16_int8 min = 352.35 max = 358.38 avg = 355.15 resnet50 min = 96.76 max = 98.63 avg = 97.09 resnet50_int8 min = 90.00 max = 92.74 avg = 90.81 squeezenet_ssd min = 33.23 max = 33.99 avg = 33.39 squeezenet_ssd_int8 min = 38.50 max = 41.53 avg = 39.28 mobilenet_ssd min = 42.49 max = 44.78 avg = 42.72 mobilenet_ssd_int8 min = 37.06 max = 39.97 avg = 37.57 mobilenet_yolo min = 96.34 max = 98.91 avg = 96.73 mobilenetv2_yolov3 min = 50.88 max = 52.97 avg = 51.15 yolov4-tiny min = 65.56 max = 67.13 avg = 65.80 nanodet_m min = 19.94 max = 20.82 avg = 20.04 yolo-fastest-1.1 min = 7.66 max = 7.81 avg = 7.71 yolo-fastestv2 min = 6.82 max = 7.23 avg = 6.87 vision_transformer min = 1535.03 max = 1552.84 avg = 1543.73 FastestDet min = 7.17 max = 7.50 avg = 7.21 root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 2 0 -1 0 loop_count = 300 num_threads = 2 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 6.35 max = 9.15 avg = 7.33 squeezenet_int8 min = 8.06 max = 8.60 avg = 8.14 mobilenet min = 10.30 max = 11.86 avg = 11.48 mobilenet_int8 min = 8.93 max = 11.87 avg = 10.47 mobilenet_v2 min = 9.05 max = 11.50 avg = 9.19 mobilenet_v3 min = 6.32 max = 6.42 avg = 6.36 shufflenet min = 6.73 max = 8.55 avg = 6.81 shufflenet_v2 min = 4.94 max = 6.65 avg = 6.32 mnasnet min = 7.38 max = 10.77 avg = 8.82 proxylessnasnet min = 8.57 max = 9.72 avg = 8.63 efficientnet_b0 min = 18.61 max = 22.53 avg = 20.42 efficientnetv2_b0 min = 18.75 max = 21.93 avg = 20.79 regnety_400m min = 11.86 max = 15.09 avg = 14.60 blazeface min = 1.95 max = 3.37 avg = 2.06 googlenet min = 28.66 max = 32.24 avg = 28.94 googlenet_int8 min = 27.64 max = 32.15 avg = 30.84 resnet18 min = 20.33 max = 20.77 avg = 20.47 resnet18_int8 min = 22.63 max = 23.72 avg = 22.88 alexnet min = 20.41 max = 29.37 avg = 27.22 vgg16 min = 101.72 max = 140.33 avg = 103.29 vgg16_int8 min = 187.56 max = 211.44 avg = 189.92 resnet50 min = 51.07 max = 59.25 avg = 58.35 resnet50_int8 min = 46.50 max = 52.55 avg = 48.93 squeezenet_ssd min = 22.48 max = 28.59 avg = 22.98 squeezenet_ssd_int8 min = 25.56 max = 26.82 avg = 25.99 mobilenet_ssd min = 22.81 max = 26.21 avg = 24.88 mobilenet_ssd_int8 min = 19.31 max = 25.53 avg = 21.74 mobilenet_yolo min = 59.58 max = 62.04 avg = 59.99 mobilenetv2_yolov3 min = 33.26 max = 35.74 avg = 33.51 yolov4-tiny min = 41.14 max = 45.34 avg = 42.46 nanodet_m min = 12.10 max = 16.69 avg = 15.02 yolo-fastest-1.1 min = 5.44 max = 7.78 avg = 7.24 yolo-fastestv2 min = 5.03 max = 8.08 avg = 6.75 vision_transformer min = 994.46 max = 1090.68 avg = 1045.50 FastestDet min = 6.76 max = 6.91 avg = 6.83 root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 4 0 -1 0 loop_count = 300 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 3.79 max = 6.99 avg = 4.55 squeezenet_int8 min = 5.13 max = 5.68 avg = 5.20 mobilenet min = 6.25 max = 6.55 avg = 6.30 mobilenet_int8 min = 5.96 max = 6.10 avg = 6.03 mobilenet_v2 min = 5.34 max = 7.15 avg = 5.62 mobilenet_v3 min = 4.05 max = 5.74 avg = 5.01 shufflenet min = 3.69 max = 5.81 avg = 5.15 shufflenet_v2 min = 4.31 max = 6.02 avg = 4.56 mnasnet min = 4.48 max = 6.05 avg = 5.54 proxylessnasnet min = 5.05 max = 8.08 avg = 6.03 efficientnet_b0 min = 10.17 max = 12.21 avg = 11.58 efficientnetv2_b0 min = 10.86 max = 15.78 avg = 12.70 regnety_400m min = 9.24 max = 14.13 avg = 11.98 blazeface min = 1.89 max = 1.97 avg = 1.93 googlenet min = 15.19 max = 20.31 avg = 16.90 googlenet_int8 min = 17.97 max = 19.40 avg = 18.11 resnet18 min = 11.18 max = 11.48 avg = 11.29 resnet18_int8 min = 12.26 max = 12.78 avg = 12.44 alexnet min = 14.43 max = 16.94 avg = 14.68 vgg16 min = 62.40 max = 78.42 avg = 64.96 vgg16_int8 min = 101.52 max = 109.42 avg = 104.46 resnet50 min = 29.19 max = 39.69 avg = 32.99 resnet50_int8 min = 26.94 max = 28.82 avg = 27.16 squeezenet_ssd min = 12.90 max = 16.52 avg = 15.20 squeezenet_ssd_int8 min = 15.58 max = 18.40 avg = 16.28 mobilenet_ssd min = 13.68 max = 14.45 avg = 13.87 mobilenet_ssd_int8 min = 12.20 max = 14.58 avg = 12.84 mobilenet_yolo min = 34.85 max = 36.54 avg = 35.05 mobilenetv2_yolov3 min = 18.61 max = 20.93 avg = 19.92 yolov4-tiny min = 26.09 max = 32.32 avg = 28.03 nanodet_m min = 7.85 max = 12.48 avg = 11.00 yolo-fastest-1.1 min = 6.19 max = 6.49 avg = 6.31 yolo-fastestv2 min = 3.66 max = 6.83 avg = 5.11 vision_transformer min = 605.95 max = 624.99 avg = 609.79 FastestDet min = 4.32 max = 5.41 avg = 5.17 root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 8 0 -1 0 loop_count = 300 num_threads = 8 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 2.72 max = 3.74 avg = 3.05 squeezenet_int8 min = 3.80 max = 4.71 avg = 4.03 mobilenet min = 3.94 max = 5.15 avg = 4.00 mobilenet_int8 min = 3.73 max = 3.87 avg = 3.80 mobilenet_v2 min = 4.51 max = 6.57 avg = 4.68 mobilenet_v3 min = 4.12 max = 4.38 avg = 4.28 shufflenet min = 4.60 max = 6.27 avg = 4.88 shufflenet_v2 min = 4.07 max = 4.20 avg = 4.11 mnasnet min = 4.26 max = 4.51 avg = 4.36 proxylessnasnet min = 4.71 max = 7.40 avg = 4.80 efficientnet_b0 min = 8.49 max = 8.74 avg = 8.56 efficientnetv2_b0 min = 9.34 max = 9.68 avg = 9.41 regnety_400m min = 8.00 max = 12.85 avg = 10.64 blazeface min = 1.76 max = 1.84 avg = 1.80 googlenet min = 10.89 max = 11.33 avg = 10.98 googlenet_int8 min = 11.66 max = 14.07 avg = 11.83 resnet18 min = 6.48 max = 6.61 avg = 6.54 resnet18_int8 min = 7.30 max = 7.79 avg = 7.51 alexnet min = 8.33 max = 8.95 avg = 8.62 vgg16 min = 29.94 max = 47.54 avg = 31.95 vgg16_int8 min = 54.67 max = 60.76 avg = 56.03 resnet50 min = 16.13 max = 20.79 avg = 20.03 resnet50_int8 min = 15.64 max = 20.13 avg = 16.11 squeezenet_ssd min = 11.58 max = 12.02 avg = 11.77 squeezenet_ssd_int8 min = 11.14 max = 13.72 avg = 12.10 mobilenet_ssd min = 8.27 max = 10.77 avg = 8.76 mobilenet_ssd_int8 min = 8.13 max = 9.09 avg = 8.29 mobilenet_yolo min = 23.90 max = 24.69 avg = 24.17 mobilenetv2_yolov3 min = 14.83 max = 15.72 avg = 15.19 yolov4-tiny min = 19.78 max = 23.66 avg = 20.05 nanodet_m min = 8.92 max = 10.76 avg = 9.09 yolo-fastest-1.1 min = 5.49 max = 5.77 avg = 5.63 yolo-fastestv2 min = 5.04 max = 5.21 avg = 5.10 vision_transformer min = 318.42 max = 379.40 avg = 363.66 FastestDet min = 4.18 max = 4.54 avg = 4.38 root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 16 0 -1 0 loop_count = 300 num_threads = 16 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 2.70 max = 3.14 avg = 2.81 squeezenet_int8 min = 3.21 max = 4.22 avg = 3.39 mobilenet min = 3.13 max = 3.26 avg = 3.20 mobilenet_int8 min = 3.17 max = 5.05 avg = 3.30 mobilenet_v2 min = 4.31 max = 6.24 avg = 4.62 mobilenet_v3 min = 3.57 max = 3.77 avg = 3.68 shufflenet min = 4.70 max = 6.45 avg = 4.80 shufflenet_v2 min = 3.73 max = 4.27 avg = 3.87 mnasnet min = 3.67 max = 3.87 avg = 3.75 proxylessnasnet min = 4.28 max = 4.81 avg = 4.35 efficientnet_b0 min = 7.31 max = 7.77 avg = 7.53 efficientnetv2_b0 min = 9.87 max = 12.33 avg = 10.07 regnety_400m min = 17.95 max = 18.53 avg = 18.26 blazeface min = 2.26 max = 2.40 avg = 2.33 googlenet min = 9.51 max = 9.99 avg = 9.68 googlenet_int8 min = 10.98 max = 11.36 avg = 11.18 resnet18 min = 5.59 max = 6.08 avg = 5.71 resnet18_int8 min = 6.55 max = 7.28 avg = 6.77 alexnet min = 6.26 max = 6.50 avg = 6.36 vgg16 min = 23.98 max = 27.37 avg = 24.89 vgg16_int8 min = 38.07 max = 39.66 avg = 39.02 resnet50 min = 12.81 max = 14.19 avg = 13.76 resnet50_int8 min = 12.42 max = 12.84 avg = 12.55 squeezenet_ssd min = 10.80 max = 11.49 avg = 11.12 squeezenet_ssd_int8 min = 11.57 max = 12.21 avg = 11.74 mobilenet_ssd min = 7.46 max = 8.08 avg = 7.84 mobilenet_ssd_int8 min = 7.47 max = 8.07 avg = 7.63 mobilenet_yolo min = 21.70 max = 23.43 avg = 21.92 mobilenetv2_yolov3 min = 12.55 max = 14.56 avg = 12.90 yolov4-tiny min = 17.68 max = 19.85 avg = 18.18 nanodet_m min = 8.35 max = 8.70 avg = 8.45 yolo-fastest-1.1 min = 5.70 max = 7.11 avg = 6.05 yolo-fastestv2 min = 4.85 max = 5.70 avg = 5.37 vision_transformer min = 214.36 max = 259.56 avg = 245.47 FastestDet min = 5.01 max = 5.42 avg = 5.17 root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 32 0 -1 0 loop_count = 300 num_threads = 32 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 2.30 max = 2.94 avg = 2.46 squeezenet_int8 min = 3.08 max = 4.88 avg = 4.03 mobilenet min = 2.49 max = 2.76 avg = 2.53 mobilenet_int8 min = 2.86 max = 3.73 avg = 2.95 mobilenet_v2 min = 4.51 max = 5.20 avg = 4.74 mobilenet_v3 min = 5.11 max = 6.91 avg = 6.10 shufflenet min = 5.57 max = 6.51 avg = 5.78 shufflenet_v2 min = 4.37 max = 4.66 avg = 4.48 mnasnet min = 3.72 max = 4.08 avg = 3.90 proxylessnasnet min = 4.19 max = 6.18 avg = 4.79 efficientnet_b0 min = 6.80 max = 7.22 avg = 6.89 efficientnetv2_b0 min = 13.98 max = 17.55 avg = 15.06 regnety_400m min = 16.10 max = 16.72 avg = 16.26 blazeface min = 2.12 max = 2.53 avg = 2.17 googlenet min = 8.63 max = 9.89 avg = 8.77 googlenet_int8 min = 9.90 max = 11.09 avg = 10.08 resnet18 min = 6.54 max = 6.99 avg = 6.73 resnet18_int8 min = 8.34 max = 9.00 avg = 8.67 alexnet min = 6.64 max = 7.15 avg = 6.93 vgg16 min = 22.79 max = 23.91 avg = 23.50 vgg16_int8 min = 32.37 max = 37.51 avg = 33.13 resnet50 min = 11.19 max = 16.40 avg = 11.47 resnet50_int8 min = 11.92 max = 12.55 avg = 12.13 squeezenet_ssd min = 10.75 max = 12.28 avg = 11.12 squeezenet_ssd_int8 min = 11.31 max = 12.29 avg = 11.57 mobilenet_ssd min = 10.25 max = 11.26 avg = 10.79 mobilenet_ssd_int8 min = 11.39 max = 16.99 avg = 11.98 mobilenet_yolo min = 52.11 max = 60.46 avg = 53.84 mobilenetv2_yolov3 min = 12.07 max = 12.47 avg = 12.20 yolov4-tiny min = 17.48 max = 17.79 avg = 17.58 nanodet_m min = 13.06 max = 14.71 avg = 13.64 yolo-fastest-1.1 min = 5.70 max = 5.89 avg = 5.79 yolo-fastestv2 min = 8.89 max = 9.99 avg = 9.21 vision_transformer min = 158.92 max = 187.40 avg = 168.21 FastestDet min = 8.70 max = 9.43 avg = 9.00 root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 64 0 -1 0 loop_count = 300 num_threads = 64 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 6.85 max = 78.56 avg = 7.81 squeezenet_int8 min = 8.06 max = 88.91 avg = 9.23 mobilenet min = 3.02 max = 86.86 avg = 5.89 mobilenet_int8 min = 3.58 max = 4.55 avg = 3.68 mobilenet_v2 min = 5.05 max = 150.06 avg = 13.04 mobilenet_v3 min = 4.85 max = 125.22 avg = 8.34 shufflenet min = 17.80 max = 220.55 avg = 21.01 shufflenet_v2 min = 11.23 max = 381.95 avg = 13.71 mnasnet min = 9.83 max = 128.42 avg = 11.10 proxylessnasnet min = 10.53 max = 68.52 avg = 12.03 efficientnet_b0 min = 16.78 max = 968.87 avg = 23.94 efficientnetv2_b0 min = 26.23 max = 551.18 avg = 31.34 regnety_400m min = 70.14 max = 407.92 avg = 78.30 blazeface min = 7.27 max = 191.44 avg = 9.37 googlenet min = 16.69 max = 820.58 avg = 25.06 googlenet_int8 min = 20.58 max = 849.09 avg = 29.87 resnet18 min = 8.67 max = 349.00 avg = 11.33 resnet18_int8 min = 10.40 max = 128.98 avg = 11.45 alexnet min = 6.15 max = 196.01 avg = 10.24 vgg16 min = 21.11 max = 288.66 avg = 29.37 vgg16_int8 min = 30.72 max = 251.95 avg = 37.68 resnet50 min = 19.10 max = 114.08 avg = 22.00 resnet50_int8 min = 18.99 max = 436.89 avg = 24.36 squeezenet_ssd min = 22.22 max = 510.52 avg = 28.76 squeezenet_ssd_int8 min = 23.42 max = 614.70 avg = 30.82 mobilenet_ssd min = 7.62 max = 202.66 avg = 14.59 mobilenet_ssd_int8 min = 7.89 max = 109.82 avg = 8.80 mobilenet_yolo min = 31.43 max = 742.10 avg = 45.52 mobilenetv2_yolov3 min = 18.31 max = 273.05 avg = 20.78 yolov4-tiny min = 21.03 max = 400.05 avg = 33.64 nanodet_m min = 19.94 max = 114.18 avg = 21.89 yolo-fastest-1.1 min = 7.20 max = 174.60 avg = 9.13 yolo-fastestv2 min = 7.50 max = 170.55 avg = 9.01 vision_transformer min = 126.90 max = 335.71 avg = 157.38 FastestDet min = 6.59 max = 19.77 avg = 6.77 ``` ### Intel Atom x5-Z8350 ``` nihui@nihui-ROCK-Pi-X:~/ncnn/build/benchmark$ ./benchncnn 20 4 0 -1 1 loop_count = 20 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 50.22 max = 50.53 avg = 50.32 squeezenet_int8 min = 77.92 max = 78.37 avg = 78.07 mobilenet min = 80.12 max = 81.53 avg = 80.35 mobilenet_int8 min = 120.54 max = 124.10 avg = 120.84 mobilenet_v2 min = 56.62 max = 60.12 avg = 58.37 mobilenet_v3 min = 50.19 max = 50.41 avg = 50.27 shufflenet min = 37.96 max = 38.28 avg = 38.10 shufflenet_v2 min = 35.28 max = 35.59 avg = 35.45 mnasnet min = 54.91 max = 55.10 avg = 55.01 proxylessnasnet min = 62.25 max = 62.59 avg = 62.40 efficientnet_b0 min = 101.92 max = 105.73 avg = 102.27 efficientnetv2_b0 min = 115.48 max = 117.25 avg = 115.89 regnety_400m min = 79.66 max = 81.70 avg = 79.95 blazeface min = 10.43 max = 10.60 avg = 10.49 googlenet min = 170.41 max = 173.44 avg = 170.68 googlenet_int8 min = 253.06 max = 257.34 avg = 253.57 resnet18 min = 127.19 max = 130.69 avg = 127.65 resnet18_int8 min = 200.54 max = 204.25 avg = 200.88 alexnet min = 104.89 max = 110.89 avg = 105.56 vgg16 min = 653.78 max = 661.34 avg = 655.44 vgg16_int8 min = 974.72 max = 1006.48 avg = 978.76 resnet50 min = 367.63 max = 371.74 avg = 368.27 resnet50_int8 min = 574.94 max = 584.08 avg = 576.18 squeezenet_ssd min = 115.35 max = 116.47 avg = 115.62 squeezenet_ssd_int8 min = 169.95 max = 170.75 avg = 170.26 mobilenet_ssd min = 167.00 max = 172.02 avg = 168.95 mobilenet_ssd_int8 min = 244.91 max = 248.30 avg = 245.27 mobilenet_yolo min = 382.80 max = 393.23 avg = 385.79 mobilenetv2_yolov3 min = 208.23 max = 211.54 avg = 209.64 yolov4-tiny min = 251.10 max = 263.77 avg = 256.37 nanodet_m min = 84.48 max = 84.95 avg = 84.70 yolo-fastest-1.1 min = 44.11 max = 45.15 avg = 44.26 yolo-fastestv2 min = 37.95 max = 38.52 avg = 38.34 nihui@nihui-ROCK-Pi-X:~/ncnn/build/benchmark$ ./benchncnn 10 1 0 -1 1 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 130.52 max = 131.08 avg = 130.64 squeezenet_int8 min = 231.03 max = 231.38 avg = 231.19 mobilenet min = 231.40 max = 231.74 avg = 231.61 mobilenet_int8 min = 409.74 max = 410.02 avg = 409.85 mobilenet_v2 min = 150.23 max = 150.72 avg = 150.47 mobilenet_v3 min = 119.08 max = 119.34 avg = 119.20 shufflenet min = 72.62 max = 72.81 avg = 72.73 shufflenet_v2 min = 73.63 max = 73.71 avg = 73.68 mnasnet min = 140.87 max = 141.09 avg = 140.98 proxylessnasnet min = 166.39 max = 166.75 avg = 166.54 efficientnet_b0 min = 280.55 max = 281.30 avg = 280.77 efficientnetv2_b0 min = 321.05 max = 321.24 avg = 321.16 regnety_400m min = 183.78 max = 184.64 avg = 183.91 blazeface min = 18.94 max = 19.08 avg = 19.01 googlenet min = 453.56 max = 454.71 avg = 454.15 googlenet_int8 min = 791.40 max = 791.93 avg = 791.61 resnet18 min = 365.87 max = 366.40 avg = 366.15 resnet18_int8 min = 652.86 max = 653.39 avg = 653.09 alexnet min = 289.15 max = 290.25 avg = 289.65 vgg16 min = 1887.16 max = 1887.73 avg = 1887.41 vgg16_int8 min = 3211.44 max = 3213.39 avg = 3212.55 resnet50 min = 1060.37 max = 1061.40 avg = 1060.80 resnet50_int8 min = 1869.41 max = 1870.59 avg = 1870.17 squeezenet_ssd min = 277.23 max = 277.83 avg = 277.50 squeezenet_ssd_int8 min = 455.54 max = 458.06 avg = 456.28 mobilenet_ssd min = 478.03 max = 478.83 avg = 478.32 mobilenet_ssd_int8 min = 822.61 max = 822.96 avg = 822.79 mobilenet_yolo min = 1136.89 max = 1138.51 avg = 1137.74 mobilenetv2_yolov3 min = 551.81 max = 552.53 avg = 552.14 yolov4-tiny min = 685.49 max = 686.15 avg = 685.79 nanodet_m min = 181.21 max = 181.52 avg = 181.32 yolo-fastest-1.1 min = 82.21 max = 82.68 avg = 82.30 yolo-fastestv2 min = 67.62 max = 68.36 avg = 68.10 root@nihui-ROCK-Pi-X:/home/nihui/osd/ncnn/build/benchmark# ./benchncnn 10 1 0 0 0 [0 Intel(R) HD Graphics (CHV)] queueC=0[1] queueG=0[1] queueT=0[1] [0 Intel(R) HD Graphics (CHV)] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 Intel(R) HD Graphics (CHV)] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 Intel(R) HD Graphics (CHV)] subgroup=32 basic=1 vote=1 ballot=1 shuffle=1 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = 0 cooling_down = 0 squeezenet min = 29.14 max = 29.76 avg = 29.45 mobilenet min = 36.19 max = 37.03 avg = 36.52 mobilenet_v2 min = 30.39 max = 31.62 avg = 30.76 mobilenet_v3 min = 31.60 max = 32.25 avg = 31.92 shufflenet min = 22.47 max = 23.19 avg = 22.70 shufflenet_v2 min = 22.30 max = 24.16 avg = 23.12 mnasnet min = 29.40 max = 30.23 avg = 29.84 proxylessnasnet min = 31.00 max = 31.91 avg = 31.41 efficientnet_b0 min = 58.03 max = 58.74 avg = 58.42 efficientnetv2_b0 min = 131.17 max = 191.61 avg = 161.37 regnety_400m min = 40.30 max = 42.27 avg = 41.04 blazeface min = 15.06 max = 15.96 avg = 15.48 googlenet min = 85.37 max = 86.49 avg = 85.84 resnet18 min = 93.87 max = 95.00 avg = 94.53 alexnet min = 110.96 max = 120.83 avg = 115.14 vgg16 min = 798.75 max = 812.60 avg = 804.93 resnet50 min = 213.12 max = 214.81 avg = 213.79 squeezenet_ssd min = 124.48 max = 125.18 avg = 124.87 mobilenet_ssd min = 84.04 max = 84.70 avg = 84.49 mobilenet_yolo min = 186.52 max = 189.61 avg = 188.53 mobilenetv2_yolov3 min = 102.07 max = 102.97 avg = 102.39 yolov4-tiny min = 212.49 max = 214.75 avg = 213.77 nanodet_m min = 42.97 max = 45.58 avg = 44.05 yolo-fastest-1.1 min = 27.14 max = 32.53 avg = 28.76 yolo-fastestv2 min = 20.73 max = 25.90 avg = 22.97 ``` ### Intel Celeron N5105 ``` loop_count = 8 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 18.06 max = 18.21 avg = 18.12 squeezenet_int8 min = 24.55 max = 25.16 avg = 24.69 mobilenet min = 32.22 max = 32.70 avg = 32.40 mobilenet_int8 min = 40.52 max = 40.59 avg = 40.54 mobilenet_v2 min = 22.54 max = 22.71 avg = 22.65 mobilenet_v3 min = 17.86 max = 19.02 avg = 18.09 shufflenet min = 11.23 max = 11.30 avg = 11.28 shufflenet_v2 min = 11.04 max = 11.19 avg = 11.13 mnasnet min = 19.93 max = 20.09 avg = 20.01 proxylessnasnet min = 21.91 max = 22.00 avg = 21.95 efficientnet_b0 min = 33.29 max = 33.66 avg = 33.50 efficientnetv2_b0 min = 40.16 max = 40.63 avg = 40.34 regnety_400m min = 27.38 max = 27.59 avg = 27.50 blazeface min = 3.01 max = 3.11 avg = 3.04 googlenet min = 64.78 max = 65.16 avg = 65.01 googlenet_int8 min = 80.11 max = 80.79 avg = 80.46 resnet18 min = 53.91 max = 54.28 avg = 54.07 resnet18_int8 min = 63.95 max = 64.20 avg = 64.06 alexnet min = 51.84 max = 52.17 avg = 52.00 vgg16 min = 322.01 max = 324.34 avg = 322.72 vgg16_int8 min = 323.83 max = 324.17 avg = 324.02 resnet50 min = 152.66 max = 153.33 avg = 153.03 resnet50_int8 min = 193.40 max = 194.55 avg = 194.03 squeezenet_ssd min = 44.07 max = 44.51 avg = 44.37 squeezenet_ssd_int8 min = 51.08 max = 52.26 avg = 51.60 mobilenet_ssd min = 67.73 max = 68.21 avg = 67.98 mobilenet_ssd_int8 min = 82.41 max = 82.70 avg = 82.55 mobilenet_yolo min = 157.38 max = 159.44 avg = 158.23 mobilenetv2_yolov3 min = 83.35 max = 83.68 avg = 83.55 yolov4-tiny min = 107.25 max = 107.72 avg = 107.50 nanodet_m min = 26.93 max = 27.24 avg = 27.09 yolo-fastest-1.1 min = 12.47 max = 12.71 avg = 12.61 yolo-fastestv2 min = 10.65 max = 10.95 avg = 10.81 loop_count = 4 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 54.43 max = 54.48 avg = 54.46 squeezenet_int8 min = 79.32 max = 79.64 avg = 79.43 mobilenet min = 105.92 max = 106.12 avg = 106.03 mobilenet_int8 min = 152.24 max = 152.28 avg = 152.26 mobilenet_v2 min = 62.44 max = 62.83 avg = 62.57 mobilenet_v3 min = 49.47 max = 49.55 avg = 49.50 shufflenet min = 27.32 max = 27.37 avg = 27.34 shufflenet_v2 min = 29.85 max = 30.00 avg = 29.93 mnasnet min = 59.83 max = 60.09 avg = 59.98 proxylessnasnet min = 66.66 max = 66.84 avg = 66.76 efficientnet_b0 min = 104.00 max = 104.19 avg = 104.08 efficientnetv2_b0 min = 128.05 max = 128.39 avg = 128.21 regnety_400m min = 77.95 max = 78.03 avg = 78.00 blazeface min = 6.66 max = 6.77 avg = 6.70 googlenet min = 195.32 max = 195.75 avg = 195.52 googlenet_int8 min = 275.81 max = 276.25 avg = 275.98 resnet18 min = 160.94 max = 161.17 avg = 161.03 resnet18_int8 min = 223.88 max = 224.12 avg = 224.03 alexnet min = 120.96 max = 121.16 avg = 121.05 vgg16 min = 852.50 max = 853.66 avg = 853.04 vgg16_int8 min = 1081.07 max = 1083.31 avg = 1082.18 resnet50 min = 497.54 max = 497.85 avg = 497.67 resnet50_int8 min = 681.79 max = 682.60 avg = 682.29 squeezenet_ssd min = 101.81 max = 102.49 avg = 102.13 squeezenet_ssd_int8 min = 147.77 max = 148.52 avg = 148.04 mobilenet_ssd min = 215.63 max = 216.07 avg = 215.91 mobilenet_ssd_int8 min = 305.65 max = 305.97 avg = 305.78 mobilenet_yolo min = 494.99 max = 495.41 avg = 495.16 mobilenetv2_yolov3 min = 233.51 max = 234.26 avg = 233.84 yolov4-tiny min = 287.26 max = 287.89 avg = 287.50 nanodet_m min = 70.48 max = 70.73 avg = 70.61 yolo-fastest-1.1 min = 27.32 max = 27.36 avg = 27.34 yolo-fastestv2 min = 23.51 max = 23.85 avg = 23.76 [0 Intel(R) UHD Graphics (JSL)] queueC=0[1] queueG=0[1] queueT=0[1] [0 Intel(R) UHD Graphics (JSL)] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 Intel(R) UHD Graphics (JSL)] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 Intel(R) UHD Graphics (JSL)] subgroup=32 basic=1 vote=1 ballot=1 shuffle=1 loop_count = 4 num_threads = 1 powersave = 0 gpu_device = 0 cooling_down = 0 squeezenet min = 14.71 max = 15.37 avg = 14.90 mobilenet min = 15.38 max = 16.34 avg = 16.07 mobilenet_v2 min = 13.58 max = 14.52 avg = 14.23 mobilenet_v3 min = 14.95 max = 15.81 avg = 15.20 shufflenet min = 11.93 max = 12.73 avg = 12.31 shufflenet_v2 min = 14.47 max = 14.74 avg = 14.60 mnasnet min = 15.32 max = 17.13 avg = 15.95 proxylessnasnet min = 15.34 max = 16.25 avg = 15.66 efficientnet_b0 min = 26.02 max = 26.19 avg = 26.11 efficientnetv2_b0 min = 75.92 max = 76.18 avg = 76.07 regnety_400m min = 17.79 max = 18.00 avg = 17.91 blazeface min = 5.03 max = 5.96 avg = 5.65 googlenet min = 35.20 max = 35.40 avg = 35.32 resnet18 min = 35.49 max = 35.61 avg = 35.56 alexnet min = 40.93 max = 41.25 avg = 41.11 vgg16 min = 220.66 max = 222.18 avg = 221.42 resnet50 min = 78.10 max = 78.48 avg = 78.28 squeezenet_ssd min = 46.90 max = 47.46 avg = 47.26 mobilenet_ssd min = 33.33 max = 33.54 avg = 33.44 mobilenet_yolo min = 67.54 max = 67.77 avg = 67.64 mobilenetv2_yolov3 min = 38.98 max = 39.69 avg = 39.37 yolov4-tiny min = 68.01 max = 69.74 avg = 68.86 nanodet_m min = 17.41 max = 18.13 avg = 17.78 yolo-fastest-1.1 min = 13.91 max = 14.18 avg = 14.03 yolo-fastestv2 min = 15.94 max = 16.02 avg = 15.97 ``` ### nVIDIA RTX2060 of Notebook ``` C:\Users\ai\AppData\Local\Temp\benchmark>benchncnn.exe 64 1 0 0 0 [0 GeForce RTX 2060] queueC=2[8] queueG=0[16] queueT=1[2] [0 GeForce RTX 2060] buglssc=0 bugihfa=0 [0 GeForce RTX 2060] fp16p=1 fp16s=1 fp16a=1 int8s=1 int8a=1 loop_count = 64 num_threads = 1 powersave = 0 gpu_device = 0 cooling_down = 0 squeezenet min = 2.14 max = 2.93 avg = 2.26 mobilenet min = 2.08 max = 2.53 avg = 2.22 mobilenet_v2 min = 2.81 max = 4.03 avg = 3.05 mobilenet_v3 min = 2.90 max = 3.53 avg = 3.08 shufflenet min = 1.94 max = 4.27 avg = 2.55 shufflenet_v2 min = 2.34 max = 2.97 avg = 2.49 mnasnet min = 2.11 max = 2.86 avg = 2.37 proxylessnasnet min = 2.27 max = 3.25 avg = 2.49 googlenet min = 4.34 max = 6.79 avg = 5.25 resnet18 min = 2.60 max = 4.36 avg = 2.90 alexnet min = 2.79 max = 4.70 avg = 3.04 vgg16 min = 11.40 max = 14.32 avg = 12.42 resnet50 min = 5.26 max = 5.86 avg = 5.51 squeezenet_ssd min = 5.58 max = 7.94 avg = 6.56 mobilenet_ssd min = 3.47 max = 5.29 avg = 3.77 mobilenet_yolo min = 5.49 max = 6.19 avg = 5.70 mobilenetv2_yolov3 min = 3.69 max = 5.14 avg = 3.91 ``` ### nVIDIA RTX A3000 of Notebook (6GB) ``` cx@HP-ZBook-Fury-15-6-inch-G8-Mobile-Workstation-PC:~/ncnn/build/benchmark$ ./benchncnn 10 1 0 1 [0 Intel(R) UHD Graphics (TGL GT1)] queueC=0[1] queueG=0[1] queueT=0[1] [0 Intel(R) UHD Graphics (TGL GT1)] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 Intel(R) UHD Graphics (TGL GT1)] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 Intel(R) UHD Graphics (TGL GT1)] subgroup=32 basic/vote/ballot/shuffle=1/1/1/1 [0 Intel(R) UHD Graphics (TGL GT1)] fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0 [1 NVIDIA RTX A3000 Laptop GPU] queueC=2[8] queueG=0[16] queueT=1[2] [1 NVIDIA RTX A3000 Laptop GPU] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [1 NVIDIA RTX A3000 Laptop GPU] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [1 NVIDIA RTX A3000 Laptop GPU] subgroup=32 basic/vote/ballot/shuffle=1/1/1/1 [1 NVIDIA RTX A3000 Laptop GPU] fp16-matrix-16_8_8/16_8_16/16_16_16=1/1/1 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = 1 cooling_down = 1 squeezenet min = 1.49 max = 1.94 avg = 1.74 squeezenet_int8 min = 6.13 max = 6.20 avg = 6.16 mobilenet min = 4.05 max = 4.82 avg = 4.65 mobilenet_int8 min = 10.24 max = 10.29 avg = 10.26 mobilenet_v2 min = 0.98 max = 1.14 avg = 1.03 mobilenet_v3 min = 1.74 max = 1.82 avg = 1.77 shufflenet min = 1.43 max = 30.51 avg = 9.51 shufflenet_v2 min = 3.43 max = 3.89 avg = 3.77 mnasnet min = 6.50 max = 6.75 avg = 6.62 proxylessnasnet min = 6.46 max = 7.28 avg = 7.00 efficientnet_b0 min = 3.14 max = 15.11 avg = 7.29 efficientnetv2_b0 min = 18.50 max = 20.13 avg = 19.17 regnety_400m min = 2.16 max = 3.57 avg = 2.70 blazeface min = 2.52 max = 2.76 avg = 2.65 googlenet min = 2.67 max = 14.67 avg = 9.85 googlenet_int8 min = 19.08 max = 19.40 avg = 19.19 resnet18 min = 5.19 max = 9.44 avg = 8.48 resnet18_int8 min = 16.57 max = 17.69 avg = 16.96 alexnet min = 1.98 max = 3.24 avg = 2.23 vgg16 min = 3.59 max = 12.34 avg = 10.99 vgg16_int8 min = 110.63 max = 124.31 avg = 118.16 resnet50 min = 3.01 max = 4.93 avg = 3.77 resnet50_int8 min = 41.58 max = 44.80 avg = 43.24 squeezenet_ssd min = 4.08 max = 4.70 avg = 4.32 squeezenet_ssd_int8 min = 17.32 max = 17.92 avg = 17.46 mobilenet_ssd min = 2.26 max = 8.23 avg = 5.57 mobilenet_ssd_int8 min = 20.35 max = 21.89 avg = 20.76 mobilenet_yolo min = 2.14 max = 16.94 avg = 6.44 mobilenetv2_yolov3 min = 3.64 max = 5.09 avg = 4.02 yolov4-tiny min = 10.94 max = 17.46 avg = 13.58 nanodet_m min = 6.57 max = 13.91 avg = 9.82 yolo-fastest-1.1 min = 5.40 max = 14.22 avg = 10.78 yolo-fastestv2 min = 7.49 max = 9.43 avg = 7.99 vision_transformer min = 76.04 max = 76.96 avg = 76.43 FastestDet min = 6.31 max = 6.60 avg = 6.43 ``` ### nVIDIA RTX2080 of Desktop ``` E:\projects\framework\ncnn\benchmark>benchncnn.exe 4096 1 0 0 0 [0 GeForce RTX 2080] queueC=2[8] queueG=0[16] queueT=1[2] [0 GeForce RTX 2080] buglssc=0 bugihfa=0 [0 GeForce RTX 2080] fp16p=1 fp16s=1 fp16a=1 int8s=1 int8a=1 loop_count = 4096 num_threads = 1 powersave = 0 gpu_device = 0 cooling_down = 0 squeezenet min = 1.39 max = 16.70 avg = 1.49 mobilenet min = 1.32 max = 2.55 avg = 1.42 mobilenet_v2 min = 1.88 max = 5.02 avg = 2.00 mobilenet_v3 min = 2.31 max = 3.58 avg = 2.45 shufflenet min = 1.45 max = 2.65 avg = 1.55 shufflenet_v2 min = 1.90 max = 3.21 avg = 2.03 mnasnet min = 1.95 max = 3.17 avg = 2.09 proxylessnasnet min = 2.02 max = 2.95 avg = 2.16 googlenet min = 3.81 max = 5.91 avg = 4.05 resnet18 min = 2.10 max = 3.28 avg = 2.24 alexnet min = 2.15 max = 3.35 avg = 2.30 vgg16 min = 7.33 max = 11.12 avg = 7.80 resnet50 min = 4.21 max = 6.70 avg = 4.49 squeezenet_ssd min = 4.58 max = 6.86 avg = 4.88 mobilenet_ssd min = 2.90 max = 4.52 avg = 3.09 mobilenet_yolo min = 4.15 max = 6.09 avg = 4.40 mobilenetv2_yolov3 min = 3.04 max = 9.13 avg = 3.28 ``` ### NVIDIA Jetson AGX Xavier (Carmel 2.2 GHz x 8 + Volta Tensor Cores 64) ``` i@ubuntu:~/projects/ncnn/benchmark$ ./benchncnn 32 1 0 -1 0 loop_count = 32 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 22.31 max = 23.29 avg = 22.68 squeezenet_int8 min = 47.64 max = 52.88 avg = 49.72 mobilenet min = 37.50 max = 38.45 avg = 37.85 mobilenet_int8 min = 89.14 max = 92.38 avg = 90.95 mobilenet_v2 min = 24.31 max = 25.53 avg = 24.68 mobilenet_v3 min = 20.20 max = 21.21 avg = 20.56 shufflenet min = 14.85 max = 15.64 avg = 15.15 shufflenet_v2 min = 14.34 max = 16.11 avg = 14.86 mnasnet min = 23.42 max = 23.86 avg = 23.56 proxylessnasnet min = 27.44 max = 28.83 avg = 27.83 efficientnet_b0 min = 34.57 max = 37.84 avg = 35.13 efficientnetv2_b0 min = 65.16 max = 68.67 avg = 66.76 regnety_400m min = 33.86 max = 34.49 avg = 34.17 blazeface min = 11.86 max = 14.15 avg = 12.52 googlenet min = 83.19 max = 89.84 avg = 85.14 googlenet_int8 min = 146.74 max = 155.25 avg = 151.14 resnet18 min = 50.46 max = 57.80 avg = 53.40 resnet18_int8 min = 108.43 max = 116.14 avg = 110.78 alexnet min = 56.59 max = 64.93 avg = 59.51 vgg16 min = 266.78 max = 272.16 avg = 269.14 vgg16_int8 min = 538.71 max = 551.55 avg = 544.78 resnet50 min = 169.11 max = 172.26 avg = 170.51 resnet50_int8 min = 370.55 max = 384.36 avg = 377.75 squeezenet_ssd min = 58.51 max = 67.88 avg = 62.78 squeezenet_ssd_int8 min = 95.34 max = 106.49 avg = 97.99 mobilenet_ssd min = 83.52 max = 86.84 avg = 84.86 mobilenet_ssd_int8 min = 172.70 max = 181.84 avg = 176.25 mobilenet_yolo min = 165.26 max = 167.74 avg = 166.51 mobilenetv2_yolov3 min = 88.11 max = 90.29 avg = 89.19 yolov4-tiny min = 105.44 max = 109.24 avg = 107.07 nanodet_m min = 33.60 max = 37.02 avg = 34.39 yolo-fastest-1.1 min = 13.56 max = 14.22 avg = 13.75 yolo-fastestv2 min = 13.76 max = 14.59 avg = 14.02 i@ubuntu:~/projects/ncnn/benchmark$ ./benchncnn 32 2 0 -1 0 loop_count = 32 num_threads = 2 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 13.05 max = 13.76 avg = 13.36 squeezenet_int8 min = 26.08 max = 28.09 avg = 26.69 mobilenet min = 20.61 max = 21.21 avg = 20.81 mobilenet_int8 min = 44.72 max = 47.33 avg = 45.76 mobilenet_v2 min = 14.67 max = 15.23 avg = 14.86 mobilenet_v3 min = 12.59 max = 15.50 avg = 13.36 shufflenet min = 12.74 max = 14.14 avg = 13.31 shufflenet_v2 min = 10.05 max = 10.89 avg = 10.40 mnasnet min = 14.02 max = 14.75 avg = 14.19 proxylessnasnet min = 16.05 max = 16.94 avg = 16.31 efficientnet_b0 min = 20.47 max = 23.05 avg = 20.81 efficientnetv2_b0 min = 37.51 max = 41.53 avg = 39.19 regnety_400m min = 25.21 max = 25.73 avg = 25.39 blazeface min = 7.30 max = 8.44 avg = 7.43 googlenet min = 42.52 max = 47.38 avg = 44.39 googlenet_int8 min = 76.38 max = 81.63 avg = 77.93 resnet18 min = 26.76 max = 28.72 avg = 27.22 resnet18_int8 min = 55.97 max = 61.57 avg = 57.26 alexnet min = 29.29 max = 33.20 avg = 31.03 vgg16 min = 134.32 max = 138.65 avg = 136.05 vgg16_int8 min = 267.70 max = 281.71 avg = 272.79 resnet50 min = 87.22 max = 88.75 avg = 87.65 resnet50_int8 min = 183.80 max = 192.17 avg = 187.25 squeezenet_ssd min = 35.80 max = 39.00 avg = 37.32 squeezenet_ssd_int8 min = 53.56 max = 60.43 avg = 55.58 mobilenet_ssd min = 44.17 max = 48.30 avg = 44.70 mobilenet_ssd_int8 min = 90.32 max = 94.09 avg = 92.27 mobilenet_yolo min = 87.50 max = 89.63 avg = 88.33 mobilenetv2_yolov3 min = 49.76 max = 51.58 avg = 50.44 yolov4-tiny min = 61.17 max = 64.41 avg = 62.15 nanodet_m min = 21.43 max = 22.47 avg = 21.82 yolo-fastest-1.1 min = 10.90 max = 12.63 avg = 11.12 yolo-fastestv2 min = 10.61 max = 11.11 avg = 10.82 i@ubuntu:~/projects/ncnn/benchmark$ ./benchncnn 32 4 0 -1 0 loop_count = 32 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 8.06 max = 8.79 avg = 8.39 squeezenet_int8 min = 14.96 max = 16.64 avg = 15.37 mobilenet min = 11.24 max = 11.91 avg = 11.48 mobilenet_int8 min = 23.63 max = 24.75 avg = 23.81 mobilenet_v2 min = 9.27 max = 9.97 avg = 9.44 mobilenet_v3 min = 8.81 max = 10.06 avg = 9.07 shufflenet min = 11.22 max = 11.53 avg = 11.37 shufflenet_v2 min = 7.81 max = 8.17 avg = 7.97 mnasnet min = 9.40 max = 10.49 avg = 10.06 proxylessnasnet min = 10.53 max = 10.73 avg = 10.62 efficientnet_b0 min = 13.55 max = 15.14 avg = 13.80 efficientnetv2_b0 min = 19.83 max = 21.95 avg = 21.09 regnety_400m min = 21.80 max = 22.91 avg = 22.13 blazeface min = 5.17 max = 6.27 avg = 5.31 googlenet min = 22.67 max = 25.35 avg = 23.10 googlenet_int8 min = 43.19 max = 45.68 avg = 43.72 resnet18 min = 15.19 max = 16.14 avg = 15.42 resnet18_int8 min = 31.22 max = 34.76 avg = 31.81 alexnet min = 15.20 max = 17.65 avg = 15.56 vgg16 min = 70.76 max = 73.21 avg = 71.70 vgg16_int8 min = 137.94 max = 143.50 avg = 139.54 resnet50 min = 47.15 max = 47.91 avg = 47.40 resnet50_int8 min = 99.80 max = 102.94 avg = 100.29 squeezenet_ssd min = 22.10 max = 24.11 avg = 22.46 squeezenet_ssd_int8 min = 33.21 max = 35.98 avg = 33.98 mobilenet_ssd min = 25.09 max = 26.81 avg = 25.50 mobilenet_ssd_int8 min = 48.15 max = 50.96 avg = 49.49 mobilenet_yolo min = 48.63 max = 49.02 avg = 48.84 mobilenetv2_yolov3 min = 30.93 max = 31.41 avg = 31.13 yolov4-tiny min = 38.43 max = 41.20 avg = 39.28 nanodet_m min = 14.95 max = 15.74 avg = 15.35 yolo-fastest-1.1 min = 8.89 max = 9.18 avg = 9.01 yolo-fastestv2 min = 8.36 max = 9.28 avg = 8.50 i@ubuntu:~/projects/ncnn/benchmark$ ./benchncnn 32 8 0 -1 0 loop_count = 32 num_threads = 8 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 6.52 max = 74.10 avg = 12.94 squeezenet_int8 min = 10.44 max = 18.81 avg = 12.15 mobilenet min = 7.49 max = 14.63 avg = 8.67 mobilenet_int8 min = 13.80 max = 15.89 avg = 14.53 mobilenet_v2 min = 8.15 max = 11.42 avg = 8.78 mobilenet_v3 min = 7.60 max = 10.92 avg = 8.38 shufflenet min = 11.51 max = 19.48 avg = 12.97 shufflenet_v2 min = 7.06 max = 15.58 avg = 9.48 mnasnet min = 7.77 max = 15.12 avg = 8.68 proxylessnasnet min = 8.54 max = 42.73 avg = 10.00 efficientnet_b0 min = 11.11 max = 12.86 avg = 11.89 efficientnetv2_b0 min = 17.17 max = 29.03 avg = 20.48 regnety_400m min = 22.41 max = 36.72 avg = 25.49 blazeface min = 4.93 max = 11.62 avg = 6.13 googlenet min = 17.02 max = 31.61 avg = 19.92 googlenet_int8 min = 27.70 max = 35.49 avg = 29.18 resnet18 min = 9.74 max = 18.78 avg = 11.40 resnet18_int8 min = 18.52 max = 24.70 avg = 19.32 alexnet min = 10.70 max = 15.41 avg = 11.39 vgg16 min = 40.80 max = 54.47 avg = 42.72 vgg16_int8 min = 74.71 max = 79.66 avg = 76.37 resnet50 min = 28.21 max = 36.62 avg = 29.41 resnet50_int8 min = 54.53 max = 76.02 avg = 56.81 squeezenet_ssd min = 19.01 max = 30.68 avg = 24.89 squeezenet_ssd_int8 min = 27.61 max = 35.87 avg = 29.22 mobilenet_ssd min = 17.35 max = 22.87 avg = 18.55 mobilenet_ssd_int8 min = 29.92 max = 36.35 avg = 31.15 mobilenet_yolo min = 31.63 max = 55.61 avg = 34.31 mobilenetv2_yolov3 min = 23.75 max = 35.45 avg = 25.68 yolov4-tiny min = 29.23 max = 70.12 avg = 31.94 nanodet_m min = 13.00 max = 21.72 avg = 15.39 yolo-fastest-1.1 min = 9.72 max = 17.94 avg = 11.45 yolo-fastestv2 min = 9.16 max = 16.35 avg = 11.08 i@ubuntu:~/projects/ncnn/benchmark$ ./benchncnn 128 1 0 0 0 [0 NVIDIA Tegra Xavier (nvgpu)] queueC=2[8] queueG=0[16] queueT=1[1] [0 NVIDIA Tegra Xavier (nvgpu)] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 NVIDIA Tegra Xavier (nvgpu)] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 NVIDIA Tegra Xavier (nvgpu)] subgroup=32 basic=1 vote=1 ballot=1 shuffle=1 loop_count = 128 num_threads = 1 powersave = 0 gpu_device = 0 cooling_down = 0 squeezenet min = 4.85 max = 19.65 avg = 6.83 squeezenet_int8 min = 46.38 max = 49.70 avg = 47.22 mobilenet min = 5.62 max = 6.61 avg = 6.33 mobilenet_int8 min = 87.42 max = 92.95 avg = 90.52 mobilenet_v2 min = 5.96 max = 7.53 avg = 6.50 mobilenet_v3 min = 6.77 max = 7.83 avg = 7.01 shufflenet min = 10.58 max = 18.46 avg = 13.68 shufflenet_v2 min = 20.06 max = 21.09 avg = 20.37 mnasnet min = 6.49 max = 26.49 avg = 8.26 proxylessnasnet min = 6.75 max = 27.37 avg = 7.88 efficientnet_b0 min = 12.11 max = 48.35 avg = 14.63 efficientnetv2_b0 min = 24.61 max = 69.68 avg = 34.33 regnety_400m min = 9.02 max = 34.40 avg = 10.84 blazeface min = 7.55 max = 8.10 avg = 7.78 googlenet min = 12.57 max = 65.14 avg = 18.91 googlenet_int8 min = 145.74 max = 155.87 avg = 151.06 resnet18 min = 8.88 max = 30.48 avg = 9.34 resnet18_int8 min = 109.19 max = 116.78 avg = 111.52 alexnet min = 9.06 max = 54.53 avg = 19.04 vgg16 min = 18.12 max = 37.31 avg = 19.65 vgg16_int8 min = 530.60 max = 551.58 avg = 542.33 resnet50 min = 11.62 max = 20.64 avg = 12.17 resnet50_int8 min = 374.83 max = 384.79 avg = 379.50 squeezenet_ssd min = 14.01 max = 55.88 avg = 23.64 squeezenet_ssd_int8 min = 89.86 max = 95.80 avg = 92.18 mobilenet_ssd min = 13.20 max = 13.61 avg = 13.37 mobilenet_ssd_int8 min = 170.17 max = 181.48 avg = 174.93 mobilenet_yolo min = 11.78 max = 20.42 avg = 13.34 mobilenetv2_yolov3 min = 18.08 max = 62.94 avg = 26.70 yolov4-tiny min = 26.44 max = 34.83 avg = 31.83 nanodet_m min = 7.93 max = 9.91 avg = 9.01 yolo-fastest-1.1 min = 6.03 max = 20.85 avg = 8.42 yolo-fastestv2 min = 9.01 max = 20.60 avg = 12.51 ``` ### MacBook Pro (13-inch, M1, 2020) ``` MacBook-Pro benchmark % ./benchncnn 10 1 0 -1 0 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 4.80 max = 5.05 avg = 4.86 squeezenet_int8 min = 4.02 max = 4.13 avg = 4.04 mobilenet min = 9.09 max = 9.41 avg = 9.22 mobilenet_int8 min = 4.65 max = 4.76 avg = 4.70 mobilenet_v2 min = 5.64 max = 5.83 avg = 5.73 mobilenet_v3 min = 4.64 max = 4.85 avg = 4.76 shufflenet min = 3.48 max = 3.63 avg = 3.56 shufflenet_v2 min = 3.69 max = 3.81 avg = 3.73 mnasnet min = 5.67 max = 5.94 avg = 5.77 proxylessnasnet min = 7.03 max = 7.28 avg = 7.20 efficientnet_b0 min = 9.13 max = 9.53 avg = 9.28 efficientnetv2_b0 min = 17.37 max = 18.47 avg = 17.63 regnety_400m min = 7.64 max = 8.08 avg = 7.72 blazeface min = 1.80 max = 1.89 avg = 1.83 googlenet min = 25.71 max = 25.90 avg = 25.81 googlenet_int8 min = 16.89 max = 17.10 avg = 16.97 resnet18 min = 17.16 max = 17.28 avg = 17.20 resnet18_int8 min = 15.55 max = 15.75 avg = 15.64 alexnet min = 30.60 max = 31.11 avg = 30.69 vgg16 min = 73.41 max = 75.37 avg = 73.91 vgg16_int8 min = 103.81 max = 105.15 avg = 104.19 resnet50 min = 43.47 max = 44.24 avg = 43.68 resnet50_int8 min = 30.37 max = 35.25 avg = 31.61 squeezenet_ssd min = 20.97 max = 21.21 avg = 21.12 squeezenet_ssd_int8 min = 19.34 max = 19.54 avg = 19.42 mobilenet_ssd min = 22.18 max = 22.58 avg = 22.28 mobilenet_ssd_int8 min = 13.27 max = 15.31 avg = 14.05 mobilenet_yolo min = 40.78 max = 41.04 avg = 40.89 mobilenetv2_yolov3 min = 20.87 max = 21.92 avg = 21.02 yolov4-tiny min = 30.73 max = 32.37 avg = 31.29 nanodet_m min = 8.54 max = 8.86 avg = 8.65 MacBook-Pro benchmark % ./benchncnn 10 8 0 0 0 [0 Apple M1] queueC=0[1] queueG=0[1] queueT=0[1] [0 Apple M1] bugsbn1=0 bugbilz=151 bugcopc=0 bugihfa=0 [0 Apple M1] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 Apple M1] subgroup=32 basic=1 vote=1 ballot=1 shuffle=1 loop_count = 10 num_threads = 8 powersave = 0 gpu_device = 0 cooling_down = 0 squeezenet min = 1.86 max = 2.22 avg = 2.01 squeezenet_int8 min = 2.38 max = 8.40 avg = 5.13 mobilenet min = 2.50 max = 2.91 avg = 2.64 mobilenet_int8 min = 2.29 max = 5.26 avg = 3.54 mobilenet_v2 min = 2.93 max = 3.12 avg = 2.98 mobilenet_v3 min = 3.36 max = 3.61 avg = 3.48 shufflenet min = 1.99 max = 2.54 avg = 2.18 shufflenet_v2 min = 2.35 max = 2.84 avg = 2.52 mnasnet min = 2.81 max = 3.33 avg = 2.92 proxylessnasnet min = 3.21 max = 3.62 avg = 3.36 efficientnet_b0 min = 4.74 max = 5.73 avg = 5.07 efficientnetv2_b0 min = 12.04 max = 13.04 avg = 12.61 regnety_400m min = 3.86 max = 4.04 avg = 3.98 blazeface min = 0.98 max = 1.11 avg = 1.03 googlenet min = 4.86 max = 5.38 avg = 5.02 googlenet_int8 min = 9.43 max = 15.72 avg = 10.44 resnet18 min = 3.92 max = 4.59 avg = 4.24 resnet18_int8 min = 6.83 max = 7.57 avg = 7.35 alexnet min = 7.49 max = 7.87 avg = 7.65 vgg16 min = 34.10 max = 35.29 avg = 34.60 vgg16_int8 min = 40.09 max = 44.66 avg = 41.95 resnet50 min = 7.22 max = 7.83 avg = 7.42 resnet50_int8 min = 14.52 max = 20.56 avg = 15.78 squeezenet_ssd min = 8.52 max = 13.79 avg = 9.98 squeezenet_ssd_int8 min = 12.38 max = 15.44 avg = 13.37 mobilenet_ssd min = 4.83 max = 6.00 avg = 5.31 mobilenet_ssd_int8 min = 7.26 max = 13.12 avg = 9.01 mobilenet_yolo min = 7.22 max = 8.66 avg = 7.99 mobilenetv2_yolov3 min = 7.46 max = 8.06 avg = 7.80 yolov4-tiny min = 12.17 max = 13.95 avg = 12.82 nanodet_m min = 3.54 max = 4.78 avg = 3.86 ``` ### MacBook Air (13-inch, M3, 2024) ``` MacBook-Air benchmark % ./benchncnn 10 1 0 -1 0 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 3.59 max = 4.20 avg = 3.80 squeezenet_int8 min = 2.61 max = 2.82 avg = 2.74 mobilenet min = 6.67 max = 6.92 avg = 6.85 mobilenet_int8 min = 3.61 max = 3.66 avg = 3.62 mobilenet_v2 min = 4.08 max = 4.15 avg = 4.10 mobilenet_v3 min = 3.32 max = 3.44 avg = 3.34 shufflenet min = 2.08 max = 2.13 avg = 2.10 shufflenet_v2 min = 2.35 max = 2.44 avg = 2.37 mnasnet min = 4.14 max = 4.23 avg = 4.18 proxylessnasnet min = 5.09 max = 5.15 avg = 5.11 efficientnet_b0 min = 6.67 max = 6.75 avg = 6.70 efficientnetv2_b0 min = 8.79 max = 8.83 avg = 8.81 regnety_400m min = 5.68 max = 5.73 avg = 5.69 blazeface min = 0.75 max = 0.77 avg = 0.76 googlenet min = 15.94 max = 15.97 avg = 15.96 googlenet_int8 min = 10.88 max = 10.92 avg = 10.89 resnet18 min = 12.60 max = 12.63 avg = 12.61 resnet18_int8 min = 9.88 max = 9.95 avg = 9.90 alexnet min = 12.72 max = 12.82 avg = 12.77 vgg16 min = 57.85 max = 61.44 avg = 58.40 vgg16_int8 min = 78.53 max = 79.85 avg = 78.83 resnet50 min = 34.79 max = 34.85 avg = 34.81 resnet50_int8 min = 20.56 max = 20.62 avg = 20.58 squeezenet_ssd min = 9.64 max = 9.82 avg = 9.69 squeezenet_ssd_int8 min = 8.21 max = 8.34 avg = 8.25 mobilenet_ssd min = 14.21 max = 14.34 avg = 14.25 mobilenet_ssd_int8 min = 7.35 max = 7.41 avg = 7.37 mobilenet_yolo min = 31.61 max = 31.74 avg = 31.64 mobilenetv2_yolov3 min = 15.79 max = 15.87 avg = 15.83 yolov4-tiny min = 22.93 max = 22.99 avg = 22.96 nanodet_m min = 5.58 max = 5.62 avg = 5.59 yolo-fastest-1.1 min = 2.00 max = 2.05 avg = 2.01 yolo-fastestv2 min = 1.75 max = 1.77 avg = 1.76 vision_transformer min = 1020.57 max = 1046.02 avg = 1028.75 FastestDet min = 1.88 max = 1.93 avg = 1.89 MacBook-Air benchmark % ./benchncnn 10 8 0 0 0 [0 Apple M3] queueC=0[1] queueG=0[1] queueT=0[1] [0 Apple M3] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 Apple M3] fp16-p/s/u/a=1/1/1/1 int8-p/s/u/a=1/1/1/1 [0 Apple M3] subgroup=32 basic/vote/ballot/shuffle=1/1/1/1 [0 Apple M3] fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0 loop_count = 10 num_threads = 8 powersave = 0 gpu_device = 0 cooling_down = 0 squeezenet min = 1.79 max = 2.48 avg = 2.16 squeezenet_int8 min = 2.78 max = 2.93 avg = 2.80 mobilenet min = 1.40 max = 1.85 avg = 1.68 mobilenet_int8 min = 3.60 max = 3.67 avg = 3.61 mobilenet_v2 min = 1.68 max = 2.28 avg = 1.97 mobilenet_v3 min = 1.71 max = 2.29 avg = 2.00 shufflenet min = 1.18 max = 2.49 avg = 1.78 shufflenet_v2 min = 1.45 max = 2.09 avg = 1.70 mnasnet min = 1.74 max = 2.25 avg = 2.05 proxylessnasnet min = 1.75 max = 2.18 avg = 2.02 efficientnet_b0 min = 2.71 max = 3.19 avg = 2.99 efficientnetv2_b0 min = 6.77 max = 7.04 avg = 6.88 regnety_400m min = 1.94 max = 2.40 avg = 2.10 blazeface min = 1.05 max = 1.43 avg = 1.24 googlenet min = 3.99 max = 4.42 avg = 4.27 googlenet_int8 min = 10.83 max = 10.86 avg = 10.85 resnet18 min = 2.50 max = 2.77 avg = 2.70 resnet18_int8 min = 9.86 max = 9.91 avg = 9.88 alexnet min = 2.99 max = 3.28 avg = 3.11 vgg16 min = 12.41 max = 13.13 avg = 12.54 vgg16_int8 min = 78.52 max = 78.67 avg = 78.61 resnet50 min = 5.46 max = 5.52 avg = 5.49 resnet50_int8 min = 20.57 max = 20.59 avg = 20.58 squeezenet_ssd min = 3.86 max = 4.53 avg = 4.17 squeezenet_ssd_int8 min = 8.20 max = 8.35 avg = 8.25 mobilenet_ssd min = 3.19 max = 3.75 avg = 3.52 mobilenet_ssd_int8 min = 7.35 max = 7.41 avg = 7.37 mobilenet_yolo min = 4.77 max = 4.88 avg = 4.81 mobilenetv2_yolov3 min = 4.28 max = 4.88 avg = 4.62 yolov4-tiny min = 6.76 max = 7.38 avg = 7.21 nanodet_m min = 2.92 max = 4.71 avg = 3.46 yolo-fastest-1.1 min = 1.48 max = 2.04 avg = 1.87 yolo-fastestv2 min = 1.41 max = 1.97 avg = 1.74 vision_transformer min = 80.34 max = 80.66 avg = 80.44 FastestDet min = 1.43 max = 2.04 avg = 1.73 ``` ### Ingenic T40XP Xburst2 Core X2 1.4Ghz (without MSA) ``` loop_count = 8 num_threads = 2 powersave = 0 gpu_device = 0 cooling_down = 0 squeezenet min = 921.23 max = 944.03 avg = 930.71 squeezenet_int8 min = 3280.89 max = 3404.83 avg = 3359.68 mobilenet min = 1277.61 max = 1298.51 avg = 1284.38 mobilenet_int8 min = 4342.67 max = 4350.21 avg = 4345.85 mobilenet_v2 min = 780.92 max = 783.93 avg = 782.79 mobilenet_v3 min = 650.59 max = 655.08 avg = 652.06 shufflenet min = 352.75 max = 353.69 avg = 353.24 shufflenet_v2 min = 362.82 max = 364.08 avg = 363.38 mnasnet min = 790.45 max = 791.89 avg = 790.99 proxylessnasnet min = 868.71 max = 870.47 avg = 869.17 efficientnet_b0 min = 1491.44 max = 1492.36 avg = 1491.95 efficientnetv2_b0 min = 2135.04 max = 2148.02 avg = 2139.99 regnety_400m min = 1000.53 max = 1005.29 avg = 1001.81 blazeface min = 102.72 max = 104.18 avg = 103.51 googlenet min = 3652.89 max = 3705.40 avg = 3675.43 googlenet_int8 min = 8067.30 max = 8070.22 avg = 8069.21 ``` ### MacBook Pro (15-inch, 2019) - 2.6GHz six cores Intel Core i7 && Radeon Pro 555X 4GB && Intel UHD Graphics 630 1536MB ``` ➜ benchmark git:(master) ✗ ./benchncnn 10 1 0 -1 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 14.68 max = 17.06 avg = 15.55 squeezenet_int8 min = 51.64 max = 57.85 avg = 54.01 mobilenet min = 20.74 max = 25.38 avg = 22.77 mobilenet_int8 min = 66.84 max = 91.01 avg = 75.69 mobilenet_v2 min = 14.04 max = 20.06 avg = 16.36 mobilenet_v3 min = 11.89 max = 16.22 avg = 13.58 shufflenet min = 13.74 max = 17.10 avg = 15.02 shufflenet_v2 min = 12.73 max = 14.36 avg = 13.53 mnasnet min = 11.05 max = 17.79 avg = 13.82 proxylessnasnet min = 12.60 max = 27.38 avg = 17.55 efficientnet_b0 min = 23.73 max = 26.82 avg = 25.45 efficientnetv2_b0 min = 27.03 max = 33.89 avg = 30.78 regnety_400m min = 13.81 max = 21.50 avg = 15.40 blazeface min = 3.72 max = 4.98 avg = 4.43 googlenet min = 65.88 max = 76.62 avg = 69.40 googlenet_int8 min = 192.07 max = 227.85 avg = 203.81 resnet18 min = 79.45 max = 90.41 avg = 85.32 resnet18_int8 min = 201.71 max = 222.31 avg = 207.39 alexnet min = 70.67 max = 80.13 avg = 74.43 vgg16 min = 233.74 max = 261.62 avg = 250.99 vgg16_int8 min = 1722.78 max = 1997.14 avg = 1772.71 resnet50 min = 130.39 max = 135.31 avg = 133.27 resnet50_int8 min = 439.69 max = 483.78 avg = 461.33 squeezenet_ssd min = 108.54 max = 122.15 avg = 115.02 squeezenet_ssd_int8 min = 175.58 max = 185.09 avg = 181.33 mobilenet_ssd min = 51.89 max = 59.32 avg = 54.30 mobilenet_ssd_int8 min = 140.15 max = 192.10 avg = 164.47 mobilenet_yolo min = 117.37 max = 131.89 avg = 126.34 mobilenetv2_yolov3 min = 57.57 max = 72.29 avg = 64.92 yolov4-tiny min = 114.45 max = 123.15 avg = 116.91 nanodet_m min = 25.65 max = 33.27 avg = 28.75 ➜ benchmark git:(master) ✗ ./benchncnn 10 1 0 0 [0 AMD Radeon Pro 555X] queueC=0[1] queueG=0[1] queueT=0[1] [0 AMD Radeon Pro 555X] bugsbn1=0 bugbilz=196 bugcopc=0 bugihfa=0 [0 AMD Radeon Pro 555X] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 AMD Radeon Pro 555X] subgroup=64 basic=0 vote=0 ballot=0 shuffle=0 [1 Intel(R) UHD Graphics 630] queueC=0[1] queueG=0[1] queueT=0[1] [1 Intel(R) UHD Graphics 630] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [1 Intel(R) UHD Graphics 630] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [1 Intel(R) UHD Graphics 630] subgroup=32 basic=0 vote=0 ballot=0 shuffle=0 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = 0 cooling_down = 1 squeezenet min = 6.66 max = 7.30 avg = 6.91 squeezenet_int8 min = 49.97 max = 60.92 avg = 53.86 mobilenet min = 6.99 max = 7.48 avg = 7.17 mobilenet_int8 min = 70.46 max = 83.20 avg = 79.33 mobilenet_v2 min = 9.56 max = 10.87 avg = 10.34 mobilenet_v3 min = 11.48 max = 12.20 avg = 11.94 shufflenet min = 4.52 max = 5.25 avg = 4.96 shufflenet_v2 min = 7.29 max = 9.65 avg = 7.99 mnasnet min = 9.82 max = 11.88 avg = 10.62 proxylessnasnet min = 7.85 max = 8.41 avg = 8.07 efficientnet_b0 min = 17.34 max = 17.85 avg = 17.56 efficientnetv2_b0 min = 21.95 max = 24.10 avg = 23.15 regnety_400m min = 13.54 max = 14.83 avg = 14.11 blazeface min = 3.26 max = 6.59 avg = 5.50 googlenet min = 17.62 max = 19.47 avg = 18.27 googlenet_int8 min = 198.88 max = 247.97 avg = 223.31 resnet18 min = 11.10 max = 12.01 avg = 11.59 resnet18_int8 min = 225.56 max = 259.39 avg = 238.97 alexnet min = 17.66 max = 19.19 avg = 18.24 vgg16 min = 53.20 max = 54.88 avg = 53.73 vgg16_int8 min = 1747.52 max = 2130.08 avg = 1880.42 resnet50 min = 27.38 max = 28.84 avg = 28.34 resnet50_int8 min = 461.86 max = 579.83 avg = 528.15 squeezenet_ssd min = 19.99 max = 20.98 avg = 20.50 squeezenet_ssd_int8 min = 185.20 max = 209.66 avg = 196.81 mobilenet_ssd min = 12.81 max = 14.21 avg = 13.48 mobilenet_ssd_int8 min = 139.29 max = 168.38 avg = 148.20 mobilenet_yolo min = 19.50 max = 20.51 avg = 19.97 mobilenetv2_yolov3 min = 15.95 max = 19.28 avg = 16.85 yolov4-tiny min = 21.43 max = 23.42 avg = 22.28 nanodet_m min = 7.95 max = 9.23 avg = 8.48 ➜ benchmark git:(master) ✗ ./benchncnn 10 1 0 1 [0 AMD Radeon Pro 555X] queueC=0[1] queueG=0[1] queueT=0[1] [0 AMD Radeon Pro 555X] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 AMD Radeon Pro 555X] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 AMD Radeon Pro 555X] subgroup=64 basic=0 vote=0 ballot=0 shuffle=0 [1 Intel(R) UHD Graphics 630] queueC=0[1] queueG=0[1] queueT=0[1] [1 Intel(R) UHD Graphics 630] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [1 Intel(R) UHD Graphics 630] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [1 Intel(R) UHD Graphics 630] subgroup=32 basic=0 vote=0 ballot=0 shuffle=0 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = 1 cooling_down = 1 squeezenet min = 11.06 max = 13.22 avg = 12.09 squeezenet_int8 min = 54.87 max = 64.55 avg = 59.84 mobilenet min = 13.65 max = 16.70 avg = 14.81 mobilenet_int8 min = 72.36 max = 93.58 avg = 86.40 mobilenet_v2 min = 11.88 max = 15.90 avg = 13.47 mobilenet_v3 min = 12.68 max = 16.16 avg = 14.56 shufflenet min = 13.87 max = 16.68 avg = 14.93 shufflenet_v2 min = 11.73 max = 13.65 avg = 12.87 mnasnet min = 12.71 max = 15.56 avg = 14.22 proxylessnasnet min = 14.03 max = 17.28 avg = 15.37 efficientnet_b0 min = 17.50 max = 21.46 avg = 19.30 efficientnetv2_b0 min = 35.47 max = 38.58 avg = 36.89 regnety_400m min = 16.00 max = 19.45 avg = 17.48 blazeface min = 6.08 max = 7.18 avg = 6.39 googlenet min = 23.35 max = 29.68 avg = 25.77 googlenet_int8 min = 198.49 max = 254.38 avg = 222.77 resnet18 min = 21.85 max = 28.10 avg = 24.70 resnet18_int8 min = 211.21 max = 279.55 avg = 222.64 alexnet min = 24.45 max = 30.47 avg = 26.87 vgg16 min = 115.20 max = 117.76 avg = 116.48 vgg16_int8 min = 1715.92 max = 1960.02 avg = 1800.21 resnet50 min = 45.65 max = 46.25 avg = 46.05 resnet50_int8 min = 448.13 max = 555.53 avg = 485.47 squeezenet_ssd min = 28.43 max = 33.26 avg = 29.85 squeezenet_ssd_int8 min = 180.91 max = 202.51 avg = 190.84 mobilenet_ssd min = 21.03 max = 26.93 avg = 23.48 mobilenet_ssd_int8 min = 154.41 max = 184.64 avg = 165.04 mobilenet_yolo min = 37.04 max = 38.64 avg = 37.52 mobilenetv2_yolov3 min = 24.98 max = 30.03 avg = 27.70 yolov4-tiny min = 39.29 max = 50.25 avg = 44.18 nanodet_m min = 15.97 max = 20.27 avg = 17.93 ``` ### Sunway SW421 (sw_64 1.7GHz * 4) ``` root@SW421:~/Desktop/ncnn-20220420/ncnn-20220420/build/benchmark$ ./benchncnn loop_count = 4 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 943.61 max = 966.98 avg = 955.24 squeezenet_int8 min = 654.75 max = 731.28 avg = 674.87 mobilenet min = 1584.87 max = 1612.88 avg = 1597.47 mobilenet_int8 min = 1198.21 max = 1204.82 avg = 1201.61 mobilenet_v2 min = 733.94 max = 754.79 avg = 744.48 mobilenet_v3 min = 665.26 max = 683.81 avg = 675.18 shufflenet min = 401.53 max = 435.21 avg = 420.32 shufflenet_v2 min = 294.65 max = 316.50 avg = 309.08 mnasnet min = 671.22 max = 808.46 avg = 713.01 proxylessnasnet min = 686.12 max = 698.13 avg = 692.29 efficientnet_b0 min = 1151.75 max = 1184.86 avg = 1161.33 efficientnetv2_b0 min = 1372.05 max = 1395.22 avg = 1379.47 regnety_400m min = 933.93 max = 949.42 avg = 942.43 blazeface min = 104.72 max = 136.77 avg = 112.86 googlenet min = 2574.02 max = 4330.81 avg = 3015.56 googlenet_int8 min = 2136.42 max = 2183.61 avg = 2166.45 resnet18 min = 2511.12 max = 2537.42 avg = 2526.08 resnet18_int8 min = 2003.84 max = 2027.50 avg = 2012.48 alexnet min = 668.28 max = 686.35 avg = 673.95 vgg16 min = 24863.92 max = 24967.94 avg = 24907.39 vgg16_int8 min = 18735.54 max = 18926.83 avg = 18859.32 resnet50 min = 9896.47 max = 9981.13 avg = 9929.77 resnet50_int8 min = 6971.01 max = 7085.29 avg = 7017.88 squeezenet_ssd min = 1798.23 max = 1814.25 avg = 1806.57 squeezenet_ssd_int8 min = 1586.11 max = 1606.83 avg = 1596.75 mobilenet_ssd min = 3995.54 max = 4018.27 avg = 4002.78 mobilenet_ssd_int8 min = 2753.65 max = 2766.06 avg = 2760.04 mobilenet_yolo min = 10892.22 max = 10978.84 avg = 10921.00 mobilenetv2_yolov3 min = 3600.80 max = 3607.72 avg = 3603.18 yolov4-tiny min = 5565.82 max = 5582.22 avg = 5571.78 nanodet_m min = 1182.97 max = 1220.47 avg = 1199.30 yolo-fastest-1.1 min = 340.63 max = 360.95 avg = 349.15 yolo-fastestv2 min = 255.47 max = 281.79 avg = 268.82 ``` ### Sunway SW831 (sw_64 2.5GHz * 8) ``` root@SW831:~/Desktop/ncnn_20221128/build/benchmark$ ./benchncnn 5 8 2 -1 0 loop_count = 5 num_threads = 8 powersave = 2 gpu_device = -1 cooling_down = 0 squeezenet min = 343.27 max = 420.86 avg = 364.97 squeezenet_int8 min = 237.91 max = 251.71 avg = 243.84 mobilenet min = 607.80 max = 696.04 avg = 646.61 mobilenet_int8 min = 428.37 max = 499.32 avg = 460.21 mobilenet_v2 min = 291.29 max = 381.93 avg = 311.76 mobilenet_v3 min = 262.01 max = 287.93 avg = 277.29 shufflenet min = 144.89 max = 169.10 avg = 150.84 shufflenet_v2 min = 121.44 max = 139.62 avg = 126.96 mnasnet min = 265.59 max = 353.84 avg = 288.79 proxylessnasnet min = 272.08 max = 293.19 avg = 284.61 efficientnet_b0 min = 445.40 max = 508.36 avg = 467.84 efficientnetv2_b0 min = 550.57 max = 619.16 avg = 581.85 regnety_400m min = 374.02 max = 460.64 avg = 394.49 blazeface min = 39.93 max = 59.19 avg = 44.14 googlenet min = 941.35 max = 1014.23 avg = 976.37 googlenet_int8 min = 770.66 max = 827.44 avg = 797.93 resnet18 min = 815.02 max = 895.13 avg = 843.57 resnet18_int8 min = 701.10 max = 776.40 avg = 729.49 alexnet min = 216.74 max = 273.39 avg = 228.99 vgg16 min = 8645.55 max = 8699.60 avg = 8681.61 vgg16_int8 min = 6786.91 max = 6930.90 avg = 6854.29 resnet50 min = 3624.02 max = 3698.91 avg = 3652.31 resnet50_int8 min = 2537.92 max = 2618.10 avg = 2567.88 squeezenet_ssd min = 635.25 max = 693.23 avg = 663.56 squeezenet_ssd_int8 min = 577.37 max = 641.12 avg = 603.34 mobilenet_ssd min = 1529.35 max = 1711.54 avg = 1582.10 mobilenet_ssd_int8 min = 982.65 max = 1042.82 avg = 1016.62 mobilenet_yolo min = 4053.62 max = 4124.84 avg = 4094.38 mobilenetv2_yolov3 min = 1367.81 max = 1527.79 avg = 1433.04 yolov4-tiny min = 1943.20 max = 2028.02 avg = 1978.31 nanodet_m min = 433.66 max = 498.83 avg = 457.77 yolo-fastest-1.1 min = 140.07 max = 284.35 avg = 192.46 yolo-fastestv2 min = 123.91 max = 225.70 avg = 152.54 vision_transformer min = 2470.70 max = 2509.73 avg = 2486.40 FastestDet min = 145.30 max = 163.43 avg = 154.35 ``` ### AXERA AX620A (Cortex-A7 1.0GHz * 4) ``` /root/axera # ./benchncnn 4 1 0 -1 0 loop_count = 4 num_threads = 1 powersave = 0 gpu_device = 0 cooling_down = 0 squeezenet min = 530.57 max = 533.11 avg = 532.22 squeezenet_int8 min = 359.74 max = 360.02 avg = 359.86 mobilenet min = 920.12 max = 921.04 avg = 920.52 mobilenet_int8 min = 532.60 max = 533.08 avg = 532.81 mobilenet_v2 min = 608.81 max = 609.49 avg = 609.18 mobilenet_v3 min = 531.43 max = 532.34 avg = 531.90 shufflenet min = 297.91 max = 300.08 avg = 299.06 shufflenet_v2 min = 288.44 max = 289.30 avg = 288.79 mnasnet min = 590.29 max = 590.99 avg = 590.63 proxylessnasnet min = 678.22 max = 679.22 avg = 678.63 efficientnet_b0 min = 1041.41 max = 1043.79 avg = 1042.61 efficientnetv2_b0 min = 1222.41 max = 1223.63 avg = 1222.91 regnety_400m min = 723.83 max = 725.37 avg = 724.64 blazeface min = 86.77 max = 87.21 avg = 86.92 googlenet min = 1740.32 max = 1741.44 avg = 1740.81 googlenet_int8 min = 1167.95 max = 1169.18 avg = 1168.54 resnet18 min = 1584.41 max = 1585.36 avg = 1584.97 resnet18_int8 min = 915.78 max = 918.77 avg = 917.16 alexnet min = 1811.30 max = 1812.86 avg = 1812.07 resnet50 min = 4516.48 max = 4523.48 avg = 4519.03 resnet50_int8 min = 2573.18 max = 2574.29 avg = 2573.69 squeezenet_ssd min = 1191.79 max = 1193.71 avg = 1193.02 squeezenet_ssd_int8 min = 862.36 max = 863.69 avg = 862.83 mobilenet_ssd min = 1950.48 max = 1950.98 avg = 1950.65 mobilenet_ssd_int8 min = 1081.70 max = 1082.64 avg = 1082.20 mobilenet_yolo min = 4629.22 max = 4630.23 avg = 4629.69 mobilenetv2_yolov3 min = 2233.05 max = 2234.14 avg = 2233.42 yolov4-tiny min = 2942.58 max = 2946.55 avg = 2944.81 nanodet_m min = 692.19 max = 693.36 avg = 692.79 yolo-fastest-1.1 min = 333.62 max = 334.43 avg = 334.00 yolo-fastestv2 min = 256.41 max = 257.32 avg = 256.83 /root/axera # ./benchncnn 4 4 0 -1 0 loop_count = 4 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 150.38 max = 179.83 avg = 157.90 squeezenet_int8 min = 106.97 max = 107.43 avg = 107.22 mobilenet min = 248.92 max = 273.98 avg = 255.72 mobilenet_int8 min = 139.49 max = 139.65 avg = 139.60 mobilenet_v2 min = 174.67 max = 204.35 avg = 182.30 mobilenet_v3 min = 152.17 max = 152.54 avg = 152.30 shufflenet min = 98.74 max = 125.99 avg = 105.74 shufflenet_v2 min = 103.44 max = 103.88 avg = 103.65 mnasnet min = 167.63 max = 197.54 avg = 175.28 proxylessnasnet min = 186.02 max = 186.32 avg = 186.15 efficientnet_b0 min = 284.35 max = 318.17 avg = 292.90 efficientnetv2_b0 min = 329.56 max = 359.71 avg = 337.22 regnety_400m min = 246.91 max = 277.08 avg = 254.71 blazeface min = 30.95 max = 31.31 avg = 31.16 googlenet min = 474.87 max = 504.38 avg = 489.43 googlenet_int8 min = 322.06 max = 331.97 avg = 324.57 resnet18 min = 440.03 max = 475.28 avg = 456.70 resnet18_int8 min = 252.01 max = 280.64 avg = 259.22 alexnet min = 453.16 max = 478.80 avg = 465.88 resnet50 min = 1214.70 max = 1252.42 avg = 1229.22 resnet50_int8 min = 684.53 max = 715.65 avg = 706.14 squeezenet_ssd min = 358.84 max = 393.45 avg = 367.77 squeezenet_ssd_int8 min = 281.56 max = 312.86 avg = 289.85 mobilenet_ssd min = 519.11 max = 559.14 avg = 538.41 mobilenet_ssd_int8 min = 284.58 max = 310.02 avg = 291.02 mobilenet_yolo min = 1238.87 max = 1284.74 avg = 1260.51 mobilenetv2_yolov3 min = 624.42 max = 665.81 avg = 642.15 yolov4-tiny min = 826.46 max = 852.97 avg = 844.88 nanodet_m min = 246.76 max = 279.09 avg = 255.04 yolo-fastest-1.1 min = 116.12 max = 116.95 avg = 116.50 yolo-fastestv2 min = 91.08 max = 102.93 avg = 94.41 ``` ### AMD Ryzen 5700g (Zen3 3.8 GHz ~ 4.6 GHz x 8) test in wsl2 with ubuntu 20.04 ``` $ ./benchncnn 10 1 0 -1 0 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 6.53 max = 7.05 avg = 6.77 squeezenet_int8 min = 17.72 max = 17.86 avg = 17.79 mobilenet min = 11.43 max = 11.98 avg = 11.64 mobilenet_int8 min = 22.91 max = 24.48 avg = 23.26 mobilenet_v2 min = 8.28 max = 9.29 avg = 8.66 mobilenet_v3 min = 6.86 max = 6.98 avg = 6.94 shufflenet min = 3.75 max = 4.64 avg = 3.91 shufflenet_v2 min = 5.08 max = 5.80 avg = 5.22 mnasnet min = 7.54 max = 8.60 avg = 7.81 proxylessnasnet min = 9.18 max = 10.33 avg = 9.41 efficientnet_b0 min = 22.57 max = 23.67 avg = 22.93 efficientnetv2_b0 min = 21.23 max = 22.08 avg = 21.45 regnety_400m min = 10.56 max = 10.80 avg = 10.63 blazeface min = 1.08 max = 1.17 avg = 1.11 googlenet min = 27.91 max = 29.51 avg = 28.28 googlenet_int8 min = 71.00 max = 86.86 avg = 72.74 resnet18 min = 20.11 max = 20.56 avg = 20.26 resnet18_int8 min = 63.80 max = 65.13 avg = 64.19 alexnet min = 20.64 max = 24.25 avg = 21.65 vgg16 min = 119.99 max = 125.45 avg = 121.59 vgg16_int8 min = 268.11 max = 270.41 avg = 269.15 resnet50 min = 55.42 max = 56.29 avg = 55.70 resnet50_int8 min = 126.73 max = 132.37 avg = 128.72 squeezenet_ssd min = 28.41 max = 30.30 avg = 29.20 squeezenet_ssd_int8 min = 41.12 max = 42.53 avg = 41.52 mobilenet_ssd min = 24.15 max = 24.91 avg = 24.33 mobilenet_ssd_int8 min = 46.06 max = 59.19 avg = 49.87 mobilenet_yolo min = 67.58 max = 73.19 avg = 68.99 mobilenetv2_yolov3 min = 29.44 max = 30.46 avg = 29.78 yolov4-tiny min = 41.89 max = 43.47 avg = 42.37 nanodet_m min = 11.23 max = 11.47 avg = 11.36 yolo-fastest-1.1 min = 3.86 max = 4.64 avg = 4.04 yolo-fastestv2 min = 3.43 max = 3.99 avg = 3.56 vision_transformer min = 1590.86 max = 1593.97 avg = 1591.91 $ ./benchncnn 10 16 0 -1 0 loop_count = 10 num_threads = 16 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 2.94 max = 4.66 avg = 3.31 squeezenet_int8 min = 3.53 max = 5.26 avg = 3.92 mobilenet min = 3.96 max = 5.30 avg = 4.21 mobilenet_int8 min = 4.27 max = 4.56 avg = 4.35 mobilenet_v2 min = 3.63 max = 4.20 avg = 3.82 mobilenet_v3 min = 3.25 max = 4.79 avg = 3.58 shufflenet min = 2.98 max = 3.59 avg = 3.12 shufflenet_v2 min = 2.62 max = 5.93 avg = 3.04 mnasnet min = 3.09 max = 3.49 avg = 3.28 proxylessnasnet min = 3.57 max = 4.18 avg = 3.76 efficientnet_b0 min = 5.98 max = 6.48 avg = 6.18 efficientnetv2_b0 min = 6.96 max = 7.48 avg = 7.13 regnety_400m min = 8.71 max = 11.89 avg = 9.61 blazeface min = 0.86 max = 0.96 avg = 0.89 googlenet min = 10.75 max = 11.33 avg = 11.00 googlenet_int8 min = 12.75 max = 15.47 avg = 13.50 resnet18 min = 8.92 max = 16.08 avg = 10.08 resnet18_int8 min = 10.55 max = 10.99 avg = 10.69 alexnet min = 9.95 max = 10.45 avg = 10.17 vgg16 min = 52.28 max = 53.69 avg = 52.89 vgg16_int8 min = 44.90 max = 47.90 avg = 45.61 resnet50 min = 17.80 max = 21.43 avg = 18.66 resnet50_int8 min = 21.80 max = 25.42 avg = 22.75 squeezenet_ssd min = 14.49 max = 16.36 avg = 14.90 squeezenet_ssd_int8 min = 10.02 max = 10.49 avg = 10.28 mobilenet_ssd min = 7.20 max = 7.86 avg = 7.51 mobilenet_ssd_int8 min = 8.51 max = 10.90 avg = 9.09 mobilenet_yolo min = 35.67 max = 44.84 avg = 37.33 mobilenetv2_yolov3 min = 12.72 max = 17.16 avg = 13.67 yolov4-tiny min = 20.81 max = 22.11 avg = 21.33 nanodet_m min = 5.13 max = 42.12 avg = 9.07 yolo-fastest-1.1 min = 3.05 max = 4.72 avg = 3.39 yolo-fastestv2 min = 3.33 max = 3.73 avg = 3.44 vision_transformer min = 214.91 max = 229.91 avg = 220.82 ``` ### Intel Celeron M 420 (Yonah 1.60 GHz x 1) Tested on `Debian GNU/Linux 11 (bullseye) i686` with `cmake -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_AVX512=OFF-DNCNN_BUILD_TESTS=ON ..`. ``` mouri@Mouri-Laptop-2:~/ncnn/benchmark$ ./../build/benchmark/benchncnn loop_count = 4 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 289.23 max = 301.83 avg = 292.90 squeezenet_int8 min = 442.82 max = 457.21 avg = 446.89 mobilenet min = 549.62 max = 561.20 avg = 554.78 mobilenet_int8 min = 823.92 max = 837.70 avg = 830.52 mobilenet_v2 min = 341.72 max = 353.77 avg = 345.34 mobilenet_v3 min = 267.68 max = 282.08 avg = 273.10 shufflenet min = 151.66 max = 153.02 avg = 152.24 shufflenet_v2 min = 161.54 max = 163.38 avg = 162.13 mnasnet min = 322.66 max = 336.91 avg = 326.86 proxylessnasnet min = 356.63 max = 368.79 avg = 360.66 efficientnet_b0 min = 489.92 max = 505.11 avg = 497.32 efficientnetv2_b0 min = 618.16 max = 632.02 avg = 622.82 regnety_400m min = 414.83 max = 428.42 avg = 419.28 blazeface min = 38.56 max = 40.05 avg = 39.05 googlenet min = 1022.54 max = 1037.53 avg = 1029.48 googlenet_int8 min = 1493.35 max = 1495.46 avg = 1494.31 resnet18 min = 803.32 max = 818.27 avg = 812.49 resnet18_int8 min = 1188.26 max = 1200.88 avg = 1192.56 alexnet min = 613.78 max = 623.88 avg = 619.99 vgg16 min = 4465.44 max = 4478.12 avg = 4474.16 vgg16_int8 min = 6042.40 max = 6114.37 avg = 6077.07 resnet50 min = 2517.75 max = 2528.42 avg = 2522.83 resnet50_int8 min = 3746.28 max = 3771.09 avg = 3756.88 squeezenet_ssd min = 585.56 max = 636.01 avg = 602.62 squeezenet_ssd_int8 min = 822.43 max = 968.77 avg = 862.33 mobilenet_ssd min = 1116.98 max = 1139.17 avg = 1127.65 mobilenet_ssd_int8 min = 1665.03 max = 1670.55 avg = 1668.37 mobilenet_yolo min = 2638.61 max = 2666.54 avg = 2652.26 mobilenetv2_yolov3 min = 1248.56 max = 1255.98 avg = 1251.22 yolov4-tiny min = 1507.31 max = 1525.56 avg = 1514.66 nanodet_m min = 386.41 max = 400.63 avg = 391.21 yolo-fastest-1.1 min = 159.97 max = 164.53 avg = 161.41 yolo-fastestv2 min = 134.29 max = 135.47 avg = 134.70 vision_transformer min = 22201.32 max = 22510.75 avg = 22315.09 FastestDet min = 146.94 max = 148.50 avg = 147.44 ``` ### VisionFive2 , JH7110 (SiFive-U74(RV64GC) 1.5GHz x 4) riscv64 with PowerVR B-Series BXE-4-32 Test on Debian 11 with g++ 12.2.0 and vulkan 1.3.231 ``` user@starfive:~/Downloads/ncnn-master/benchmark$ ./benchncnn 10 4 0 -1 0 loop_count = 10 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 149.06 max = 149.33 avg = 149.17 squeezenet_int8 min = 1318.66 max = 1349.04 avg = 1328.87 mobilenet min = 255.13 max = 255.71 avg = 255.39 mobilenet_int8 min = 2025.40 max = 2036.00 avg = 2031.67 mobilenet_v2 min = 173.92 max = 174.60 avg = 174.31 mobilenet_v3 min = 166.58 max = 167.30 avg = 167.02 shufflenet min = 91.36 max = 91.72 avg = 91.57 shufflenet_v2 min = 83.50 max = 83.95 avg = 83.76 mnasnet min = 190.42 max = 191.15 avg = 190.66 proxylessnasnet min = 226.35 max = 226.81 avg = 226.52 efficientnet_b0 min = 342.74 max = 343.62 avg = 343.15 efficientnetv2_b0 min = 343.31 max = 344.23 avg = 343.80 regnety_400m min = 227.04 max = 227.75 avg = 227.43 blazeface min = 26.18 max = 26.43 avg = 26.28 googlenet min = 506.76 max = 508.58 avg = 507.84 googlenet_int8 min = 3827.36 max = 3856.05 avg = 3835.67 resnet18 min = 401.12 max = 402.27 avg = 401.61 resnet18_int8 min = 4053.06 max = 4069.98 avg = 4061.63 alexnet min = 297.81 max = 320.09 avg = 301.39 vgg16 min = 2338.76 max = 2351.23 avg = 2346.19 vgg16_int8 min = 36846.41 max = 36929.56 avg = 36886.26 resnet50 min = 1189.88 max = 1211.10 avg = 1193.34 resnet50_int8 min = 11819.59 max = 11884.94 avg = 11845.22 squeezenet_ssd min = 351.71 max = 352.73 avg = 352.30 squeezenet_ssd_int8 min = 2872.00 max = 2903.35 avg = 2891.01 mobilenet_ssd min = 530.92 max = 531.73 avg = 531.28 mobilenet_ssd_int8 min = 4511.56 max = 4553.41 avg = 4523.51 mobilenet_yolo min = 1357.14 max = 1359.82 avg = 1358.83 mobilenetv2_yolov3 min = 621.15 max = 622.29 avg = 621.66 yolov4-tiny min = 803.06 max = 809.19 avg = 805.79 nanodet_m min = 220.82 max = 221.18 avg = 221.06 yolo-fastest-1.1 min = 102.59 max = 103.98 avg = 102.93 yolo-fastestv2 min = 89.61 max = 90.03 avg = 89.76 vision_transformer min = 15862.96 max = 15897.17 avg = 15878.22 FastestDet min = 108.69 max = 109.00 avg = 108.84 user@starfive:~/Downloads/ncnn-master/benchmark$ ./benchncnn 10 4 1 -1 0 loop_count = 10 num_threads = 4 powersave = 1 gpu_device = -1 cooling_down = 0 squeezenet min = 148.62 max = 148.95 avg = 148.82 squeezenet_int8 min = 1324.10 max = 1339.58 avg = 1332.57 mobilenet min = 255.67 max = 256.20 avg = 255.93 mobilenet_int8 min = 2024.72 max = 2028.23 avg = 2026.29 mobilenet_v2 min = 173.76 max = 174.73 avg = 174.31 mobilenet_v3 min = 166.66 max = 167.28 avg = 166.99 shufflenet min = 91.18 max = 91.68 avg = 91.46 shufflenet_v2 min = 83.88 max = 84.84 avg = 84.26 mnasnet min = 190.23 max = 190.84 avg = 190.45 proxylessnasnet min = 226.02 max = 226.82 avg = 226.38 efficientnet_b0 min = 342.95 max = 343.52 avg = 343.25 efficientnetv2_b0 min = 343.07 max = 343.80 avg = 343.39 regnety_400m min = 226.96 max = 227.62 avg = 227.24 blazeface min = 26.08 max = 26.32 avg = 26.18 googlenet min = 508.30 max = 510.34 avg = 509.27 googlenet_int8 min = 3825.65 max = 3858.90 avg = 3833.79 resnet18 min = 400.69 max = 403.18 avg = 401.74 resnet18_int8 min = 4055.41 max = 4123.79 avg = 4067.55 alexnet min = 296.35 max = 300.46 avg = 299.11 vgg16 min = 2337.68 max = 2349.78 avg = 2344.77 vgg16_int8 min = 36760.47 max = 36985.40 avg = 36918.31 resnet50 min = 1190.13 max = 1221.98 avg = 1196.77 resnet50_int8 min = 11816.03 max = 11869.41 avg = 11843.72 squeezenet_ssd min = 351.24 max = 352.20 avg = 351.89 squeezenet_ssd_int8 min = 2873.40 max = 2902.55 avg = 2891.58 mobilenet_ssd min = 530.45 max = 531.85 avg = 530.91 mobilenet_ssd_int8 min = 4504.87 max = 4564.64 avg = 4528.56 mobilenet_yolo min = 1357.83 max = 1360.48 avg = 1358.75 mobilenetv2_yolov3 min = 621.00 max = 621.76 avg = 621.35 yolov4-tiny min = 803.54 max = 808.00 avg = 806.16 nanodet_m min = 221.08 max = 222.57 avg = 221.72 yolo-fastest-1.1 min = 102.79 max = 103.15 avg = 102.95 yolo-fastestv2 min = 89.56 max = 89.79 avg = 89.70 vision_transformer min = 15874.12 max = 15907.97 avg = 15883.26 FastestDet min = 108.22 max = 108.64 avg = 108.36 user@starfive:~/Downloads/ncnn-master/benchmark$ ./benchncnn 10 1 1 0 0 [0 PowerVR B-Series BXE-4-32] queueC=0[2] queueG=0[2] queueT=0[2] [0 PowerVR B-Series BXE-4-32] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 PowerVR B-Series BXE-4-32] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 PowerVR B-Series BXE-4-32] subgroup=1 basic/vote/ballot/shuffle=1/1/1/1 [0 PowerVR B-Series BXE-4-32] fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0 loop_count = 10 num_threads = 1 powersave = 1 gpu_device = 0 cooling_down = 0 squeezenet min = 355.26 max = 356.42 avg = 355.75 squeezenet_int8 min = 5171.49 max = 5187.42 avg = 5178.45 mobilenet min = 757.04 max = 762.74 avg = 759.77 mobilenet_int8 min = 7695.03 max = 7715.39 avg = 7705.16 mobilenet_v2 min = 476.20 max = 477.19 avg = 476.94 mobilenet_v3 min = 403.12 max = 405.44 avg = 405.09 shufflenet min = 181.02 max = 182.32 avg = 181.96 shufflenet_v2 min = 257.29 max = 259.06 avg = 258.57 mnasnet min = 495.78 max = 497.44 avg = 496.89 proxylessnasnet min = 562.60 max = 563.02 avg = 562.83 efficientnet_b0 min = 660.29 max = 664.73 avg = 662.97 efficientnetv2_b0 min = 856.88 max = 864.96 avg = 861.30 regnety_400m min = 492.79 max = 495.44 avg = 494.51 blazeface min = 65.95 max = 68.72 avg = 68.19 googlenet min = 1132.70 max = 1134.65 avg = 1133.50 googlenet_int8 min = 14978.60 max = 15000.89 avg = 14988.56 resnet18 min = 1155.15 max = 1172.06 avg = 1160.64 resnet18_int8 min = 15776.36 max = 15790.48 avg = 15782.76 alexnet min = 601.09 max = 606.63 avg = 603.81 vgg16 min = 5558.47 max = 5613.23 avg = 5586.98 vgg16_int8 min = 143936.04 max = 144068.45 avg = 143991.58 resnet50 min = 3425.81 max = 3440.51 avg = 3434.73 resnet50_int8 min = 44780.92 max = 45144.97 avg = 45038.46 squeezenet_ssd min = 967.46 max = 978.39 avg = 972.76 squeezenet_ssd_int8 min = 10842.39 max = 10999.00 avg = 10940.15 mobilenet_ssd min = 1565.15 max = 1570.11 avg = 1568.87 mobilenet_ssd_int8 min = 17317.40 max = 17386.46 avg = 17361.80 mobilenet_yolo min = 3559.36 max = 3570.38 avg = 3568.84 mobilenetv2_yolov3 min = 1731.98 max = 1739.52 avg = 1735.33 yolov4-tiny min = 1984.22 max = 2001.65 avg = 1993.20 nanodet_m min = 603.06 max = 609.65 avg = 607.79 yolo-fastest-1.1 min = 306.30 max = 312.33 avg = 310.63 yolo-fastestv2 min = 201.45 max = 207.44 avg = 205.93 vision_transformer min = 27310.74 max = 27358.54 avg = 27327.23 FastestDet min = 245.07 max = 248.81 avg = 248.14 ``` ### T-Head TH1520 (C910V, 1.848 GHz x 4 + BXM-4-64 PowerVR) Tested on `Linux anolis-riscv 5.10.112-00579-g8e3db308d5a5 #23 SMP PREEMPT Fri Aug 12 10:17:32 CST 2022 riscv64 riscv64 riscv64 GNU/Linux` ``` [root@anolis-riscv benchmark]# ./benchncnn syscall error -1 loop_count = 4 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 187.88 max = 188.82 avg = 188.13 squeezenet_int8 min = 2388.26 max = 2446.92 avg = 2411.46 mobilenet min = 321.46 max = 323.34 avg = 322.19 mobilenet_int8 min = 2318.93 max = 2458.55 avg = 2400.99 mobilenet_v2 min = 214.01 max = 216.00 avg = 215.35 mobilenet_v3 min = 247.71 max = 248.18 avg = 247.96 shufflenet min = 155.58 max = 155.85 avg = 155.67 shufflenet_v2 min = 99.50 max = 99.75 avg = 99.63 mnasnet min = 261.46 max = 263.83 avg = 262.53 proxylessnasnet min = 315.40 max = 316.89 avg = 316.28 efficientnet_b0 min = 484.97 max = 486.16 avg = 485.55 efficientnetv2_b0 min = 453.03 max = 453.40 avg = 453.21 regnety_400m min = 314.09 max = 315.33 avg = 314.77 blazeface min = 46.14 max = 46.69 avg = 46.39 googlenet min = 650.99 max = 653.60 avg = 651.69 googlenet_int8 min = 5435.11 max = 6391.98 avg = 6012.81 resnet18 min = 505.48 max = 506.70 avg = 506.06 resnet18_int8 min = 5053.33 max = 6599.94 avg = 6001.86 alexnet min = 403.68 max = 404.60 avg = 404.23 vgg16 min = 2731.55 max = 2746.48 avg = 2738.82 ``` test on `Beaglev-ahead(Linux ahead 5.10.113-ahead #2023.08.02.13.12+2c2096a98 SMP PREEMPT Wed Aug 2 13:13:02 UTC 2 riscv64 GNU/Linux)` ``` debian@ahead:~/ncnn/build/benchmark$ sudo ./benchncnn 10 1 0 0 0 [0 PowerVR B-Series BXM-4-64] queueC=0[2] queueG=0[2] queueT=0[2] [0 PowerVR B-Series BXM-4-64] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 PowerVR B-Series BXM-4-64] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 PowerVR B-Series BXM-4-64] subgroup=1 basic/vote/ballot/shuffle=1/1/1/1 [0 PowerVR B-Series BXM-4-64] fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = 0 cooling_down = 0 squeezenet min = 287.88 max = 296.84 avg = 295.68 squeezenet_int8 min = 2289.46 max = 2320.97 avg = 2306.60 mobilenet min = 584.32 max = 588.48 avg = 587.41 mobilenet_int8 min = 2487.91 max = 2492.12 avg = 2489.64 mobilenet_v2 min = 380.02 max = 386.67 avg = 385.75 mobilenet_v3 min = 314.73 max = 328.84 avg = 325.76 shufflenet min = 146.96 max = 158.29 avg = 156.38 shufflenet_v2 min = 203.94 max = 211.77 avg = 210.82 mnasnet min = 395.80 max = 404.95 avg = 403.80 proxylessnasnet min = 447.74 max = 456.89 avg = 454.87 efficientnet_b0 min = 532.23 max = 543.05 avg = 538.53 efficientnetv2_b0 min = 659.43 max = 681.64 avg = 669.13 regnety_400m min = 393.16 max = 407.27 avg = 403.81 blazeface min = 50.41 max = 61.83 avg = 56.92 googlenet min = 890.79 max = 898.09 avg = 896.25 googlenet_int8 min = 4713.76 max = 5296.61 avg = 5044.39 resnet18 min = 814.16 max = 824.53 avg = 820.35 resnet18_int8 min = 4800.73 max = 6015.34 avg = 5765.47 alexnet min = 453.80 max = 465.51 avg = 462.11 vgg16 min = 4016.26 max = 4027.30 avg = 4021.94 vgg16_int8 min = 55069.69 max = 64814.86 avg = 59096.20 resnet50 min = 2494.42 max = 2502.38 avg = 2500.28 resnet50_int8 min = 15366.90 max = 17179.36 avg = 16701.20 squeezenet_ssd min = 724.36 max = 738.28 avg = 730.44 squeezenet_ssd_int8 min = 4550.62 max = 5235.87 avg = 4684.19 mobilenet_ssd min = 1207.04 max = 1218.80 avg = 1212.86 mobilenet_ssd_int8 min = 6019.61 max = 6349.35 avg = 6184.49 mobilenet_yolo min = 2736.28 max = 2747.06 avg = 2743.21 mobilenetv2_yolov3 min = 1339.16 max = 1349.46 avg = 1344.81 yolov4-tiny min = 1457.05 max = 1459.04 avg = 1457.81 nanodet_m min = 443.40 max = 444.58 avg = 444.00 yolo-fastest-1.1 min = 240.39 max = 248.05 avg = 247.04 yolo-fastestv2 min = 162.71 max = 173.30 avg = 169.39 vision_transformer min = 17148.14 max = 17250.66 avg = 17202.60 FastestDet min = 199.71 max = 200.38 avg = 199.90 ``` ### CVITEK SG2000 (C906, 1 GHz x 1 + 700MHz x 1) ``` [root@milkv-duo]~/ncnn# ./benchncnn 4 1 2 -1 0 loop_count = 4 num_threads = 1 powersave = 2 gpu_device = -1 cooling_down = 0 squeezenet min = 221.53 max = 229.14 avg = 225.53 squeezenet_int8 min = 8153.49 max = 8163.26 avg = 8160.17 mobilenet min = 329.60 max = 338.58 avg = 335.00 mobilenet_int8 min = 12725.12 max = 12733.70 avg = 12728.52 mobilenet_v2 min = 253.83 max = 260.60 avg = 257.20 mobilenet_v3 min = 205.51 max = 212.72 avg = 209.26 shufflenet min = 358.73 max = 367.05 avg = 364.52 shufflenet_v2 min = 238.44 max = 246.05 avg = 242.09 mnasnet min = 254.39 max = 258.26 avg = 255.63 proxylessnasnet min = 294.99 max = 302.80 avg = 300.65 regnety_400m min = 407.72 max = 409.69 avg = 409.03 blazeface min = 117.08 max = 124.26 avg = 119.00 googlenet min = 817.28 max = 824.70 avg = 820.70 googlenet_int8 min = 18246.97 max = 18276.23 avg = 18261.11 resnet18 min = 610.81 max = 618.87 avg = 613.91 resnet18_int8 min = 18772.96 max = 18808.53 avg = 18786.88 alexnet min = 568.11 max = 577.02 avg = 570.66 squeezenet_ssd min = 890.76 max = 896.30 avg = 893.57 squeezenet_ssd_int8 min = 31680.48 max = 31938.09 avg = 31810.68 mobilenet_ssd min = 746.38 max = 762.07 avg = 752.19 mobilenet_ssd_int8 min = 41140.62 max = 41540.85 avg = 41356.70 mobilenet_yolo min = 1744.59 max = 1755.90 avg = 1750.05 mobilenetv2_yolov3 min = 890.20 max = 897.86 avg = 895.14 yolov4-tiny min = 1056.03 max = 1059.44 avg = 1058.21 nanodet_m min = 547.85 max = 554.80 avg = 549.81 yolo-fastest-1.1 min = 290.89 max = 298.31 avg = 296.24 yolo-fastestv2 min = 188.59 max = 196.79 avg = 190.96 FastestDet min = 196.19 max = 205.96 avg = 200.99 ``` ### Rockchip RK3588 (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz) test in ROCK5 MODEL B ``` rock@rock-5b:~/ncnn/build/benchmark$ ./benchncnn 10 1 0 -1 0 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 15.22 max = 16.03 avg = 15.70 squeezenet_int8 min = 16.77 max = 16.96 avg = 16.86 mobilenet min = 23.07 max = 23.58 avg = 23.36 mobilenet_int8 min = 18.58 max = 18.90 avg = 18.72 mobilenet_v2 min = 18.74 max = 19.10 avg = 18.96 mobilenet_v3 min = 14.40 max = 14.65 avg = 14.50 shufflenet min = 9.74 max = 9.88 avg = 9.84 shufflenet_v2 min = 9.44 max = 9.55 avg = 9.50 mnasnet min = 14.73 max = 15.03 avg = 14.87 proxylessnasnet min = 18.37 max = 18.59 avg = 18.46 efficientnet_b0 min = 29.11 max = 30.18 avg = 29.63 efficientnetv2_b0 min = 46.40 max = 46.95 avg = 46.76 regnety_400m min = 19.18 max = 19.39 avg = 19.28 blazeface min = 5.16 max = 5.23 avg = 5.20 googlenet min = 64.64 max = 65.33 avg = 65.00 googlenet_int8 min = 61.86 max = 63.41 avg = 62.42 resnet18 min = 42.00 max = 43.34 avg = 42.48 resnet18_int8 min = 67.22 max = 67.80 avg = 67.45 alexnet min = 57.65 max = 58.21 avg = 58.01 vgg16 min = 192.35 max = 193.36 avg = 192.84 vgg16_int8 min = 570.86 max = 578.81 avg = 574.50 resnet50 min = 107.86 max = 109.52 avg = 108.70 resnet50_int8 min = 134.41 max = 135.86 avg = 135.18 squeezenet_ssd min = 40.85 max = 41.24 avg = 41.02 squeezenet_ssd_int8 min = 52.23 max = 53.70 avg = 52.54 mobilenet_ssd min = 45.11 max = 45.50 avg = 45.32 mobilenet_ssd_int8 min = 36.53 max = 36.63 avg = 36.59 mobilenet_yolo min = 95.18 max = 96.79 avg = 95.90 mobilenetv2_yolov3 min = 65.50 max = 65.88 avg = 65.72 yolov4-tiny min = 86.13 max = 88.84 avg = 87.29 nanodet_m min = 22.57 max = 22.87 avg = 22.74 yolo-fastest-1.1 min = 9.23 max = 9.35 avg = 9.29 yolo-fastestv2 min = 8.62 max = 8.83 avg = 8.73 vision_transformer min = 3077.54 max = 3396.13 avg = 3339.58 FastestDet min = 9.11 max = 9.30 avg = 9.20 rock@rock-5b:~/ncnn/build/benchmark$ ./benchncnn 10 8 0 -1 0 loop_count = 10 num_threads = 8 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 10.02 max = 11.01 avg = 10.43 squeezenet_int8 min = 11.78 max = 13.77 avg = 12.55 mobilenet min = 12.75 max = 13.58 avg = 13.12 mobilenet_int8 min = 12.23 max = 14.29 avg = 13.54 mobilenet_v2 min = 12.76 max = 14.27 avg = 13.40 mobilenet_v3 min = 9.51 max = 9.81 avg = 9.71 shufflenet min = 7.06 max = 7.23 avg = 7.13 shufflenet_v2 min = 6.21 max = 7.32 avg = 6.38 mnasnet min = 9.32 max = 12.49 avg = 10.75 proxylessnasnet min = 13.79 max = 15.51 avg = 14.70 efficientnet_b0 min = 16.59 max = 17.99 avg = 17.08 efficientnetv2_b0 min = 28.26 max = 32.26 avg = 30.52 regnety_400m min = 13.43 max = 15.00 avg = 13.72 blazeface min = 3.87 max = 7.38 avg = 5.65 googlenet min = 29.18 max = 44.00 avg = 36.31 googlenet_int8 min = 31.14 max = 37.48 avg = 34.58 resnet18 min = 21.47 max = 24.40 avg = 22.35 resnet18_int8 min = 26.68 max = 29.89 avg = 28.45 alexnet min = 29.35 max = 38.09 avg = 31.65 vgg16 min = 112.37 max = 122.94 avg = 117.05 vgg16_int8 min = 161.08 max = 215.29 avg = 176.89 resnet50 min = 54.54 max = 57.50 avg = 55.71 resnet50_int8 min = 54.76 max = 65.05 avg = 60.59 squeezenet_ssd min = 26.21 max = 35.05 avg = 30.76 squeezenet_ssd_int8 min = 33.34 max = 40.88 avg = 36.19 mobilenet_ssd min = 26.71 max = 28.85 avg = 27.88 mobilenet_ssd_int8 min = 22.03 max = 25.31 avg = 24.21 mobilenet_yolo min = 60.51 max = 74.65 avg = 65.45 mobilenetv2_yolov3 min = 37.27 max = 44.13 avg = 41.20 yolov4-tiny min = 49.84 max = 58.12 avg = 53.93 nanodet_m min = 16.54 max = 22.41 avg = 20.60 yolo-fastest-1.1 min = 8.49 max = 13.50 avg = 9.91 yolo-fastestv2 min = 6.28 max = 11.22 avg = 8.00 vision_transformer min = 968.62 max = 1063.47 avg = 1019.12 FastestDet min = 6.14 max = 11.92 avg = 7.85 rock@rock-5b:~/ncnn/build/benchmark$ ./benchncnn 10 4 2 -1 0 loop_count = 10 num_threads = 4 powersave = 2 gpu_device = -1 cooling_down = 0 squeezenet min = 6.78 max = 7.27 avg = 7.07 squeezenet_int8 min = 4.58 max = 4.73 avg = 4.63 mobilenet min = 5.67 max = 5.78 avg = 5.72 mobilenet_int8 min = 5.01 max = 5.20 avg = 5.15 mobilenet_v2 min = 5.44 max = 5.76 avg = 5.50 mobilenet_v3 min = 4.67 max = 5.03 avg = 4.74 shufflenet min = 4.22 max = 4.30 avg = 4.27 shufflenet_v2 min = 3.48 max = 3.60 avg = 3.53 mnasnet min = 4.52 max = 4.83 avg = 4.61 proxylessnasnet min = 5.44 max = 6.01 avg = 5.56 efficientnet_b0 min = 8.33 max = 8.52 avg = 8.41 efficientnetv2_b0 min = 12.95 max = 13.08 avg = 13.02 regnety_400m min = 8.60 max = 8.73 avg = 8.66 blazeface min = 1.86 max = 1.95 avg = 1.90 googlenet min = 16.58 max = 16.85 avg = 16.65 googlenet_int8 min = 16.99 max = 17.13 avg = 17.06 resnet18 min = 14.98 max = 15.30 avg = 15.08 resnet18_int8 min = 20.10 max = 20.22 avg = 20.15 alexnet min = 19.78 max = 20.21 avg = 19.87 vgg16 min = 66.35 max = 94.16 avg = 75.24 vgg16_int8 min = 131.02 max = 131.98 avg = 131.51 resnet50 min = 28.07 max = 28.78 avg = 28.28 resnet50_int8 min = 33.56 max = 35.53 avg = 33.84 squeezenet_ssd min = 16.40 max = 16.80 avg = 16.49 squeezenet_ssd_int8 min = 18.64 max = 19.00 avg = 18.76 mobilenet_ssd min = 13.66 max = 13.78 avg = 13.72 mobilenet_ssd_int8 min = 11.23 max = 11.42 avg = 11.33 mobilenet_yolo min = 30.76 max = 31.03 avg = 30.86 mobilenetv2_yolov3 min = 19.28 max = 21.07 avg = 20.30 yolov4-tiny min = 33.44 max = 37.68 avg = 34.70 nanodet_m min = 8.28 max = 8.55 avg = 8.38 yolo-fastest-1.1 min = 4.30 max = 4.40 avg = 4.34 yolo-fastestv2 min = 4.07 max = 4.18 avg = 4.13 vision_transformer min = 815.67 max = 819.27 avg = 817.49 FastestDet min = 4.34 max = 7.47 avg = 5.18 ``` ### AWS c5.4xlarge Instance - OS: Ubuntu 20.04.6 LTS x86_64 - CPU: Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz - Compiler: gcc version 9.4.0 (Ubuntu 9.4.0-1ubuntu1~20.04.2) - ncnn tag: 20240102 ``` loop_count = 4 num_threads = 8 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 3.31 max = 3.33 avg = 3.32 squeezenet_int8 min = 3.87 max = 4.34 avg = 4.07 mobilenet min = 3.12 max = 3.20 avg = 3.17 mobilenet_int8 min = 3.32 max = 3.45 avg = 3.38 mobilenet_v2 min = 4.23 max = 4.43 avg = 4.33 mobilenet_v3 min = 3.82 max = 3.92 avg = 3.87 shufflenet min = 3.67 max = 3.72 avg = 3.69 shufflenet_v2 min = 4.08 max = 4.22 avg = 4.15 mnasnet min = 3.62 max = 3.69 avg = 3.64 proxylessnasnet min = 4.29 max = 4.59 avg = 4.37 efficientnet_b0 min = 5.32 max = 5.64 avg = 5.50 efficientnetv2_b0 min = 6.81 max = 6.88 avg = 6.85 regnety_400m min = 9.71 max = 9.77 avg = 9.74 blazeface min = 1.71 max = 2.57 avg = 2.10 googlenet min = 10.00 max = 10.09 avg = 10.05 googlenet_int8 min = 8.76 max = 8.79 avg = 8.77 resnet18 min = 6.55 max = 6.91 avg = 6.70 resnet18_int8 min = 5.63 max = 5.95 avg = 5.81 alexnet min = 4.88 max = 4.91 avg = 4.89 vgg16 min = 36.99 max = 37.04 avg = 37.01 vgg16_int8 min = 28.13 max = 28.57 avg = 28.31 resnet50 min = 13.99 max = 14.13 avg = 14.06 resnet50_int8 min = 12.49 max = 12.56 avg = 12.53 squeezenet_ssd min = 9.93 max = 10.04 avg = 9.98 squeezenet_ssd_int8 min = 9.51 max = 9.70 avg = 9.59 mobilenet_ssd min = 6.60 max = 6.63 avg = 6.61 mobilenet_ssd_int8 min = 6.95 max = 7.10 avg = 7.02 mobilenet_yolo min = 18.28 max = 18.44 avg = 18.35 mobilenetv2_yolov3 min = 13.26 max = 13.39 avg = 13.32 yolov4-tiny min = 25.14 max = 25.58 avg = 25.37 nanodet_m min = 7.71 max = 7.77 avg = 7.75 yolo-fastest-1.1 min = 4.69 max = 4.96 avg = 4.81 yolo-fastestv2 min = 4.84 max = 5.17 avg = 5.01 vision_transformer min = 139.34 max = 140.38 avg = 139.96 FastestDet min = 4.95 max = 5.12 avg = 5.06 ``` ### Hyper-V Linux Guest with GPU-PV enabled (Intel Core i7-11800H, NVIDIA GeForce RTX 3070 Laptop GPU) - Host OS: Microsoft Windows 11 Enterprise (10.0.22621.1635) - Guest OS: openSUSE Tumbleweed x86_64 20230507 - Mesa 3D source tree: https://gitlab.freedesktop.org/mesa/mesa/-/tree/ce6430067613e3e64cabf79918a3d96122b0c4c4 - Mesa 3D configuration command > meson --prefix="${PWD}/build/install" -D gallium-drivers=swrast,d3d12 -D vulkan-drivers=swrast,microsoft-experimental build/ - ncnn configuration command > cmake -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=ON .. ``` mouri@MouriVM-openSUSE:~/Workspace/ncnn/benchmark> VK_ICD_FILENAMES=/home/mouri/Workspace/mesa/build/install/share/vulkan/icd.d/dzn_icd.x86_64.json ./../build/benchmark/benchncnn 10 1 0 0 0 WARNING: dzn is not a conformant Vulkan implementation, testing use only. WARNING: dzn is not a conformant Vulkan implementation, testing use only. [0 Microsoft Direct3D12 (NVIDIA GeForce RTX 3070 Laptop GPU)] queueC=1[8] queueG=0[4] queueT=2[1] [0 Microsoft Direct3D12 (NVIDIA GeForce RTX 3070 Laptop GPU)] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 Microsoft Direct3D12 (NVIDIA GeForce RTX 3070 Laptop GPU)] fp16-p/s/a=1/1/1 int8-p/s/a=1/0/0 [0 Microsoft Direct3D12 (NVIDIA GeForce RTX 3070 Laptop GPU)] subgroup=32 basic=1 vote=1 ballot=1 shuffle=1 [1 Microsoft Direct3D12 (Intel(R) UHD Graphics)] queueC=1[8] queueG=0[4] queueT=2[1] [1 Microsoft Direct3D12 (Intel(R) UHD Graphics)] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [1 Microsoft Direct3D12 (Intel(R) UHD Graphics)] fp16-p/s/a=1/1/1 int8-p/s/a=1/0/0 [1 Microsoft Direct3D12 (Intel(R) UHD Graphics)] subgroup=16 basic=1 vote=1 ballot=1 shuffle=1 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = 0 cooling_down = 0 squeezenet min = 52.30 max = 65.51 avg = 56.65 squeezenet_int8 min = 14.53 max = 15.55 avg = 14.88 mobilenet min = 37.42 max = 52.07 avg = 42.48 mobilenet_int8 min = 19.01 max = 19.82 avg = 19.46 mobilenet_v2 min = 55.34 max = 73.39 avg = 63.94 mobilenet_v3 min = 97.02 max = 123.14 avg = 109.90 shufflenet min = 72.75 max = 100.26 avg = 88.26 shufflenet_v2 min = 93.34 max = 119.64 avg = 105.76 mnasnet min = 63.49 max = 74.11 avg = 69.05 proxylessnasnet min = 65.87 max = 83.87 avg = 76.33 efficientnet_b0 min = 162.86 max = 210.51 avg = 184.03 efficientnetv2_b0 min = 200.88 max = 220.40 avg = 210.85 regnety_400m min = 106.92 max = 134.68 avg = 123.04 blazeface min = 58.64 max = 66.50 avg = 60.54 googlenet min = 117.34 max = 145.28 avg = 134.84 googlenet_int8 min = 62.50 max = 65.07 avg = 63.44 resnet18 min = 67.30 max = 92.40 avg = 80.23 resnet18_int8 min = 56.09 max = 58.40 avg = 56.97 alexnet min = 29.94 max = 47.51 avg = 38.83 vgg16 min = 59.72 max = 73.08 avg = 65.46 vgg16_int8 min = 136.35 max = 148.39 avg = 143.96 resnet50 min = 115.92 max = 152.34 avg = 129.64 resnet50_int8 min = 93.86 max = 101.51 avg = 97.96 squeezenet_ssd min = 139.82 max = 149.15 avg = 144.78 squeezenet_ssd_int8 min = 32.09 max = 35.96 avg = 33.41 mobilenet_ssd min = 88.14 max = 102.62 avg = 97.79 mobilenet_ssd_int8 min = 33.93 max = 36.42 avg = 34.41 mobilenet_yolo min = 52.22 max = 65.25 avg = 58.81 mobilenetv2_yolov3 min = 75.09 max = 94.12 avg = 85.23 yolov4-tiny min = 73.27 max = 88.69 avg = 81.44 nanodet_m min = 110.98 max = 150.70 avg = 127.60 yolo-fastest-1.1 min = 104.72 max = 135.40 avg = 116.92 yolo-fastestv2 min = 113.84 max = 142.19 avg = 128.24 vision_transformer min = 412.19 max = 474.25 avg = 444.15 FastestDet min = 96.31 max = 131.51 avg = 117.27 mouri@MouriVM-openSUSE:~/Workspace/ncnn/benchmark> VK_ICD_FILENAMES=/home/mouri/Workspace/mesa/build/install/share/vulkan/icd.d/dzn_icd.x86_64.json ./../build/benchmark/benchncnn 10 1 0 1 0 WARNING: dzn is not a conformant Vulkan implementation, testing use only. WARNING: dzn is not a conformant Vulkan implementation, testing use only. [0 Microsoft Direct3D12 (NVIDIA GeForce RTX 3070 Laptop GPU)] queueC=1[8] queueG=0[4] queueT=2[1] [0 Microsoft Direct3D12 (NVIDIA GeForce RTX 3070 Laptop GPU)] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 Microsoft Direct3D12 (NVIDIA GeForce RTX 3070 Laptop GPU)] fp16-p/s/a=1/1/1 int8-p/s/a=1/0/0 [0 Microsoft Direct3D12 (NVIDIA GeForce RTX 3070 Laptop GPU)] subgroup=32 basic=1 vote=1 ballot=1 shuffle=1 [1 Microsoft Direct3D12 (Intel(R) UHD Graphics)] queueC=1[8] queueG=0[4] queueT=2[1] [1 Microsoft Direct3D12 (Intel(R) UHD Graphics)] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [1 Microsoft Direct3D12 (Intel(R) UHD Graphics)] fp16-p/s/a=1/1/1 int8-p/s/a=1/0/0 [1 Microsoft Direct3D12 (Intel(R) UHD Graphics)] subgroup=16 basic=1 vote=1 ballot=1 shuffle=1 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = 1 cooling_down = 0 squeezenet min = 36.86 max = 62.04 avg = 44.48 squeezenet_int8 min = 15.31 max = 16.14 avg = 15.63 mobilenet min = 30.79 max = 34.67 avg = 32.95 mobilenet_int8 min = 19.23 max = 19.72 avg = 19.42 mobilenet_v2 min = 36.56 max = 40.53 avg = 38.20 mobilenet_v3 min = 52.11 max = 61.72 avg = 56.58 shufflenet min = 41.50 max = 74.61 avg = 49.24 shufflenet_v2 min = 44.49 max = 52.30 avg = 49.04 mnasnet min = 35.66 max = 43.45 avg = 37.98 proxylessnasnet min = 41.27 max = 47.63 avg = 43.63 efficientnet_b0 min = 67.66 max = 80.88 avg = 73.64 efficientnetv2_b0 min = 111.10 max = 156.52 avg = 126.70 regnety_400m min = 62.66 max = 89.16 avg = 68.99 blazeface min = 24.86 max = 33.52 avg = 26.91 googlenet min = 70.55 max = 84.22 avg = 75.19 googlenet_int8 min = 58.78 max = 64.81 avg = 62.99 resnet18 min = 44.17 max = 49.37 avg = 46.73 resnet18_int8 min = 59.99 max = 66.91 avg = 62.35 alexnet min = 41.54 max = 57.16 avg = 44.30 vgg16 min = 138.74 max = 165.03 avg = 146.90 vgg16_int8 min = 135.36 max = 165.89 avg = 142.61 resnet50 min = 97.46 max = 107.18 avg = 100.89 resnet50_int8 min = 92.90 max = 100.45 avg = 95.91 squeezenet_ssd min = 72.27 max = 90.71 avg = 76.09 squeezenet_ssd_int8 min = 34.66 max = 40.46 avg = 36.58 mobilenet_ssd min = 59.90 max = 68.74 avg = 62.40 mobilenet_ssd_int8 min = 37.02 max = 38.59 avg = 37.82 mobilenet_yolo min = 73.19 max = 80.40 avg = 76.42 mobilenetv2_yolov3 min = 58.56 max = 66.71 avg = 62.02 yolov4-tiny min = 63.75 max = 84.29 avg = 69.54 nanodet_m min = 54.66 max = 67.89 avg = 60.82 yolo-fastest-1.1 min = 40.89 max = 51.03 avg = 43.15 yolo-fastestv2 min = 50.43 max = 77.46 avg = 60.66 vision_transformer min = 1330.82 max = 1388.73 avg = 1354.10 FastestDet min = 85.75 max = 112.67 avg = 98.62 mouri@MouriVM-openSUSE:~/Workspace/ncnn/benchmark> VK_ICD_FILENAMES=/home/mouri/Workspace/mesa/build/install/share/vulkan/icd.d/dzn_icd.x86_64.json ./../build/benchmark/benchncnn 10 1 0 -1 0 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 6.30 max = 10.16 avg = 8.21 squeezenet_int8 min = 14.53 max = 14.94 avg = 14.67 mobilenet min = 10.71 max = 11.26 avg = 10.91 mobilenet_int8 min = 17.66 max = 18.46 avg = 17.91 mobilenet_v2 min = 7.74 max = 8.05 avg = 7.89 mobilenet_v3 min = 6.25 max = 6.70 avg = 6.38 shufflenet min = 3.78 max = 7.87 avg = 5.37 shufflenet_v2 min = 4.19 max = 7.83 avg = 5.25 mnasnet min = 7.29 max = 7.61 avg = 7.44 proxylessnasnet min = 8.10 max = 8.43 avg = 8.24 efficientnet_b0 min = 11.77 max = 12.66 avg = 12.06 efficientnetv2_b0 min = 13.80 max = 15.02 avg = 14.11 regnety_400m min = 10.09 max = 10.26 avg = 10.17 blazeface min = 1.24 max = 4.02 avg = 2.45 googlenet min = 24.05 max = 25.78 avg = 24.64 googlenet_int8 min = 58.75 max = 62.45 avg = 59.54 resnet18 min = 20.31 max = 21.48 avg = 20.74 resnet18_int8 min = 53.82 max = 55.27 avg = 54.43 alexnet min = 17.37 max = 18.69 avg = 17.66 vgg16 min = 114.49 max = 117.62 avg = 115.96 vgg16_int8 min = 133.82 max = 144.40 avg = 137.07 resnet50 min = 54.40 max = 58.74 avg = 55.54 resnet50_int8 min = 92.95 max = 104.71 avg = 99.18 squeezenet_ssd min = 17.30 max = 18.65 avg = 17.71 squeezenet_ssd_int8 min = 32.27 max = 33.88 avg = 32.82 mobilenet_ssd min = 24.01 max = 25.94 avg = 25.02 mobilenet_ssd_int8 min = 34.68 max = 36.09 avg = 35.43 mobilenet_yolo min = 53.32 max = 63.48 avg = 56.58 mobilenetv2_yolov3 min = 30.06 max = 34.24 avg = 31.46 yolov4-tiny min = 41.49 max = 43.55 avg = 42.50 nanodet_m min = 10.24 max = 11.08 avg = 10.43 yolo-fastest-1.1 min = 3.85 max = 8.34 avg = 5.40 yolo-fastestv2 min = 4.33 max = 7.61 avg = 6.01 vision_transformer min = 556.38 max = 599.49 avg = 567.98 FastestDet min = 4.20 max = 11.37 avg = 6.51 mouri@MouriVM-openSUSE:~/Workspace/ncnn/benchmark> ``` ### Hyper-V Linux Guest with GPU-PV enabled (Intel Core i7-7700K, NVIDIA GeForce GTX 1050 Ti) - Host OS: Microsoft Windows 10 Enterprise LTSC 2021 (10.0.19044.2846) - Guest OS: openSUSE Tumbleweed x86_64 20230507 - Mesa 3D source tree: https://gitlab.freedesktop.org/mesa/mesa/-/tree/ce6430067613e3e64cabf79918a3d96122b0c4c4 - Mesa 3D configuration command > meson --prefix="${PWD}/build/install" -D gallium-drivers=swrast,d3d12 -D vulkan-drivers=swrast,microsoft-experimental build/ - ncnn configuration command > cmake -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=ON .. ``` mouri@MouriVM-openSUSE:~/Workspace/ncnn/benchmark> VK_ICD_FILENAMES=/home/mouri/Workspace/mesa/build/install/share/vulkan/icd.d/dzn_icd.x86_64.json ./../build/benchmark/benchncnn 10 1 0 0 0 WARNING: dzn is not a conformant Vulkan implementation, testing use only. [0 Microsoft Direct3D12 (NVIDIA GeForce GTX 1050 Ti)] queueC=1[8] queueG=0[4] queueT=2[1] [0 Microsoft Direct3D12 (NVIDIA GeForce GTX 1050 Ti)] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 Microsoft Direct3D12 (NVIDIA GeForce GTX 1050 Ti)] fp16-p/s/a=1/0/0 int8-p/s/a=1/0/0 [0 Microsoft Direct3D12 (NVIDIA GeForce GTX 1050 Ti)] subgroup=32 basic=1 vote=1 ballot=1 shuffle=1 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = 0 cooling_down = 0 squeezenet min = 53.80 max = 64.22 avg = 59.91 squeezenet_int8 min = 23.21 max = 25.98 avg = 24.44 mobilenet min = 47.63 max = 55.22 avg = 49.79 mobilenet_int8 min = 23.27 max = 25.05 avg = 23.77 mobilenet_v2 min = 58.17 max = 83.14 avg = 68.48 mobilenet_v3 min = 92.14 max = 114.74 avg = 101.66 shufflenet min = 75.96 max = 106.54 avg = 89.64 shufflenet_v2 min = 90.66 max = 114.69 avg = 103.25 mnasnet min = 58.40 max = 85.74 avg = 67.75 proxylessnasnet min = 66.73 max = 84.82 avg = 77.73 efficientnet_b0 min = 134.28 max = 164.39 avg = 155.40 efficientnetv2_b0 min = 171.97 max = 220.43 avg = 198.26 regnety_400m min = 124.15 max = 145.61 avg = 135.99 blazeface min = 53.18 max = 72.10 avg = 60.21 googlenet min = 119.34 max = 159.93 avg = 134.71 googlenet_int8 min = 96.71 max = 102.44 avg = 98.57 resnet18 min = 68.14 max = 89.99 avg = 80.76 resnet18_int8 min = 88.07 max = 108.62 avg = 91.09 alexnet min = 44.12 max = 51.57 avg = 48.09 vgg16 min = 88.49 max = 99.87 avg = 93.42 vgg16_int8 min = 196.17 max = 211.99 avg = 201.27 resnet50 min = 115.36 max = 138.65 avg = 125.57 resnet50_int8 min = 138.15 max = 148.55 avg = 141.08 squeezenet_ssd min = 138.42 max = 168.49 avg = 155.66 squeezenet_ssd_int8 min = 46.01 max = 47.83 avg = 46.85 mobilenet_ssd min = 82.39 max = 134.74 avg = 101.22 mobilenet_ssd_int8 min = 45.53 max = 46.67 avg = 45.96 mobilenet_yolo min = 70.39 max = 87.83 avg = 80.01 mobilenetv2_yolov3 min = 75.71 max = 90.59 avg = 84.04 yolov4-tiny min = 72.16 max = 87.76 avg = 76.81 nanodet_m min = 98.27 max = 129.60 avg = 112.34 yolo-fastest-1.1 min = 101.01 max = 118.45 avg = 106.47 yolo-fastestv2 min = 109.89 max = 137.23 avg = 123.97 vision_transformer min = 688.60 max = 750.54 avg = 723.30 FastestDet min = 104.16 max = 139.23 avg = 123.75 mouri@MouriVM-openSUSE:~/Workspace/ncnn/benchmark> VK_ICD_FILENAMES=/home/mouri/Workspace/mesa/build/install/share/vulkan/icd.d/dzn_icd.x86_64.json ./../build/benchmark/benchncnn 10 1 0 -1 0 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 8.90 max = 9.48 avg = 9.15 squeezenet_int8 min = 22.54 max = 24.13 avg = 22.85 mobilenet min = 14.85 max = 16.15 avg = 15.18 mobilenet_int8 min = 23.56 max = 23.98 avg = 23.74 mobilenet_v2 min = 11.03 max = 11.73 avg = 11.22 mobilenet_v3 min = 8.61 max = 9.29 avg = 8.79 shufflenet min = 5.26 max = 5.96 avg = 5.42 shufflenet_v2 min = 5.56 max = 7.06 avg = 5.82 mnasnet min = 10.46 max = 11.04 avg = 10.68 proxylessnasnet min = 12.18 max = 12.55 avg = 12.33 efficientnet_b0 min = 22.46 max = 23.15 avg = 22.86 efficientnetv2_b0 min = 23.33 max = 23.80 avg = 23.55 regnety_400m min = 13.03 max = 14.25 avg = 13.28 blazeface min = 1.49 max = 1.95 avg = 1.61 googlenet min = 35.26 max = 46.31 avg = 39.63 googlenet_int8 min = 96.25 max = 98.15 avg = 96.93 resnet18 min = 29.34 max = 31.00 avg = 29.92 resnet18_int8 min = 87.84 max = 89.85 avg = 88.73 alexnet min = 22.91 max = 23.87 avg = 23.18 vgg16 min = 151.26 max = 174.79 avg = 155.94 vgg16_int8 min = 193.66 max = 210.63 avg = 199.14 resnet50 min = 74.89 max = 77.27 avg = 75.91 resnet50_int8 min = 136.59 max = 162.13 avg = 141.22 squeezenet_ssd min = 24.48 max = 34.00 avg = 26.19 squeezenet_ssd_int8 min = 46.31 max = 48.87 avg = 47.09 mobilenet_ssd min = 31.56 max = 34.45 avg = 32.50 mobilenet_ssd_int8 min = 45.15 max = 46.53 avg = 45.93 mobilenet_yolo min = 72.09 max = 78.05 avg = 74.31 mobilenetv2_yolov3 min = 40.44 max = 41.54 avg = 40.86 yolov4-tiny min = 56.73 max = 60.59 avg = 57.93 nanodet_m min = 13.22 max = 19.28 avg = 14.65 yolo-fastest-1.1 min = 5.47 max = 5.70 avg = 5.58 yolo-fastestv2 min = 5.68 max = 7.20 avg = 5.88 vision_transformer min = 600.83 max = 666.35 avg = 617.33 FastestDet min = 6.05 max = 6.72 avg = 6.23 ``` ### AMD Ryzen 9 5950X 16-Core of Desktop[2023-10-12] ``` E:\github\ncnn\build-ncnn-vs2019\benchmark\Release>benchncnn.exe 100 16 0 -1 0 loop_count = 100 num_threads = 16 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 2.68 max = 3.10 avg = 2.77 squeezenet_int8 min = 3.57 max = 4.72 avg = 4.04 mobilenet min = 3.09 max = 5.44 avg = 3.38 mobilenet_int8 min = 2.36 max = 3.40 avg = 2.74 mobilenet_v2 min = 4.24 max = 4.81 avg = 4.40 mobilenet_v3 min = 3.46 max = 3.93 avg = 3.58 shufflenet min = 3.21 max = 4.54 avg = 4.01 shufflenet_v2 min = 2.99 max = 4.49 avg = 3.34 mnasnet min = 3.62 max = 4.31 avg = 3.83 proxylessnasnet min = 4.06 max = 5.70 avg = 4.23 efficientnet_b0 min = 5.60 max = 6.55 avg = 5.81 efficientnetv2_b0 min = 6.83 max = 8.82 avg = 7.12 regnety_400m min = 8.02 max = 9.75 avg = 8.34 blazeface min = 1.34 max = 1.77 avg = 1.46 googlenet min = 11.62 max = 15.95 avg = 12.70 googlenet_int8 min = 7.43 max = 10.06 avg = 7.92 resnet18 min = 8.39 max = 10.39 avg = 9.04 resnet18_int8 min = 6.23 max = 8.64 avg = 6.75 alexnet min = 7.78 max = 12.51 avg = 8.51 vgg16 min = 53.85 max = 63.39 avg = 56.36 vgg16_int8 min = 35.61 max = 46.94 avg = 38.08 resnet50 min = 18.55 max = 24.46 avg = 19.81 resnet50_int8 min = 11.95 max = 23.21 avg = 13.51 squeezenet_ssd min = 10.01 max = 13.16 avg = 10.69 squeezenet_ssd_int8 min = 9.29 max = 14.02 avg = 10.47 mobilenet_ssd min = 6.38 max = 10.26 avg = 7.15 mobilenet_ssd_int8 min = 4.69 max = 6.98 avg = 5.42 mobilenet_yolo min = 17.63 max = 22.59 avg = 19.45 mobilenetv2_yolov3 min = 11.79 max = 15.67 avg = 12.76 yolov4-tiny min = 21.53 max = 25.79 avg = 22.46 nanodet_m min = 7.16 max = 9.99 avg = 8.01 yolo-fastest-1.1 min = 3.66 max = 5.00 avg = 4.38 yolo-fastestv2 min = 3.52 max = 5.20 avg = 4.60 vision_transformer min = 67.01 max = 93.71 avg = 78.48 FastestDet min = 4.44 max = 8.62 avg = 4.69 ``` ### AMD Radeon RX 6900 XT of Desktop[2023-10-12] ``` E:\github\ncnn\build-ncnn-vs2019\benchmark\Release>benchncnn.exe 100 16 0 0 0 [0 AMD Radeon RX 6900 XT] queueC=1[2] queueG=0[1] queueT=2[2] [0 AMD Radeon RX 6900 XT] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 AMD Radeon RX 6900 XT] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 AMD Radeon RX 6900 XT] subgroup=64 basic/vote/ballot/shuffle=1/1/1/1 [0 AMD Radeon RX 6900 XT] fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0 loop_count = 100 num_threads = 16 powersave = 0 gpu_device = 0 cooling_down = 0 squeezenet min = 2.19 max = 2.70 avg = 2.47 squeezenet_int8 min = 3.94 max = 4.51 avg = 4.18 mobilenet min = 2.03 max = 2.63 avg = 2.28 mobilenet_int8 min = 2.56 max = 3.34 avg = 2.69 mobilenet_v2 min = 2.29 max = 2.98 avg = 2.62 mobilenet_v3 min = 2.31 max = 3.10 avg = 2.75 shufflenet min = 1.89 max = 2.61 avg = 2.30 shufflenet_v2 min = 2.17 max = 3.04 avg = 2.59 mnasnet min = 2.19 max = 2.98 avg = 2.69 proxylessnasnet min = 2.12 max = 4.08 avg = 2.62 efficientnet_b0 min = 3.62 max = 5.27 avg = 4.21 efficientnetv2_b0 min = 6.09 max = 7.15 avg = 6.49 regnety_400m min = 2.55 max = 3.82 avg = 3.00 blazeface min = 1.93 max = 2.56 avg = 2.28 googlenet min = 3.35 max = 4.46 avg = 3.75 googlenet_int8 min = 8.02 max = 12.84 avg = 9.15 resnet18 min = 2.46 max = 3.14 avg = 2.84 resnet18_int8 min = 6.37 max = 9.15 avg = 7.30 alexnet min = 2.31 max = 2.91 avg = 2.69 vgg16 min = 4.76 max = 5.79 avg = 5.24 vgg16_int8 min = 35.94 max = 46.27 avg = 39.05 resnet50 min = 3.25 max = 4.09 avg = 3.75 resnet50_int8 min = 12.04 max = 20.53 avg = 14.61 squeezenet_ssd min = 3.03 max = 5.31 avg = 3.66 squeezenet_ssd_int8 min = 9.74 max = 13.46 avg = 10.42 mobilenet_ssd min = 2.82 max = 4.75 avg = 3.39 mobilenet_ssd_int8 min = 4.67 max = 6.76 avg = 5.30 mobilenet_yolo min = 3.01 max = 3.67 avg = 3.34 mobilenetv2_yolov3 min = 4.04 max = 6.46 avg = 4.55 yolov4-tiny min = 5.75 max = 8.05 avg = 6.52 nanodet_m min = 10.16 max = 14.97 avg = 13.11 yolo-fastest-1.1 min = 2.36 max = 3.80 avg = 2.88 yolo-fastestv2 min = 2.24 max = 3.19 avg = 2.80 vision_transformer min = 20.43 max = 25.06 avg = 21.07 FastestDet min = 2.49 max = 3.18 avg = 2.93 ``` ### NVIDIA GeForce RTX 3060 Ti of Desktop[2023-10-12] ``` E:\github\ncnn\build-ncnn-vs2019\benchmark\Release>benchncnn.exe 100 16 0 0 0 [0 NVIDIA GeForce RTX 3060 Ti] queueC=2[8] queueG=0[16] queueT=1[2] [0 NVIDIA GeForce RTX 3060 Ti] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 NVIDIA GeForce RTX 3060 Ti] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 NVIDIA GeForce RTX 3060 Ti] subgroup=32 basic/vote/ballot/shuffle=1/1/1/1 [0 NVIDIA GeForce RTX 3060 Ti] fp16-matrix-16_8_8/16_8_16/16_16_16=1/1/1 [1 Intel(R) UHD Graphics 770] queueC=0[1] queueG=0[1] queueT=0[1] [1 Intel(R) UHD Graphics 770] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [1 Intel(R) UHD Graphics 770] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [1 Intel(R) UHD Graphics 770] subgroup=32 basic/vote/ballot/shuffle=1/1/1/1 [1 Intel(R) UHD Graphics 770] fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0 loop_count = 100 num_threads = 16 powersave = 0 gpu_device = 0 cooling_down = 0 squeezenet min = 0.80 max = 2.51 avg = 0.89 squeezenet_int8 min = 2.81 max = 3.51 avg = 2.96 mobilenet min = 0.70 max = 0.79 avg = 0.71 mobilenet_int8 min = 2.95 max = 3.44 avg = 3.03 mobilenet_v2 min = 1.09 max = 1.25 avg = 1.12 mobilenet_v3 min = 1.33 max = 2.04 avg = 1.56 shufflenet min = 1.20 max = 1.39 avg = 1.27 shufflenet_v2 min = 1.50 max = 1.66 avg = 1.57 mnasnet min = 1.11 max = 1.22 avg = 1.15 proxylessnasnet min = 1.20 max = 1.63 avg = 1.24 efficientnet_b0 min = 2.38 max = 3.21 avg = 2.61 efficientnetv2_b0 min = 9.16 max = 11.35 avg = 9.63 regnety_400m min = 1.86 max = 2.03 avg = 1.94 blazeface min = 0.70 max = 1.10 avg = 0.76 googlenet min = 2.11 max = 2.40 avg = 2.30 googlenet_int8 min = 6.91 max = 7.88 avg = 7.17 resnet18 min = 1.14 max = 1.47 avg = 1.19 resnet18_int8 min = 4.96 max = 6.82 avg = 5.40 alexnet min = 1.10 max = 1.85 avg = 1.19 vgg16 min = 2.27 max = 3.97 avg = 2.46 vgg16_int8 min = 19.02 max = 22.20 avg = 20.28 resnet50 min = 2.00 max = 2.99 avg = 2.10 resnet50_int8 min = 10.66 max = 13.30 avg = 11.29 squeezenet_ssd min = 2.74 max = 3.44 avg = 2.90 squeezenet_ssd_int8 min = 6.93 max = 7.95 avg = 7.19 mobilenet_ssd min = 1.86 max = 2.07 avg = 1.96 mobilenet_ssd_int8 min = 5.92 max = 6.48 avg = 6.09 mobilenet_yolo min = 1.65 max = 2.58 avg = 1.78 mobilenetv2_yolov3 min = 3.85 max = 4.11 avg = 3.96 yolov4-tiny min = 6.54 max = 7.05 avg = 6.69 nanodet_m min = 2.38 max = 3.28 avg = 2.72 yolo-fastest-1.1 min = 1.73 max = 2.07 avg = 1.83 yolo-fastestv2 min = 1.72 max = 1.92 avg = 1.80 vision_transformer min = 53.91 max = 56.59 avg = 55.27 FastestDet min = 1.48 max = 1.83 avg = 1.69 ``` ### Intel(R) UHD Graphics 770 of Desktop[2023-10-12] ``` E:\github\ncnn\build-ncnn-vs2019\benchmark\Release>benchncnn.exe 100 16 0 1 0 [0 NVIDIA GeForce RTX 3060 Ti] queueC=2[8] queueG=0[16] queueT=1[2] [0 NVIDIA GeForce RTX 3060 Ti] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 NVIDIA GeForce RTX 3060 Ti] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 NVIDIA GeForce RTX 3060 Ti] subgroup=32 basic/vote/ballot/shuffle=1/1/1/1 [0 NVIDIA GeForce RTX 3060 Ti] fp16-matrix-16_8_8/16_8_16/16_16_16=1/1/1 [1 Intel(R) UHD Graphics 770] queueC=0[1] queueG=0[1] queueT=0[1] [1 Intel(R) UHD Graphics 770] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [1 Intel(R) UHD Graphics 770] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [1 Intel(R) UHD Graphics 770] subgroup=32 basic/vote/ballot/shuffle=1/1/1/1 [1 Intel(R) UHD Graphics 770] fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0 loop_count = 100 num_threads = 16 powersave = 0 gpu_device = 1 cooling_down = 0 squeezenet min = 3.11 max = 4.47 avg = 3.45 squeezenet_int8 min = 1.89 max = 2.84 avg = 2.23 mobilenet min = 4.98 max = 5.67 avg = 5.18 mobilenet_int8 min = 2.54 max = 3.17 avg = 2.98 mobilenet_v2 min = 4.03 max = 4.89 avg = 4.37 mobilenet_v3 min = 4.45 max = 5.68 avg = 4.86 shufflenet min = 3.42 max = 4.42 avg = 3.79 shufflenet_v2 min = 3.00 max = 4.01 avg = 3.30 mnasnet min = 4.21 max = 5.12 avg = 4.51 proxylessnasnet min = 4.62 max = 5.64 avg = 4.90 efficientnet_b0 min = 7.82 max = 8.63 avg = 8.10 efficientnetv2_b0 min = 34.52 max = 36.34 avg = 35.29 regnety_400m min = 6.07 max = 7.31 avg = 6.44 blazeface min = 1.54 max = 1.67 avg = 1.59 googlenet min = 11.53 max = 12.64 avg = 11.89 googlenet_int8 min = 13.71 max = 15.52 avg = 14.38 resnet18 min = 10.75 max = 12.94 avg = 11.07 resnet18_int8 min = 9.04 max = 11.05 avg = 9.53 alexnet min = 13.64 max = 14.37 avg = 13.98 vgg16 min = 38.53 max = 40.16 avg = 39.22 vgg16_int8 min = 16.04 max = 21.16 avg = 19.35 resnet50 min = 25.61 max = 28.22 avg = 26.62 resnet50_int8 min = 7.72 max = 12.83 avg = 10.29 squeezenet_ssd min = 10.34 max = 15.88 avg = 14.75 squeezenet_ssd_int8 min = 4.63 max = 7.13 avg = 5.66 mobilenet_ssd min = 11.35 max = 13.06 avg = 12.44 mobilenet_ssd_int8 min = 4.21 max = 6.31 avg = 5.32 mobilenet_yolo min = 20.14 max = 22.92 avg = 21.94 mobilenetv2_yolov3 min = 12.58 max = 14.88 avg = 14.21 yolov4-tiny min = 20.62 max = 25.58 avg = 24.39 nanodet_m min = 7.75 max = 12.49 avg = 11.42 yolo-fastest-1.1 min = 3.68 max = 6.49 avg = 5.54 yolo-fastestv2 min = 4.32 max = 5.39 avg = 4.51 vision_transformer min = 796.51 max = 805.29 avg = 802.39 FastestDet min = 2.89 max = 4.83 avg = 3.95 ``` ### Intel® Core™ i7-13700K of Desktop[2023-10-12] ``` E:\github\ncnn\build-ncnn-vs2019\benchmark\Release>benchncnn.exe 100 16 0 -1 0 loop_count = 100 num_threads = 16 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 1.69 max = 2.63 avg = 2.12 squeezenet_int8 min = 1.83 max = 3.03 avg = 2.26 mobilenet min = 1.69 max = 2.64 avg = 2.24 mobilenet_int8 min = 2.47 max = 3.06 avg = 2.84 mobilenet_v2 min = 1.94 max = 3.47 avg = 2.47 mobilenet_v3 min = 1.49 max = 2.74 avg = 1.87 shufflenet min = 1.57 max = 3.00 avg = 1.82 shufflenet_v2 min = 1.41 max = 1.72 avg = 1.51 mnasnet min = 1.73 max = 2.94 avg = 2.13 proxylessnasnet min = 2.08 max = 3.31 avg = 2.69 efficientnet_b0 min = 3.20 max = 4.99 avg = 3.78 efficientnetv2_b0 min = 3.51 max = 5.16 avg = 4.08 regnety_400m min = 4.51 max = 10.29 avg = 6.18 blazeface min = 0.52 max = 0.92 avg = 0.59 googlenet min = 5.49 max = 7.48 avg = 6.26 googlenet_int8 min = 4.83 max = 7.54 avg = 5.90 resnet18 min = 4.05 max = 6.61 avg = 4.83 resnet18_int8 min = 3.77 max = 5.70 avg = 4.57 alexnet min = 3.60 max = 5.09 avg = 4.26 vgg16 min = 25.19 max = 28.79 avg = 26.81 vgg16_int8 min = 17.52 max = 21.79 avg = 19.80 resnet50 min = 9.23 max = 13.15 avg = 11.34 resnet50_int8 min = 7.77 max = 12.00 avg = 10.18 squeezenet_ssd min = 4.33 max = 6.73 avg = 4.96 squeezenet_ssd_int8 min = 4.77 max = 7.62 avg = 5.71 mobilenet_ssd min = 3.70 max = 6.43 avg = 4.53 mobilenet_ssd_int8 min = 4.16 max = 6.53 avg = 5.38 mobilenet_yolo min = 11.27 max = 14.93 avg = 12.90 mobilenetv2_yolov3 min = 7.41 max = 11.52 avg = 9.11 yolov4-tiny min = 12.05 max = 18.96 avg = 14.15 nanodet_m min = 3.39 max = 5.77 avg = 4.07 yolo-fastest-1.1 min = 1.95 max = 3.85 avg = 2.30 yolo-fastestv2 min = 1.91 max = 3.52 avg = 2.27 vision_transformer min = 79.50 max = 99.93 avg = 88.91 FastestDet min = 1.92 max = 2.72 avg = 2.19 ``` ### Amlogic S805 (Cortex-A5, 4 × 1.536GHz) - Platform: Xunlei OneCloud (玩客云) - OS: Armbian buster (20.12) armv7l - Compiler: gcc version 8.3.0 (Debian 8.3.0-6) - ncnn tag: 20240102 ``` mizu-bai@aml-s812:~/ncnn-20240102/benchmark$ ../build/benchmark/benchncnn loop_count = 4 num_threads = 4 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 376.45 max = 445.48 avg = 408.08 squeezenet_int8 min = 247.06 max = 340.34 avg = 281.40 mobilenet min = 696.71 max = 745.63 avg = 718.49 mobilenet_int8 min = 355.78 max = 472.06 avg = 401.17 mobilenet_v2 min = 428.86 max = 491.25 avg = 458.45 mobilenet_v3 min = 361.78 max = 425.90 avg = 396.94 shufflenet min = 245.90 max = 333.41 avg = 293.46 shufflenet_v2 min = 210.69 max = 329.51 avg = 260.73 mnasnet min = 418.49 max = 493.40 avg = 448.95 proxylessnasnet min = 542.20 max = 566.65 avg = 554.75 efficientnet_b0 min = 727.72 max = 785.47 avg = 750.72 efficientnetv2_b0 min = 805.70 max = 874.57 avg = 843.87 regnety_400m min = 627.74 max = 686.57 avg = 660.60 blazeface min = 62.14 max = 121.32 avg = 82.10 googlenet min = 1295.31 max = 1411.88 avg = 1342.26 googlenet_int8 min = 796.39 max = 860.28 avg = 823.76 resnet18 min = 1076.93 max = 1125.12 avg = 1099.37 resnet18_int8 min = 587.12 max = 634.97 avg = 605.29 alexnet min = 701.70 max = 729.68 avg = 718.99 vgg16 min = 5584.13 max = 5748.84 avg = 5660.70 vgg16_int8 min = 3107.89 max = 3138.78 avg = 3121.28 resnet50 min = 3378.84 max = 3461.61 avg = 3425.38 resnet50_int8 min = 2044.93 max = 2067.70 avg = 2061.38 squeezenet_ssd min = 908.77 max = 972.68 avg = 939.98 squeezenet_ssd_int8 min = 609.58 max = 703.88 avg = 662.43 mobilenet_ssd min = 1524.69 max = 1589.79 avg = 1552.12 mobilenet_ssd_int8 min = 817.70 max = 885.45 avg = 840.30 mobilenet_yolo min = 3497.13 max = 3605.83 avg = 3543.72 mobilenetv2_yolov3 min = 1734.10 max = 1824.98 avg = 1795.42 yolov4-tiny min = 2093.70 max = 2163.44 avg = 2128.30 nanodet_m min = 593.75 max = 647.03 avg = 608.03 yolo-fastest-1.1 min = 228.68 max = 318.40 avg = 265.74 yolo-fastestv2 min = 194.29 max = 258.78 avg = 219.82 vision_transformer min = 14836.43 max = 15238.27 avg = 15125.26 FastestDet min = 215.60 max = 264.69 avg = 239.85 ``` ### Qualcomm SM8550-AB Snapdragon 8 Gen 2 (Kyro 3.20 GHz + 2.8 GHz x 2 + 2.80 GHz x 2 + 2.00 GHz * 3 + Adreno 740) ``` ./benchncnn 4 1 2 -1 1 loop_count = 4 num_threads = 1 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 8.44 max = 8.51 avg = 8.47 squeezenet_int8 min = 6.91 max = 7.13 avg = 7.00 mobilenet min = 15.45 max = 15.53 avg = 15.49 mobilenet_int8 min = 8.76 max = 9.03 avg = 8.88 mobilenet_v2 min = 9.52 max = 10.71 avg = 10.02 mobilenet_v3 min = 7.89 max = 8.02 avg = 7.93 shufflenet min = 5.07 max = 5.61 avg = 5.25 shufflenet_v2 min = 5.28 max = 5.41 avg = 5.37 mnasnet min = 9.52 max = 9.58 avg = 9.54 proxylessnasnet min = 11.26 max = 11.41 avg = 11.36 efficientnet_b0 min = 18.84 max = 18.91 avg = 18.88 efficientnetv2_b0 min = 28.60 max = 28.73 avg = 28.66 regnety_400m min = 12.35 max = 12.39 avg = 12.37 blazeface min = 1.83 max = 2.23 avg = 1.94 googlenet min = 32.07 max = 37.37 avg = 35.59 googlenet_int8 min = 28.50 max = 28.57 avg = 28.53 resnet18 min = 21.88 max = 22.05 avg = 21.94 resnet18_int8 min = 24.43 max = 40.52 avg = 32.04 alexnet min = 23.69 max = 24.22 avg = 23.98 vgg16 min = 91.85 max = 100.71 avg = 94.80 vgg16_int8 min = 206.66 max = 325.74 avg = 258.40 resnet50 min = 53.59 max = 54.20 avg = 53.96 resnet50_int8 min = 44.39 max = 45.11 avg = 44.74 squeezenet_ssd min = 23.80 max = 24.12 avg = 23.94 squeezenet_ssd_int8 min = 30.17 max = 30.42 avg = 30.31 mobilenet_ssd min = 33.49 max = 33.69 avg = 33.59 mobilenet_ssd_int8 min = 19.37 max = 19.76 avg = 19.56 mobilenet_yolo min = 72.63 max = 73.00 avg = 72.77 mobilenetv2_yolov3 min = 36.86 max = 37.40 avg = 37.08 yolov4-tiny min = 44.94 max = 45.46 avg = 45.22 nanodet_m min = 13.65 max = 13.99 avg = 13.82 yolo-fastest-1.1 min = 3.84 max = 3.93 avg = 3.89 yolo-fastestv2 min = 4.78 max = 4.93 avg = 4.84 vision_transformer min = 1042.50 max = 1043.06 avg = 1042.80 FastestDet min = 4.67 max = 4.75 avg = 4.70 ./benchncnn 4 4 2 -1 1 loop_count = 4 num_threads = 4 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 2.60 max = 2.66 avg = 2.64 squeezenet_int8 min = 2.38 max = 2.43 avg = 2.40 mobilenet min = 4.17 max = 4.25 avg = 4.21 mobilenet_int8 min = 2.59 max = 2.60 avg = 2.60 mobilenet_v2 min = 3.13 max = 3.44 avg = 3.23 mobilenet_v3 min = 2.90 max = 5.07 avg = 3.46 shufflenet min = 2.34 max = 2.44 avg = 2.38 shufflenet_v2 min = 2.06 max = 2.15 avg = 2.11 mnasnet min = 3.19 max = 3.20 avg = 3.20 proxylessnasnet min = 3.53 max = 3.61 avg = 3.57 efficientnet_b0 min = 5.72 max = 5.75 avg = 5.74 efficientnetv2_b0 min = 8.61 max = 8.67 avg = 8.64 regnety_400m min = 6.22 max = 6.27 avg = 6.25 blazeface min = 0.82 max = 0.92 avg = 0.86 googlenet min = 10.62 max = 14.39 avg = 11.59 googlenet_int8 min = 8.84 max = 8.99 avg = 8.92 resnet18 min = 6.61 max = 6.66 avg = 6.63 resnet18_int8 min = 21.41 max = 23.48 avg = 22.57 alexnet min = 8.18 max = 8.24 avg = 8.21 vgg16 min = 36.99 max = 39.65 avg = 37.75 vgg16_int8 min = 86.21 max = 89.00 avg = 86.95 resnet50 min = 18.90 max = 18.98 avg = 18.94 resnet50_int8 min = 19.18 max = 19.28 avg = 19.22 squeezenet_ssd min = 8.26 max = 8.42 avg = 8.32 squeezenet_ssd_int8 min = 21.02 max = 21.15 avg = 21.09 mobilenet_ssd min = 9.29 max = 9.42 avg = 9.34 mobilenet_ssd_int8 min = 5.85 max = 5.91 avg = 5.87 mobilenet_yolo min = 21.64 max = 21.71 avg = 21.69 mobilenetv2_yolov3 min = 11.50 max = 11.62 avg = 11.57 yolov4-tiny min = 14.91 max = 14.99 avg = 14.95 nanodet_m min = 4.93 max = 5.02 avg = 4.98 yolo-fastest-1.1 min = 2.19 max = 2.26 avg = 2.21 yolo-fastestv2 min = 2.29 max = 2.44 avg = 2.39 vision_transformer min = 242.50 max = 301.91 avg = 271.32 FastestDet min = 2.01 max = 2.12 avg = 2.05 ./benchncnn 4 8 0 -1 1 loop_count = 4 num_threads = 8 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 4.53 max = 6.34 avg = 5.48 squeezenet_int8 min = 5.48 max = 7.02 avg = 6.14 mobilenet min = 6.89 max = 8.44 avg = 7.61 mobilenet_int8 min = 4.89 max = 6.39 avg = 5.43 mobilenet_v2 min = 6.01 max = 7.28 avg = 6.53 mobilenet_v3 min = 4.85 max = 12.13 avg = 7.16 shufflenet min = 4.41 max = 6.20 avg = 5.25 shufflenet_v2 min = 3.50 max = 4.34 avg = 3.74 mnasnet min = 5.52 max = 7.03 avg = 6.18 proxylessnasnet min = 6.21 max = 7.76 avg = 6.94 efficientnet_b0 min = 9.49 max = 10.57 avg = 9.94 efficientnetv2_b0 min = 15.26 max = 19.50 avg = 17.42 regnety_400m min = 9.89 max = 14.30 avg = 12.02 blazeface min = 2.25 max = 3.44 avg = 2.66 googlenet min = 18.98 max = 23.38 avg = 21.07 googlenet_int8 min = 17.99 max = 20.47 avg = 19.45 resnet18 min = 34.98 max = 84.52 avg = 69.50 resnet18_int8 min = 14.58 max = 15.43 avg = 15.04 alexnet min = 13.56 max = 15.05 avg = 14.29 vgg16 min = 63.32 max = 73.69 avg = 67.01 vgg16_int8 min = 91.17 max = 99.80 avg = 94.81 resnet50 min = 32.01 max = 42.22 avg = 36.06 resnet50_int8 min = 30.16 max = 32.25 avg = 30.72 squeezenet_ssd min = 14.72 max = 21.45 avg = 17.51 squeezenet_ssd_int8 min = 18.21 max = 23.93 avg = 21.45 mobilenet_ssd min = 16.38 max = 17.92 avg = 16.97 mobilenet_ssd_int8 min = 10.15 max = 15.88 avg = 12.92 mobilenet_yolo min = 35.88 max = 37.10 avg = 36.26 mobilenetv2_yolov3 min = 21.92 max = 27.60 avg = 24.12 yolov4-tiny min = 32.03 max = 34.45 avg = 33.51 nanodet_m min = 9.49 max = 14.35 avg = 11.20 yolo-fastest-1.1 min = 3.97 max = 5.16 avg = 4.40 yolo-fastestv2 min = 5.13 max = 7.84 avg = 6.18 vision_transformer min = 364.37 max = 391.13 avg = 374.55 FastestDet min = 3.01 max = 7.36 avg = 4.76 ./benchncnn 4 1 2 0 0 [0 Adreno (TM) 740] queueC=0[3] queueG=0[3] queueT=0[3] [0 Adreno (TM) 740] bugsbn1=1 bugbilz=0 bugcopc=0 bugihfa=0 [0 Adreno (TM) 740] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 Adreno (TM) 740] subgroup=64 basic=1 vote=1 ballot=1 shuffle=1 loop_count = 4 num_threads = 1 powersave = 2 gpu_device = 0 cooling_down = 0 squeezenet min = 9.73 max = 11.72 avg = 10.55 squeezenet_int8 min = 7.21 max = 7.34 avg = 7.27 mobilenet min = 10.87 max = 13.09 avg = 12.01 mobilenet_int8 min = 8.82 max = 9.23 avg = 9.11 mobilenet_v2 min = 15.77 max = 16.21 avg = 15.96 mobilenet_v3 min = 18.04 max = 18.68 avg = 18.40 shufflenet min = 9.82 max = 11.92 avg = 10.79 shufflenet_v2 min = 14.41 max = 15.41 avg = 14.96 mnasnet min = 16.01 max = 16.43 avg = 16.27 proxylessnasnet min = 14.18 max = 16.28 avg = 15.51 efficientnet_b0 min = 36.38 max = 37.06 avg = 36.83 efficientnetv2_b0 min = 55.98 max = 66.59 avg = 59.54 regnety_400m min = 21.94 max = 22.46 avg = 22.30 blazeface min = 3.92 max = 4.47 avg = 4.08 googlenet min = 31.79 max = 35.63 avg = 33.04 googlenet_int8 min = 23.21 max = 29.38 avg = 26.60 resnet18 min = 22.61 max = 24.05 avg = 23.09 resnet18_int8 min = 24.56 max = 24.78 avg = 24.62 alexnet min = 25.98 max = 27.05 avg = 26.49 vgg16 min = 39.00 max = 39.82 avg = 39.29 vgg16_int8 min = 207.47 max = 208.56 avg = 207.90 resnet50 min = 44.07 max = 44.43 avg = 44.29 resnet50_int8 min = 44.77 max = 47.04 avg = 45.44 squeezenet_ssd min = 33.71 max = 34.27 avg = 34.09 squeezenet_ssd_int8 min = 22.53 max = 30.33 avg = 25.07 mobilenet_ssd min = 26.91 max = 28.35 avg = 27.42 mobilenet_ssd_int8 min = 19.43 max = 19.82 avg = 19.69 mobilenet_yolo min = 28.03 max = 29.19 avg = 28.65 mobilenetv2_yolov3 min = 33.54 max = 34.65 avg = 34.31 yolov4-tiny min = 49.77 max = 51.21 avg = 50.55 nanodet_m min = 17.35 max = 18.83 avg = 18.06 yolo-fastest-1.1 min = 9.45 max = 9.59 avg = 9.51 yolo-fastestv2 min = 13.13 max = 13.63 avg = 13.36 vision_transformer min = 671.13 max = 679.90 avg = 675.27 FastestDet min = 8.62 max = 9.01 avg = 8.86 ./benchncnn 64 1 2 0 0 [0 Adreno (TM) 740] queueC=0[3] queueG=0[3] queueT=0[3] [0 Adreno (TM) 740] bugsbn1=1 bugbilz=0 bugcopc=0 bugihfa=0 [0 Adreno (TM) 740] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 Adreno (TM) 740] subgroup=64 basic=1 vote=1 ballot=1 shuffle=1 loop_count = 64 num_threads = 1 powersave = 2 gpu_device = 0 cooling_down = 0 squeezenet min = 9.56 max = 12.14 avg = 11.48 squeezenet_int8 min = 6.78 max = 8.47 avg = 7.04 mobilenet min = 11.59 max = 12.90 avg = 12.44 mobilenet_int8 min = 8.69 max = 9.42 avg = 8.90 mobilenet_v2 min = 14.00 max = 16.08 avg = 15.12 mobilenet_v3 min = 16.66 max = 19.62 avg = 18.51 shufflenet min = 8.72 max = 13.02 avg = 11.86 shufflenet_v2 min = 12.82 max = 14.66 avg = 14.03 mnasnet min = 15.06 max = 17.55 avg = 16.12 proxylessnasnet min = 15.42 max = 17.28 avg = 16.59 efficientnet_b0 min = 35.96 max = 41.24 avg = 37.89 efficientnetv2_b0 min = 46.11 max = 65.75 avg = 58.52 regnety_400m min = 22.07 max = 26.40 avg = 24.43 blazeface min = 3.61 max = 6.26 avg = 4.53 googlenet min = 32.60 max = 37.05 avg = 34.55 googlenet_int8 min = 21.79 max = 30.65 avg = 24.84 resnet18 min = 19.46 max = 24.26 avg = 22.76 resnet18_int8 min = 38.09 max = 40.42 avg = 38.44 alexnet min = 20.80 max = 28.44 avg = 26.86 vgg16 min = 36.00 max = 44.01 avg = 39.18 vgg16_int8 min = 201.54 max = 209.87 avg = 207.06 resnet50 min = 42.50 max = 46.82 avg = 44.26 resnet50_int8 min = 44.63 max = 47.47 avg = 45.15 squeezenet_ssd min = 33.19 max = 36.74 avg = 34.62 squeezenet_ssd_int8 min = 22.40 max = 31.99 avg = 25.65 mobilenet_ssd min = 26.35 max = 29.79 avg = 28.09 mobilenet_ssd_int8 min = 19.15 max = 20.86 avg = 19.48 mobilenet_yolo min = 28.42 max = 31.16 avg = 29.06 mobilenetv2_yolov3 min = 33.86 max = 36.54 avg = 35.36 yolov4-tiny min = 46.51 max = 49.29 avg = 48.01 nanodet_m min = 17.14 max = 19.79 avg = 18.49 yolo-fastest-1.1 min = 9.49 max = 15.00 avg = 13.59 yolo-fastestv2 min = 11.65 max = 15.61 avg = 14.36 vision_transformer min = 650.85 max = 696.67 avg = 671.13 FastestDet min = 8.63 max = 13.12 avg = 11.39 ``` ### MediaTek Dimensity 9300 (MT6989) (Cortex-X4 3.25 GHz + 2.85 GHz x 3 + Cortex-A720 2.0 GHz x 4 + Mali-G720-Immortalis MC12) ``` k6989v1_64:/data/local/tmp/benchmark # ../build-android/benchmark/benchncnn 8 8 0 -1 1 loop_count = 8 num_threads = 8 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 1.87 max = 2.18 avg = 2.01 squeezenet_int8 min = 1.52 max = 1.98 avg = 1.77 mobilenet min = 3.02 max = 3.34 avg = 3.15 mobilenet_int8 min = 1.90 max = 2.27 avg = 2.04 mobilenet_v2 min = 2.72 max = 3.13 avg = 2.89 mobilenet_v3 min = 2.20 max = 3.82 avg = 2.78 shufflenet min = 1.97 max = 2.56 avg = 2.20 shufflenet_v2 min = 1.77 max = 2.29 avg = 1.96 mnasnet min = 2.61 max = 3.48 avg = 2.90 proxylessnasnet min = 2.72 max = 3.06 avg = 2.89 efficientnet_b0 min = 4.57 max = 5.17 avg = 4.89 efficientnetv2_b0 min = 5.24 max = 6.72 avg = 5.81 regnety_400m min = 4.94 max = 6.78 avg = 5.70 blazeface min = 0.80 max = 1.02 avg = 0.91 googlenet min = 7.76 max = 8.53 avg = 8.12 googlenet_int8 min = 5.68 max = 6.62 avg = 6.19 resnet18 min = 5.35 max = 6.06 avg = 5.61 resnet18_int8 min = 4.20 max = 4.40 avg = 4.29 alexnet min = 5.96 max = 7.30 avg = 6.77 vgg16 min = 29.27 max = 30.58 avg = 29.93 vgg16_int8 min = 26.72 max = 28.12 avg = 27.27 resnet50 min = 15.21 max = 19.16 avg = 16.09 resnet50_int8 min = 8.57 max = 9.16 avg = 8.91 squeezenet_ssd min = 6.29 max = 7.56 avg = 6.82 squeezenet_ssd_int8 min = 5.57 max = 6.96 avg = 6.12 mobilenet_ssd min = 6.90 max = 8.90 avg = 7.55 mobilenet_ssd_int8 min = 4.53 max = 5.22 avg = 4.86 mobilenet_yolo min = 16.88 max = 19.71 avg = 17.88 mobilenetv2_yolov3 min = 10.51 max = 14.19 avg = 11.95 yolov4-tiny min = 12.81 max = 16.23 avg = 14.22 nanodet_m min = 4.38 max = 5.96 avg = 5.19 yolo-fastest-1.1 min = 2.22 max = 3.08 avg = 2.73 yolo-fastestv2 min = 2.09 max = 2.73 avg = 2.41 vision_transformer min = 193.39 max = 203.13 avg = 198.32 FastestDet min = 1.98 max = 2.35 avg = 2.16 k6989v1_64:/data/local/tmp/benchmark # ../build-android/benchmark/benchncnn 8 4 2 -1 1 loop_count = 8 num_threads = 4 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 2.23 max = 2.31 avg = 2.27 squeezenet_int8 min = 1.68 max = 1.73 avg = 1.70 mobilenet min = 3.76 max = 3.86 avg = 3.81 mobilenet_int8 min = 2.07 max = 2.16 avg = 2.11 mobilenet_v2 min = 2.72 max = 2.95 avg = 2.80 mobilenet_v3 min = 2.43 max = 2.51 avg = 2.47 shufflenet min = 1.78 max = 1.87 avg = 1.81 shufflenet_v2 min = 1.61 max = 1.66 avg = 1.63 mnasnet min = 2.69 max = 2.82 avg = 2.76 proxylessnasnet min = 2.95 max = 3.13 avg = 3.05 efficientnet_b0 min = 4.99 max = 5.29 avg = 5.08 efficientnetv2_b0 min = 5.73 max = 5.86 avg = 5.79 regnety_400m min = 4.97 max = 5.04 avg = 5.00 blazeface min = 1.07 max = 1.17 avg = 1.10 googlenet min = 8.51 max = 9.43 avg = 8.75 googlenet_int8 min = 6.01 max = 6.13 avg = 6.07 resnet18 min = 6.72 max = 7.04 avg = 6.95 resnet18_int8 min = 4.31 max = 4.40 avg = 4.34 alexnet min = 7.41 max = 7.71 avg = 7.57 vgg16 min = 33.77 max = 34.68 avg = 34.08 vgg16_int8 min = 32.61 max = 33.83 avg = 33.12 resnet50 min = 18.76 max = 19.53 avg = 19.05 resnet50_int8 min = 9.56 max = 9.70 avg = 9.61 squeezenet_ssd min = 6.86 max = 7.26 avg = 7.01 squeezenet_ssd_int8 min = 5.42 max = 6.17 avg = 5.64 mobilenet_ssd min = 8.38 max = 9.14 avg = 8.62 mobilenet_ssd_int8 min = 4.60 max = 4.90 avg = 4.69 mobilenet_yolo min = 19.59 max = 20.06 avg = 19.78 mobilenetv2_yolov3 min = 10.46 max = 11.01 avg = 10.70 yolov4-tiny min = 13.46 max = 14.18 avg = 13.86 nanodet_m min = 4.52 max = 4.59 avg = 4.55 yolo-fastest-1.1 min = 1.88 max = 1.94 avg = 1.91 yolo-fastestv2 min = 1.73 max = 1.79 avg = 1.76 vision_transformer min = 220.32 max = 229.49 avg = 223.92 FastestDet min = 1.67 max = 1.73 avg = 1.70 k6989v1_64:/data/local/tmp/benchmark # ../build-android/benchmark/benchncnn 8 4 1 -1 1 loop_count = 8 num_threads = 4 powersave = 1 gpu_device = -1 cooling_down = 1 squeezenet min = 3.42 max = 4.25 avg = 3.62 squeezenet_int8 min = 2.63 max = 2.78 avg = 2.73 mobilenet min = 5.66 max = 6.25 avg = 5.82 mobilenet_int8 min = 3.13 max = 5.66 avg = 3.58 mobilenet_v2 min = 4.40 max = 4.46 avg = 4.42 mobilenet_v3 min = 3.74 max = 4.07 avg = 3.94 shufflenet min = 2.77 max = 2.86 avg = 2.82 shufflenet_v2 min = 2.52 max = 2.62 avg = 2.57 mnasnet min = 4.24 max = 4.37 avg = 4.28 proxylessnasnet min = 4.65 max = 4.91 avg = 4.74 efficientnet_b0 min = 7.71 max = 10.00 avg = 8.08 efficientnetv2_b0 min = 9.24 max = 10.34 avg = 9.87 regnety_400m min = 7.87 max = 8.35 avg = 8.02 blazeface min = 2.38 max = 2.46 avg = 2.40 googlenet min = 13.21 max = 13.78 avg = 13.40 googlenet_int8 min = 10.23 max = 10.65 avg = 10.36 resnet18 min = 9.25 max = 9.68 avg = 9.49 resnet18_int8 min = 6.86 max = 6.97 avg = 6.91 alexnet min = 9.73 max = 10.53 avg = 9.97 vgg16 min = 47.43 max = 48.12 avg = 47.78 vgg16_int8 min = 47.08 max = 48.18 avg = 47.46 resnet50 min = 26.82 max = 27.14 avg = 26.99 resnet50_int8 min = 15.01 max = 15.57 avg = 15.20 squeezenet_ssd min = 9.96 max = 12.66 avg = 10.83 squeezenet_ssd_int8 min = 8.47 max = 9.26 avg = 8.88 mobilenet_ssd min = 12.54 max = 13.25 avg = 12.82 mobilenet_ssd_int8 min = 7.03 max = 10.91 avg = 7.94 mobilenet_yolo min = 29.73 max = 30.45 avg = 30.23 mobilenetv2_yolov3 min = 16.64 max = 17.71 avg = 17.13 yolov4-tiny min = 22.25 max = 22.65 avg = 22.45 nanodet_m min = 7.56 max = 7.86 avg = 7.69 yolo-fastest-1.1 min = 3.32 max = 3.45 avg = 3.39 yolo-fastestv2 min = 2.76 max = 2.96 avg = 2.84 vision_transformer min = 328.11 max = 337.26 avg = 332.12 FastestDet min = 2.66 max = 2.77 avg = 2.71 k6989v1_64:/data/local/tmp/benchmark # ../build-android/benchmark/benchncnn 8 1 2 -1 1 loop_count = 8 num_threads = 1 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 5.27 max = 5.35 avg = 5.32 squeezenet_int8 min = 3.06 max = 3.22 avg = 3.16 mobilenet min = 9.59 max = 9.85 avg = 9.74 mobilenet_int8 min = 4.29 max = 4.45 avg = 4.37 mobilenet_v2 min = 5.14 max = 5.33 avg = 5.20 mobilenet_v3 min = 4.28 max = 4.54 avg = 4.42 shufflenet min = 3.18 max = 3.34 avg = 3.27 shufflenet_v2 min = 2.78 max = 3.23 avg = 3.05 mnasnet min = 5.01 max = 5.38 avg = 5.19 proxylessnasnet min = 6.11 max = 6.30 avg = 6.21 efficientnet_b0 min = 11.53 max = 11.78 avg = 11.66 efficientnetv2_b0 min = 13.88 max = 14.28 avg = 14.13 regnety_400m min = 8.11 max = 8.18 avg = 8.16 blazeface min = 0.99 max = 1.08 avg = 1.01 googlenet min = 19.68 max = 20.71 avg = 20.25 googlenet_int8 min = 13.42 max = 13.86 avg = 13.60 resnet18 min = 18.10 max = 18.84 avg = 18.53 resnet18_int8 min = 9.67 max = 10.17 avg = 9.99 alexnet min = 15.76 max = 16.35 avg = 16.03 vgg16 min = 70.22 max = 72.85 avg = 71.58 vgg16_int8 min = 76.83 max = 79.70 avg = 78.45 resnet50 min = 39.73 max = 41.24 avg = 40.30 resnet50_int8 min = 20.76 max = 21.54 avg = 21.27 squeezenet_ssd min = 12.63 max = 18.67 avg = 15.20 squeezenet_ssd_int8 min = 10.29 max = 16.13 avg = 14.13 mobilenet_ssd min = 17.21 max = 18.43 avg = 17.68 mobilenet_ssd_int8 min = 8.92 max = 9.49 avg = 9.07 mobilenet_yolo min = 37.45 max = 38.29 avg = 37.88 mobilenetv2_yolov3 min = 19.18 max = 19.83 avg = 19.58 yolov4-tiny min = 27.06 max = 27.86 avg = 27.45 nanodet_m min = 9.33 max = 9.50 avg = 9.42 yolo-fastest-1.1 min = 3.48 max = 3.59 avg = 3.54 yolo-fastestv2 min = 2.29 max = 2.37 avg = 2.33 vision_transformer min = 730.38 max = 739.99 avg = 735.77 FastestDet min = 2.40 max = 2.48 avg = 2.43 k6989v1_64:/data/local/tmp/benchmark # ../build-android/benchmark/benchncnn 64 1 2 0 0 [0 Mali-G720-Immortalis MC12] queueC=0[2] queueG=0[2] queueT=0[2] [0 Mali-G720-Immortalis MC12] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 Mali-G720-Immortalis MC12] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 Mali-G720-Immortalis MC12] subgroup=16 basic/vote/ballot/shuffle=1/1/1/1 [0 Mali-G720-Immortalis MC12] fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0 loop_count = 64 num_threads = 1 powersave = 2 gpu_device = 0 cooling_down = 0 squeezenet min = 11.26 max = 13.58 avg = 12.32 squeezenet_int8 min = 3.08 max = 3.29 avg = 3.17 mobilenet min = 11.96 max = 14.52 avg = 13.48 mobilenet_int8 min = 4.20 max = 4.58 avg = 4.34 mobilenet_v2 min = 13.62 max = 16.46 avg = 14.62 mobilenet_v3 min = 13.98 max = 17.16 avg = 15.25 shufflenet min = 10.22 max = 11.82 avg = 11.07 shufflenet_v2 min = 12.42 max = 15.39 avg = 14.35 mnasnet min = 12.94 max = 16.30 avg = 14.91 proxylessnasnet min = 13.18 max = 16.55 avg = 15.05 efficientnet_b0 min = 16.70 max = 20.35 avg = 18.27 efficientnetv2_b0 min = 54.09 max = 70.05 avg = 58.68 regnety_400m min = 16.20 max = 18.42 avg = 17.27 blazeface min = 6.50 max = 7.86 avg = 6.93 googlenet min = 15.29 max = 17.54 avg = 16.19 googlenet_int8 min = 20.38 max = 22.08 avg = 20.98 resnet18 min = 12.22 max = 15.63 avg = 14.27 resnet18_int8 min = 9.50 max = 10.46 avg = 9.75 alexnet min = 12.00 max = 16.09 avg = 13.65 vgg16 min = 31.06 max = 32.77 avg = 31.85 vgg16_int8 min = 115.72 max = 123.71 avg = 118.23 resnet50 min = 15.74 max = 16.53 avg = 16.10 resnet50_int8 min = 32.43 max = 33.78 avg = 33.07 squeezenet_ssd min = 17.24 max = 21.80 avg = 20.68 squeezenet_ssd_int8 min = 9.69 max = 10.52 avg = 9.97 mobilenet_ssd min = 15.32 max = 17.63 avg = 16.62 mobilenet_ssd_int8 min = 8.84 max = 9.54 avg = 9.05 mobilenet_yolo min = 16.67 max = 18.21 avg = 17.25 mobilenetv2_yolov3 min = 20.08 max = 25.40 avg = 23.12 yolov4-tiny min = 21.98 max = 29.67 avg = 24.75 nanodet_m min = 23.19 max = 29.95 avg = 25.69 yolo-fastest-1.1 min = 15.07 max = 17.78 avg = 16.49 yolo-fastestv2 min = 14.67 max = 16.07 avg = 15.44 vision_transformer min = 768.04 max = 801.48 avg = 786.79 FastestDet min = 8.33 max = 16.07 avg = 14.38 ``` ### Xeon Phi 3120A (1.10 GHz 57-core 228-thread) - Host: CentOS 7.9 - Compiler: icc & icpc (ICC) 17.0.2 20170213 - ncnn tag: 20240102 Build command ```bash $ CC=icc CXX=icpc CFLAGS="-mmic" CXXFLAGS="-mmic" cmake .. -DCMAKE_BUILD_TYPE=Release -DNCNN_SSE2=OFF -DNCNN_AVX=OFF -DNCNN_AVX2=OFF ``` Copy the whole `ncnn` directory and libraries in `/opt/intel/compilers_and_libraries_2017/linux/lib/mic/lib` to `mic0`, then set the `LD_LIBRARY_PATH` environment variable. Some tools cannot be built, but `benchncnn` should work. The built `benchncnn` is for Intel Xeon Phi coprocessor (k1om). ```bash [mizu-bai@DESKTOP-1D9QDE1-mic0 benchmark]$ file benchncnn benchncnn: ELF 64-bit LSB executable, Intel Xeon Phi coprocessor (k1om), version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.32, not stripped ``` The benchmark is run in the native mode, ssh into the Xeon Phi by `ssh user@mic0`, then run `benckncnn` as under general linux systems. ``` [mizu-bai@DESKTOP-1D9QDE1-mic0 benchmark]$ KMP_AFFINITY=scatter ../build/benchmark/benchncnn 4 56 0 -1 1 loop_count = 4 num_threads = 56 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 43.42 max = 44.20 avg = 43.64 squeezenet_int8 min = 161.92 max = 162.41 avg = 162.15 mobilenet min = 44.49 max = 46.90 avg = 45.68 mobilenet_int8 min = 230.47 max = 232.40 avg = 231.77 mobilenet_v2 min = 57.22 max = 62.03 avg = 59.42 mobilenet_v3 min = 301.16 max = 306.62 avg = 303.90 shufflenet min = 65.80 max = 70.18 avg = 67.70 shufflenet_v2 min = 49.54 max = 53.17 avg = 51.22 mnasnet min = 521.87 max = 527.76 avg = 524.63 proxylessnasnet min = 745.79 max = 748.55 avg = 746.92 efficientnet_b0 min = 582.21 max = 584.64 avg = 583.34 efficientnetv2_b0 min = 84.13 max = 86.13 avg = 85.19 regnety_400m min = 209.67 max = 214.84 avg = 212.39 blazeface min = 26.33 max = 27.39 avg = 26.74 googlenet min = 124.14 max = 125.72 avg = 124.83 googlenet_int8 min = 498.36 max = 502.37 avg = 500.29 resnet18 min = 87.86 max = 88.83 avg = 88.35 resnet18_int8 min = 359.50 max = 360.71 avg = 360.11 alexnet min = 49.87 max = 51.25 avg = 50.76 vgg16 min = 341.87 max = 343.92 avg = 342.42 vgg16_int8 min = 1649.34 max = 1655.37 avg = 1652.98 resnet50 min = 198.91 max = 202.32 avg = 200.58 resnet50_int8 min = 983.48 max = 988.73 avg = 986.22 squeezenet_ssd min = 108.33 max = 111.45 avg = 110.18 squeezenet_ssd_int8 min = 368.96 max = 370.30 avg = 369.54 mobilenet_ssd min = 98.29 max = 101.49 avg = 99.99 mobilenet_ssd_int8 min = 462.18 max = 466.20 avg = 464.85 mobilenet_yolo min = 262.42 max = 266.84 avg = 263.91 mobilenetv2_yolov3 min = 159.20 max = 161.58 avg = 160.66 yolov4-tiny min = 229.22 max = 230.48 avg = 229.87 nanodet_m min = 115.10 max = 116.78 avg = 115.86 yolo-fastest-1.1 min = 154.48 max = 155.33 avg = 154.79 yolo-fastestv2 min = 161.10 max = 163.98 avg = 161.88 vision_transformer min = 848.51 max = 863.03 avg = 854.92 FastestDet min = 251.64 max = 253.22 avg = 252.38 [mizu-bai@DESKTOP-1D9QDE1-mic0 benchmark]$ KMP_AFFINITY=scatter ../build/benchmark/benchncnn 4 112 0 -1 1 loop_count = 4 num_threads = 112 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 41.07 max = 41.19 avg = 41.12 squeezenet_int8 min = 161.73 max = 163.90 avg = 162.74 mobilenet min = 36.82 max = 37.53 avg = 37.11 mobilenet_int8 min = 231.50 max = 233.81 avg = 232.65 mobilenet_v2 min = 53.12 max = 55.87 avg = 54.44 mobilenet_v3 min = 277.82 max = 280.61 avg = 279.66 shufflenet min = 64.11 max = 64.92 avg = 64.63 shufflenet_v2 min = 48.23 max = 50.00 avg = 49.19 mnasnet min = 532.09 max = 534.73 avg = 533.34 proxylessnasnet min = 760.43 max = 763.94 avg = 762.34 efficientnet_b0 min = 534.29 max = 547.51 avg = 541.29 efficientnetv2_b0 min = 75.94 max = 76.88 avg = 76.39 regnety_400m min = 226.37 max = 227.81 avg = 227.23 blazeface min = 26.03 max = 26.93 avg = 26.51 googlenet min = 106.53 max = 107.54 avg = 107.06 googlenet_int8 min = 503.01 max = 505.16 avg = 504.13 resnet18 min = 73.63 max = 76.61 avg = 75.11 resnet18_int8 min = 358.18 max = 359.50 avg = 358.99 alexnet min = 37.40 max = 38.17 avg = 37.83 vgg16 min = 244.95 max = 250.05 avg = 247.24 vgg16_int8 min = 1511.89 max = 1512.66 avg = 1512.35 resnet50 min = 151.99 max = 154.66 avg = 153.37 resnet50_int8 min = 954.16 max = 957.63 avg = 956.55 squeezenet_ssd min = 91.46 max = 97.18 avg = 94.00 squeezenet_ssd_int8 min = 368.03 max = 375.96 avg = 370.99 mobilenet_ssd min = 79.61 max = 81.38 avg = 80.33 mobilenet_ssd_int8 min = 458.93 max = 463.41 avg = 461.63 mobilenet_yolo min = 234.59 max = 236.91 avg = 235.43 mobilenetv2_yolov3 min = 145.82 max = 146.92 avg = 146.23 yolov4-tiny min = 219.22 max = 220.51 avg = 219.83 nanodet_m min = 109.43 max = 113.94 avg = 112.20 yolo-fastest-1.1 min = 158.13 max = 160.59 avg = 159.20 yolo-fastestv2 min = 162.05 max = 162.80 avg = 162.47 vision_transformer min = 615.14 max = 625.35 avg = 618.47 FastestDet min = 279.98 max = 282.49 avg = 281.14 [mizu-bai@DESKTOP-1D9QDE1-mic0 benchmark]$ KMP_AFFINITY=scatter ../build/benchmark/benchncnn 4 224 0 -1 1 loop_count = 4 num_threads = 224 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 45.54 max = 46.81 avg = 46.13 squeezenet_int8 min = 186.81 max = 187.14 avg = 186.97 mobilenet min = 38.33 max = 39.11 avg = 38.64 mobilenet_int8 min = 251.06 max = 251.91 avg = 251.40 mobilenet_v2 min = 56.57 max = 57.15 avg = 56.88 mobilenet_v3 min = 365.04 max = 366.87 avg = 365.94 shufflenet min = 71.16 max = 72.02 avg = 71.68 shufflenet_v2 min = 52.14 max = 53.60 avg = 52.92 mnasnet min = 596.37 max = 603.62 avg = 600.50 proxylessnasnet min = 911.84 max = 912.23 avg = 912.04 efficientnet_b0 min = 611.77 max = 614.32 avg = 612.69 efficientnetv2_b0 min = 82.16 max = 83.05 avg = 82.62 regnety_400m min = 253.43 max = 255.79 avg = 254.66 blazeface min = 30.54 max = 30.91 avg = 30.70 googlenet min = 111.68 max = 112.65 avg = 112.11 googlenet_int8 min = 594.07 max = 597.09 avg = 596.03 resnet18 min = 78.14 max = 79.12 avg = 78.75 resnet18_int8 min = 412.69 max = 413.92 avg = 413.46 alexnet min = 40.93 max = 41.43 avg = 41.17 vgg16 min = 242.45 max = 244.46 avg = 243.47 vgg16_int8 min = 1545.61 max = 1548.72 avg = 1547.47 resnet50 min = 147.73 max = 148.56 avg = 148.07 resnet50_int8 min = 1034.47 max = 1042.31 avg = 1038.41 squeezenet_ssd min = 107.82 max = 110.53 avg = 108.98 squeezenet_ssd_int8 min = 423.30 max = 426.91 avg = 425.67 mobilenet_ssd min = 74.54 max = 77.13 avg = 75.97 mobilenet_ssd_int8 min = 510.95 max = 513.33 avg = 512.40 mobilenet_yolo min = 238.83 max = 239.64 avg = 239.27 mobilenetv2_yolov3 min = 159.80 max = 160.31 avg = 160.04 yolov4-tiny min = 233.89 max = 237.41 avg = 236.22 nanodet_m min = 122.39 max = 123.42 avg = 122.89 yolo-fastest-1.1 min = 194.49 max = 195.25 avg = 194.94 yolo-fastestv2 min = 193.06 max = 195.03 avg = 194.05 vision_transformer min = 547.36 max = 554.17 avg = 549.99 FastestDet min = 317.76 max = 321.38 avg = 320.18 ``` ### PhytiumPi, Phytium E2000 (FTC664@1.8GHz x2 + FTC310@1.5GHz x2) ``` loop_count = 4 num_threads = 2 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 43.84 max = 43.95 avg = 43.88 squeezenet_int8 min = 35.48 max = 35.77 avg = 35.66 mobilenet min = 69.31 max = 70.03 avg = 69.66 mobilenet_int8 min = 42.30 max = 42.40 avg = 42.35 mobilenet_v2 min = 59.07 max = 59.35 avg = 59.19 mobilenet_v3 min = 46.02 max = 46.37 avg = 46.19 shufflenet min = 31.52 max = 31.61 avg = 31.56 shufflenet_v2 min = 23.99 max = 24.07 avg = 24.04 mnasnet min = 49.40 max = 50.45 avg = 49.92 proxylessnasnet min = 53.24 max = 53.85 avg = 53.53 efficientnet_b0 min = 77.49 max = 77.84 avg = 77.62 efficientnetv2_b0 min = 88.51 max = 88.92 avg = 88.69 regnety_400m min = 66.99 max = 67.05 avg = 67.03 blazeface min = 7.74 max = 8.14 avg = 7.98 googlenet min = 126.62 max = 127.23 avg = 126.91 googlenet_int8 min = 102.87 max = 103.16 avg = 103.01 resnet18 min = 102.28 max = 102.63 avg = 102.48 resnet18_int8 min = 72.01 max = 72.45 avg = 72.29 alexnet min = 76.00 max = 124.61 avg = 88.24 vgg16 min = 597.75 max = 601.99 avg = 599.44 vgg16_int8 min = 421.40 max = 423.83 avg = 423.01 resnet50 min = 278.16 max = 280.64 avg = 279.37 resnet50_int8 min = 207.26 max = 207.47 avg = 207.36 squeezenet_ssd min = 108.69 max = 109.26 avg = 108.99 squeezenet_ssd_int8 min = 84.05 max = 84.60 avg = 84.28 mobilenet_ssd min = 141.65 max = 142.46 avg = 142.14 mobilenet_ssd_int8 min = 84.43 max = 84.99 avg = 84.73 mobilenet_yolo min = 322.53 max = 325.15 avg = 323.51 mobilenetv2_yolov3 min = 194.84 max = 196.98 avg = 196.07 yolov4-tiny min = 208.29 max = 213.26 avg = 210.77 nanodet_m min = 64.78 max = 65.38 avg = 65.08 yolo-fastest-1.1 min = 37.89 max = 38.23 avg = 38.07 yolo-fastestv2 min = 29.75 max = 30.33 avg = 30.09 vision_transformer min = 4257.71 max = 4263.73 avg = 4260.60 FastestDet min = 30.86 max = 44.67 avg = 34.41 ``` ### AMD EPYC 7742 (2.25GHz) ubuntu 22.04 AOCC_4.2.0-Build#89 single core ``` # nice -20 ../build-host-aocc-linux/benchmark/benchncnn 100 1 0 -1 0 loop_count = 100 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 9.26 max = 10.05 avg = 9.45 squeezenet_int8 min = 9.54 max = 13.35 avg = 9.67 mobilenet min = 16.20 max = 16.83 avg = 16.35 mobilenet_int8 min = 16.79 max = 17.28 avg = 16.89 mobilenet_v2 min = 10.69 max = 11.13 avg = 10.78 mobilenet_v3 min = 8.87 max = 14.09 avg = 9.03 shufflenet min = 4.99 max = 5.29 avg = 5.06 shufflenet_v2 min = 5.61 max = 7.14 avg = 5.66 mnasnet min = 11.94 max = 12.39 avg = 12.05 proxylessnasnet min = 13.48 max = 16.57 avg = 13.62 efficientnet_b0 min = 19.58 max = 20.34 avg = 19.73 efficientnetv2_b0 min = 22.66 max = 23.63 avg = 22.89 regnety_400m min = 14.89 max = 18.76 avg = 15.11 blazeface min = 1.45 max = 1.59 avg = 1.51 googlenet min = 35.38 max = 36.94 avg = 35.79 googlenet_int8 min = 30.55 max = 42.18 avg = 30.88 resnet18 min = 34.73 max = 48.15 avg = 35.43 resnet18_int8 min = 27.39 max = 28.22 avg = 27.61 alexnet min = 31.42 max = 32.26 avg = 31.64 vgg16 min = 160.38 max = 172.02 avg = 162.52 vgg16_int8 min = 134.03 max = 153.69 avg = 135.12 resnet50 min = 85.47 max = 87.90 avg = 86.21 resnet50_int8 min = 71.18 max = 80.37 avg = 71.70 squeezenet_ssd min = 24.66 max = 25.71 avg = 24.84 squeezenet_ssd_int8 min = 23.61 max = 24.28 avg = 23.78 mobilenet_ssd min = 34.48 max = 35.69 avg = 34.64 mobilenet_ssd_int8 min = 33.26 max = 34.32 avg = 33.45 mobilenet_yolo min = 77.25 max = 86.54 avg = 77.73 mobilenetv2_yolov3 min = 41.72 max = 42.92 avg = 42.02 yolov4-tiny min = 57.61 max = 59.49 avg = 58.46 nanodet_m min = 12.92 max = 13.39 avg = 13.03 yolo-fastest-1.1 min = 5.02 max = 5.26 avg = 5.11 yolo-fastestv2 min = 5.06 max = 5.20 avg = 5.09 vision_transformer min = 637.63 max = 670.46 avg = 640.60 FastestDet min = 5.59 max = 5.82 avg = 5.66 ``` 64 cores ``` # nice -20 ../build-host-aocc-linux/benchmark/benchncnn 300 64 0 -1 0 loop_count = 300 num_threads = 64 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 4.19 max = 13.94 avg = 5.06 squeezenet_int8 min = 4.93 max = 13.59 avg = 5.14 mobilenet min = 3.29 max = 5.28 avg = 3.39 mobilenet_int8 min = 2.32 max = 3.32 avg = 2.40 mobilenet_v2 min = 4.58 max = 8.64 avg = 4.76 mobilenet_v3 min = 4.11 max = 6.89 avg = 4.88 shufflenet min = 5.67 max = 8.60 avg = 5.92 shufflenet_v2 min = 4.83 max = 6.29 avg = 5.02 mnasnet min = 4.08 max = 12.75 avg = 4.29 proxylessnasnet min = 4.46 max = 7.28 avg = 4.68 efficientnet_b0 min = 5.51 max = 11.67 avg = 6.33 efficientnetv2_b0 min = 7.50 max = 11.30 avg = 9.34 regnety_400m min = 12.50 max = 20.88 avg = 12.76 blazeface min = 1.67 max = 3.37 avg = 1.76 googlenet min = 10.64 max = 11.59 avg = 10.87 googlenet_int8 min = 8.49 max = 17.88 avg = 9.90 resnet18 min = 6.36 max = 6.88 avg = 6.48 resnet18_int8 min = 4.65 max = 13.03 avg = 4.77 alexnet min = 3.88 max = 4.62 avg = 3.97 vgg16 min = 26.00 max = 36.86 avg = 27.25 vgg16_int8 min = 17.75 max = 19.63 avg = 18.42 resnet50 min = 13.94 max = 23.10 avg = 14.17 resnet50_int8 min = 8.73 max = 18.32 avg = 8.92 squeezenet_ssd min = 10.39 max = 12.10 avg = 10.77 squeezenet_ssd_int8 min = 11.53 max = 20.24 avg = 12.01 mobilenet_ssd min = 6.80 max = 8.16 avg = 6.96 mobilenet_ssd_int8 min = 4.98 max = 5.21 avg = 5.07 mobilenet_yolo min = 17.75 max = 30.34 avg = 18.29 mobilenetv2_yolov3 min = 13.74 max = 15.69 avg = 14.18 yolov4-tiny min = 21.27 max = 29.53 avg = 22.81 nanodet_m min = 10.22 max = 12.25 avg = 10.89 yolo-fastest-1.1 min = 5.56 max = 6.03 avg = 5.66 yolo-fastestv2 min = 5.61 max = 5.78 avg = 5.67 vision_transformer min = 69.07 max = 508.15 avg = 71.73 FastestDet min = 5.74 max = 6.83 avg = 5.81 ``` ### NVIDIA Tesla V100-PCIE-32GB (GV100 SM x 80 + Tensor Core x 640) ``` # ../build-host-gcc-vk-linux/benchmark/benchncnn 300 1 0 0 0 [0 Tesla V100-PCIE-32GB] queueC=2[8] queueG=0[16] queueT=1[2] [0 Tesla V100-PCIE-32GB] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 Tesla V100-PCIE-32GB] fp16-p/s/u/a=1/1/1/1 int8-p/s/u/a=1/1/1/1 [0 Tesla V100-PCIE-32GB] subgroup=32 basic/vote/ballot/shuffle=1/1/1/1 [0 Tesla V100-PCIE-32GB] fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0 [1 llvmpipe (LLVM 15.0.7, 256 bits)] queueC=0[1] queueG=0[1] queueT=0[1] [1 llvmpipe (LLVM 15.0.7, 256 bits)] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [1 llvmpipe (LLVM 15.0.7, 256 bits)] fp16-p/s/u/a=1/1/1/1 int8-p/s/u/a=1/1/1/1 [1 llvmpipe (LLVM 15.0.7, 256 bits)] subgroup=8 basic/vote/ballot/shuffle=1/1/1/1 [1 llvmpipe (LLVM 15.0.7, 256 bits)] fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0 [2 Tesla V100-PCIE-32GB] queueC=2[8] queueG=0[16] queueT=1[2] [2 Tesla V100-PCIE-32GB] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [2 Tesla V100-PCIE-32GB] fp16-p/s/u/a=1/1/1/1 int8-p/s/u/a=1/1/1/1 [2 Tesla V100-PCIE-32GB] subgroup=32 basic/vote/ballot/shuffle=1/1/1/1 [2 Tesla V100-PCIE-32GB] fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0 [3 Tesla V100-PCIE-32GB] queueC=2[8] queueG=0[16] queueT=1[2] [3 Tesla V100-PCIE-32GB] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [3 Tesla V100-PCIE-32GB] fp16-p/s/u/a=1/1/1/1 int8-p/s/u/a=1/1/1/1 [3 Tesla V100-PCIE-32GB] subgroup=32 basic/vote/ballot/shuffle=1/1/1/1 [3 Tesla V100-PCIE-32GB] fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0 [4 Tesla V100-PCIE-32GB] queueC=2[8] queueG=0[16] queueT=1[2] [4 Tesla V100-PCIE-32GB] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [4 Tesla V100-PCIE-32GB] fp16-p/s/u/a=1/1/1/1 int8-p/s/u/a=1/1/1/1 [4 Tesla V100-PCIE-32GB] subgroup=32 basic/vote/ballot/shuffle=1/1/1/1 [4 Tesla V100-PCIE-32GB] fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0 loop_count = 300 num_threads = 1 powersave = 0 gpu_device = 0 cooling_down = 0 squeezenet min = 1.16 max = 16.79 avg = 1.64 squeezenet_int8 min = 9.03 max = 10.06 avg = 9.15 mobilenet min = 1.05 max = 2.60 avg = 1.25 mobilenet_int8 min = 16.78 max = 19.89 avg = 16.93 mobilenet_v2 min = 1.60 max = 3.29 avg = 1.76 mobilenet_v3 min = 1.84 max = 8.43 avg = 2.04 shufflenet min = 1.35 max = 3.73 avg = 1.54 shufflenet_v2 min = 1.66 max = 8.02 avg = 1.93 mnasnet min = 1.69 max = 3.31 avg = 1.82 proxylessnasnet min = 1.74 max = 3.70 avg = 1.89 efficientnet_b0 min = 2.86 max = 5.21 avg = 3.02 efficientnetv2_b0 min = 60.41 max = 80.28 avg = 69.51 regnety_400m min = 2.38 max = 6.84 avg = 2.57 blazeface min = 0.85 max = 3.50 avg = 0.96 googlenet min = 3.69 max = 16.66 avg = 4.10 googlenet_int8 min = 33.66 max = 47.27 avg = 34.32 resnet18 min = 1.76 max = 7.58 avg = 1.95 resnet18_int8 min = 27.12 max = 36.43 avg = 27.62 alexnet min = 1.33 max = 2.97 avg = 1.49 vgg16 min = 2.98 max = 4.60 avg = 3.17 vgg16_int8 min = 133.97 max = 154.41 avg = 136.22 resnet50 min = 3.42 max = 17.05 avg = 3.72 resnet50_int8 min = 70.53 max = 93.57 avg = 71.96 squeezenet_ssd min = 16.88 max = 22.55 avg = 18.49 squeezenet_ssd_int8 min = 23.12 max = 30.45 avg = 23.50 mobilenet_ssd min = 5.44 max = 7.09 avg = 5.93 mobilenet_ssd_int8 min = 33.28 max = 38.92 avg = 33.62 mobilenet_yolo min = 5.67 max = 7.66 avg = 6.26 mobilenetv2_yolov3 min = 6.33 max = 7.89 avg = 6.67 yolov4-tiny min = 14.66 max = 17.29 avg = 15.57 nanodet_m min = 5.36 max = 16.11 avg = 5.95 yolo-fastest-1.1 min = 5.60 max = 7.45 avg = 6.13 yolo-fastestv2 min = 3.48 max = 5.29 avg = 3.96 vision_transformer min = 153.75 max = 198.81 avg = 165.58 FastestDet min = 3.01 max = 5.01 avg = 3.29 ``` ### AXERA AX630C (Cortex-A53 1.2GHz * 2) ``` # ~/ncnn/build-aarch64-linux-gnu/benchmark # ./benchncnn 4 1 0 -1 0 loop_count = 4 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 129.78 max = 130.30 avg = 130.09 squeezenet_int8 min = 123.08 max = 123.48 avg = 123.22 mobilenet min = 211.46 max = 221.68 avg = 214.14 mobilenet_int8 min = 196.00 max = 212.73 avg = 200.23 mobilenet_v2 min = 149.15 max = 149.21 avg = 149.17 mobilenet_v3 min = 124.70 max = 125.54 avg = 125.08 shufflenet min = 80.75 max = 80.88 avg = 80.81 shufflenet_v2 min = 74.30 max = 74.50 avg = 74.37 mnasnet min = 148.87 max = 165.85 avg = 153.26 proxylessnasnet min = 203.05 max = 213.50 avg = 205.82 efficientnet_b0 min = 270.39 max = 280.59 avg = 273.13 efficientnetv2_b0 min = 302.93 max = 318.07 avg = 307.30 regnety_400m min = 187.47 max = 187.90 avg = 187.60 blazeface min = 22.64 max = 22.78 avg = 22.72 googlenet min = 487.36 max = 503.50 avg = 493.93 googlenet_int8 min = 418.16 max = 434.44 avg = 426.09 resnet18_int8 min = 290.39 max = 301.90 avg = 293.70 resnet50_int8 min = 888.81 max = 898.34 avg = 895.92 squeezenet_ssd min = 320.78 max = 330.33 avg = 323.54 squeezenet_ssd_int8 min = 281.52 max = 299.11 avg = 286.89 mobilenet_ssd min = 435.79 max = 452.66 avg = 444.19 mobilenet_ssd_int8 min = 394.38 max = 411.09 avg = 398.65 mobilenet_yolo min = 955.48 max = 972.38 avg = 967.52 mobilenetv2_yolov3 min = 519.47 max = 536.58 avg = 524.25 yolo-fastestv2 min = 73.94 max = 74.15 avg = 74.05 FastestDet min = 81.89 max = 82.07 avg = 81.98 # ~/ncnn/build-aarch64-linux-gnu/benchmark # ./benchncnn 4 2 0 -1 0 loop_count = 4 num_threads = 2 powersave = 0 gpu_device = -1 cooling_down = 0 squeezenet min = 75.14 max = 88.89 avg = 79.06 squeezenet_int8 min = 70.11 max = 85.48 avg = 74.32 mobilenet min = 112.72 max = 124.85 avg = 115.87 mobilenet_int8 min = 100.35 max = 100.58 avg = 100.49 mobilenet_v2 min = 85.92 max = 86.20 avg = 86.03 mobilenet_v3 min = 73.94 max = 74.34 avg = 74.20 shufflenet min = 53.99 max = 66.11 avg = 57.63 shufflenet_v2 min = 47.47 max = 47.72 avg = 47.59 mnasnet min = 85.96 max = 86.27 avg = 86.13 proxylessnasnet min = 111.15 max = 121.84 avg = 113.92 efficientnet_b0 min = 149.72 max = 150.00 avg = 149.85 efficientnetv2_b0 min = 168.84 max = 170.57 avg = 169.35 regnety_400m min = 120.42 max = 135.50 avg = 124.26 blazeface min = 14.27 max = 14.48 avg = 14.39 googlenet min = 263.82 max = 274.74 avg = 266.84 googlenet_int8 min = 226.91 max = 227.36 avg = 227.23 resnet18_int8 min = 157.66 max = 168.11 avg = 160.57 resnet50_int8 min = 469.84 max = 484.00 avg = 476.59 squeezenet_ssd min = 190.23 max = 204.41 avg = 193.99 squeezenet_ssd_int8 min = 162.73 max = 174.30 avg = 165.79 mobilenet_ssd min = 236.26 max = 251.16 avg = 240.34 mobilenet_ssd_int8 min = 203.22 max = 212.01 avg = 206.00 mobilenet_yolo min = 522.45 max = 537.99 avg = 529.95 mobilenetv2_yolov3 min = 300.33 max = 316.59 avg = 304.89 yolo-fastestv2 min = 50.27 max = 50.62 avg = 50.43 FastestDet min = 53.34 max = 53.64 avg = 53.51 ``` ### Spacemit MUSE Pi Pro Spacemit M1 (Spacemit X60 *8 + PowerVR B-Series BXE-2-32 MC1) ``` root@spacemit-k1-x-MUSE-Pi-Pro-board:/home/yingxi/ncnn/build/benchmark# ./benchncnn 4 8 2 -1 1 loop_count = 4 num_threads = 8 powersave = 2 gpu_device = -1 cooling_down = 1 squeezenet min = 192.55 max = 203.73 avg = 195.61 squeezenet_int8 min = 863.38 max = 875.44 avg = 867.96 mobilenet min = 260.32 max = 274.70 avg = 266.42 mobilenet_int8 min = 1287.80 max = 1606.98 avg = 1461.52 mobilenet_v2 min = 168.08 max = 173.99 avg = 169.97 mobilenet_v3 min = 141.06 max = 166.83 avg = 147.74 shufflenet min = 82.91 max = 92.83 avg = 85.57 shufflenet_v2 min = 83.11 max = 83.35 avg = 83.26 mnasnet min = 168.99 max = 180.35 avg = 171.95 proxylessnasnet min = 186.14 max = 194.56 avg = 188.91 efficientnet_b0 min = 257.93 max = 263.18 avg = 259.94 efficientnetv2_b0 min = 385.35 max = 394.09 avg = 388.57 regnety_400m min = 228.02 max = 229.55 avg = 228.88 blazeface min = 26.78 max = 27.43 avg = 26.97 googlenet min = 781.12 max = 796.37 avg = 788.60 googlenet_int8 min = 2422.82 max = 2441.75 avg = 2432.78 resnet18 min = 864.67 max = 874.15 avg = 869.32 resnet18_int8 min = 2409.34 max = 2728.57 avg = 2530.44 alexnet min = 389.93 max = 393.67 avg = 391.77 vgg16 min = 8213.96 max = 8957.49 avg = 8405.27 vgg16_int8 min = 34268.94 max = 36044.89 avg = 35244.72 resnet50 min = 1798.75 max = 1859.80 avg = 1825.00 resnet50_int8 min = 7364.21 max = 7500.24 avg = 7428.21 squeezenet_ssd min = 693.59 max = 701.68 avg = 697.60 squeezenet_ssd_int8 min = 1447.64 max = 1461.21 avg = 1455.02 mobilenet_ssd min = 530.90 max = 542.81 avg = 534.42 mobilenet_ssd_int8 min = 4347.45 max = 4391.44 avg = 4377.68 mobilenet_yolo min = 1285.07 max = 1369.59 avg = 1312.64 mobilenetv2_yolov3 min = 605.19 max = 628.05 avg = 616.37 yolov4-tiny min = 1743.00 max = 1751.39 avg = 1748.09 nanodet_m min = 201.46 max = 202.80 avg = 202.03 yolo-fastest-1.1 min = 97.02 max = 98.29 avg = 97.71 yolo-fastestv2 min = 75.53 max = 76.62 avg = 76.20 vision_transformer min = 11328.10 max = 11334.80 avg = 11332.34 FastestDet min = 85.01 max = 86.04 avg = 85.45 root@spacemit-k1-x-MUSE-Pi-Pro-board:/home/yingxi/ncnn/build/benchmark# ./benchncnn 4 8 2 0 1 [0 PowerVR B-Series BXE-2-32 MC1] queueC=0[2] queueG=0[2] queueT=0[2] [0 PowerVR B-Series BXE-2-32 MC1] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0 [0 PowerVR B-Series BXE-2-32 MC1] fp16-p/s/u/a=1/1/1/1 int8-p/s/u/a=1/1/1/1 [0 PowerVR B-Series BXE-2-32 MC1] subgroup=1(1~1) ops=1/1/1/1/1/1/0/0/1/1 [0 PowerVR B-Series BXE-2-32 MC1] fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0 loop_count = 4 num_threads = 8 powersave = 2 gpu_device = 0 cooling_down = 1 squeezenet min = 381.51 max = 382.05 avg = 381.73 squeezenet_int8 min = 862.26 max = 890.38 avg = 879.94 mobilenet min = 795.29 max = 796.41 avg = 795.80 mobilenet_int8 min = 1284.16 max = 1298.86 avg = 1290.31 mobilenet_v2 min = 512.00 max = 512.59 avg = 512.19 mobilenet_v3 min = 428.55 max = 428.95 avg = 428.76 shufflenet min = 198.17 max = 198.83 avg = 198.39 shufflenet_v2 min = 272.36 max = 272.73 avg = 272.55 mnasnet min = 526.92 max = 527.44 avg = 527.12 proxylessnasnet min = 601.43 max = 602.65 avg = 602.05 efficientnet_b0 min = 704.94 max = 705.23 avg = 705.13 efficientnetv2_b0 min = 854.83 max = 866.51 avg = 859.85 regnety_400m min = 526.46 max = 527.04 avg = 526.65 blazeface min = 69.74 max = 69.84 avg = 69.80 googlenet min = 1230.07 max = 1231.04 avg = 1230.53 googlenet_int8 min = 2409.25 max = 2423.38 avg = 2416.76 resnet18 min = 1134.72 max = 1136.35 avg = 1135.44 resnet18_int8 min = 2431.48 max = 2552.62 avg = 2473.90 alexnet min = 692.35 max = 697.08 avg = 695.61 vgg16 min = 5790.33 max = 5805.37 avg = 5796.20 vgg16_int8 min = 34057.43 max = 35714.99 avg = 35080.62 resnet50 min = 3426.54 max = 3429.97 avg = 3427.94 resnet50_int8 min = 7370.03 max = 7409.63 avg = 7390.83 squeezenet_ssd min = 1057.50 max = 1061.42 avg = 1059.26 squeezenet_ssd_int8 min = 1454.99 max = 1469.47 avg = 1462.61 mobilenet_ssd min = 1670.02 max = 1673.22 avg = 1671.34 mobilenet_ssd_int8 min = 4372.23 max = 4424.18 avg = 4400.11 mobilenet_yolo min = 3794.02 max = 3796.52 avg = 3795.21 mobilenetv2_yolov3 min = 1841.86 max = 1844.70 avg = 1843.49 yolov4-tiny min = 2099.86 max = 2104.18 avg = 2102.34 nanodet_m min = 646.19 max = 647.41 avg = 646.69 yolo-fastest-1.1 min = 322.08 max = 323.71 avg = 323.22 yolo-fastestv2 min = 209.42 max = 209.72 avg = 209.56 vision_transformer min = 26499.86 max = 26548.73 avg = 26528.54 FastestDet min = 251.68 max = 252.52 avg = 252.14 ``` ### Arduino UNO Q - QRB2210 (ARM Cortex-A53 @ 2.0GHz x 4) ``` arduino@noivis-uno-q:~/ncnn/benchmark$ ../build/benchmark/benchncnn 10 4 0 -1 -1 loop_count = 10 num_threads = 4 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 35.57 max = 111.57 avg = 43.99 squeezenet_int8 min = 31.61 max = 32.34 avg = 31.91 mobilenet min = 47.82 max = 133.12 avg = 56.77 mobilenet_int8 min = 33.96 max = 102.49 avg = 44.91 mobilenet_v2 min = 42.62 max = 119.38 avg = 51.88 mobilenet_v3 min = 34.53 max = 35.91 avg = 35.27 shufflenet min = 26.18 max = 26.47 avg = 26.32 shufflenet_v2 min = 22.02 max = 88.82 avg = 30.98 mnasnet min = 38.96 max = 92.30 avg = 50.98 proxylessnasnet min = 47.04 max = 137.34 avg = 56.91 efficientnet_b0 min = 58.75 max = 141.67 avg = 76.36 efficientnetv2_b0 min = 79.72 max = 175.06 avg = 99.54 regnety_400m min = 65.97 max = 184.19 avg = 96.94 blazeface min = 6.43 max = 7.84 avg = 6.76 googlenet min = 105.37 max = 197.46 avg = 130.49 googlenet_int8 min = 89.68 max = 179.01 avg = 107.28 resnet18 min = 86.52 max = 166.67 avg = 102.49 resnet18_int8 min = 57.96 max = 107.52 avg = 66.63 alexnet min = 56.77 max = 127.20 avg = 67.50 vgg16 min = 463.45 max = 557.00 avg = 511.24 vgg16_int8 min = 323.15 max = 415.10 avg = 367.00 resnet50 min = 219.89 max = 298.83 avg = 250.55 resnet50_int8 min = 177.14 max = 261.74 avg = 208.69 squeezenet_ssd min = 96.95 max = 195.33 avg = 123.10 squeezenet_ssd_int8 min = 79.66 max = 179.98 avg = 97.71 mobilenet_ssd min = 100.40 max = 191.42 avg = 119.07 mobilenet_ssd_int8 min = 71.88 max = 173.69 avg = 92.27 mobilenet_yolo min = 216.49 max = 301.24 avg = 248.78 mobilenetv2_yolov3 min = 154.69 max = 245.76 avg = 179.31 yolov4-tiny min = 191.17 max = 261.76 avg = 218.64 nanodet_m min = 57.66 max = 113.14 avg = 67.66 yolo-fastest-1.1 min = 34.72 max = 131.85 avg = 49.81 yolo-fastestv2 min = 26.91 max = 28.23 avg = 27.46 vision_transformer min = 2529.77 max = 2703.20 avg = 2601.17 FastestDet min = 28.09 max = 29.11 avg = 28.48 arduino@noivis-uno-q:~/ncnn/benchmark$ ../build/benchmark/benchncnn 10 1 0 -1 -1 loop_count = 10 num_threads = 1 powersave = 0 gpu_device = -1 cooling_down = 1 squeezenet min = 94.15 max = 111.95 avg = 99.15 squeezenet_int8 min = 78.23 max = 86.76 avg = 80.23 mobilenet min = 146.45 max = 165.20 avg = 153.61 mobilenet_int8 min = 123.70 max = 133.75 avg = 126.28 mobilenet_v2 min = 99.85 max = 108.01 avg = 103.90 mobilenet_v3 min = 93.31 max = 102.90 avg = 96.41 shufflenet min = 61.80 max = 79.39 avg = 65.28 shufflenet_v2 min = 47.57 max = 56.28 avg = 49.89 mnasnet min = 106.41 max = 119.18 avg = 109.83 proxylessnasnet min = 143.93 max = 164.33 avg = 151.37 efficientnet_b0 min = 164.14 max = 173.38 avg = 167.91 efficientnetv2_b0 min = 206.05 max = 225.26 avg = 211.93 regnety_400m min = 133.84 max = 144.94 avg = 137.26 blazeface min = 13.90 max = 14.97 avg = 14.25 googlenet min = 337.11 max = 364.05 avg = 347.30 googlenet_int8 min = 281.64 max = 293.46 avg = 288.34 resnet18 min = 276.23 max = 304.36 avg = 289.94 resnet18_int8 min = 190.11 max = 217.07 avg = 199.87 alexnet min = 196.14 max = 203.26 avg = 198.63 vgg16 min = 1391.13 max = 1626.54 avg = 1502.86 vgg16_int8 min = 1128.65 max = 1290.60 avg = 1200.60 resnet50 min = 739.44 max = 774.68 avg = 750.76 resnet50_int8 min = 591.32 max = 612.44 avg = 603.38 squeezenet_ssd min = 245.57 max = 280.32 avg = 262.18 squeezenet_ssd_int8 min = 182.86 max = 228.61 avg = 199.68 mobilenet_ssd min = 308.26 max = 320.81 avg = 314.58 mobilenet_ssd_int8 min = 246.33 max = 265.22 avg = 253.05 mobilenet_yolo min = 682.76 max = 703.99 avg = 696.30 mobilenetv2_yolov3 min = 346.53 max = 365.76 avg = 355.41 yolov4-tiny min = 527.86 max = 558.38 avg = 542.25 nanodet_m min = 135.87 max = 153.99 avg = 145.11 yolo-fastest-1.1 min = 58.92 max = 76.24 avg = 65.08 yolo-fastestv2 min = 48.54 max = 59.97 avg = 53.21 vision_transformer min = 9218.64 max = 10723.27 avg = 10253.49 FastestDet min = 51.52 max = 62.65 avg = 55.04 ``` ================================================ FILE: benchmark/RankCards/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.10) project(RankCards CXX) set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(EXECUTABLE_OUTPUT_PATH "../") add_executable(RankCards main.cpp) ================================================ FILE: benchmark/RankCards/README.md ================================================ ### Rank the boards. The table below is generated by RankCards, using the timings found in the /ncnn/benchmark/README.md file.
First, the best set of timings is selected from each board.
The set is then compared to a reference set by calculating the ratio of each model one by one and averaging all results.
Finally, the boards are ranked from fast to slow.
| | Board | Ratio | | :--: | :---- | :--- | | 1 | NVIDIA Quadro RTX 8000 (TU102 SM x 72 + Tensor Core x 576) | 0.147 | | 2 | nVIDIA RTX2080 of Desktop | 0.15 | | 3 | NVIDIA GeForce RTX 3060 Ti of Desktop[2023-10-12] | 0.18 | | 4 | nVIDIA RTX2060 of Notebook | 0.198 | | 5 | Intel® Core™ i7-13700K of Desktop[2023-10-12] | 0.255 | | 6 | AMD Radeon RX 6900 XT of Desktop[2023-10-12] | 0.275 | | 7 | NVIDIA RTX3090 (GA102 SM x 82 + Tensor Core 328) | 0.277 | | 8 | MediaTek Dimensity 9300 (MT6989) (Cortex-X4 3.25 GHz + 2.85 GHz x 3 + Cortex-A720 2.0 GHz x 4 + Mali-G720-Immortalis MC12) | 0.309 | | 9 | MacBook Pro (13-inch, M1, 2020) | 0.346 | | 10 | AWS c5.4xlarge Instance | 0.418 | | 11 | AMD Ryzen 9 5950X 16-Core of Desktop[2023-10-12] | 0.427 | | 12 | Qualcomm SM8550-AB Snapdragon 8 Gen 2 (Kyro 3.20 GHz + 2.8 GHz x 2 + 2.80 GHz x 2 + 2.00 GHz * 3 + Adreno 740) | 0.45 | | 13 | AMD Ryzen 5700g (Zen3 3.8 GHz ~ 4.6 GHz x 8) | 0.478 | | 14 | HUAWEI KunPeng 920 3211K (x24 cores) | 0.482 | | 15 | NVIDIA Jetson AGX Orin (Cortex-A78AE 2.2 GHz x 12 + Ampere@1.3 GHz Tensor Cores 64) | 0.485 | | 16 | HUAWEI KunPeng 920 2251K (x8 cores) | 0.54 | | 17 | nVIDIA RTX A3000 of Notebook (6GB) | 0.577 | | 18 | Intel(R) UHD Graphics 770 of Desktop[2023-10-12] | 0.593 | | 19 | OrangePi5, Rockchip RK3588s (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz) | 0.642 | | 20 | Qualcomm SM8150-AC Snapdragon 855+ (Kyro485 2.96 GHz + 2.42 GHz x 3 + 1.80 GHz x 4 + Adreno 640) | 0.665 | | 21 | Rockchip RK3588 (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz) | 0.753 | | 22 | NVIDIA Jetson Orin Nano | 0.819 | | 23 | Raspberry Pi 5 Broadcom BCM2712, Cortex-A76 (ARMv8) (2.4GHz x 4) | 1 | | 24 | Station-M3/ROC-RK3588S-PC, Rockchip RK3588S (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz + Mali-G610) StationOS (Android) | 1 | | 25 | NVIDIA Jetson AGX Xavier (Carmel 2.2 GHz x 8 + Volta Tensor Cores 64) | 1.05 | | 26 | Loongson 3A6000 (LA664 2.5GHz * 4+4) | 1.11 | | 27 | Hyper-V Linux Guest with GPU-PV enabled (Intel Core i7-11800H, NVIDIA GeForce RTX 3070 Laptop GPU) | 1.19 | | 28 | Rockchip RK3588 (Cortex-A76 2.4GHz x 4 + Cortex-A55 1.8GHz x 4) | 1.35 | | 29 | NVIDIA Jetson TX2 NX(NV-Denver2 2.0Ghz x 2 + Cortex-A57 2.0Ghz x 4 + 256-core NVIDIA Pascal iGPU) | 1.59 | | 30 | Hyper-V Linux Guest with GPU-PV enabled (Intel Core i7-7700K, NVIDIA GeForce GTX 1050 Ti) | 1.66 | | 31 | Phytium FT-2000+/64 (FTC662 armv8 2.4GHz x 8) | 1.75 | | 32 | AMD Ryzen Threadripper 3970X (Zen2 3.7 GHz ~ 4.5 GHz x 32) | 2.19 | | 33 | AMD Ryzen Embedded V1605B (Zen 2.0 GHz ~ 3.6 GHz x 4 + Radeon Vega 8 1.1GHz 8CU) | 2.23 | | 34 | Avaota Aim T527, Allwinner T527 (Cortex-A55 2.2GHz x 4 + Cortex-A55 1.8GHz x 4) | 2.28 | | 35 | Loongson 3A5000 (LA464 2.5GHz * 4) | 2.31 | | 36 | Qualcomm MSM8996 Pro Snapdragon 821 (Kyro 2.35GHz x 2 + Kyro 2.19GHz x 2) | 2.37 | | 37 | NVIDIA Jetson Nano | 2.44 | | 38 | Intel Celeron N5105 | 2.8 | | 39 | Loongson 3A4000 (GS464V 1.8GHz * 4 with MSA128) | 3.24 | | 40 | Khadas VIM3, Amlogic A311D (Cortex-A73 2.2GHz x 4 + Cortex-A53 1.8GHz x 2) | 3.48 | | 41 | Kirin 970 (Cortex-A73 2.4GHz x 4 + Cortex-A53 1.8GHz x 4) | 3.58 | | 42 | Qualcomm MSM8998 Snapdragon 835 (Kyro 2.45GHz x 4 + Kyro 1.9GHz x 4 + Adreno 540) | 3.63 | | 43 | MacBook Pro (15-inch, 2019) - 2.6GHz six cores Intel Core i7 && Radeon Pro 555X 4GB && Intel UHD Graphics 630 1536MB | 3.75 | | 44 | Qualcomm MSM6150 Snapdragon 675 (Kyro460 2.0GHz x 2 + Kyro460 1.7GHz x 6 + Adreno 612) | 3.75 | | 45 | Qualcomm MSM8994 Snapdragon 810 (Cortex-A57 2.0GHz x 4 + Cortex-A53 1.55GHz x 4) | 3.82 | | 46 | Station P2, Rockchip RK3568 (Cortex-A55 2.0GHz x 4) | 3.85 | | 47 | Rock3A, Rockchip RK3568 (Cortex-A55 2.0GHz x 4) ubuntu 20.04 | 3.86 | | 48 | Loongson 3A4000 (GS464V 1.8GHz * 4 with MSA128) | 4.08 | | 49 | Radxa Zero 3W, Cortex-A55 (ARMv82) (1.416 GHz x 4) | 4.5 | | 50 | Raspberry Pi 4 Model B Broadcom BCM2711B0, Cortex-A72 (ARMv8) (1.8GHz x 4) | 4.95 | | 51 | OrangePi4 LTS, Rockchip RK3399 (Cortex-A72 1.8GHz x 2 + Cortex-A53 1.5GHz x 4) | 5.11 | | 52 | Rockchip RK3399 (Cortex-A72 1.8GHz x 2 + Cortex-A53 1.5GHz x 4) | 5.16 | | 53 | PhytiumPi, Phytium E2000 (FTC664@1.8GHz x2 + FTC310@1.5GHz x2) | 5.16 | | 54 | Qualcomm SDM660 Snapdragon 660 (Kyro260 2.2GHz x 4 + Kyro260 1.84GHz x 4 + Adreno 512) | 5.26 | | 55 | Phytium FT-2000/4 (FTC663 armv8 2.2GHz x 4) | 5.27 | | 56 | RDK X3 Module (Cortex-A53 1.5GHz x 4) aarch64 | 5.88 | | 57 | Station-M2/ROC-RK3566-PC, Rockchip RK3566 (Cortex-A55 1.8GHz x 4 + Mali-G52) StationOS (Android) | 6.51 | | 58 | Rockchip RK3288-CG.W (Cortex-A17 1.8GHz x 4) | 6.66 | | 59 | Qualcomm MSM8916 Snapdragon 410 (Cortex-A53 1.2GHz x 4) | 7.63 | | 60 | NanoPi R2S, Rockchip RK3328 (Cortex-A53 1.3GHz x 4) Armbian focal (21.05.1) aarch64 | 7.66 | | 61 | Intel Atom x5-Z8350 | 7.74 | | 62 | Loongson 2K2000 (LA364 1.5GHz * 2 with lsx) | 8.23 | | 63 | EAIDK 310, Rockchip RK3228H (Cortex-A53 1.3GHz x 4) fedora-28 aarch64 | 8.34 | | 64 | OrangePi Zero 2, Allwinner H616 (Cortex-A53 1.5GHz x 4) | 9.51 | | 65 | Raspberry Pi 3 Model B+ Broadcom BCM2837B0, Cortex-A53 (ARMv8) (1.4GHz x 4) | 9.87 | | 66 | iPhone 5S (Apple A7 1.3GHz x 2) | 11 | | 67 | MYIR RemiPi,Renesas RZG2L(Cortex-A55 1.5GHz x 2) | 11.9 | | 68 | Raspberry Pi 5 Broadcom BCM2712, VideoCore VII Graphics (Vulkan 1.2) | 12.5 | | 69 | Raspberry Pi Zero 2 W Broadcom BCM2710A1, Cortex-A53 (ARMv8) (1.0GHz x 4) | 13.7 | | 70 | Xeon Phi 3120A (1.10 GHz 57-core 228-thread) | 15.1 | | 71 | Loongson 3A3000 (GS464E 1.45GHz * 4) | 16.3 | | 72 | AXERA AX620A (Cortex-A7 1.0GHz * 4) | 18.8 | | 73 | Loongson 2K1000LA (LA264 1.0GHz * 2) | 24.4 | | 74 | Loongson 2K1000 (GS264 1.0GHz x 2) | 24.8 | | 75 | Freescale i.MX7 Dual (Cortex A7 1.0GHz x 2) | 26.7 | | 76 | Banana Pi M2 Zero 2 AllWinner H2+, Cortex-A7 (ARMv7-A) (1.2GHz x 4) | 26.8 | | 77 | HiSilicon Hi3519V101 (Cortex-A17 1.2GHz x 1) | 36.2 | | 78 | Sunway SW831 (sw_64 2.5GHz * 8) | 40.7 | | 79 | Z7-Lite 7020 XC7Z020CLG400-2 (Cortex-A9 766MHz x 2) | 43.2 | | 80 | Intel Celeron M 420 (Yonah 1.60 GHz x 1) | 43.9 | | 81 | Amlogic S805 (Cortex-A5, 4 × 1.536GHz) | 45.9 | | 82 | VisionFive2 , JH7110 (SiFive-U74(RV64GC) 1.5GHz x 4) riscv64 with PowerVR B-Series BXE-4-32 | 72.4 | | 83 | T-Head TH1520 (C910V, 1.848 GHz x 4 + BXM-4-64 PowerVR) | 83.3 | | 84 | Sunway SW421 (sw_64 1.7GHz * 4) | 116 | | 85 | Ingenic T40XP Xburst2 Core X2 1.4Ghz (without MSA) | 165 | ================================================ FILE: benchmark/RankCards/Rcards.h ================================================ // Copyright 2017 Tencent // SPDX-License-Identifier: BSD-3-Clause #ifndef RCARDS_H #define RCARDS_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include //--------------------------------------------------------------------------- // Global hardcoded parameters //--------------------------------------------------------------------------- // LERP(a,b,c) = linear interpolation macro, is 'a' when c == 0.0 and 'b' when c == 1.0 */ #define MIN(a, b) ((a) > (b) ? (b) : (a)) #define MAX(a, b) ((a) < (b) ? (b) : (a)) #define LIM(a, b, c) (((a) > (c)) ? (c) : ((a) < (b)) ? (b) : (a)) #define LERP(a, b, c) (((b) - (a)) * (c) + (a)) #define ROUND(a) (static_cast((a) + 0.5)) #define EUCLIDEAN(x1, y1, x2, y2) sqrt(((x1) - (x2)) * ((x1) - (x2)) + ((y1) - (y2)) * ((y1) - (y2))) //--------------------------------------------------------------------------- struct TModel { std::string Name; float AvrTime{0.0}; }; //--------------------------------------------------------------------------- struct TModelSet { std::vector Mset; //use push_back to prevent issues with CMake inline TModelSet(void) { TModel model; model.Name = "squeezenet"; Mset.push_back(model); model.Name = "squeezenet_int8"; Mset.push_back(model); model.Name = "mobilenet"; Mset.push_back(model); model.Name = "mobilenet_int8"; Mset.push_back(model); model.Name = "mobilenet_v2"; Mset.push_back(model); model.Name = "mobilenet_v3"; Mset.push_back(model); model.Name = "shufflenet"; Mset.push_back(model); model.Name = "shufflenet_v2"; Mset.push_back(model); model.Name = "mnasnet"; Mset.push_back(model); model.Name = "proxylessnasnet"; Mset.push_back(model); model.Name = "efficientnet_b0"; Mset.push_back(model); model.Name = "efficientnetv2_b0"; Mset.push_back(model); model.Name = "regnety_400m"; Mset.push_back(model); model.Name = "blazeface"; Mset.push_back(model); model.Name = "googlenet"; Mset.push_back(model); model.Name = "googlenet_int8"; Mset.push_back(model); model.Name = "resnet18"; Mset.push_back(model); model.Name = "resnet18_int8"; Mset.push_back(model); model.Name = "alexnet"; Mset.push_back(model); model.Name = "vgg16"; Mset.push_back(model); model.Name = "vgg16_int8"; Mset.push_back(model); model.Name = "resnet50"; Mset.push_back(model); model.Name = "resnet50_int8"; Mset.push_back(model); model.Name = "squeezenet_ssd"; Mset.push_back(model); model.Name = "squeezenet_ssd_int8"; Mset.push_back(model); model.Name = "mobilenet_ssd"; Mset.push_back(model); model.Name = "mobilenet_ssd_int8"; Mset.push_back(model); model.Name = "mobilenet_yolo"; Mset.push_back(model); model.Name = "mobilenetv2_yolov3"; Mset.push_back(model); model.Name = "yolov4-tiny"; Mset.push_back(model); model.Name = "nanodet_m"; Mset.push_back(model); model.Name = "yolo-fastest-1.1"; Mset.push_back(model); model.Name = "yolo-fastestv2"; Mset.push_back(model); model.Name = "vision_transformer"; Mset.push_back(model); model.Name = "FastestDet"; Mset.push_back(model); } void Store(const TModel& model) { for (size_t i = 0; i < Mset.size(); i++) { if (Mset[i].Name == model.Name) { Mset[i].AvrTime = model.AvrTime; break; } } } float Sum(void) { float t = 0; for (size_t i = 0; i < Mset.size(); i++) t += Mset[i].AvrTime; return t; } float Ratio(const TModelSet& Rset) { float w; float s = 0; float t = 0; for (size_t r = 0; r < Rset.Mset.size(); r++) { if (Rset.Mset[r].AvrTime > 0.0) { for (size_t i = 0; i < Mset.size(); i++) { if (Mset[i].AvrTime > 0.0) { if (Mset[i].Name == Rset.Mset[r].Name) { w = log(Rset.Mset[r].AvrTime); s += w * (Mset[i].AvrTime / Rset.Mset[r].AvrTime); t += w; } } } } } if (t > 0) s /= t; return s; } }; //--------------------------------------------------------------------------- struct TBoard { std::string Name; size_t StartLine; size_t EndLine; std::vector BenchSet; int BestSet; float Ratio; }; //--------------------------------------------------------------------------- inline bool FileExists(const std::string& name) { struct stat buffer; return (stat(name.c_str(), &buffer) == 0); } //--------------------------------------------------------------------------- inline void FileCopy(const std::string& Src, const std::string& Dst) { std::ifstream src(Src, std::ios::binary); std::ofstream dst(Dst, std::ios::binary); dst << src.rdbuf(); } //--------------------------------------------------------------------------- // to lower case static inline void lcase(std::string& s) { std::transform(s.begin(), s.end(), s.begin(), [](unsigned char c) { return std::tolower(c); }); } //--------------------------------------------------------------------------- // to lower case (copying) static inline std::string lcase_copy(std::string s) { lcase(s); return s; } //--------------------------------------------------------------------------- // trim from start (in place) static inline void ltrim(std::string& s) { s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) { return !std::isspace(ch); })); } //--------------------------------------------------------------------------- // trim from end (in place) static inline void rtrim(std::string& s) { s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) { return !std::isspace(ch); }).base(), s.end()); } //--------------------------------------------------------------------------- // trim from both ends (in place) static inline void trim(std::string& s) { ltrim(s); rtrim(s); } //--------------------------------------------------------------------------- // trim from start (copying) static inline std::string ltrim_copy(std::string s) { ltrim(s); return s; } //--------------------------------------------------------------------------- // trim from end (copying) static inline std::string rtrim_copy(std::string s) { rtrim(s); return s; } //--------------------------------------------------------------------------- // trim from both ends (copying) static inline std::string trim_copy(std::string s) { trim(s); return s; } //--------------------------------------------------------------------------- static inline void GetNameAver(std::string line, TModel& model) { // line example: squeezenet min = 46.28 max = 46.91 avg = 46.65 size_t p = line.find("min ="); if (p != std::string::npos) { model.Name = trim_copy(line.substr(0, p)); p = line.find("avg ="); if (p != std::string::npos) { try { model.AvrTime = std::stof(trim_copy(line.substr(p + 5, line.length() - p - 5))); } catch (...) { } } else model.AvrTime = 0.0; } else { model.Name = ""; model.AvrTime = 0.0; } } //--------------------------------------------------------------------------- #endif // RCARDS_H ================================================ FILE: benchmark/RankCards/main.cpp ================================================ // Copyright 2017 Tencent // SPDX-License-Identifier: BSD-3-Clause #include #include #include #include #include #include #include "Rcards.h" //--------------------------------------------------------------------------- using namespace std; //--------------------------------------------------------------------------- #define REF_BOARD "Raspberry Pi 5 Broadcom BCM2712, Cortex-A76 (ARMv8)" //--------------------------------------------------------------------------- // Define a custom comparator function for sorting based on Ratio bool compareByRatio(const TBoard& a, const TBoard& b) { return a.Ratio < b.Ratio; } //--------------------------------------------------------------------------- int main(int argc, char** argv) { size_t i, t, n, r; int RefBoard; float f, x; string Line; TModel Model; vector Lines; // Vector to store strings vector Boards; // Vector to store boards ifstream inputFile; // Check existence of the ../README.md file inputFile.open("../README.md"); if (!inputFile.is_open()) { if (argc != 2) { fprintf(stderr, "Usage: ./RankCards \n"); return -1; } const char* imagepath = argv[1]; // Open the file given as argument inputFile.open(imagepath); // Check if the file is open if (!inputFile.is_open()) { cerr << "Error opening file" << endl; return 1; // Return an error code } } // Read each Line from the file and add it to the vector while (std::getline(inputFile, Line)) { Lines.push_back(Line); } // Close the file inputFile.close(); // Get the boards. for (i = 0; i < Lines.size(); i++) { TBoard Brd; if (Lines[i].find("###") != string::npos) { Brd.Name = Lines[i].substr(4, Lines[i].length() - 4); Brd.StartLine = i + 1; Boards.push_back(Brd); } } // Get the boards end Line. for (t = 0; t < Boards.size() - 1; t++) { Boards[t].EndLine = Boards[t + 1].StartLine; } Boards[t].EndLine = Lines.size(); // Get the bench sets (must always start with squeezenet) for (t = 0; t < Boards.size(); t++) { TModelSet MdSet; bool FirstSet = true; for (n = Boards[t].StartLine; n < Boards[t].EndLine; n++) { GetNameAver(Lines[n], Model); MdSet.Store(Model); if (Model.Name == "squeezenet") { //start of new set, check if it is the first set if (FirstSet) FirstSet = false; else Boards[t].BenchSet.push_back(MdSet); } } Boards[t].BenchSet.push_back(MdSet); } // Get the total AvrTime of the bench sets and set the lowest as best set for (t = 0; t < Boards.size(); t++) { x = FLT_MAX; for (n = 0; n < Boards[t].BenchSet.size(); n++) { f = Boards[t].BenchSet[n].Sum(); if (f < x) { x = f; Boards[t].BestSet = n; } } } // Get the reference set RefBoard = -1; for (t = 0; t < Boards.size(); t++) { if (Boards[t].Name.find(REF_BOARD) != string::npos) { RefBoard = static_cast(t); } } if (RefBoard == -1) { cerr << "Error finding reference board :" << endl; cerr << REF_BOARD << endl; return 1; // Return an error code } // Get the ratios between the best bench sets and reference r = Boards[RefBoard].BestSet; for (t = 0; t < Boards.size(); t++) { n = Boards[t].BestSet; Boards[t].Ratio = Boards[t].BenchSet[n].Ratio(Boards[RefBoard].BenchSet[r]); } // Sort the vector using the custom comparator std::sort(Boards.begin(), Boards.end(), compareByRatio); // Open an output README.md file std::ofstream outputFile("README.md"); // Check if the file is successfully opened if (outputFile.is_open()) { outputFile << "### Rank the boards." << endl; outputFile << "The table below is generated by RankCards, using the timings found in the /ncnn/benchmark/README.md file.
" << endl; outputFile << "First, the best set of timings is selected from each board.
" << endl; outputFile << "The set is then compared to a reference set by calculating the ratio of each model one by one and averaging all results.
" << endl; outputFile << "Finally, the boards are ranked from fast to slow.
" << endl; outputFile << "| | Board | Ratio | " << endl; outputFile << "| :--: | :---- | :--- | " << endl; // Write the sorted vector to the file for (t = 0; t < Boards.size(); t++) { outputFile << "| " << t + 1 << " | " << Boards[t].Name << " | " << setprecision(3) << Boards[t].Ratio << " | " << endl; } // Close the file stream outputFile.close(); cout << "Sorted data has been written to README.md" << endl; } else { cerr << "Error opening the file." << endl; return 1; // Return an error code } return 0; // Return success } //--------------------------------------------------------------------------- ================================================ FILE: benchmark/alexnet.param ================================================ 7767517 15 15 Input data 0 1 data -23330=4,3,227,227,3 0=227 1=227 2=3 Convolution conv1 1 1 data conv1_relu1 -23330=4,3,55,55,96 0=96 1=11 3=4 5=1 6=34848 9=1 LRN norm1 1 1 conv1_relu1 norm1 -23330=4,3,55,55,96 2=1.000000e-04 Pooling pool1 1 1 norm1 pool1 -23330=4,3,27,27,96 1=3 2=2 ConvolutionDepthWise conv2 1 1 pool1 conv2_relu2 -23330=4,3,27,27,256 0=256 1=5 4=2 5=1 6=307200 7=2 9=1 LRN norm2 1 1 conv2_relu2 norm2 -23330=4,3,27,27,256 2=1.000000e-04 Pooling pool2 1 1 norm2 pool2 -23330=4,3,13,13,256 1=3 2=2 Convolution conv3 1 1 pool2 conv3_relu3 -23330=4,3,13,13,384 0=384 1=3 4=1 5=1 6=884736 9=1 ConvolutionDepthWise conv4 1 1 conv3_relu3 conv4_relu4 -23330=4,3,13,13,384 0=384 1=3 4=1 5=1 6=663552 7=2 9=1 ConvolutionDepthWise conv5 1 1 conv4_relu4 conv5_relu5 -23330=4,3,13,13,256 0=256 1=3 4=1 5=1 6=442368 7=2 9=1 Pooling pool5 1 1 conv5_relu5 pool5 -23330=4,3,6,6,256 1=3 2=2 InnerProduct fc6 1 1 pool5 fc6_drop6 -23330=4,1,4096,1,1 0=4096 1=1 2=37748736 9=1 InnerProduct fc7 1 1 fc6_drop6 fc7_drop7 -23330=4,1,4096,1,1 0=4096 1=1 2=16777216 9=1 InnerProduct fc8 1 1 fc7_drop7 fc8 -23330=4,1,1000,1,1 0=1000 1=1 2=4096000 Softmax prob 1 1 fc8 output -23330=4,1,1000,1,1 ================================================ FILE: benchmark/benchncnn.cpp ================================================ // Copyright 2018 Tencent // SPDX-License-Identifier: BSD-3-Clause #include #include #include #ifdef __EMSCRIPTEN__ #include #endif #include "benchmark.h" #include "cpu.h" #include "datareader.h" #include "net.h" #include "gpu.h" #include "benchncnn_param_data.h" #ifndef NCNN_SIMPLESTL #include #endif class DataReaderFromEmpty : public ncnn::DataReader { public: virtual int scan(const char* format, void* p) const { return 0; } virtual size_t read(void* buf, size_t size) const { memset(buf, 0, size); return size; } }; static int g_warmup_loop_count = 8; static int g_loop_count = 4; static bool g_enable_cooling_down = true; static ncnn::UnlockedPoolAllocator g_blob_pool_allocator; static ncnn::PoolAllocator g_workspace_pool_allocator; #if NCNN_VULKAN static ncnn::VulkanDevice* g_vkdev = 0; static ncnn::VkAllocator* g_blob_vkallocator = 0; static ncnn::VkAllocator* g_staging_vkallocator = 0; #endif // NCNN_VULKAN void benchmark(const char* comment, const std::vector& _in, const ncnn::Option& opt, const char* model_param_data = NULL) { // Skip if int8 model name and using GPU if (opt.use_vulkan_compute && strstr(comment, "int8") != NULL) { if (!model_param_data) fprintf(stderr, "%20s skipped (int8+GPU not supported)\n", comment); return; } g_blob_pool_allocator.clear(); g_workspace_pool_allocator.clear(); #if NCNN_VULKAN if (opt.use_vulkan_compute) { g_blob_vkallocator->clear(); g_staging_vkallocator->clear(); } #endif // NCNN_VULKAN ncnn::Net net; net.opt = opt; #if NCNN_VULKAN if (net.opt.use_vulkan_compute) { net.set_vulkan_device(g_vkdev); } #endif // NCNN_VULKAN if (model_param_data) { net.load_param_mem(model_param_data); } else { net.load_param(comment); } DataReaderFromEmpty dr; net.load_model(dr); const std::vector& input_names = net.input_names(); const std::vector& output_names = net.output_names(); if (g_enable_cooling_down) { // sleep 10 seconds for cooling down SOC :( ncnn::sleep(10 * 1000); } if (input_names.size() > _in.size()) { fprintf(stderr, "input %zu tensors while model has %zu inputs\n", _in.size(), input_names.size()); return; } // initialize input for (size_t j = 0; j < input_names.size(); ++j) { ncnn::Mat in = _in[j]; in.fill(0.01f); } // warm up for (int i = 0; i < g_warmup_loop_count; i++) { ncnn::Extractor ex = net.create_extractor(); for (size_t j = 0; j < input_names.size(); ++j) { ncnn::Mat in = _in[j]; ex.input(input_names[j], in); } for (size_t j = 0; j < output_names.size(); ++j) { ncnn::Mat out; ex.extract(output_names[j], out); } } double time_min = DBL_MAX; double time_max = -DBL_MAX; double time_avg = 0; for (int i = 0; i < g_loop_count; i++) { double start = ncnn::get_current_time(); { ncnn::Extractor ex = net.create_extractor(); for (size_t j = 0; j < input_names.size(); ++j) { ncnn::Mat in = _in[j]; ex.input(input_names[j], in); } for (size_t j = 0; j < output_names.size(); ++j) { ncnn::Mat out; ex.extract(output_names[j], out); } } double end = ncnn::get_current_time(); double time = end - start; time_min = std::min(time_min, time); time_max = std::max(time_max, time); time_avg += time; } time_avg /= g_loop_count; fprintf(stderr, "%20s min = %7.2f max = %7.2f avg = %7.2f\n", comment, time_min, time_max, time_avg); } void benchmark(const char* comment, const ncnn::Mat& _in, const ncnn::Option& opt, const char* model_param_data = NULL) { std::vector inputs; inputs.push_back(_in); return benchmark(comment, inputs, opt, model_param_data); } void show_usage() { fprintf(stderr, "Usage: benchncnn [loop count] [num threads] [powersave] [gpu device] [cooling down] [(key=value)...]\n"); fprintf(stderr, " param=model.param\n"); fprintf(stderr, " shape=[227,227,3],...\n"); } static std::vector parse_shape_list(char* s) { std::vector > shapes; std::vector mats; char* pch = strtok(s, "[]"); while (pch != NULL) { // parse a,b,c int v; int nconsumed = 0; int nscan = sscanf(pch, "%d%n", &v, &nconsumed); if (nscan == 1) { // ok we get shape pch += nconsumed; std::vector s; s.push_back(v); nscan = sscanf(pch, ",%d%n", &v, &nconsumed); while (nscan == 1) { pch += nconsumed; s.push_back(v); nscan = sscanf(pch, ",%d%n", &v, &nconsumed); } // shape end shapes.push_back(s); } pch = strtok(NULL, "[]"); } for (size_t i = 0; i < shapes.size(); ++i) { const std::vector& shape = shapes[i]; switch (shape.size()) { case 4: mats.push_back(ncnn::Mat(shape[0], shape[1], shape[2], shape[3])); break; case 3: mats.push_back(ncnn::Mat(shape[0], shape[1], shape[2])); break; case 2: mats.push_back(ncnn::Mat(shape[0], shape[1])); break; case 1: mats.push_back(ncnn::Mat(shape[0])); break; default: fprintf(stderr, "unsupported input shape size %zu\n", shape.size()); break; } } return mats; } int main(int argc, char** argv) { int loop_count = 4; int num_threads = ncnn::get_physical_big_cpu_count(); int powersave = 2; int gpu_device = -1; int cooling_down = 1; char* model = 0; std::vector inputs; for (int i = 1; i < argc; i++) { if (argv[i][0] == '-' && argv[i][1] == 'h') { show_usage(); return -1; } if (strcmp(argv[i], "--help") == 0) { show_usage(); return -1; } } if (argc >= 2) { loop_count = atoi(argv[1]); } if (argc >= 3) { num_threads = atoi(argv[2]); } if (argc >= 4) { powersave = atoi(argv[3]); } if (argc >= 5) { gpu_device = atoi(argv[4]); } if (argc >= 6) { cooling_down = atoi(argv[5]); } for (int i = 6; i < argc; i++) { // key=value char* kv = argv[i]; char* eqs = strchr(kv, '='); if (eqs == NULL) { fprintf(stderr, "unrecognized arg %s\n", kv); continue; } // split k v eqs[0] = '\0'; const char* key = kv; char* value = eqs + 1; if (strcmp(key, "param") == 0) model = value; if (strcmp(key, "shape") == 0) inputs = parse_shape_list(value); } if (model && inputs.empty()) { fprintf(stderr, "input tensor shape empty!\n"); return -1; } #ifdef __EMSCRIPTEN__ EM_ASM( FS.mkdir('/working'); FS.mount(NODEFS, {root: '.'}, '/working');); #endif // __EMSCRIPTEN__ bool use_vulkan_compute = gpu_device != -1; g_enable_cooling_down = cooling_down != 0; g_loop_count = loop_count; g_blob_pool_allocator.set_size_compare_ratio(0.f); g_workspace_pool_allocator.set_size_compare_ratio(0.f); #if NCNN_VULKAN if (use_vulkan_compute) { g_warmup_loop_count = 10; g_vkdev = ncnn::get_gpu_device(gpu_device); g_blob_vkallocator = new ncnn::VkBlobAllocator(g_vkdev); g_staging_vkallocator = new ncnn::VkStagingAllocator(g_vkdev); } #endif // NCNN_VULKAN ncnn::set_cpu_powersave(powersave); ncnn::set_omp_dynamic(0); ncnn::set_omp_num_threads(num_threads); // default option ncnn::Option opt; opt.lightmode = true; opt.num_threads = num_threads; opt.blob_allocator = &g_blob_pool_allocator; opt.workspace_allocator = &g_workspace_pool_allocator; #if NCNN_VULKAN opt.blob_vkallocator = g_blob_vkallocator; opt.workspace_vkallocator = g_blob_vkallocator; opt.staging_vkallocator = g_staging_vkallocator; #endif // NCNN_VULKAN opt.use_winograd_convolution = true; opt.use_sgemm_convolution = true; opt.use_int8_inference = true; opt.use_vulkan_compute = use_vulkan_compute; opt.use_fp16_packed = true; opt.use_fp16_storage = true; opt.use_fp16_arithmetic = true; opt.use_int8_storage = true; opt.use_int8_arithmetic = true; opt.use_packing_layout = true; fprintf(stderr, "loop_count = %d\n", g_loop_count); fprintf(stderr, "num_threads = %d\n", num_threads); fprintf(stderr, "powersave = %d\n", ncnn::get_cpu_powersave()); fprintf(stderr, "gpu_device = %d\n", gpu_device); fprintf(stderr, "cooling_down = %d\n", (int)g_enable_cooling_down); if (model != 0) { // run user defined benchmark benchmark(model, inputs, opt); } else { // run default cases benchmark("squeezenet", ncnn::Mat(227, 227, 3), opt, squeezenet_param_data); benchmark("squeezenet_int8", ncnn::Mat(227, 227, 3), opt, squeezenet_int8_param_data); benchmark("mobilenet", ncnn::Mat(224, 224, 3), opt, mobilenet_param_data); benchmark("mobilenet_int8", ncnn::Mat(224, 224, 3), opt, mobilenet_int8_param_data); benchmark("mobilenet_v2", ncnn::Mat(224, 224, 3), opt, mobilenet_v2_param_data); // benchmark("mobilenet_v2_int8", ncnn::Mat(224, 224, 3), opt, mobilenet_v2_int8_param_data); benchmark("mobilenet_v3", ncnn::Mat(224, 224, 3), opt, mobilenet_v3_param_data); benchmark("shufflenet", ncnn::Mat(224, 224, 3), opt, shufflenet_param_data); benchmark("shufflenet_v2", ncnn::Mat(224, 224, 3), opt, shufflenet_v2_param_data); benchmark("mnasnet", ncnn::Mat(224, 224, 3), opt, mnasnet_param_data); benchmark("proxylessnasnet", ncnn::Mat(224, 224, 3), opt, proxylessnasnet_param_data); benchmark("efficientnet_b0", ncnn::Mat(224, 224, 3), opt, efficientnet_b0_param_data); benchmark("efficientnetv2_b0", ncnn::Mat(224, 224, 3), opt, efficientnetv2_b0_param_data); benchmark("regnety_400m", ncnn::Mat(224, 224, 3), opt, regnety_400m_param_data); benchmark("blazeface", ncnn::Mat(128, 128, 3), opt, blazeface_param_data); benchmark("googlenet", ncnn::Mat(224, 224, 3), opt, googlenet_param_data); benchmark("googlenet_int8", ncnn::Mat(224, 224, 3), opt, googlenet_int8_param_data); benchmark("resnet18", ncnn::Mat(224, 224, 3), opt, resnet18_param_data); benchmark("resnet18_int8", ncnn::Mat(224, 224, 3), opt, resnet18_int8_param_data); benchmark("alexnet", ncnn::Mat(227, 227, 3), opt, alexnet_param_data); benchmark("vgg16", ncnn::Mat(224, 224, 3), opt, vgg16_param_data); benchmark("vgg16_int8", ncnn::Mat(224, 224, 3), opt, vgg16_int8_param_data); benchmark("resnet50", ncnn::Mat(224, 224, 3), opt, resnet50_param_data); benchmark("resnet50_int8", ncnn::Mat(224, 224, 3), opt, resnet50_int8_param_data); benchmark("squeezenet_ssd", ncnn::Mat(300, 300, 3), opt, squeezenet_ssd_param_data); benchmark("squeezenet_ssd_int8", ncnn::Mat(300, 300, 3), opt, squeezenet_ssd_int8_param_data); benchmark("mobilenet_ssd", ncnn::Mat(300, 300, 3), opt, mobilenet_ssd_param_data); benchmark("mobilenet_ssd_int8", ncnn::Mat(300, 300, 3), opt, mobilenet_ssd_int8_param_data); benchmark("mobilenet_yolo", ncnn::Mat(416, 416, 3), opt, mobilenet_yolo_param_data); benchmark("mobilenetv2_yolov3", ncnn::Mat(352, 352, 3), opt, mobilenetv2_yolov3_param_data); benchmark("yolov4-tiny", ncnn::Mat(416, 416, 3), opt, yolov4_tiny_param_data); benchmark("nanodet_m", ncnn::Mat(320, 320, 3), opt, nanodet_m_param_data); benchmark("yolo-fastest-1.1", ncnn::Mat(320, 320, 3), opt, yolo_fastest_1_1_param_data); benchmark("yolo-fastestv2", ncnn::Mat(352, 352, 3), opt, yolo_fastestv2_param_data); benchmark("vision_transformer", ncnn::Mat(384, 384, 3), opt, vision_transformer_param_data); benchmark("FastestDet", ncnn::Mat(352, 352, 3), opt, FastestDet_param_data); } #if NCNN_VULKAN delete g_blob_vkallocator; delete g_staging_vkallocator; #endif // NCNN_VULKAN return 0; } ================================================ FILE: benchmark/benchncnn_param_data.h.in ================================================ // Benchncnn Param Data header // // This file is auto-generated by cmake, don't edit it. @param_header_data@ ================================================ FILE: benchmark/blazeface.param ================================================ 7767517 101 117 Input data 0 1 data 0=128 1=128 2=3 Padding 75 1 1 data 75 0=1 1=2 2=1 3=2 4=0 5=0.000000e+00 7=0 8=0 Convolution 76 1 1 75 76 0=24 1=5 11=5 2=1 12=1 3=2 13=2 4=0 14=0 15=0 16=0 5=1 6=1800 ReLU 77 1 1 76 77 Split splitncnn_0 1 2 77 77_splitncnn_0 77_splitncnn_1 ConvolutionDepthWise 78 1 1 77_splitncnn_1 78 0=24 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=216 7=24 Convolution 79 1 1 78 79 0=24 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=576 BinaryOp 80 2 1 79 77_splitncnn_0 80 0=0 ReLU 81 1 1 80 81 Split splitncnn_1 1 2 81 81_splitncnn_0 81_splitncnn_1 Padding 82 1 1 81_splitncnn_1 82 0=0 1=0 2=0 3=0 4=0 5=0.000000e+00 7=0 8=4 ConvolutionDepthWise 83 1 1 81_splitncnn_0 83 0=24 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=216 7=24 Convolution 84 1 1 83 84 0=28 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=672 BinaryOp 85 2 1 84 82 85 0=0 ReLU 86 1 1 85 86 Split splitncnn_2 1 2 86 86_splitncnn_0 86_splitncnn_1 Padding 87 1 1 86_splitncnn_1 87 0=0 1=2 2=0 3=2 4=0 5=0.000000e+00 7=0 8=0 Pooling 88 1 1 86_splitncnn_0 88 0=0 1=2 11=2 2=2 12=2 3=0 13=0 14=0 15=0 5=1 Padding 89 1 1 88 89 0=0 1=0 2=0 3=0 4=0 5=0.000000e+00 7=0 8=4 ConvolutionDepthWise 90 1 1 87 90 0=28 1=3 11=3 2=1 12=1 3=2 13=2 4=0 14=0 15=0 16=0 5=1 6=252 7=28 Convolution 91 1 1 90 91 0=32 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=896 BinaryOp 92 2 1 91 89 92 0=0 ReLU 93 1 1 92 93 Split splitncnn_3 1 2 93 93_splitncnn_0 93_splitncnn_1 Padding 94 1 1 93_splitncnn_1 94 0=0 1=0 2=0 3=0 4=0 5=0.000000e+00 7=0 8=4 ConvolutionDepthWise 95 1 1 93_splitncnn_0 95 0=32 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=288 7=32 Convolution 96 1 1 95 96 0=36 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=1152 BinaryOp 97 2 1 96 94 97 0=0 ReLU 98 1 1 97 98 Split splitncnn_4 1 2 98 98_splitncnn_0 98_splitncnn_1 Padding 99 1 1 98_splitncnn_1 99 0=0 1=0 2=0 3=0 4=0 5=0.000000e+00 7=0 8=6 ConvolutionDepthWise 100 1 1 98_splitncnn_0 100 0=36 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=324 7=36 Convolution 101 1 1 100 101 0=42 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=1512 BinaryOp 102 2 1 101 99 102 0=0 ReLU 103 1 1 102 103 Split splitncnn_5 1 2 103 103_splitncnn_0 103_splitncnn_1 Padding 104 1 1 103_splitncnn_1 104 0=0 1=2 2=0 3=2 4=0 5=0.000000e+00 7=0 8=0 Pooling 105 1 1 103_splitncnn_0 105 0=0 1=2 11=2 2=2 12=2 3=0 13=0 14=0 15=0 5=1 Padding 106 1 1 105 106 0=0 1=0 2=0 3=0 4=0 5=0.000000e+00 7=0 8=6 ConvolutionDepthWise 107 1 1 104 107 0=42 1=3 11=3 2=1 12=1 3=2 13=2 4=0 14=0 15=0 16=0 5=1 6=378 7=42 Convolution 108 1 1 107 108 0=48 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=2016 BinaryOp 109 2 1 108 106 109 0=0 ReLU 110 1 1 109 110 Split splitncnn_6 1 2 110 110_splitncnn_0 110_splitncnn_1 Padding 111 1 1 110_splitncnn_1 111 0=0 1=0 2=0 3=0 4=0 5=0.000000e+00 7=0 8=8 ConvolutionDepthWise 112 1 1 110_splitncnn_0 112 0=48 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=432 7=48 Convolution 113 1 1 112 113 0=56 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=2688 BinaryOp 114 2 1 113 111 114 0=0 ReLU 115 1 1 114 115 Split splitncnn_7 1 2 115 115_splitncnn_0 115_splitncnn_1 Padding 116 1 1 115_splitncnn_1 116 0=0 1=0 2=0 3=0 4=0 5=0.000000e+00 7=0 8=8 ConvolutionDepthWise 117 1 1 115_splitncnn_0 117 0=56 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=504 7=56 Convolution 118 1 1 117 118 0=64 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=3584 BinaryOp 119 2 1 118 116 119 0=0 ReLU 120 1 1 119 120 Split splitncnn_8 1 2 120 120_splitncnn_0 120_splitncnn_1 Padding 121 1 1 120_splitncnn_1 121 0=0 1=0 2=0 3=0 4=0 5=0.000000e+00 7=0 8=8 ConvolutionDepthWise 122 1 1 120_splitncnn_0 122 0=64 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=576 7=64 Convolution 123 1 1 122 123 0=72 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=4608 BinaryOp 124 2 1 123 121 124 0=0 ReLU 125 1 1 124 125 Split splitncnn_9 1 2 125 125_splitncnn_0 125_splitncnn_1 Padding 126 1 1 125_splitncnn_1 126 0=0 1=0 2=0 3=0 4=0 5=0.000000e+00 7=0 8=8 ConvolutionDepthWise 127 1 1 125_splitncnn_0 127 0=72 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=648 7=72 Convolution 128 1 1 127 128 0=80 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=5760 BinaryOp 129 2 1 128 126 129 0=0 ReLU 130 1 1 129 130 Split splitncnn_10 1 2 130 130_splitncnn_0 130_splitncnn_1 Padding 131 1 1 130_splitncnn_1 131 0=0 1=0 2=0 3=0 4=0 5=0.000000e+00 7=0 8=8 ConvolutionDepthWise 132 1 1 130_splitncnn_0 132 0=80 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=720 7=80 Convolution 133 1 1 132 133 0=88 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=7040 BinaryOp 134 2 1 133 131 134 0=0 ReLU 135 1 1 134 135 Split splitncnn_11 1 2 135 135_splitncnn_0 135_splitncnn_1 Padding 136 1 1 135_splitncnn_1 136 0=0 1=2 2=0 3=2 4=0 5=0.000000e+00 7=0 8=0 Pooling 137 1 1 135_splitncnn_0 137 0=0 1=2 11=2 2=2 12=2 3=0 13=0 14=0 15=0 5=1 Padding 138 1 1 137 138 0=0 1=0 2=0 3=0 4=0 5=0.000000e+00 7=0 8=8 ConvolutionDepthWise 139 1 1 136 139 0=88 1=3 11=3 2=1 12=1 3=2 13=2 4=0 14=0 15=0 16=0 5=1 6=792 7=88 Convolution 140 1 1 139 140 0=96 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=8448 BinaryOp 141 2 1 140 138 141 0=0 ReLU 142 1 1 141 142 Split splitncnn_12 1 2 142 142_splitncnn_0 142_splitncnn_1 ConvolutionDepthWise 143 1 1 142_splitncnn_1 143 0=96 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=864 7=96 Convolution 144 1 1 143 144 0=96 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=9216 BinaryOp 145 2 1 144 142_splitncnn_0 145 0=0 ReLU 146 1 1 145 146 Split splitncnn_13 1 2 146 146_splitncnn_0 146_splitncnn_1 ConvolutionDepthWise 147 1 1 146_splitncnn_1 147 0=96 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=864 7=96 Convolution 148 1 1 147 148 0=96 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=9216 BinaryOp 149 2 1 148 146_splitncnn_0 149 0=0 ReLU 150 1 1 149 150 Split splitncnn_14 1 2 150 150_splitncnn_0 150_splitncnn_1 ConvolutionDepthWise 151 1 1 150_splitncnn_1 151 0=96 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=864 7=96 Convolution 152 1 1 151 152 0=96 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=9216 BinaryOp 153 2 1 152 150_splitncnn_0 153 0=0 ReLU 154 1 1 153 154 Split splitncnn_15 1 2 154 154_splitncnn_0 154_splitncnn_1 ConvolutionDepthWise 155 1 1 154_splitncnn_1 155 0=96 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=864 7=96 Convolution 156 1 1 155 156 0=96 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=9216 BinaryOp 157 2 1 156 154_splitncnn_0 157 0=0 ReLU output 1 1 157 output ================================================ FILE: benchmark/efficientnet_b0.param ================================================ 7767517 200 225 Input input.1 0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3 Convolution Conv_0 1 1 data 362 -23330=4,3,112,112,32 0=32 1=3 3=2 4=1 5=1 6=864 Swish Mul_3 1 1 362 364 -23330=4,3,112,112,32 ConvolutionDepthWise Conv_4 1 1 364 366 -23330=4,3,112,112,32 0=32 1=3 4=1 5=1 6=288 7=32 Swish Mul_7 1 1 366 368 -23330=4,3,112,112,32 Split splitncnn_0 1 2 368 368_splitncnn_0 368_splitncnn_1 -23330=8,3,112,112,32,3,112,112,32 Pooling GlobalAveragePool_8 1 1 368_splitncnn_1 369 -23330=4,1,32,1,1 0=1 4=1 InnerProduct Conv_9 1 1 369 370 -23330=4,1,8,1,1 0=8 1=1 2=256 Swish Mul_11 1 1 370 372 -23330=4,1,8,1,1 Convolution Conv_12 1 1 372 374 -23330=4,1,32,1,1 0=32 1=1 5=1 6=256 9=4 BinaryOp Mul_14 2 1 368_splitncnn_0 374 375 -23330=4,3,112,112,32 0=2 Convolution Conv_15 1 1 375 377 -23330=4,3,112,112,16 0=16 1=1 5=1 6=512 Convolution Conv_17 1 1 377 379 -23330=4,3,112,112,96 0=96 1=1 5=1 6=1536 Swish Mul_20 1 1 379 381 -23330=4,3,112,112,96 ConvolutionDepthWise Conv_21 1 1 381 383 -23330=4,3,56,56,96 0=96 1=3 3=2 4=1 5=1 6=864 7=96 Swish Mul_24 1 1 383 385 -23330=4,3,56,56,96 Split splitncnn_1 1 2 385 385_splitncnn_0 385_splitncnn_1 -23330=8,3,56,56,96,3,56,56,96 Pooling GlobalAveragePool_25 1 1 385_splitncnn_1 386 -23330=4,1,96,1,1 0=1 4=1 InnerProduct Conv_26 1 1 386 387 -23330=4,1,4,1,1 0=4 1=1 2=384 Swish Mul_28 1 1 387 389 -23330=4,1,4,1,1 Convolution Conv_29 1 1 389 391 -23330=4,1,96,1,1 0=96 1=1 5=1 6=384 9=4 BinaryOp Mul_31 2 1 385_splitncnn_0 391 392 -23330=4,3,56,56,96 0=2 Convolution Conv_32 1 1 392 394 -23330=4,3,56,56,24 0=24 1=1 5=1 6=2304 Split splitncnn_2 1 2 394 394_splitncnn_0 394_splitncnn_1 -23330=8,3,56,56,24,3,56,56,24 Convolution Conv_34 1 1 394_splitncnn_1 396 -23330=4,3,56,56,144 0=144 1=1 5=1 6=3456 Swish Mul_37 1 1 396 398 -23330=4,3,56,56,144 ConvolutionDepthWise Conv_38 1 1 398 400 -23330=4,3,56,56,144 0=144 1=3 4=1 5=1 6=1296 7=144 Swish Mul_41 1 1 400 402 -23330=4,3,56,56,144 Split splitncnn_3 1 2 402 402_splitncnn_0 402_splitncnn_1 -23330=8,3,56,56,144,3,56,56,144 Pooling GlobalAveragePool_42 1 1 402_splitncnn_1 403 -23330=4,1,144,1,1 0=1 4=1 InnerProduct Conv_43 1 1 403 404 -23330=4,1,6,1,1 0=6 1=1 2=864 Swish Mul_45 1 1 404 406 -23330=4,1,6,1,1 Convolution Conv_46 1 1 406 408 -23330=4,1,144,1,1 0=144 1=1 5=1 6=864 9=4 BinaryOp Mul_48 2 1 402_splitncnn_0 408 409 -23330=4,3,56,56,144 0=2 Convolution Conv_49 1 1 409 411 -23330=4,3,56,56,24 0=24 1=1 5=1 6=3456 BinaryOp Add_51 2 1 394_splitncnn_0 411 412 -23330=4,3,56,56,24 Convolution Conv_52 1 1 412 414 -23330=4,3,56,56,144 0=144 1=1 5=1 6=3456 Swish Mul_55 1 1 414 416 -23330=4,3,56,56,144 ConvolutionDepthWise Conv_56 1 1 416 418 -23330=4,3,28,28,144 0=144 1=5 3=2 4=2 5=1 6=3600 7=144 Swish Mul_59 1 1 418 420 -23330=4,3,28,28,144 Split splitncnn_4 1 2 420 420_splitncnn_0 420_splitncnn_1 -23330=8,3,28,28,144,3,28,28,144 Pooling GlobalAveragePool_60 1 1 420_splitncnn_1 421 -23330=4,1,144,1,1 0=1 4=1 InnerProduct Conv_61 1 1 421 422 -23330=4,1,6,1,1 0=6 1=1 2=864 Swish Mul_63 1 1 422 424 -23330=4,1,6,1,1 Convolution Conv_64 1 1 424 426 -23330=4,1,144,1,1 0=144 1=1 5=1 6=864 9=4 BinaryOp Mul_66 2 1 420_splitncnn_0 426 427 -23330=4,3,28,28,144 0=2 Convolution Conv_67 1 1 427 429 -23330=4,3,28,28,40 0=40 1=1 5=1 6=5760 Split splitncnn_5 1 2 429 429_splitncnn_0 429_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40 Convolution Conv_69 1 1 429_splitncnn_1 431 -23330=4,3,28,28,240 0=240 1=1 5=1 6=9600 Swish Mul_72 1 1 431 433 -23330=4,3,28,28,240 ConvolutionDepthWise Conv_73 1 1 433 435 -23330=4,3,28,28,240 0=240 1=5 4=2 5=1 6=6000 7=240 Swish Mul_76 1 1 435 437 -23330=4,3,28,28,240 Split splitncnn_6 1 2 437 437_splitncnn_0 437_splitncnn_1 -23330=8,3,28,28,240,3,28,28,240 Pooling GlobalAveragePool_77 1 1 437_splitncnn_1 438 -23330=4,1,240,1,1 0=1 4=1 InnerProduct Conv_78 1 1 438 439 -23330=4,1,10,1,1 0=10 1=1 2=2400 Swish Mul_80 1 1 439 441 -23330=4,1,10,1,1 Convolution Conv_81 1 1 441 443 -23330=4,1,240,1,1 0=240 1=1 5=1 6=2400 9=4 BinaryOp Mul_83 2 1 437_splitncnn_0 443 444 -23330=4,3,28,28,240 0=2 Convolution Conv_84 1 1 444 446 -23330=4,3,28,28,40 0=40 1=1 5=1 6=9600 BinaryOp Add_86 2 1 429_splitncnn_0 446 447 -23330=4,3,28,28,40 Convolution Conv_87 1 1 447 449 -23330=4,3,28,28,240 0=240 1=1 5=1 6=9600 Swish Mul_90 1 1 449 451 -23330=4,3,28,28,240 ConvolutionDepthWise Conv_91 1 1 451 453 -23330=4,3,14,14,240 0=240 1=3 3=2 4=1 5=1 6=2160 7=240 Swish Mul_94 1 1 453 455 -23330=4,3,14,14,240 Split splitncnn_7 1 2 455 455_splitncnn_0 455_splitncnn_1 -23330=8,3,14,14,240,3,14,14,240 Pooling GlobalAveragePool_95 1 1 455_splitncnn_1 456 -23330=4,1,240,1,1 0=1 4=1 InnerProduct Conv_96 1 1 456 457 -23330=4,1,10,1,1 0=10 1=1 2=2400 Swish Mul_98 1 1 457 459 -23330=4,1,10,1,1 Convolution Conv_99 1 1 459 461 -23330=4,1,240,1,1 0=240 1=1 5=1 6=2400 9=4 BinaryOp Mul_101 2 1 455_splitncnn_0 461 462 -23330=4,3,14,14,240 0=2 Convolution Conv_102 1 1 462 464 -23330=4,3,14,14,80 0=80 1=1 5=1 6=19200 Split splitncnn_8 1 2 464 464_splitncnn_0 464_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80 Convolution Conv_104 1 1 464_splitncnn_1 466 -23330=4,3,14,14,480 0=480 1=1 5=1 6=38400 Swish Mul_107 1 1 466 468 -23330=4,3,14,14,480 ConvolutionDepthWise Conv_108 1 1 468 470 -23330=4,3,14,14,480 0=480 1=3 4=1 5=1 6=4320 7=480 Swish Mul_111 1 1 470 472 -23330=4,3,14,14,480 Split splitncnn_9 1 2 472 472_splitncnn_0 472_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480 Pooling GlobalAveragePool_112 1 1 472_splitncnn_1 473 -23330=4,1,480,1,1 0=1 4=1 InnerProduct Conv_113 1 1 473 474 -23330=4,1,20,1,1 0=20 1=1 2=9600 Swish Mul_115 1 1 474 476 -23330=4,1,20,1,1 Convolution Conv_116 1 1 476 478 -23330=4,1,480,1,1 0=480 1=1 5=1 6=9600 9=4 BinaryOp Mul_118 2 1 472_splitncnn_0 478 479 -23330=4,3,14,14,480 0=2 Convolution Conv_119 1 1 479 481 -23330=4,3,14,14,80 0=80 1=1 5=1 6=38400 BinaryOp Add_121 2 1 464_splitncnn_0 481 482 -23330=4,3,14,14,80 Split splitncnn_10 1 2 482 482_splitncnn_0 482_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80 Convolution Conv_122 1 1 482_splitncnn_1 484 -23330=4,3,14,14,480 0=480 1=1 5=1 6=38400 Swish Mul_125 1 1 484 486 -23330=4,3,14,14,480 ConvolutionDepthWise Conv_126 1 1 486 488 -23330=4,3,14,14,480 0=480 1=3 4=1 5=1 6=4320 7=480 Swish Mul_129 1 1 488 490 -23330=4,3,14,14,480 Split splitncnn_11 1 2 490 490_splitncnn_0 490_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480 Pooling GlobalAveragePool_130 1 1 490_splitncnn_1 491 -23330=4,1,480,1,1 0=1 4=1 InnerProduct Conv_131 1 1 491 492 -23330=4,1,20,1,1 0=20 1=1 2=9600 Swish Mul_133 1 1 492 494 -23330=4,1,20,1,1 Convolution Conv_134 1 1 494 496 -23330=4,1,480,1,1 0=480 1=1 5=1 6=9600 9=4 BinaryOp Mul_136 2 1 490_splitncnn_0 496 497 -23330=4,3,14,14,480 0=2 Convolution Conv_137 1 1 497 499 -23330=4,3,14,14,80 0=80 1=1 5=1 6=38400 BinaryOp Add_139 2 1 482_splitncnn_0 499 500 -23330=4,3,14,14,80 Convolution Conv_140 1 1 500 502 -23330=4,3,14,14,480 0=480 1=1 5=1 6=38400 Swish Mul_143 1 1 502 504 -23330=4,3,14,14,480 ConvolutionDepthWise Conv_144 1 1 504 506 -23330=4,3,14,14,480 0=480 1=5 4=2 5=1 6=12000 7=480 Swish Mul_147 1 1 506 508 -23330=4,3,14,14,480 Split splitncnn_12 1 2 508 508_splitncnn_0 508_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480 Pooling GlobalAveragePool_148 1 1 508_splitncnn_1 509 -23330=4,1,480,1,1 0=1 4=1 InnerProduct Conv_149 1 1 509 510 -23330=4,1,20,1,1 0=20 1=1 2=9600 Swish Mul_151 1 1 510 512 -23330=4,1,20,1,1 Convolution Conv_152 1 1 512 514 -23330=4,1,480,1,1 0=480 1=1 5=1 6=9600 9=4 BinaryOp Mul_154 2 1 508_splitncnn_0 514 515 -23330=4,3,14,14,480 0=2 Convolution Conv_155 1 1 515 517 -23330=4,3,14,14,112 0=112 1=1 5=1 6=53760 Split splitncnn_13 1 2 517 517_splitncnn_0 517_splitncnn_1 -23330=8,3,14,14,112,3,14,14,112 Convolution Conv_157 1 1 517_splitncnn_1 519 -23330=4,3,14,14,672 0=672 1=1 5=1 6=75264 Swish Mul_160 1 1 519 521 -23330=4,3,14,14,672 ConvolutionDepthWise Conv_161 1 1 521 523 -23330=4,3,14,14,672 0=672 1=5 4=2 5=1 6=16800 7=672 Swish Mul_164 1 1 523 525 -23330=4,3,14,14,672 Split splitncnn_14 1 2 525 525_splitncnn_0 525_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672 Pooling GlobalAveragePool_165 1 1 525_splitncnn_1 526 -23330=4,1,672,1,1 0=1 4=1 InnerProduct Conv_166 1 1 526 527 -23330=4,1,28,1,1 0=28 1=1 2=18816 Swish Mul_168 1 1 527 529 -23330=4,1,28,1,1 Convolution Conv_169 1 1 529 531 -23330=4,1,672,1,1 0=672 1=1 5=1 6=18816 9=4 BinaryOp Mul_171 2 1 525_splitncnn_0 531 532 -23330=4,3,14,14,672 0=2 Convolution Conv_172 1 1 532 534 -23330=4,3,14,14,112 0=112 1=1 5=1 6=75264 BinaryOp Add_174 2 1 517_splitncnn_0 534 535 -23330=4,3,14,14,112 Split splitncnn_15 1 2 535 535_splitncnn_0 535_splitncnn_1 -23330=8,3,14,14,112,3,14,14,112 Convolution Conv_175 1 1 535_splitncnn_1 537 -23330=4,3,14,14,672 0=672 1=1 5=1 6=75264 Swish Mul_178 1 1 537 539 -23330=4,3,14,14,672 ConvolutionDepthWise Conv_179 1 1 539 541 -23330=4,3,14,14,672 0=672 1=5 4=2 5=1 6=16800 7=672 Swish Mul_182 1 1 541 543 -23330=4,3,14,14,672 Split splitncnn_16 1 2 543 543_splitncnn_0 543_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672 Pooling GlobalAveragePool_183 1 1 543_splitncnn_1 544 -23330=4,1,672,1,1 0=1 4=1 InnerProduct Conv_184 1 1 544 545 -23330=4,1,28,1,1 0=28 1=1 2=18816 Swish Mul_186 1 1 545 547 -23330=4,1,28,1,1 Convolution Conv_187 1 1 547 549 -23330=4,1,672,1,1 0=672 1=1 5=1 6=18816 9=4 BinaryOp Mul_189 2 1 543_splitncnn_0 549 550 -23330=4,3,14,14,672 0=2 Convolution Conv_190 1 1 550 552 -23330=4,3,14,14,112 0=112 1=1 5=1 6=75264 BinaryOp Add_192 2 1 535_splitncnn_0 552 553 -23330=4,3,14,14,112 Convolution Conv_193 1 1 553 555 -23330=4,3,14,14,672 0=672 1=1 5=1 6=75264 Swish Mul_196 1 1 555 557 -23330=4,3,14,14,672 ConvolutionDepthWise Conv_197 1 1 557 559 -23330=4,3,7,7,672 0=672 1=5 3=2 4=2 5=1 6=16800 7=672 Swish Mul_200 1 1 559 561 -23330=4,3,7,7,672 Split splitncnn_17 1 2 561 561_splitncnn_0 561_splitncnn_1 -23330=8,3,7,7,672,3,7,7,672 Pooling GlobalAveragePool_201 1 1 561_splitncnn_1 562 -23330=4,1,672,1,1 0=1 4=1 InnerProduct Conv_202 1 1 562 563 -23330=4,1,28,1,1 0=28 1=1 2=18816 Swish Mul_204 1 1 563 565 -23330=4,1,28,1,1 Convolution Conv_205 1 1 565 567 -23330=4,1,672,1,1 0=672 1=1 5=1 6=18816 9=4 BinaryOp Mul_207 2 1 561_splitncnn_0 567 568 -23330=4,3,7,7,672 0=2 Convolution Conv_208 1 1 568 570 -23330=4,3,7,7,192 0=192 1=1 5=1 6=129024 Split splitncnn_18 1 2 570 570_splitncnn_0 570_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192 Convolution Conv_210 1 1 570_splitncnn_1 572 -23330=4,3,7,7,1152 0=1152 1=1 5=1 6=221184 Swish Mul_213 1 1 572 574 -23330=4,3,7,7,1152 ConvolutionDepthWise Conv_214 1 1 574 576 -23330=4,3,7,7,1152 0=1152 1=5 4=2 5=1 6=28800 7=1152 Swish Mul_217 1 1 576 578 -23330=4,3,7,7,1152 Split splitncnn_19 1 2 578 578_splitncnn_0 578_splitncnn_1 -23330=8,3,7,7,1152,3,7,7,1152 Pooling GlobalAveragePool_218 1 1 578_splitncnn_1 579 -23330=4,1,1152,1,1 0=1 4=1 InnerProduct Conv_219 1 1 579 580 -23330=4,1,48,1,1 0=48 1=1 2=55296 Swish Mul_221 1 1 580 582 -23330=4,1,48,1,1 Convolution Conv_222 1 1 582 584 -23330=4,1,1152,1,1 0=1152 1=1 5=1 6=55296 9=4 BinaryOp Mul_224 2 1 578_splitncnn_0 584 585 -23330=4,3,7,7,1152 0=2 Convolution Conv_225 1 1 585 587 -23330=4,3,7,7,192 0=192 1=1 5=1 6=221184 BinaryOp Add_227 2 1 570_splitncnn_0 587 588 -23330=4,3,7,7,192 Split splitncnn_20 1 2 588 588_splitncnn_0 588_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192 Convolution Conv_228 1 1 588_splitncnn_1 590 -23330=4,3,7,7,1152 0=1152 1=1 5=1 6=221184 Swish Mul_231 1 1 590 592 -23330=4,3,7,7,1152 ConvolutionDepthWise Conv_232 1 1 592 594 -23330=4,3,7,7,1152 0=1152 1=5 4=2 5=1 6=28800 7=1152 Swish Mul_235 1 1 594 596 -23330=4,3,7,7,1152 Split splitncnn_21 1 2 596 596_splitncnn_0 596_splitncnn_1 -23330=8,3,7,7,1152,3,7,7,1152 Pooling GlobalAveragePool_236 1 1 596_splitncnn_1 597 -23330=4,1,1152,1,1 0=1 4=1 InnerProduct Conv_237 1 1 597 598 -23330=4,1,48,1,1 0=48 1=1 2=55296 Swish Mul_239 1 1 598 600 -23330=4,1,48,1,1 Convolution Conv_240 1 1 600 602 -23330=4,1,1152,1,1 0=1152 1=1 5=1 6=55296 9=4 BinaryOp Mul_242 2 1 596_splitncnn_0 602 603 -23330=4,3,7,7,1152 0=2 Convolution Conv_243 1 1 603 605 -23330=4,3,7,7,192 0=192 1=1 5=1 6=221184 BinaryOp Add_245 2 1 588_splitncnn_0 605 606 -23330=4,3,7,7,192 Split splitncnn_22 1 2 606 606_splitncnn_0 606_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192 Convolution Conv_246 1 1 606_splitncnn_1 608 -23330=4,3,7,7,1152 0=1152 1=1 5=1 6=221184 Swish Mul_249 1 1 608 610 -23330=4,3,7,7,1152 ConvolutionDepthWise Conv_250 1 1 610 612 -23330=4,3,7,7,1152 0=1152 1=5 4=2 5=1 6=28800 7=1152 Swish Mul_253 1 1 612 614 -23330=4,3,7,7,1152 Split splitncnn_23 1 2 614 614_splitncnn_0 614_splitncnn_1 -23330=8,3,7,7,1152,3,7,7,1152 Pooling GlobalAveragePool_254 1 1 614_splitncnn_1 615 -23330=4,1,1152,1,1 0=1 4=1 InnerProduct Conv_255 1 1 615 616 -23330=4,1,48,1,1 0=48 1=1 2=55296 Swish Mul_257 1 1 616 618 -23330=4,1,48,1,1 Convolution Conv_258 1 1 618 620 -23330=4,1,1152,1,1 0=1152 1=1 5=1 6=55296 9=4 BinaryOp Mul_260 2 1 614_splitncnn_0 620 621 -23330=4,3,7,7,1152 0=2 Convolution Conv_261 1 1 621 623 -23330=4,3,7,7,192 0=192 1=1 5=1 6=221184 BinaryOp Add_263 2 1 606_splitncnn_0 623 624 -23330=4,3,7,7,192 Convolution Conv_264 1 1 624 626 -23330=4,3,7,7,1152 0=1152 1=1 5=1 6=221184 Swish Mul_267 1 1 626 628 -23330=4,3,7,7,1152 ConvolutionDepthWise Conv_268 1 1 628 630 -23330=4,3,7,7,1152 0=1152 1=3 4=1 5=1 6=10368 7=1152 Swish Mul_271 1 1 630 632 -23330=4,3,7,7,1152 Split splitncnn_24 1 2 632 632_splitncnn_0 632_splitncnn_1 -23330=8,3,7,7,1152,3,7,7,1152 Pooling GlobalAveragePool_272 1 1 632_splitncnn_1 633 -23330=4,1,1152,1,1 0=1 4=1 InnerProduct Conv_273 1 1 633 634 -23330=4,1,48,1,1 0=48 1=1 2=55296 Swish Mul_275 1 1 634 636 -23330=4,1,48,1,1 Convolution Conv_276 1 1 636 638 -23330=4,1,1152,1,1 0=1152 1=1 5=1 6=55296 9=4 BinaryOp Mul_278 2 1 632_splitncnn_0 638 639 -23330=4,3,7,7,1152 0=2 Convolution Conv_279 1 1 639 641 -23330=4,3,7,7,320 0=320 1=1 5=1 6=368640 Convolution Conv_281 1 1 641 643 -23330=4,3,7,7,1280 0=1280 1=1 5=1 6=409600 Swish Mul_284 1 1 643 645 -23330=4,3,7,7,1280 Pooling GlobalAveragePool_285 1 1 645 654 -23330=4,1,1280,1,1 0=1 4=1 InnerProduct Gemm_292 1 1 654 655 -23330=4,1,1000,1,1 0=1000 1=1 2=1280000 Softmax prob 1 1 655 output -23330=4,1,1000,1,1 ================================================ FILE: benchmark/efficientnetv2_b0.param ================================================ 7767517 257 288 MemoryData 110:12 0 1 110:12 -23330=4,1,112,1,1 0=112 MemoryData 133:12 0 1 133:12 -23330=4,1,192,1,1 0=192 MemoryData 144:12 0 1 144:12 -23330=4,1,192,1,1 0=192 MemoryData 14:11 0 1 14:11 -23330=4,1,32,1,1 0=32 MemoryData 155:12 0 1 155:12 -23330=4,1,192,1,1 0=192 MemoryData 166:12 0 1 166:12 -23330=4,1,192,1,1 0=192 MemoryData 177:12 0 1 177:12 -23330=4,1,192,1,1 0=192 MemoryData 188:12 0 1 188:12 -23330=4,1,192,1,1 0=192 MemoryData 199:12 0 1 199:12 -23330=4,1,192,1,1 0=192 MemoryData 22:11 0 1 22:11 -23330=4,1,48,1,1 0=48 MemoryData 33:11 0 1 33:11 -23330=4,1,112,1,1 0=112 MemoryData 44:11 0 1 44:11 -23330=4,1,112,1,1 0=112 MemoryData 55:11 0 1 55:11 -23330=4,1,112,1,1 0=112 MemoryData 77:11 0 1 77:11 -23330=4,1,96,1,1 0=96 MemoryData 88:11 0 1 88:11 -23330=4,1,96,1,1 0=96 Input op_201 0 1 204:12 -23330=4,3,224,224,3 0=224 1=224 2=3 Convolution op_202 1 1 204:12 206:12 -23330=4,3,112,112,32 0=32 1=3 3=2 4=-233 5=1 6=864 Swish op_203 1 1 206:12 208:12 -23330=4,3,112,112,32 Convolution op_204 1 1 208:12 210:12 -23330=4,3,112,112,16 0=16 1=3 4=-233 5=1 6=4608 Swish op_205 1 1 210:12 212:12_splitncnn_0 -23330=4,3,112,112,16 Convolution op_207 1 1 212:12_splitncnn_0 215:12 -23330=4,3,56,56,64 0=64 1=3 3=2 4=-233 5=1 6=9216 Swish op_208 1 1 215:12 217:12 -23330=4,3,56,56,64 Convolution op_209 1 1 217:12 219:12 -23330=4,3,56,56,32 0=32 1=1 4=-233 5=1 6=2048 Split splitncnn_1 1 2 219:12 219:12_splitncnn_0 219:12_splitncnn_1 -23330=8,3,56,56,32,3,56,56,32 Convolution op_210 1 1 219:12_splitncnn_1 221:12 -23330=4,3,56,56,128 0=128 1=3 4=-233 5=1 6=36864 Swish op_211 1 1 221:12 223:12 -23330=4,3,56,56,128 Convolution op_212 1 1 223:12 224:12 -23330=4,3,56,56,32 0=32 1=1 4=-233 6=4096 Eltwise op_213 2 1 219:12_splitncnn_0 224:12 225:12 -23330=4,3,56,56,32 0=1 BinaryOp op_214 2 1 225:12 14:11 226:12_splitncnn_0 -23330=4,3,56,56,32 Convolution op_216 1 1 226:12_splitncnn_0 229:12 -23330=4,3,28,28,128 0=128 1=3 3=2 4=-233 5=1 6=36864 Swish op_217 1 1 229:12 231:12 -23330=4,3,28,28,128 Convolution op_218 1 1 231:12 233:12 -23330=4,3,28,28,48 0=48 1=1 4=-233 5=1 6=6144 Split splitncnn_3 1 2 233:12 233:12_splitncnn_0 233:12_splitncnn_1 -23330=8,3,28,28,48,3,28,28,48 Convolution op_219 1 1 233:12_splitncnn_1 235:12 -23330=4,3,28,28,192 0=192 1=3 4=-233 5=1 6=82944 Swish op_220 1 1 235:12 237:12 -23330=4,3,28,28,192 Convolution op_221 1 1 237:12 238:12 -23330=4,3,28,28,48 0=48 1=1 4=-233 6=9216 Eltwise op_222 2 1 233:12_splitncnn_0 238:12 239:12 -23330=4,3,28,28,48 0=1 BinaryOp op_223 2 1 239:12 22:11 240:12_splitncnn_0 -23330=4,3,28,28,48 Convolution op_225 1 1 240:12_splitncnn_0 243:12 -23330=4,3,28,28,192 0=192 1=1 4=-233 5=1 6=9216 Swish op_226 1 1 243:12 245:12 -23330=4,3,28,28,192 ConvolutionDepthWise op_227 1 1 245:12 248:12 -23330=4,3,14,14,192 0=192 1=3 3=2 4=-233 5=1 6=1728 7=192 Swish op_229 1 1 248:12 250:12 -23330=4,3,14,14,192 Split splitncnn_5 1 2 250:12 250:12_splitncnn_0 250:12_splitncnn_1 -23330=8,3,14,14,192,3,14,14,192 Reduction op_230 1 1 250:12_splitncnn_1 251:12 -23330=4,3,1,1,192 0=3 1=0 -23303=2,1,2 4=1 5=1 Convolution op_231 1 1 251:12 253:12 -23330=4,3,1,1,12 0=12 1=1 4=-233 5=1 6=2304 Swish op_232 1 1 253:12 255:12 -23330=4,3,1,1,12 Convolution op_233 1 1 255:12 258:12 -23330=4,3,1,1,192 0=192 1=1 4=-233 5=1 6=2304 9=4 BinaryOp op_235 2 1 250:12_splitncnn_0 258:12 259:12 -23330=4,3,14,14,192 0=2 Convolution op_236 1 1 259:12 261:12 -23330=4,3,14,14,96 0=96 1=1 4=-233 5=1 6=18432 Split splitncnn_6 1 2 261:12 261:12_splitncnn_0 261:12_splitncnn_1 -23330=8,3,14,14,96,3,14,14,96 Convolution op_237 1 1 261:12_splitncnn_1 263:12 -23330=4,3,14,14,384 0=384 1=1 4=-233 5=1 6=36864 Swish op_238 1 1 263:12 265:12 -23330=4,3,14,14,384 ConvolutionDepthWise op_239 1 1 265:12 268:12 -23330=4,3,14,14,384 0=384 1=3 4=-233 5=1 6=3456 7=384 Swish op_241 1 1 268:12 270:12 -23330=4,3,14,14,384 Split splitncnn_7 1 2 270:12 270:12_splitncnn_0 270:12_splitncnn_1 -23330=8,3,14,14,384,3,14,14,384 Reduction op_242 1 1 270:12_splitncnn_1 271:12 -23330=4,3,1,1,384 0=3 1=0 -23303=2,1,2 4=1 5=1 Convolution op_243 1 1 271:12 273:12 -23330=4,3,1,1,24 0=24 1=1 4=-233 5=1 6=9216 Swish op_244 1 1 273:12 275:12 -23330=4,3,1,1,24 Convolution op_245 1 1 275:12 278:12 -23330=4,3,1,1,384 0=384 1=1 4=-233 5=1 6=9216 9=4 BinaryOp op_247 2 1 270:12_splitncnn_0 278:12 279:12 -23330=4,3,14,14,384 0=2 Convolution op_248 1 1 279:12 280:12 -23330=4,3,14,14,96 0=96 1=1 4=-233 6=36864 Eltwise op_249 2 1 261:12_splitncnn_0 280:12 281:12 -23330=4,3,14,14,96 0=1 BinaryOp op_250 2 1 281:12 77:11 282:12 -23330=4,3,14,14,96 Split splitncnn_8 1 2 282:12 282:12_splitncnn_0 282:12_splitncnn_1 -23330=8,3,14,14,96,3,14,14,96 Convolution op_251 1 1 282:12_splitncnn_1 284:12 -23330=4,3,14,14,384 0=384 1=1 4=-233 5=1 6=36864 Swish op_252 1 1 284:12 286:12 -23330=4,3,14,14,384 ConvolutionDepthWise op_253 1 1 286:12 289:12 -23330=4,3,14,14,384 0=384 1=3 4=-233 5=1 6=3456 7=384 Swish op_255 1 1 289:12 291:12 -23330=4,3,14,14,384 Split splitncnn_9 1 2 291:12 291:12_splitncnn_0 291:12_splitncnn_1 -23330=8,3,14,14,384,3,14,14,384 Reduction op_256 1 1 291:12_splitncnn_1 292:12 -23330=4,3,1,1,384 0=3 1=0 -23303=2,1,2 4=1 5=1 Convolution op_257 1 1 292:12 294:12 -23330=4,3,1,1,24 0=24 1=1 4=-233 5=1 6=9216 Swish op_258 1 1 294:12 296:12 -23330=4,3,1,1,24 Convolution op_259 1 1 296:12 299:12 -23330=4,3,1,1,384 0=384 1=1 4=-233 5=1 6=9216 9=4 BinaryOp op_261 2 1 291:12_splitncnn_0 299:12 300:12 -23330=4,3,14,14,384 0=2 Convolution op_262 1 1 300:12 301:12 -23330=4,3,14,14,96 0=96 1=1 4=-233 6=36864 Eltwise op_263 2 1 282:12_splitncnn_0 301:12 302:12 -23330=4,3,14,14,96 0=1 BinaryOp op_264 2 1 302:12 88:11 303:12 -23330=4,3,14,14,96 Convolution op_265 1 1 303:12 305:12 -23330=4,3,14,14,576 0=576 1=1 4=-233 5=1 6=55296 Swish op_266 1 1 305:12 307:12 -23330=4,3,14,14,576 ConvolutionDepthWise op_267 1 1 307:12 310:12 -23330=4,3,14,14,576 0=576 1=3 4=-233 5=1 6=5184 7=576 Swish op_269 1 1 310:12 312:12 -23330=4,3,14,14,576 Split splitncnn_10 1 2 312:12 312:12_splitncnn_0 312:12_splitncnn_1 -23330=8,3,14,14,576,3,14,14,576 Reduction op_270 1 1 312:12_splitncnn_1 313:12 -23330=4,3,1,1,576 0=3 1=0 -23303=2,1,2 4=1 5=1 Convolution op_271 1 1 313:12 315:12 -23330=4,3,1,1,24 0=24 1=1 4=-233 5=1 6=13824 Swish op_272 1 1 315:12 317:12 -23330=4,3,1,1,24 Convolution op_273 1 1 317:12 320:12 -23330=4,3,1,1,576 0=576 1=1 4=-233 5=1 6=13824 9=4 BinaryOp op_275 2 1 312:12_splitncnn_0 320:12 321:12 -23330=4,3,14,14,576 0=2 Convolution op_276 1 1 321:12 323:12 -23330=4,3,14,14,112 0=112 1=1 4=-233 5=1 6=64512 Split splitncnn_11 1 2 323:12 323:12_splitncnn_0 323:12_splitncnn_1 -23330=8,3,14,14,112,3,14,14,112 Convolution op_277 1 1 323:12_splitncnn_1 325:12 -23330=4,3,14,14,672 0=672 1=1 4=-233 5=1 6=75264 Swish op_278 1 1 325:12 327:12 -23330=4,3,14,14,672 ConvolutionDepthWise op_279 1 1 327:12 330:12 -23330=4,3,14,14,672 0=672 1=3 4=-233 5=1 6=6048 7=672 Swish op_281 1 1 330:12 332:12 -23330=4,3,14,14,672 Split splitncnn_12 1 2 332:12 332:12_splitncnn_0 332:12_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672 Reduction op_282 1 1 332:12_splitncnn_1 333:12 -23330=4,3,1,1,672 0=3 1=0 -23303=2,1,2 4=1 5=1 Convolution op_283 1 1 333:12 335:12 -23330=4,3,1,1,28 0=28 1=1 4=-233 5=1 6=18816 Swish op_284 1 1 335:12 337:12 -23330=4,3,1,1,28 Convolution op_285 1 1 337:12 340:12 -23330=4,3,1,1,672 0=672 1=1 4=-233 5=1 6=18816 9=4 BinaryOp op_287 2 1 332:12_splitncnn_0 340:12 341:12 -23330=4,3,14,14,672 0=2 Convolution op_288 1 1 341:12 342:12 -23330=4,3,14,14,112 0=112 1=1 4=-233 6=75264 Eltwise op_289 2 1 323:12_splitncnn_0 342:12 343:12 -23330=4,3,14,14,112 0=1 BinaryOp op_290 2 1 343:12 110:12 344:12 -23330=4,3,14,14,112 Split splitncnn_13 1 2 344:12 344:12_splitncnn_0 344:12_splitncnn_1 -23330=8,3,14,14,112,3,14,14,112 Convolution op_291 1 1 344:12_splitncnn_1 346:12 -23330=4,3,14,14,672 0=672 1=1 4=-233 5=1 6=75264 Swish op_292 1 1 346:12 348:12 -23330=4,3,14,14,672 ConvolutionDepthWise op_293 1 1 348:12 351:12 -23330=4,3,14,14,672 0=672 1=3 4=-233 5=1 6=6048 7=672 Swish op_295 1 1 351:12 353:12 -23330=4,3,14,14,672 Split splitncnn_14 1 2 353:12 353:12_splitncnn_0 353:12_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672 Reduction op_296 1 1 353:12_splitncnn_1 354:12 -23330=4,3,1,1,672 0=3 1=0 -23303=2,1,2 4=1 5=1 Convolution op_297 1 1 354:12 356:12 -23330=4,3,1,1,28 0=28 1=1 4=-233 5=1 6=18816 Swish op_298 1 1 356:12 358:12 -23330=4,3,1,1,28 Convolution op_299 1 1 358:12 361:12 -23330=4,3,1,1,672 0=672 1=1 4=-233 5=1 6=18816 9=4 BinaryOp op_301 2 1 353:12_splitncnn_0 361:12 362:12 -23330=4,3,14,14,672 0=2 Convolution op_302 1 1 362:12 363:12 -23330=4,3,14,14,112 0=112 1=1 4=-233 6=75264 Eltwise op_303 2 1 363:12 344:12_splitncnn_0 364:12 -23330=4,3,14,14,112 0=1 BinaryOp op_304 2 1 364:12 33:11 365:12 -23330=4,3,14,14,112 Split splitncnn_15 1 2 365:12 365:12_splitncnn_0 365:12_splitncnn_1 -23330=8,3,14,14,112,3,14,14,112 Convolution op_305 1 1 365:12_splitncnn_1 367:12 -23330=4,3,14,14,672 0=672 1=1 4=-233 5=1 6=75264 Swish op_306 1 1 367:12 369:12 -23330=4,3,14,14,672 ConvolutionDepthWise op_307 1 1 369:12 372:12 -23330=4,3,14,14,672 0=672 1=3 4=-233 5=1 6=6048 7=672 Swish op_309 1 1 372:12 374:12 -23330=4,3,14,14,672 Split splitncnn_16 1 2 374:12 374:12_splitncnn_0 374:12_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672 Reduction op_310 1 1 374:12_splitncnn_1 375:12 -23330=4,3,1,1,672 0=3 1=0 -23303=2,1,2 4=1 5=1 Convolution op_311 1 1 375:12 377:12 -23330=4,3,1,1,28 0=28 1=1 4=-233 5=1 6=18816 Swish op_312 1 1 377:12 379:12 -23330=4,3,1,1,28 Convolution op_313 1 1 379:12 382:12 -23330=4,3,1,1,672 0=672 1=1 4=-233 5=1 6=18816 9=4 BinaryOp op_315 2 1 374:12_splitncnn_0 382:12 383:12 -23330=4,3,14,14,672 0=2 Convolution op_316 1 1 383:12 384:12 -23330=4,3,14,14,112 0=112 1=1 4=-233 6=75264 Eltwise op_317 2 1 365:12_splitncnn_0 384:12 385:12 -23330=4,3,14,14,112 0=1 BinaryOp op_318 2 1 385:12 44:11 386:12 -23330=4,3,14,14,112 Split splitncnn_17 1 2 386:12 386:12_splitncnn_0 386:12_splitncnn_1 -23330=8,3,14,14,112,3,14,14,112 Convolution op_319 1 1 386:12_splitncnn_1 388:12 -23330=4,3,14,14,672 0=672 1=1 4=-233 5=1 6=75264 Swish op_320 1 1 388:12 390:12 -23330=4,3,14,14,672 ConvolutionDepthWise op_321 1 1 390:12 393:12 -23330=4,3,14,14,672 0=672 1=3 4=-233 5=1 6=6048 7=672 Swish op_323 1 1 393:12 395:12 -23330=4,3,14,14,672 Split splitncnn_18 1 2 395:12 395:12_splitncnn_0 395:12_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672 Reduction op_324 1 1 395:12_splitncnn_1 396:12 -23330=4,3,1,1,672 0=3 1=0 -23303=2,1,2 4=1 5=1 Convolution op_325 1 1 396:12 398:12 -23330=4,3,1,1,28 0=28 1=1 4=-233 5=1 6=18816 Swish op_326 1 1 398:12 400:12 -23330=4,3,1,1,28 Convolution op_327 1 1 400:12 403:12 -23330=4,3,1,1,672 0=672 1=1 4=-233 5=1 6=18816 9=4 BinaryOp op_329 2 1 395:12_splitncnn_0 403:12 404:12 -23330=4,3,14,14,672 0=2 Convolution op_330 1 1 404:12 405:12 -23330=4,3,14,14,112 0=112 1=1 4=-233 6=75264 Eltwise op_331 2 1 386:12_splitncnn_0 405:12 406:12 -23330=4,3,14,14,112 0=1 BinaryOp op_332 2 1 406:12 55:11 407:12_splitncnn_0 -23330=4,3,14,14,112 Convolution op_334 1 1 407:12_splitncnn_0 410:12 -23330=4,3,14,14,672 0=672 1=1 4=-233 5=1 6=75264 Swish op_335 1 1 410:12 412:12 -23330=4,3,14,14,672 ConvolutionDepthWise op_336 1 1 412:12 415:12 -23330=4,3,7,7,672 0=672 1=3 3=2 4=-233 5=1 6=6048 7=672 Swish op_338 1 1 415:12 417:12 -23330=4,3,7,7,672 Split splitncnn_20 1 2 417:12 417:12_splitncnn_0 417:12_splitncnn_1 -23330=8,3,7,7,672,3,7,7,672 Reduction op_339 1 1 417:12_splitncnn_1 418:12 -23330=4,3,1,1,672 0=3 1=0 -23303=2,1,2 4=1 5=1 Convolution op_340 1 1 418:12 420:12 -23330=4,3,1,1,28 0=28 1=1 4=-233 5=1 6=18816 Swish op_341 1 1 420:12 422:12 -23330=4,3,1,1,28 Convolution op_342 1 1 422:12 425:12 -23330=4,3,1,1,672 0=672 1=1 4=-233 5=1 6=18816 9=4 BinaryOp op_344 2 1 417:12_splitncnn_0 425:12 426:12 -23330=4,3,7,7,672 0=2 Convolution op_345 1 1 426:12 428:12 -23330=4,3,7,7,192 0=192 1=1 4=-233 5=1 6=129024 Split splitncnn_21 1 2 428:12 428:12_splitncnn_0 428:12_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192 Convolution op_346 1 1 428:12_splitncnn_1 430:12 -23330=4,3,7,7,1152 0=1152 1=1 4=-233 5=1 6=221184 Swish op_347 1 1 430:12 432:12 -23330=4,3,7,7,1152 ConvolutionDepthWise op_348 1 1 432:12 435:12 -23330=4,3,7,7,1152 0=1152 1=3 4=-233 5=1 6=10368 7=1152 Swish op_350 1 1 435:12 437:12 -23330=4,3,7,7,1152 Split splitncnn_22 1 2 437:12 437:12_splitncnn_0 437:12_splitncnn_1 -23330=8,3,7,7,1152,3,7,7,1152 Reduction op_351 1 1 437:12_splitncnn_1 438:12 -23330=4,3,1,1,1152 0=3 1=0 -23303=2,1,2 4=1 5=1 Convolution op_352 1 1 438:12 440:12 -23330=4,3,1,1,48 0=48 1=1 4=-233 5=1 6=55296 Swish op_353 1 1 440:12 442:12 -23330=4,3,1,1,48 Convolution op_354 1 1 442:12 445:12 -23330=4,3,1,1,1152 0=1152 1=1 4=-233 5=1 6=55296 9=4 BinaryOp op_356 2 1 437:12_splitncnn_0 445:12 446:12 -23330=4,3,7,7,1152 0=2 Convolution op_357 1 1 446:12 447:12 -23330=4,3,7,7,192 0=192 1=1 4=-233 6=221184 Eltwise op_358 2 1 428:12_splitncnn_0 447:12 448:12 -23330=4,3,7,7,192 0=1 BinaryOp op_359 2 1 448:12 133:12 449:12 -23330=4,3,7,7,192 Split splitncnn_23 1 2 449:12 449:12_splitncnn_0 449:12_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192 Convolution op_360 1 1 449:12_splitncnn_1 451:12 -23330=4,3,7,7,1152 0=1152 1=1 4=-233 5=1 6=221184 Swish op_361 1 1 451:12 453:12 -23330=4,3,7,7,1152 ConvolutionDepthWise op_362 1 1 453:12 456:12 -23330=4,3,7,7,1152 0=1152 1=3 4=-233 5=1 6=10368 7=1152 Swish op_364 1 1 456:12 458:12 -23330=4,3,7,7,1152 Split splitncnn_24 1 2 458:12 458:12_splitncnn_0 458:12_splitncnn_1 -23330=8,3,7,7,1152,3,7,7,1152 Reduction op_365 1 1 458:12_splitncnn_1 459:12 -23330=4,3,1,1,1152 0=3 1=0 -23303=2,1,2 4=1 5=1 Convolution op_366 1 1 459:12 461:12 -23330=4,3,1,1,48 0=48 1=1 4=-233 5=1 6=55296 Swish op_367 1 1 461:12 463:12 -23330=4,3,1,1,48 Convolution op_368 1 1 463:12 466:12 -23330=4,3,1,1,1152 0=1152 1=1 4=-233 5=1 6=55296 9=4 BinaryOp op_370 2 1 458:12_splitncnn_0 466:12 467:12 -23330=4,3,7,7,1152 0=2 Convolution op_371 1 1 467:12 468:12 -23330=4,3,7,7,192 0=192 1=1 4=-233 6=221184 Eltwise op_372 2 1 449:12_splitncnn_0 468:12 469:12 -23330=4,3,7,7,192 0=1 BinaryOp op_373 2 1 469:12 144:12 470:12 -23330=4,3,7,7,192 Split splitncnn_25 1 2 470:12 470:12_splitncnn_0 470:12_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192 Convolution op_374 1 1 470:12_splitncnn_1 472:12 -23330=4,3,7,7,1152 0=1152 1=1 4=-233 5=1 6=221184 Swish op_375 1 1 472:12 474:12 -23330=4,3,7,7,1152 ConvolutionDepthWise op_376 1 1 474:12 477:12 -23330=4,3,7,7,1152 0=1152 1=3 4=-233 5=1 6=10368 7=1152 Swish op_378 1 1 477:12 479:12 -23330=4,3,7,7,1152 Split splitncnn_26 1 2 479:12 479:12_splitncnn_0 479:12_splitncnn_1 -23330=8,3,7,7,1152,3,7,7,1152 Reduction op_379 1 1 479:12_splitncnn_1 480:12 -23330=4,3,1,1,1152 0=3 1=0 -23303=2,1,2 4=1 5=1 Convolution op_380 1 1 480:12 482:12 -23330=4,3,1,1,48 0=48 1=1 4=-233 5=1 6=55296 Swish op_381 1 1 482:12 484:12 -23330=4,3,1,1,48 Convolution op_382 1 1 484:12 487:12 -23330=4,3,1,1,1152 0=1152 1=1 4=-233 5=1 6=55296 9=4 BinaryOp op_384 2 1 479:12_splitncnn_0 487:12 488:12 -23330=4,3,7,7,1152 0=2 Convolution op_385 1 1 488:12 489:12 -23330=4,3,7,7,192 0=192 1=1 4=-233 6=221184 Eltwise op_386 2 1 470:12_splitncnn_0 489:12 490:12 -23330=4,3,7,7,192 0=1 BinaryOp op_387 2 1 490:12 155:12 491:12 -23330=4,3,7,7,192 Split splitncnn_27 1 2 491:12 491:12_splitncnn_0 491:12_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192 Convolution op_388 1 1 491:12_splitncnn_1 493:12 -23330=4,3,7,7,1152 0=1152 1=1 4=-233 5=1 6=221184 Swish op_389 1 1 493:12 495:12 -23330=4,3,7,7,1152 ConvolutionDepthWise op_390 1 1 495:12 498:12 -23330=4,3,7,7,1152 0=1152 1=3 4=-233 5=1 6=10368 7=1152 Swish op_392 1 1 498:12 500:12 -23330=4,3,7,7,1152 Split splitncnn_28 1 2 500:12 500:12_splitncnn_0 500:12_splitncnn_1 -23330=8,3,7,7,1152,3,7,7,1152 Reduction op_393 1 1 500:12_splitncnn_1 501:12 -23330=4,3,1,1,1152 0=3 1=0 -23303=2,1,2 4=1 5=1 Convolution op_394 1 1 501:12 503:12 -23330=4,3,1,1,48 0=48 1=1 4=-233 5=1 6=55296 Swish op_395 1 1 503:12 505:12 -23330=4,3,1,1,48 Convolution op_396 1 1 505:12 508:12 -23330=4,3,1,1,1152 0=1152 1=1 4=-233 5=1 6=55296 9=4 BinaryOp op_398 2 1 500:12_splitncnn_0 508:12 509:12 -23330=4,3,7,7,1152 0=2 Convolution op_399 1 1 509:12 510:12 -23330=4,3,7,7,192 0=192 1=1 4=-233 6=221184 Eltwise op_400 2 1 491:12_splitncnn_0 510:12 511:12 -23330=4,3,7,7,192 0=1 BinaryOp op_401 2 1 511:12 166:12 512:12 -23330=4,3,7,7,192 Split splitncnn_29 1 2 512:12 512:12_splitncnn_0 512:12_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192 Convolution op_402 1 1 512:12_splitncnn_1 514:12 -23330=4,3,7,7,1152 0=1152 1=1 4=-233 5=1 6=221184 Swish op_403 1 1 514:12 516:12 -23330=4,3,7,7,1152 ConvolutionDepthWise op_404 1 1 516:12 519:12 -23330=4,3,7,7,1152 0=1152 1=3 4=-233 5=1 6=10368 7=1152 Swish op_406 1 1 519:12 521:12 -23330=4,3,7,7,1152 Split splitncnn_30 1 2 521:12 521:12_splitncnn_0 521:12_splitncnn_1 -23330=8,3,7,7,1152,3,7,7,1152 Reduction op_407 1 1 521:12_splitncnn_1 522:12 -23330=4,3,1,1,1152 0=3 1=0 -23303=2,1,2 4=1 5=1 Convolution op_408 1 1 522:12 524:12 -23330=4,3,1,1,48 0=48 1=1 4=-233 5=1 6=55296 Swish op_409 1 1 524:12 526:12 -23330=4,3,1,1,48 Convolution op_410 1 1 526:12 529:12 -23330=4,3,1,1,1152 0=1152 1=1 4=-233 5=1 6=55296 9=4 BinaryOp op_412 2 1 521:12_splitncnn_0 529:12 530:12 -23330=4,3,7,7,1152 0=2 Convolution op_413 1 1 530:12 531:12 -23330=4,3,7,7,192 0=192 1=1 4=-233 6=221184 Eltwise op_414 2 1 512:12_splitncnn_0 531:12 532:12 -23330=4,3,7,7,192 0=1 BinaryOp op_415 2 1 532:12 177:12 533:12 -23330=4,3,7,7,192 Split splitncnn_31 1 2 533:12 533:12_splitncnn_0 533:12_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192 Convolution op_416 1 1 533:12_splitncnn_1 535:12 -23330=4,3,7,7,1152 0=1152 1=1 4=-233 5=1 6=221184 Swish op_417 1 1 535:12 537:12 -23330=4,3,7,7,1152 ConvolutionDepthWise op_418 1 1 537:12 540:12 -23330=4,3,7,7,1152 0=1152 1=3 4=-233 5=1 6=10368 7=1152 Swish op_420 1 1 540:12 542:12 -23330=4,3,7,7,1152 Split splitncnn_32 1 2 542:12 542:12_splitncnn_0 542:12_splitncnn_1 -23330=8,3,7,7,1152,3,7,7,1152 Reduction op_421 1 1 542:12_splitncnn_1 543:12 -23330=4,3,1,1,1152 0=3 1=0 -23303=2,1,2 4=1 5=1 Convolution op_422 1 1 543:12 545:12 -23330=4,3,1,1,48 0=48 1=1 4=-233 5=1 6=55296 Swish op_423 1 1 545:12 547:12 -23330=4,3,1,1,48 Convolution op_424 1 1 547:12 550:12 -23330=4,3,1,1,1152 0=1152 1=1 4=-233 5=1 6=55296 9=4 BinaryOp op_426 2 1 542:12_splitncnn_0 550:12 551:12 -23330=4,3,7,7,1152 0=2 Convolution op_427 1 1 551:12 552:12 -23330=4,3,7,7,192 0=192 1=1 4=-233 6=221184 Eltwise op_428 2 1 533:12_splitncnn_0 552:12 553:12 -23330=4,3,7,7,192 0=1 BinaryOp op_429 2 1 553:12 188:12 554:12 -23330=4,3,7,7,192 Split splitncnn_33 1 2 554:12 554:12_splitncnn_0 554:12_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192 Convolution op_430 1 1 554:12_splitncnn_1 556:12 -23330=4,3,7,7,1152 0=1152 1=1 4=-233 5=1 6=221184 Swish op_431 1 1 556:12 558:12 -23330=4,3,7,7,1152 ConvolutionDepthWise op_432 1 1 558:12 561:12 -23330=4,3,7,7,1152 0=1152 1=3 4=-233 5=1 6=10368 7=1152 Swish op_434 1 1 561:12 563:12 -23330=4,3,7,7,1152 Split splitncnn_34 1 2 563:12 563:12_splitncnn_0 563:12_splitncnn_1 -23330=8,3,7,7,1152,3,7,7,1152 Reduction op_435 1 1 563:12_splitncnn_1 564:12 -23330=4,3,1,1,1152 0=3 1=0 -23303=2,1,2 4=1 5=1 Convolution op_436 1 1 564:12 566:12 -23330=4,3,1,1,48 0=48 1=1 4=-233 5=1 6=55296 Swish op_437 1 1 566:12 568:12 -23330=4,3,1,1,48 Convolution op_438 1 1 568:12 571:12 -23330=4,3,1,1,1152 0=1152 1=1 4=-233 5=1 6=55296 9=4 BinaryOp op_440 2 1 563:12_splitncnn_0 571:12 572:12 -23330=4,3,7,7,1152 0=2 Convolution op_441 1 1 572:12 573:12 -23330=4,3,7,7,192 0=192 1=1 4=-233 6=221184 Eltwise op_442 2 1 554:12_splitncnn_0 573:12 574:12 -23330=4,3,7,7,192 0=1 BinaryOp op_443 2 1 574:12 199:12 575:12_splitncnn_0 -23330=4,3,7,7,192 Convolution op_445 1 1 575:12_splitncnn_0 578:12 -23330=4,3,7,7,1280 0=1280 1=1 4=-233 5=1 6=245760 Swish op_446 1 1 578:12 580:12 -23330=4,3,7,7,1280 Pooling op_447 1 1 580:12 581:12 -23330=4,1,1280,1,1 0=1 4=1 InnerProduct op_448 1 1 581:12 584:12 -23330=4,1,1000,1,1 0=1000 1=1 2=1280000 ================================================ FILE: benchmark/googlenet.param ================================================ 7767517 94 121 Input data 0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3 Convolution conv1/7x7_s2 1 1 data conv1/7x7_s2_conv1/relu_7x7 -23330=4,3,112,112,64 0=64 1=7 3=2 4=3 5=1 6=9408 9=1 Pooling pool1/3x3_s2 1 1 conv1/7x7_s2_conv1/relu_7x7 pool1/3x3_s2 -23330=4,3,56,56,64 1=3 2=2 LRN pool1/norm1 1 1 pool1/3x3_s2 pool1/norm1 -23330=4,3,56,56,64 2=1.000000e-04 Convolution conv2/3x3_reduce 1 1 pool1/norm1 conv2/3x3_reduce_conv2/relu_3x3_reduce -23330=4,3,56,56,64 0=64 1=1 5=1 6=4096 9=1 Convolution conv2/3x3 1 1 conv2/3x3_reduce_conv2/relu_3x3_reduce conv2/3x3_conv2/relu_3x3 -23330=4,3,56,56,192 0=192 1=3 4=1 5=1 6=110592 9=1 LRN conv2/norm2 1 1 conv2/3x3_conv2/relu_3x3 conv2/norm2 -23330=4,3,56,56,192 2=1.000000e-04 Pooling pool2/3x3_s2 1 1 conv2/norm2 pool2/3x3_s2 -23330=4,3,28,28,192 1=3 2=2 Split splitncnn_0 1 4 pool2/3x3_s2 pool2/3x3_s2_splitncnn_0 pool2/3x3_s2_splitncnn_1 pool2/3x3_s2_splitncnn_2 pool2/3x3_s2_splitncnn_3 -23330=16,3,28,28,192,3,28,28,192,3,28,28,192,3,28,28,192 Convolution inception_3a/1x1 1 1 pool2/3x3_s2_splitncnn_3 inception_3a/1x1_inception_3a/relu_1x1 -23330=4,3,28,28,64 0=64 1=1 5=1 6=12288 9=1 Convolution inception_3a/3x3_reduce 1 1 pool2/3x3_s2_splitncnn_2 inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce -23330=4,3,28,28,96 0=96 1=1 5=1 6=18432 9=1 Convolution inception_3a/3x3 1 1 inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce inception_3a/3x3_inception_3a/relu_3x3 -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=110592 9=1 Convolution inception_3a/5x5_reduce 1 1 pool2/3x3_s2_splitncnn_1 inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce -23330=4,3,28,28,16 0=16 1=1 5=1 6=3072 9=1 Convolution inception_3a/5x5 1 1 inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce inception_3a/5x5_inception_3a/relu_5x5 -23330=4,3,28,28,32 0=32 1=5 4=2 5=1 6=12800 9=1 Pooling inception_3a/pool 1 1 pool2/3x3_s2_splitncnn_0 inception_3a/pool -23330=4,3,28,28,192 1=3 3=1 Convolution inception_3a/pool_proj 1 1 inception_3a/pool inception_3a/pool_proj_inception_3a/relu_pool_proj -23330=4,3,28,28,32 0=32 1=1 5=1 6=6144 9=1 Concat inception_3a/output 4 1 inception_3a/1x1_inception_3a/relu_1x1 inception_3a/3x3_inception_3a/relu_3x3 inception_3a/5x5_inception_3a/relu_5x5 inception_3a/pool_proj_inception_3a/relu_pool_proj inception_3a/output -23330=4,3,28,28,256 Split splitncnn_1 1 4 inception_3a/output inception_3a/output_splitncnn_0 inception_3a/output_splitncnn_1 inception_3a/output_splitncnn_2 inception_3a/output_splitncnn_3 -23330=16,3,28,28,256,3,28,28,256,3,28,28,256,3,28,28,256 Convolution inception_3b/1x1 1 1 inception_3a/output_splitncnn_3 inception_3b/1x1_inception_3b/relu_1x1 -23330=4,3,28,28,128 0=128 1=1 5=1 6=32768 9=1 Convolution inception_3b/3x3_reduce 1 1 inception_3a/output_splitncnn_2 inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce -23330=4,3,28,28,128 0=128 1=1 5=1 6=32768 9=1 Convolution inception_3b/3x3 1 1 inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce inception_3b/3x3_inception_3b/relu_3x3 -23330=4,3,28,28,192 0=192 1=3 4=1 5=1 6=221184 9=1 Convolution inception_3b/5x5_reduce 1 1 inception_3a/output_splitncnn_1 inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce -23330=4,3,28,28,32 0=32 1=1 5=1 6=8192 9=1 Convolution inception_3b/5x5 1 1 inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce inception_3b/5x5_inception_3b/relu_5x5 -23330=4,3,28,28,96 0=96 1=5 4=2 5=1 6=76800 9=1 Pooling inception_3b/pool 1 1 inception_3a/output_splitncnn_0 inception_3b/pool -23330=4,3,28,28,256 1=3 3=1 Convolution inception_3b/pool_proj 1 1 inception_3b/pool inception_3b/pool_proj_inception_3b/relu_pool_proj -23330=4,3,28,28,64 0=64 1=1 5=1 6=16384 9=1 Concat inception_3b/output 4 1 inception_3b/1x1_inception_3b/relu_1x1 inception_3b/3x3_inception_3b/relu_3x3 inception_3b/5x5_inception_3b/relu_5x5 inception_3b/pool_proj_inception_3b/relu_pool_proj inception_3b/output -23330=4,3,28,28,480 Pooling pool3/3x3_s2 1 1 inception_3b/output pool3/3x3_s2 -23330=4,3,14,14,480 1=3 2=2 Split splitncnn_2 1 4 pool3/3x3_s2 pool3/3x3_s2_splitncnn_0 pool3/3x3_s2_splitncnn_1 pool3/3x3_s2_splitncnn_2 pool3/3x3_s2_splitncnn_3 -23330=16,3,14,14,480,3,14,14,480,3,14,14,480,3,14,14,480 Convolution inception_4a/1x1 1 1 pool3/3x3_s2_splitncnn_3 inception_4a/1x1_inception_4a/relu_1x1 -23330=4,3,14,14,192 0=192 1=1 5=1 6=92160 9=1 Convolution inception_4a/3x3_reduce 1 1 pool3/3x3_s2_splitncnn_2 inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce -23330=4,3,14,14,96 0=96 1=1 5=1 6=46080 9=1 Convolution inception_4a/3x3 1 1 inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce inception_4a/3x3_inception_4a/relu_3x3 -23330=4,3,14,14,208 0=208 1=3 4=1 5=1 6=179712 9=1 Convolution inception_4a/5x5_reduce 1 1 pool3/3x3_s2_splitncnn_1 inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce -23330=4,3,14,14,16 0=16 1=1 5=1 6=7680 9=1 Convolution inception_4a/5x5 1 1 inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce inception_4a/5x5_inception_4a/relu_5x5 -23330=4,3,14,14,48 0=48 1=5 4=2 5=1 6=19200 9=1 Pooling inception_4a/pool 1 1 pool3/3x3_s2_splitncnn_0 inception_4a/pool -23330=4,3,14,14,480 1=3 3=1 Convolution inception_4a/pool_proj 1 1 inception_4a/pool inception_4a/pool_proj_inception_4a/relu_pool_proj -23330=4,3,14,14,64 0=64 1=1 5=1 6=30720 9=1 Concat inception_4a/output 4 1 inception_4a/1x1_inception_4a/relu_1x1 inception_4a/3x3_inception_4a/relu_3x3 inception_4a/5x5_inception_4a/relu_5x5 inception_4a/pool_proj_inception_4a/relu_pool_proj inception_4a/output -23330=4,3,14,14,512 Split splitncnn_3 1 4 inception_4a/output inception_4a/output_splitncnn_0 inception_4a/output_splitncnn_1 inception_4a/output_splitncnn_2 inception_4a/output_splitncnn_3 -23330=16,3,14,14,512,3,14,14,512,3,14,14,512,3,14,14,512 Convolution inception_4b/1x1 1 1 inception_4a/output_splitncnn_3 inception_4b/1x1_inception_4b/relu_1x1 -23330=4,3,14,14,160 0=160 1=1 5=1 6=81920 9=1 Convolution inception_4b/3x3_reduce 1 1 inception_4a/output_splitncnn_2 inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce -23330=4,3,14,14,112 0=112 1=1 5=1 6=57344 9=1 Convolution inception_4b/3x3 1 1 inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce inception_4b/3x3_inception_4b/relu_3x3 -23330=4,3,14,14,224 0=224 1=3 4=1 5=1 6=225792 9=1 Convolution inception_4b/5x5_reduce 1 1 inception_4a/output_splitncnn_1 inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce -23330=4,3,14,14,24 0=24 1=1 5=1 6=12288 9=1 Convolution inception_4b/5x5 1 1 inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce inception_4b/5x5_inception_4b/relu_5x5 -23330=4,3,14,14,64 0=64 1=5 4=2 5=1 6=38400 9=1 Pooling inception_4b/pool 1 1 inception_4a/output_splitncnn_0 inception_4b/pool -23330=4,3,14,14,512 1=3 3=1 Convolution inception_4b/pool_proj 1 1 inception_4b/pool inception_4b/pool_proj_inception_4b/relu_pool_proj -23330=4,3,14,14,64 0=64 1=1 5=1 6=32768 9=1 Concat inception_4b/output 4 1 inception_4b/1x1_inception_4b/relu_1x1 inception_4b/3x3_inception_4b/relu_3x3 inception_4b/5x5_inception_4b/relu_5x5 inception_4b/pool_proj_inception_4b/relu_pool_proj inception_4b/output -23330=4,3,14,14,512 Split splitncnn_4 1 4 inception_4b/output inception_4b/output_splitncnn_0 inception_4b/output_splitncnn_1 inception_4b/output_splitncnn_2 inception_4b/output_splitncnn_3 -23330=16,3,14,14,512,3,14,14,512,3,14,14,512,3,14,14,512 Convolution inception_4c/1x1 1 1 inception_4b/output_splitncnn_3 inception_4c/1x1_inception_4c/relu_1x1 -23330=4,3,14,14,128 0=128 1=1 5=1 6=65536 9=1 Convolution inception_4c/3x3_reduce 1 1 inception_4b/output_splitncnn_2 inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce -23330=4,3,14,14,128 0=128 1=1 5=1 6=65536 9=1 Convolution inception_4c/3x3 1 1 inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce inception_4c/3x3_inception_4c/relu_3x3 -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=294912 9=1 Convolution inception_4c/5x5_reduce 1 1 inception_4b/output_splitncnn_1 inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce -23330=4,3,14,14,24 0=24 1=1 5=1 6=12288 9=1 Convolution inception_4c/5x5 1 1 inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce inception_4c/5x5_inception_4c/relu_5x5 -23330=4,3,14,14,64 0=64 1=5 4=2 5=1 6=38400 9=1 Pooling inception_4c/pool 1 1 inception_4b/output_splitncnn_0 inception_4c/pool -23330=4,3,14,14,512 1=3 3=1 Convolution inception_4c/pool_proj 1 1 inception_4c/pool inception_4c/pool_proj_inception_4c/relu_pool_proj -23330=4,3,14,14,64 0=64 1=1 5=1 6=32768 9=1 Concat inception_4c/output 4 1 inception_4c/1x1_inception_4c/relu_1x1 inception_4c/3x3_inception_4c/relu_3x3 inception_4c/5x5_inception_4c/relu_5x5 inception_4c/pool_proj_inception_4c/relu_pool_proj inception_4c/output -23330=4,3,14,14,512 Split splitncnn_5 1 4 inception_4c/output inception_4c/output_splitncnn_0 inception_4c/output_splitncnn_1 inception_4c/output_splitncnn_2 inception_4c/output_splitncnn_3 -23330=16,3,14,14,512,3,14,14,512,3,14,14,512,3,14,14,512 Convolution inception_4d/1x1 1 1 inception_4c/output_splitncnn_3 inception_4d/1x1_inception_4d/relu_1x1 -23330=4,3,14,14,112 0=112 1=1 5=1 6=57344 9=1 Convolution inception_4d/3x3_reduce 1 1 inception_4c/output_splitncnn_2 inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce -23330=4,3,14,14,144 0=144 1=1 5=1 6=73728 9=1 Convolution inception_4d/3x3 1 1 inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce inception_4d/3x3_inception_4d/relu_3x3 -23330=4,3,14,14,288 0=288 1=3 4=1 5=1 6=373248 9=1 Convolution inception_4d/5x5_reduce 1 1 inception_4c/output_splitncnn_1 inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce -23330=4,3,14,14,32 0=32 1=1 5=1 6=16384 9=1 Convolution inception_4d/5x5 1 1 inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce inception_4d/5x5_inception_4d/relu_5x5 -23330=4,3,14,14,64 0=64 1=5 4=2 5=1 6=51200 9=1 Pooling inception_4d/pool 1 1 inception_4c/output_splitncnn_0 inception_4d/pool -23330=4,3,14,14,512 1=3 3=1 Convolution inception_4d/pool_proj 1 1 inception_4d/pool inception_4d/pool_proj_inception_4d/relu_pool_proj -23330=4,3,14,14,64 0=64 1=1 5=1 6=32768 9=1 Concat inception_4d/output 4 1 inception_4d/1x1_inception_4d/relu_1x1 inception_4d/3x3_inception_4d/relu_3x3 inception_4d/5x5_inception_4d/relu_5x5 inception_4d/pool_proj_inception_4d/relu_pool_proj inception_4d/output -23330=4,3,14,14,528 Split splitncnn_6 1 4 inception_4d/output inception_4d/output_splitncnn_0 inception_4d/output_splitncnn_1 inception_4d/output_splitncnn_2 inception_4d/output_splitncnn_3 -23330=16,3,14,14,528,3,14,14,528,3,14,14,528,3,14,14,528 Convolution inception_4e/1x1 1 1 inception_4d/output_splitncnn_3 inception_4e/1x1_inception_4e/relu_1x1 -23330=4,3,14,14,256 0=256 1=1 5=1 6=135168 9=1 Convolution inception_4e/3x3_reduce 1 1 inception_4d/output_splitncnn_2 inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce -23330=4,3,14,14,160 0=160 1=1 5=1 6=84480 9=1 Convolution inception_4e/3x3 1 1 inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce inception_4e/3x3_inception_4e/relu_3x3 -23330=4,3,14,14,320 0=320 1=3 4=1 5=1 6=460800 9=1 Convolution inception_4e/5x5_reduce 1 1 inception_4d/output_splitncnn_1 inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce -23330=4,3,14,14,32 0=32 1=1 5=1 6=16896 9=1 Convolution inception_4e/5x5 1 1 inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce inception_4e/5x5_inception_4e/relu_5x5 -23330=4,3,14,14,128 0=128 1=5 4=2 5=1 6=102400 9=1 Pooling inception_4e/pool 1 1 inception_4d/output_splitncnn_0 inception_4e/pool -23330=4,3,14,14,528 1=3 3=1 Convolution inception_4e/pool_proj 1 1 inception_4e/pool inception_4e/pool_proj_inception_4e/relu_pool_proj -23330=4,3,14,14,128 0=128 1=1 5=1 6=67584 9=1 Concat inception_4e/output 4 1 inception_4e/1x1_inception_4e/relu_1x1 inception_4e/3x3_inception_4e/relu_3x3 inception_4e/5x5_inception_4e/relu_5x5 inception_4e/pool_proj_inception_4e/relu_pool_proj inception_4e/output -23330=4,3,14,14,832 Pooling pool4/3x3_s2 1 1 inception_4e/output pool4/3x3_s2 -23330=4,3,7,7,832 1=3 2=2 Split splitncnn_7 1 4 pool4/3x3_s2 pool4/3x3_s2_splitncnn_0 pool4/3x3_s2_splitncnn_1 pool4/3x3_s2_splitncnn_2 pool4/3x3_s2_splitncnn_3 -23330=16,3,7,7,832,3,7,7,832,3,7,7,832,3,7,7,832 Convolution inception_5a/1x1 1 1 pool4/3x3_s2_splitncnn_3 inception_5a/1x1_inception_5a/relu_1x1 -23330=4,3,7,7,256 0=256 1=1 5=1 6=212992 9=1 Convolution inception_5a/3x3_reduce 1 1 pool4/3x3_s2_splitncnn_2 inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce -23330=4,3,7,7,160 0=160 1=1 5=1 6=133120 9=1 Convolution inception_5a/3x3 1 1 inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce inception_5a/3x3_inception_5a/relu_3x3 -23330=4,3,7,7,320 0=320 1=3 4=1 5=1 6=460800 9=1 Convolution inception_5a/5x5_reduce 1 1 pool4/3x3_s2_splitncnn_1 inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce -23330=4,3,7,7,32 0=32 1=1 5=1 6=26624 9=1 Convolution inception_5a/5x5 1 1 inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce inception_5a/5x5_inception_5a/relu_5x5 -23330=4,3,7,7,128 0=128 1=5 4=2 5=1 6=102400 9=1 Pooling inception_5a/pool 1 1 pool4/3x3_s2_splitncnn_0 inception_5a/pool -23330=4,3,7,7,832 1=3 3=1 Convolution inception_5a/pool_proj 1 1 inception_5a/pool inception_5a/pool_proj_inception_5a/relu_pool_proj -23330=4,3,7,7,128 0=128 1=1 5=1 6=106496 9=1 Concat inception_5a/output 4 1 inception_5a/1x1_inception_5a/relu_1x1 inception_5a/3x3_inception_5a/relu_3x3 inception_5a/5x5_inception_5a/relu_5x5 inception_5a/pool_proj_inception_5a/relu_pool_proj inception_5a/output -23330=4,3,7,7,832 Split splitncnn_8 1 4 inception_5a/output inception_5a/output_splitncnn_0 inception_5a/output_splitncnn_1 inception_5a/output_splitncnn_2 inception_5a/output_splitncnn_3 -23330=16,3,7,7,832,3,7,7,832,3,7,7,832,3,7,7,832 Convolution inception_5b/1x1 1 1 inception_5a/output_splitncnn_3 inception_5b/1x1_inception_5b/relu_1x1 -23330=4,3,7,7,384 0=384 1=1 5=1 6=319488 9=1 Convolution inception_5b/3x3_reduce 1 1 inception_5a/output_splitncnn_2 inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce -23330=4,3,7,7,192 0=192 1=1 5=1 6=159744 9=1 Convolution inception_5b/3x3 1 1 inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce inception_5b/3x3_inception_5b/relu_3x3 -23330=4,3,7,7,384 0=384 1=3 4=1 5=1 6=663552 9=1 Convolution inception_5b/5x5_reduce 1 1 inception_5a/output_splitncnn_1 inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce -23330=4,3,7,7,48 0=48 1=1 5=1 6=39936 9=1 Convolution inception_5b/5x5 1 1 inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce inception_5b/5x5_inception_5b/relu_5x5 -23330=4,3,7,7,128 0=128 1=5 4=2 5=1 6=153600 9=1 Pooling inception_5b/pool 1 1 inception_5a/output_splitncnn_0 inception_5b/pool -23330=4,3,7,7,832 1=3 3=1 Convolution inception_5b/pool_proj 1 1 inception_5b/pool inception_5b/pool_proj_inception_5b/relu_pool_proj -23330=4,3,7,7,128 0=128 1=1 5=1 6=106496 9=1 Concat inception_5b/output 4 1 inception_5b/1x1_inception_5b/relu_1x1 inception_5b/3x3_inception_5b/relu_3x3 inception_5b/5x5_inception_5b/relu_5x5 inception_5b/pool_proj_inception_5b/relu_pool_proj inception_5b/output -23330=4,3,7,7,1024 Pooling pool5/7x7_s1 1 1 inception_5b/output pool5/7x7_s1_pool5/drop_7x7_s1 -23330=4,3,1,1,1024 0=1 1=7 InnerProduct loss3/classifier 1 1 pool5/7x7_s1_pool5/drop_7x7_s1 loss3/classifier -23330=4,1,1000,1,1 0=1000 1=1 2=1024000 Softmax prob 1 1 loss3/classifier output -23330=4,1,1000,1,1 ================================================ FILE: benchmark/googlenet_int8.param ================================================ 7767517 94 121 Input data 0 1 data 0=224 1=224 2=3 Convolution conv1/7x7_s2 1 1 data conv1/7x7_s2_conv1/relu_7x7 0=64 1=7 3=2 4=3 5=1 6=9408 8=2 9=1 Pooling pool1/3x3_s2 1 1 conv1/7x7_s2_conv1/relu_7x7 pool1/3x3_s2 1=3 2=2 LRN pool1/norm1 1 1 pool1/3x3_s2 pool1/norm1 2=0.000100 Convolution conv2/3x3_reduce 1 1 pool1/norm1 conv2/3x3_reduce_conv2/relu_3x3_reduce 0=64 1=1 5=1 6=4096 8=102 9=1 Convolution conv2/3x3 1 1 conv2/3x3_reduce_conv2/relu_3x3_reduce conv2/3x3_conv2/relu_3x3 0=192 1=3 4=1 5=1 6=110592 8=2 9=1 LRN conv2/norm2 1 1 conv2/3x3_conv2/relu_3x3 conv2/norm2 2=0.000100 Pooling pool2/3x3_s2 1 1 conv2/norm2 pool2/3x3_s2 1=3 2=2 Split splitncnn_0 1 4 pool2/3x3_s2 pool2/3x3_s2_splitncnn_0 pool2/3x3_s2_splitncnn_1 pool2/3x3_s2_splitncnn_2 pool2/3x3_s2_splitncnn_3 Convolution inception_3a/1x1 1 1 pool2/3x3_s2_splitncnn_3 inception_3a/1x1_inception_3a/relu_1x1 0=64 1=1 5=1 6=12288 8=2 9=1 Convolution inception_3a/3x3_reduce 1 1 pool2/3x3_s2_splitncnn_2 inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce 0=96 1=1 5=1 6=18432 8=102 9=1 Convolution inception_3a/3x3 1 1 inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce inception_3a/3x3_inception_3a/relu_3x3 0=128 1=3 4=1 5=1 6=110592 8=2 9=1 Convolution inception_3a/5x5_reduce 1 1 pool2/3x3_s2_splitncnn_1 inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce 0=16 1=1 5=1 6=3072 8=102 9=1 Convolution inception_3a/5x5 1 1 inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce inception_3a/5x5_inception_3a/relu_5x5 0=32 1=5 4=2 5=1 6=12800 8=2 9=1 Pooling inception_3a/pool 1 1 pool2/3x3_s2_splitncnn_0 inception_3a/pool 1=3 3=1 Convolution inception_3a/pool_proj 1 1 inception_3a/pool inception_3a/pool_proj_inception_3a/relu_pool_proj 0=32 1=1 5=1 6=6144 8=2 9=1 Concat inception_3a/output 4 1 inception_3a/1x1_inception_3a/relu_1x1 inception_3a/3x3_inception_3a/relu_3x3 inception_3a/5x5_inception_3a/relu_5x5 inception_3a/pool_proj_inception_3a/relu_pool_proj inception_3a/output Split splitncnn_1 1 4 inception_3a/output inception_3a/output_splitncnn_0 inception_3a/output_splitncnn_1 inception_3a/output_splitncnn_2 inception_3a/output_splitncnn_3 Convolution inception_3b/1x1 1 1 inception_3a/output_splitncnn_3 inception_3b/1x1_inception_3b/relu_1x1 0=128 1=1 5=1 6=32768 8=2 9=1 Convolution inception_3b/3x3_reduce 1 1 inception_3a/output_splitncnn_2 inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce 0=128 1=1 5=1 6=32768 8=102 9=1 Convolution inception_3b/3x3 1 1 inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce inception_3b/3x3_inception_3b/relu_3x3 0=192 1=3 4=1 5=1 6=221184 8=2 9=1 Convolution inception_3b/5x5_reduce 1 1 inception_3a/output_splitncnn_1 inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce 0=32 1=1 5=1 6=8192 8=102 9=1 Convolution inception_3b/5x5 1 1 inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce inception_3b/5x5_inception_3b/relu_5x5 0=96 1=5 4=2 5=1 6=76800 8=2 9=1 Pooling inception_3b/pool 1 1 inception_3a/output_splitncnn_0 inception_3b/pool 1=3 3=1 Convolution inception_3b/pool_proj 1 1 inception_3b/pool inception_3b/pool_proj_inception_3b/relu_pool_proj 0=64 1=1 5=1 6=16384 8=2 9=1 Concat inception_3b/output 4 1 inception_3b/1x1_inception_3b/relu_1x1 inception_3b/3x3_inception_3b/relu_3x3 inception_3b/5x5_inception_3b/relu_5x5 inception_3b/pool_proj_inception_3b/relu_pool_proj inception_3b/output Pooling pool3/3x3_s2 1 1 inception_3b/output pool3/3x3_s2 1=3 2=2 Split splitncnn_2 1 4 pool3/3x3_s2 pool3/3x3_s2_splitncnn_0 pool3/3x3_s2_splitncnn_1 pool3/3x3_s2_splitncnn_2 pool3/3x3_s2_splitncnn_3 Convolution inception_4a/1x1 1 1 pool3/3x3_s2_splitncnn_3 inception_4a/1x1_inception_4a/relu_1x1 0=192 1=1 5=1 6=92160 8=2 9=1 Convolution inception_4a/3x3_reduce 1 1 pool3/3x3_s2_splitncnn_2 inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce 0=96 1=1 5=1 6=46080 8=102 9=1 Convolution inception_4a/3x3 1 1 inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce inception_4a/3x3_inception_4a/relu_3x3 0=208 1=3 4=1 5=1 6=179712 8=2 9=1 Convolution inception_4a/5x5_reduce 1 1 pool3/3x3_s2_splitncnn_1 inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce 0=16 1=1 5=1 6=7680 8=102 9=1 Convolution inception_4a/5x5 1 1 inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce inception_4a/5x5_inception_4a/relu_5x5 0=48 1=5 4=2 5=1 6=19200 8=2 9=1 Pooling inception_4a/pool 1 1 pool3/3x3_s2_splitncnn_0 inception_4a/pool 1=3 3=1 Convolution inception_4a/pool_proj 1 1 inception_4a/pool inception_4a/pool_proj_inception_4a/relu_pool_proj 0=64 1=1 5=1 6=30720 8=2 9=1 Concat inception_4a/output 4 1 inception_4a/1x1_inception_4a/relu_1x1 inception_4a/3x3_inception_4a/relu_3x3 inception_4a/5x5_inception_4a/relu_5x5 inception_4a/pool_proj_inception_4a/relu_pool_proj inception_4a/output Split splitncnn_3 1 4 inception_4a/output inception_4a/output_splitncnn_0 inception_4a/output_splitncnn_1 inception_4a/output_splitncnn_2 inception_4a/output_splitncnn_3 Convolution inception_4b/1x1 1 1 inception_4a/output_splitncnn_3 inception_4b/1x1_inception_4b/relu_1x1 0=160 1=1 5=1 6=81920 8=2 9=1 Convolution inception_4b/3x3_reduce 1 1 inception_4a/output_splitncnn_2 inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce 0=112 1=1 5=1 6=57344 8=102 9=1 Convolution inception_4b/3x3 1 1 inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce inception_4b/3x3_inception_4b/relu_3x3 0=224 1=3 4=1 5=1 6=225792 8=2 9=1 Convolution inception_4b/5x5_reduce 1 1 inception_4a/output_splitncnn_1 inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce 0=24 1=1 5=1 6=12288 8=102 9=1 Convolution inception_4b/5x5 1 1 inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce inception_4b/5x5_inception_4b/relu_5x5 0=64 1=5 4=2 5=1 6=38400 8=2 9=1 Pooling inception_4b/pool 1 1 inception_4a/output_splitncnn_0 inception_4b/pool 1=3 3=1 Convolution inception_4b/pool_proj 1 1 inception_4b/pool inception_4b/pool_proj_inception_4b/relu_pool_proj 0=64 1=1 5=1 6=32768 8=2 9=1 Concat inception_4b/output 4 1 inception_4b/1x1_inception_4b/relu_1x1 inception_4b/3x3_inception_4b/relu_3x3 inception_4b/5x5_inception_4b/relu_5x5 inception_4b/pool_proj_inception_4b/relu_pool_proj inception_4b/output Split splitncnn_4 1 4 inception_4b/output inception_4b/output_splitncnn_0 inception_4b/output_splitncnn_1 inception_4b/output_splitncnn_2 inception_4b/output_splitncnn_3 Convolution inception_4c/1x1 1 1 inception_4b/output_splitncnn_3 inception_4c/1x1_inception_4c/relu_1x1 0=128 1=1 5=1 6=65536 8=2 9=1 Convolution inception_4c/3x3_reduce 1 1 inception_4b/output_splitncnn_2 inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce 0=128 1=1 5=1 6=65536 8=102 9=1 Convolution inception_4c/3x3 1 1 inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce inception_4c/3x3_inception_4c/relu_3x3 0=256 1=3 4=1 5=1 6=294912 8=2 9=1 Convolution inception_4c/5x5_reduce 1 1 inception_4b/output_splitncnn_1 inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce 0=24 1=1 5=1 6=12288 8=102 9=1 Convolution inception_4c/5x5 1 1 inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce inception_4c/5x5_inception_4c/relu_5x5 0=64 1=5 4=2 5=1 6=38400 8=2 9=1 Pooling inception_4c/pool 1 1 inception_4b/output_splitncnn_0 inception_4c/pool 1=3 3=1 Convolution inception_4c/pool_proj 1 1 inception_4c/pool inception_4c/pool_proj_inception_4c/relu_pool_proj 0=64 1=1 5=1 6=32768 8=2 9=1 Concat inception_4c/output 4 1 inception_4c/1x1_inception_4c/relu_1x1 inception_4c/3x3_inception_4c/relu_3x3 inception_4c/5x5_inception_4c/relu_5x5 inception_4c/pool_proj_inception_4c/relu_pool_proj inception_4c/output Split splitncnn_5 1 4 inception_4c/output inception_4c/output_splitncnn_0 inception_4c/output_splitncnn_1 inception_4c/output_splitncnn_2 inception_4c/output_splitncnn_3 Convolution inception_4d/1x1 1 1 inception_4c/output_splitncnn_3 inception_4d/1x1_inception_4d/relu_1x1 0=112 1=1 5=1 6=57344 8=2 9=1 Convolution inception_4d/3x3_reduce 1 1 inception_4c/output_splitncnn_2 inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce 0=144 1=1 5=1 6=73728 8=102 9=1 Convolution inception_4d/3x3 1 1 inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce inception_4d/3x3_inception_4d/relu_3x3 0=288 1=3 4=1 5=1 6=373248 8=2 9=1 Convolution inception_4d/5x5_reduce 1 1 inception_4c/output_splitncnn_1 inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce 0=32 1=1 5=1 6=16384 8=102 9=1 Convolution inception_4d/5x5 1 1 inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce inception_4d/5x5_inception_4d/relu_5x5 0=64 1=5 4=2 5=1 6=51200 8=2 9=1 Pooling inception_4d/pool 1 1 inception_4c/output_splitncnn_0 inception_4d/pool 1=3 3=1 Convolution inception_4d/pool_proj 1 1 inception_4d/pool inception_4d/pool_proj_inception_4d/relu_pool_proj 0=64 1=1 5=1 6=32768 8=2 9=1 Concat inception_4d/output 4 1 inception_4d/1x1_inception_4d/relu_1x1 inception_4d/3x3_inception_4d/relu_3x3 inception_4d/5x5_inception_4d/relu_5x5 inception_4d/pool_proj_inception_4d/relu_pool_proj inception_4d/output Split splitncnn_6 1 4 inception_4d/output inception_4d/output_splitncnn_0 inception_4d/output_splitncnn_1 inception_4d/output_splitncnn_2 inception_4d/output_splitncnn_3 Convolution inception_4e/1x1 1 1 inception_4d/output_splitncnn_3 inception_4e/1x1_inception_4e/relu_1x1 0=256 1=1 5=1 6=135168 8=2 9=1 Convolution inception_4e/3x3_reduce 1 1 inception_4d/output_splitncnn_2 inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce 0=160 1=1 5=1 6=84480 8=102 9=1 Convolution inception_4e/3x3 1 1 inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce inception_4e/3x3_inception_4e/relu_3x3 0=320 1=3 4=1 5=1 6=460800 8=2 9=1 Convolution inception_4e/5x5_reduce 1 1 inception_4d/output_splitncnn_1 inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce 0=32 1=1 5=1 6=16896 8=102 9=1 Convolution inception_4e/5x5 1 1 inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce inception_4e/5x5_inception_4e/relu_5x5 0=128 1=5 4=2 5=1 6=102400 8=2 9=1 Pooling inception_4e/pool 1 1 inception_4d/output_splitncnn_0 inception_4e/pool 1=3 3=1 Convolution inception_4e/pool_proj 1 1 inception_4e/pool inception_4e/pool_proj_inception_4e/relu_pool_proj 0=128 1=1 5=1 6=67584 8=2 9=1 Concat inception_4e/output 4 1 inception_4e/1x1_inception_4e/relu_1x1 inception_4e/3x3_inception_4e/relu_3x3 inception_4e/5x5_inception_4e/relu_5x5 inception_4e/pool_proj_inception_4e/relu_pool_proj inception_4e/output Pooling pool4/3x3_s2 1 1 inception_4e/output pool4/3x3_s2 1=3 2=2 Split splitncnn_7 1 4 pool4/3x3_s2 pool4/3x3_s2_splitncnn_0 pool4/3x3_s2_splitncnn_1 pool4/3x3_s2_splitncnn_2 pool4/3x3_s2_splitncnn_3 Convolution inception_5a/1x1 1 1 pool4/3x3_s2_splitncnn_3 inception_5a/1x1_inception_5a/relu_1x1 0=256 1=1 5=1 6=212992 8=2 9=1 Convolution inception_5a/3x3_reduce 1 1 pool4/3x3_s2_splitncnn_2 inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce 0=160 1=1 5=1 6=133120 8=102 9=1 Convolution inception_5a/3x3 1 1 inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce inception_5a/3x3_inception_5a/relu_3x3 0=320 1=3 4=1 5=1 6=460800 8=2 9=1 Convolution inception_5a/5x5_reduce 1 1 pool4/3x3_s2_splitncnn_1 inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce 0=32 1=1 5=1 6=26624 8=102 9=1 Convolution inception_5a/5x5 1 1 inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce inception_5a/5x5_inception_5a/relu_5x5 0=128 1=5 4=2 5=1 6=102400 8=2 9=1 Pooling inception_5a/pool 1 1 pool4/3x3_s2_splitncnn_0 inception_5a/pool 1=3 3=1 Convolution inception_5a/pool_proj 1 1 inception_5a/pool inception_5a/pool_proj_inception_5a/relu_pool_proj 0=128 1=1 5=1 6=106496 8=2 9=1 Concat inception_5a/output 4 1 inception_5a/1x1_inception_5a/relu_1x1 inception_5a/3x3_inception_5a/relu_3x3 inception_5a/5x5_inception_5a/relu_5x5 inception_5a/pool_proj_inception_5a/relu_pool_proj inception_5a/output Split splitncnn_8 1 4 inception_5a/output inception_5a/output_splitncnn_0 inception_5a/output_splitncnn_1 inception_5a/output_splitncnn_2 inception_5a/output_splitncnn_3 Convolution inception_5b/1x1 1 1 inception_5a/output_splitncnn_3 inception_5b/1x1_inception_5b/relu_1x1 0=384 1=1 5=1 6=319488 8=2 9=1 Convolution inception_5b/3x3_reduce 1 1 inception_5a/output_splitncnn_2 inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce 0=192 1=1 5=1 6=159744 8=102 9=1 Convolution inception_5b/3x3 1 1 inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce inception_5b/3x3_inception_5b/relu_3x3 0=384 1=3 4=1 5=1 6=663552 8=2 9=1 Convolution inception_5b/5x5_reduce 1 1 inception_5a/output_splitncnn_1 inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce 0=48 1=1 5=1 6=39936 8=102 9=1 Convolution inception_5b/5x5 1 1 inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce inception_5b/5x5_inception_5b/relu_5x5 0=128 1=5 4=2 5=1 6=153600 8=2 9=1 Pooling inception_5b/pool 1 1 inception_5a/output_splitncnn_0 inception_5b/pool 1=3 3=1 Convolution inception_5b/pool_proj 1 1 inception_5b/pool inception_5b/pool_proj_inception_5b/relu_pool_proj 0=128 1=1 5=1 6=106496 8=2 9=1 Concat inception_5b/output 4 1 inception_5b/1x1_inception_5b/relu_1x1 inception_5b/3x3_inception_5b/relu_3x3 inception_5b/5x5_inception_5b/relu_5x5 inception_5b/pool_proj_inception_5b/relu_pool_proj inception_5b/output Pooling pool5/7x7_s1 1 1 inception_5b/output pool5/7x7_s1_pool5/drop_7x7_s1 0=1 1=7 InnerProduct loss3/classifier 1 1 pool5/7x7_s1_pool5/drop_7x7_s1 loss3/classifier 0=1000 1=1 2=1024000 Softmax prob 1 1 loss3/classifier output ================================================ FILE: benchmark/mnasnet.param ================================================ 7767517 76 86 Input data 0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3 Convolution first-3x3-conv 1 1 data first-3x3-conv_relu -23330=4,3,112,112,32 0=32 1=3 3=2 4=1 5=1 6=864 9=1 ConvolutionDepthWise A0_dw 1 1 first-3x3-conv_relu A0_dw_relu -23330=4,3,112,112,32 0=32 1=3 4=1 5=1 6=288 7=32 9=1 Convolution A0_linear 1 1 A0_dw_relu A0_linear_bn -23330=4,3,112,112,16 0=16 1=1 5=1 6=512 Convolution B0_expand 1 1 A0_linear_bn B0_expand_relu -23330=4,3,112,112,48 0=48 1=1 5=1 6=768 9=1 ConvolutionDepthWise B0_dw 1 1 B0_expand_relu B0_dw_relu -23330=4,3,56,56,48 0=48 1=3 3=2 4=1 5=1 6=432 7=48 9=1 Convolution B0_linear 1 1 B0_dw_relu B0_linear_bn -23330=4,3,56,56,24 0=24 1=1 5=1 6=1152 Split splitncnn_0 1 2 B0_linear_bn B0_linear_bn_splitncnn_0 B0_linear_bn_splitncnn_1 -23330=8,3,56,56,24,3,56,56,24 Convolution B1_expand 1 1 B0_linear_bn_splitncnn_1 B1_expand_relu -23330=4,3,56,56,72 0=72 1=1 5=1 6=1728 9=1 ConvolutionDepthWise B1_dw 1 1 B1_expand_relu B1_dw_relu -23330=4,3,56,56,72 0=72 1=3 4=1 5=1 6=648 7=72 9=1 Convolution B1_linear 1 1 B1_dw_relu B1_linear_bn -23330=4,3,56,56,24 0=24 1=1 5=1 6=1728 BinaryOp unknownncnn_0 2 1 B0_linear_bn_splitncnn_0 B1_linear_bn unknownncnn_0 -23330=4,3,56,56,24 Split splitncnn_1 1 2 unknownncnn_0 unknownncnn_0_splitncnn_0 unknownncnn_0_splitncnn_1 -23330=8,3,56,56,24,3,56,56,24 Convolution B2_expand 1 1 unknownncnn_0_splitncnn_1 B2_expand_relu -23330=4,3,56,56,72 0=72 1=1 5=1 6=1728 9=1 ConvolutionDepthWise B2_dw 1 1 B2_expand_relu B2_dw_relu -23330=4,3,56,56,72 0=72 1=3 4=1 5=1 6=648 7=72 9=1 Convolution B2_linear 1 1 B2_dw_relu B2_linear_bn -23330=4,3,56,56,24 0=24 1=1 5=1 6=1728 BinaryOp unknownncnn_1 2 1 unknownncnn_0_splitncnn_0 B2_linear_bn unknownncnn_1 -23330=4,3,56,56,24 Convolution C0_expand 1 1 unknownncnn_1 C0_expand_relu -23330=4,3,56,56,72 0=72 1=1 5=1 6=1728 9=1 ConvolutionDepthWise C0_dw 1 1 C0_expand_relu C0_dw_relu -23330=4,3,28,28,72 0=72 1=5 3=2 4=2 5=1 6=1800 7=72 9=1 Convolution C0_linear 1 1 C0_dw_relu C0_linear_bn -23330=4,3,28,28,40 0=40 1=1 5=1 6=2880 Split splitncnn_2 1 2 C0_linear_bn C0_linear_bn_splitncnn_0 C0_linear_bn_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40 Convolution C1_expand 1 1 C0_linear_bn_splitncnn_1 C1_expand_relu -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1 ConvolutionDepthWise C1_dw 1 1 C1_expand_relu C1_dw_relu -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120 9=1 Convolution C1_linear 1 1 C1_dw_relu C1_linear_bn -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800 BinaryOp unknownncnn_2 2 1 C0_linear_bn_splitncnn_0 C1_linear_bn unknownncnn_2 -23330=4,3,28,28,40 Split splitncnn_3 1 2 unknownncnn_2 unknownncnn_2_splitncnn_0 unknownncnn_2_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40 Convolution C2_expand 1 1 unknownncnn_2_splitncnn_1 C2_expand_relu -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1 ConvolutionDepthWise C2_dw 1 1 C2_expand_relu C2_dw_relu -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120 9=1 Convolution C2_linear 1 1 C2_dw_relu C2_linear_bn -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800 BinaryOp unknownncnn_3 2 1 unknownncnn_2_splitncnn_0 C2_linear_bn unknownncnn_3 -23330=4,3,28,28,40 Convolution D0_expand 1 1 unknownncnn_3 D0_expand_relu -23330=4,3,28,28,240 0=240 1=1 5=1 6=9600 9=1 ConvolutionDepthWise D0_dw 1 1 D0_expand_relu D0_dw_relu -23330=4,3,14,14,240 0=240 1=5 3=2 4=2 5=1 6=6000 7=240 9=1 Convolution D0_linear 1 1 D0_dw_relu D0_linear_bn -23330=4,3,14,14,80 0=80 1=1 5=1 6=19200 Split splitncnn_4 1 2 D0_linear_bn D0_linear_bn_splitncnn_0 D0_linear_bn_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80 Convolution D1_expand 1 1 D0_linear_bn_splitncnn_1 D1_expand_relu -23330=4,3,14,14,480 0=480 1=1 5=1 6=38400 9=1 ConvolutionDepthWise D1_dw 1 1 D1_expand_relu D1_dw_relu -23330=4,3,14,14,480 0=480 1=5 4=2 5=1 6=12000 7=480 9=1 Convolution D1_linear 1 1 D1_dw_relu D1_linear_bn -23330=4,3,14,14,80 0=80 1=1 5=1 6=38400 BinaryOp unknownncnn_4 2 1 D0_linear_bn_splitncnn_0 D1_linear_bn unknownncnn_4 -23330=4,3,14,14,80 Split splitncnn_5 1 2 unknownncnn_4 unknownncnn_4_splitncnn_0 unknownncnn_4_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80 Convolution D2_expand 1 1 unknownncnn_4_splitncnn_1 D2_expand_relu -23330=4,3,14,14,480 0=480 1=1 5=1 6=38400 9=1 ConvolutionDepthWise D2_dw 1 1 D2_expand_relu D2_dw_relu -23330=4,3,14,14,480 0=480 1=5 4=2 5=1 6=12000 7=480 9=1 Convolution D2_linear 1 1 D2_dw_relu D2_linear_bn -23330=4,3,14,14,80 0=80 1=1 5=1 6=38400 BinaryOp unknownncnn_5 2 1 unknownncnn_4_splitncnn_0 D2_linear_bn unknownncnn_5 -23330=4,3,14,14,80 Convolution E0_expand 1 1 unknownncnn_5 E0_expand_relu -23330=4,3,14,14,480 0=480 1=1 5=1 6=38400 9=1 ConvolutionDepthWise E0_dw 1 1 E0_expand_relu E0_dw_relu -23330=4,3,14,14,480 0=480 1=3 4=1 5=1 6=4320 7=480 9=1 Convolution E0_linear 1 1 E0_dw_relu E0_linear_bn -23330=4,3,14,14,96 0=96 1=1 5=1 6=46080 Split splitncnn_6 1 2 E0_linear_bn E0_linear_bn_splitncnn_0 E0_linear_bn_splitncnn_1 -23330=8,3,14,14,96,3,14,14,96 Convolution E1_expand 1 1 E0_linear_bn_splitncnn_1 E1_expand_relu -23330=4,3,14,14,576 0=576 1=1 5=1 6=55296 9=1 ConvolutionDepthWise E1_dw 1 1 E1_expand_relu E1_dw_relu -23330=4,3,14,14,576 0=576 1=3 4=1 5=1 6=5184 7=576 9=1 Convolution E1_linear 1 1 E1_dw_relu E1_linear_bn -23330=4,3,14,14,96 0=96 1=1 5=1 6=55296 BinaryOp unknownncnn_6 2 1 E0_linear_bn_splitncnn_0 E1_linear_bn unknownncnn_6 -23330=4,3,14,14,96 Convolution F0_expand 1 1 unknownncnn_6 F0_expand_relu -23330=4,3,14,14,576 0=576 1=1 5=1 6=55296 9=1 ConvolutionDepthWise F0_dw 1 1 F0_expand_relu F0_dw_relu -23330=4,3,7,7,576 0=576 1=5 3=2 4=2 5=1 6=14400 7=576 9=1 Convolution F0_linear 1 1 F0_dw_relu F0_linear_bn -23330=4,3,7,7,192 0=192 1=1 5=1 6=110592 Split splitncnn_7 1 2 F0_linear_bn F0_linear_bn_splitncnn_0 F0_linear_bn_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192 Convolution F1_expand 1 1 F0_linear_bn_splitncnn_1 F1_expand_relu -23330=4,3,7,7,1152 0=1152 1=1 5=1 6=221184 9=1 ConvolutionDepthWise F1_dw 1 1 F1_expand_relu F1_dw_relu -23330=4,3,7,7,1152 0=1152 1=5 4=2 5=1 6=28800 7=1152 9=1 Convolution F1_linear 1 1 F1_dw_relu F1_linear_bn -23330=4,3,7,7,192 0=192 1=1 5=1 6=221184 BinaryOp unknownncnn_7 2 1 F0_linear_bn_splitncnn_0 F1_linear_bn unknownncnn_7 -23330=4,3,7,7,192 Split splitncnn_8 1 2 unknownncnn_7 unknownncnn_7_splitncnn_0 unknownncnn_7_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192 Convolution F2_expand 1 1 unknownncnn_7_splitncnn_1 F2_expand_relu -23330=4,3,7,7,1152 0=1152 1=1 5=1 6=221184 9=1 ConvolutionDepthWise F2_dw 1 1 F2_expand_relu F2_dw_relu -23330=4,3,7,7,1152 0=1152 1=5 4=2 5=1 6=28800 7=1152 9=1 Convolution F2_linear 1 1 F2_dw_relu F2_linear_bn -23330=4,3,7,7,192 0=192 1=1 5=1 6=221184 BinaryOp unknownncnn_8 2 1 unknownncnn_7_splitncnn_0 F2_linear_bn unknownncnn_8 -23330=4,3,7,7,192 Split splitncnn_9 1 2 unknownncnn_8 unknownncnn_8_splitncnn_0 unknownncnn_8_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192 Convolution F3_expand 1 1 unknownncnn_8_splitncnn_1 F3_expand_relu -23330=4,3,7,7,1152 0=1152 1=1 5=1 6=221184 9=1 ConvolutionDepthWise F3_dw 1 1 F3_expand_relu F3_dw_relu -23330=4,3,7,7,1152 0=1152 1=5 4=2 5=1 6=28800 7=1152 9=1 Convolution F3_linear 1 1 F3_dw_relu F3_linear_bn -23330=4,3,7,7,192 0=192 1=1 5=1 6=221184 BinaryOp unknownncnn_9 2 1 unknownncnn_8_splitncnn_0 F3_linear_bn unknownncnn_9 -23330=4,3,7,7,192 Convolution G0_expand 1 1 unknownncnn_9 G0_expand_relu -23330=4,3,7,7,1152 0=1152 1=1 5=1 6=221184 9=1 ConvolutionDepthWise G0_dw 1 1 G0_expand_relu G0_dw_relu -23330=4,3,7,7,1152 0=1152 1=3 4=1 5=1 6=10368 7=1152 9=1 Convolution G0_linear 1 1 G0_dw_relu G0_linear_bn -23330=4,3,7,7,320 0=320 1=1 5=1 6=368640 Convolution last-1x1-conv 1 1 G0_linear_bn last-1x1-conv_relu -23330=4,3,7,7,1280 0=1280 1=1 5=1 6=409600 9=1 Pooling avgpool 1 1 last-1x1-conv_relu flatten -23330=4,1,1280,1,1 0=1 1=7 4=1 5=1 InnerProduct fc 1 1 flatten fc -23330=4,1,1000,1,1 0=1000 1=1 2=1280000 Softmax prob 1 1 fc output -23330=4,1,1000,1,1 ================================================ FILE: benchmark/mobilenet.param ================================================ 7767517 31 31 Input data 0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3 Convolution conv1 1 1 data conv1_relu1 -23330=4,3,112,112,32 0=32 1=3 3=2 4=1 5=1 6=864 9=1 ConvolutionDepthWise conv2_1/dw 1 1 conv1_relu1 conv2_1/dw_relu2_1/dw -23330=4,3,112,112,32 0=32 1=3 4=1 5=1 6=288 7=32 9=1 Convolution conv2_1/sep 1 1 conv2_1/dw_relu2_1/dw conv2_1/sep_relu2_1/sep -23330=4,3,112,112,64 0=64 1=1 5=1 6=2048 9=1 ConvolutionDepthWise conv2_2/dw 1 1 conv2_1/sep_relu2_1/sep conv2_2/dw_relu2_2/dw -23330=4,3,56,56,64 0=64 1=3 3=2 4=1 5=1 6=576 7=64 9=1 Convolution conv2_2/sep 1 1 conv2_2/dw_relu2_2/dw conv2_2/sep_relu2_2/sep -23330=4,3,56,56,128 0=128 1=1 5=1 6=8192 9=1 ConvolutionDepthWise conv3_1/dw 1 1 conv2_2/sep_relu2_2/sep conv3_1/dw_relu3_1/dw -23330=4,3,56,56,128 0=128 1=3 4=1 5=1 6=1152 7=128 9=1 Convolution conv3_1/sep 1 1 conv3_1/dw_relu3_1/dw conv3_1/sep_relu3_1/sep -23330=4,3,56,56,128 0=128 1=1 5=1 6=16384 9=1 ConvolutionDepthWise conv3_2/dw 1 1 conv3_1/sep_relu3_1/sep conv3_2/dw_relu3_2/dw -23330=4,3,28,28,128 0=128 1=3 3=2 4=1 5=1 6=1152 7=128 9=1 Convolution conv3_2/sep 1 1 conv3_2/dw_relu3_2/dw conv3_2/sep_relu3_2/sep -23330=4,3,28,28,256 0=256 1=1 5=1 6=32768 9=1 ConvolutionDepthWise conv4_1/dw 1 1 conv3_2/sep_relu3_2/sep conv4_1/dw_relu4_1/dw -23330=4,3,28,28,256 0=256 1=3 4=1 5=1 6=2304 7=256 9=1 Convolution conv4_1/sep 1 1 conv4_1/dw_relu4_1/dw conv4_1/sep_relu4_1/sep -23330=4,3,28,28,256 0=256 1=1 5=1 6=65536 9=1 ConvolutionDepthWise conv4_2/dw 1 1 conv4_1/sep_relu4_1/sep conv4_2/dw_relu4_2/dw -23330=4,3,14,14,256 0=256 1=3 3=2 4=1 5=1 6=2304 7=256 9=1 Convolution conv4_2/sep 1 1 conv4_2/dw_relu4_2/dw conv4_2/sep_relu4_2/sep -23330=4,3,14,14,512 0=512 1=1 5=1 6=131072 9=1 ConvolutionDepthWise conv5_1/dw 1 1 conv4_2/sep_relu4_2/sep conv5_1/dw_relu5_1/dw -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1 Convolution conv5_1/sep 1 1 conv5_1/dw_relu5_1/dw conv5_1/sep_relu5_1/sep -23330=4,3,14,14,512 0=512 1=1 5=1 6=262144 9=1 ConvolutionDepthWise conv5_2/dw 1 1 conv5_1/sep_relu5_1/sep conv5_2/dw_relu5_2/dw -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1 Convolution conv5_2/sep 1 1 conv5_2/dw_relu5_2/dw conv5_2/sep_relu5_2/sep -23330=4,3,14,14,512 0=512 1=1 5=1 6=262144 9=1 ConvolutionDepthWise conv5_3/dw 1 1 conv5_2/sep_relu5_2/sep conv5_3/dw_relu5_3/dw -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1 Convolution conv5_3/sep 1 1 conv5_3/dw_relu5_3/dw conv5_3/sep_relu5_3/sep -23330=4,3,14,14,512 0=512 1=1 5=1 6=262144 9=1 ConvolutionDepthWise conv5_4/dw 1 1 conv5_3/sep_relu5_3/sep conv5_4/dw_relu5_4/dw -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1 Convolution conv5_4/sep 1 1 conv5_4/dw_relu5_4/dw conv5_4/sep_relu5_4/sep -23330=4,3,14,14,512 0=512 1=1 5=1 6=262144 9=1 ConvolutionDepthWise conv5_5/dw 1 1 conv5_4/sep_relu5_4/sep conv5_5/dw_relu5_5/dw -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1 Convolution conv5_5/sep 1 1 conv5_5/dw_relu5_5/dw conv5_5/sep_relu5_5/sep -23330=4,3,14,14,512 0=512 1=1 5=1 6=262144 9=1 ConvolutionDepthWise conv5_6/dw 1 1 conv5_5/sep_relu5_5/sep conv5_6/dw_relu5_6/dw -23330=4,3,7,7,512 0=512 1=3 3=2 4=1 5=1 6=4608 7=512 9=1 Convolution conv5_6/sep 1 1 conv5_6/dw_relu5_6/dw conv5_6/sep_relu5_6/sep -23330=4,3,7,7,1024 0=1024 1=1 5=1 6=524288 9=1 ConvolutionDepthWise conv6/dw 1 1 conv5_6/sep_relu5_6/sep conv6/dw_relu6/dw -23330=4,3,7,7,1024 0=1024 1=3 4=1 5=1 6=9216 7=1024 9=1 Convolution conv6/sep 1 1 conv6/dw_relu6/dw conv6/sep_relu6/sep -23330=4,3,7,7,1024 0=1024 1=1 5=1 6=1048576 9=1 Pooling pool6 1 1 conv6/sep_relu6/sep pool6 -23330=4,1,1024,1,1 0=1 4=1 InnerProduct fc7 1 1 pool6 fc7 -23330=4,1,1000,1,1 0=1000 1=1 2=1024000 Softmax prob 1 1 fc7 output -23330=4,1,1000,1,1 ================================================ FILE: benchmark/mobilenet_int8.param ================================================ 7767517 31 31 Input data 0 1 data 0=224 1=224 2=3 Convolution conv1 1 1 data conv1_relu1 0=32 1=3 3=2 4=1 5=1 6=864 8=102 9=1 ConvolutionDepthWise conv2_1/dw 1 1 conv1_relu1 conv2_1/dw_relu2_1/dw 0=32 1=3 4=1 5=1 6=288 7=32 8=101 9=1 Convolution conv2_1/sep 1 1 conv2_1/dw_relu2_1/dw conv2_1/sep_relu2_1/sep 0=64 1=1 5=1 6=2048 8=102 9=1 ConvolutionDepthWise conv2_2/dw 1 1 conv2_1/sep_relu2_1/sep conv2_2/dw_relu2_2/dw 0=64 1=3 3=2 4=1 5=1 6=576 7=64 8=101 9=1 Convolution conv2_2/sep 1 1 conv2_2/dw_relu2_2/dw conv2_2/sep_relu2_2/sep 0=128 1=1 5=1 6=8192 8=102 9=1 ConvolutionDepthWise conv3_1/dw 1 1 conv2_2/sep_relu2_2/sep conv3_1/dw_relu3_1/dw 0=128 1=3 4=1 5=1 6=1152 7=128 8=101 9=1 Convolution conv3_1/sep 1 1 conv3_1/dw_relu3_1/dw conv3_1/sep_relu3_1/sep 0=128 1=1 5=1 6=16384 8=102 9=1 ConvolutionDepthWise conv3_2/dw 1 1 conv3_1/sep_relu3_1/sep conv3_2/dw_relu3_2/dw 0=128 1=3 3=2 4=1 5=1 6=1152 7=128 8=101 9=1 Convolution conv3_2/sep 1 1 conv3_2/dw_relu3_2/dw conv3_2/sep_relu3_2/sep 0=256 1=1 5=1 6=32768 8=102 9=1 ConvolutionDepthWise conv4_1/dw 1 1 conv3_2/sep_relu3_2/sep conv4_1/dw_relu4_1/dw 0=256 1=3 4=1 5=1 6=2304 7=256 8=101 9=1 Convolution conv4_1/sep 1 1 conv4_1/dw_relu4_1/dw conv4_1/sep_relu4_1/sep 0=256 1=1 5=1 6=65536 8=102 9=1 ConvolutionDepthWise conv4_2/dw 1 1 conv4_1/sep_relu4_1/sep conv4_2/dw_relu4_2/dw 0=256 1=3 3=2 4=1 5=1 6=2304 7=256 8=101 9=1 Convolution conv4_2/sep 1 1 conv4_2/dw_relu4_2/dw conv4_2/sep_relu4_2/sep 0=512 1=1 5=1 6=131072 8=102 9=1 ConvolutionDepthWise conv5_1/dw 1 1 conv4_2/sep_relu4_2/sep conv5_1/dw_relu5_1/dw 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 Convolution conv5_1/sep 1 1 conv5_1/dw_relu5_1/dw conv5_1/sep_relu5_1/sep 0=512 1=1 5=1 6=262144 8=102 9=1 ConvolutionDepthWise conv5_2/dw 1 1 conv5_1/sep_relu5_1/sep conv5_2/dw_relu5_2/dw 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 Convolution conv5_2/sep 1 1 conv5_2/dw_relu5_2/dw conv5_2/sep_relu5_2/sep 0=512 1=1 5=1 6=262144 8=102 9=1 ConvolutionDepthWise conv5_3/dw 1 1 conv5_2/sep_relu5_2/sep conv5_3/dw_relu5_3/dw 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 Convolution conv5_3/sep 1 1 conv5_3/dw_relu5_3/dw conv5_3/sep_relu5_3/sep 0=512 1=1 5=1 6=262144 8=102 9=1 ConvolutionDepthWise conv5_4/dw 1 1 conv5_3/sep_relu5_3/sep conv5_4/dw_relu5_4/dw 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 Convolution conv5_4/sep 1 1 conv5_4/dw_relu5_4/dw conv5_4/sep_relu5_4/sep 0=512 1=1 5=1 6=262144 8=102 9=1 ConvolutionDepthWise conv5_5/dw 1 1 conv5_4/sep_relu5_4/sep conv5_5/dw_relu5_5/dw 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 Convolution conv5_5/sep 1 1 conv5_5/dw_relu5_5/dw conv5_5/sep_relu5_5/sep 0=512 1=1 5=1 6=262144 8=102 9=1 ConvolutionDepthWise conv5_6/dw 1 1 conv5_5/sep_relu5_5/sep conv5_6/dw_relu5_6/dw 0=512 1=3 3=2 4=1 5=1 6=4608 7=512 8=101 9=1 Convolution conv5_6/sep 1 1 conv5_6/dw_relu5_6/dw conv5_6/sep_relu5_6/sep 0=1024 1=1 5=1 6=524288 8=102 9=1 ConvolutionDepthWise conv6/dw 1 1 conv5_6/sep_relu5_6/sep conv6/dw_relu6/dw 0=1024 1=3 4=1 5=1 6=9216 7=1024 8=101 9=1 Convolution conv6/sep 1 1 conv6/dw_relu6/dw conv6/sep_relu6/sep 0=1024 1=1 5=1 6=1048576 8=2 9=1 Pooling pool6 1 1 conv6/sep_relu6/sep pool6 0=1 4=1 InnerProduct fc7 1 1 pool6 fc7 0=1000 1=1 2=1024000 8=2 Softmax prob 1 1 fc7 output ================================================ FILE: benchmark/mobilenet_ssd.param ================================================ 7767517 92 115 Input input 0 1 data -23330=4,3,300,300,3 0=300 1=300 2=3 Split splitncnn_0 1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6 -23330=28,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3 Convolution conv0 1 1 data_splitncnn_6 conv0_conv0/relu -23330=4,3,150,150,32 0=32 1=3 3=2 4=1 5=1 6=864 9=1 ConvolutionDepthWise conv1/dw 1 1 conv0_conv0/relu conv1/dw_conv1/dw/relu -23330=4,3,150,150,32 0=32 1=3 4=1 5=1 6=288 7=32 9=1 Convolution conv1 1 1 conv1/dw_conv1/dw/relu conv1_conv1/relu -23330=4,3,150,150,64 0=64 1=1 5=1 6=2048 9=1 ConvolutionDepthWise conv2/dw 1 1 conv1_conv1/relu conv2/dw_conv2/dw/relu -23330=4,3,75,75,64 0=64 1=3 3=2 4=1 5=1 6=576 7=64 9=1 Convolution conv2 1 1 conv2/dw_conv2/dw/relu conv2_conv2/relu -23330=4,3,75,75,128 0=128 1=1 5=1 6=8192 9=1 ConvolutionDepthWise conv3/dw 1 1 conv2_conv2/relu conv3/dw_conv3/dw/relu -23330=4,3,75,75,128 0=128 1=3 4=1 5=1 6=1152 7=128 9=1 Convolution conv3 1 1 conv3/dw_conv3/dw/relu conv3_conv3/relu -23330=4,3,75,75,128 0=128 1=1 5=1 6=16384 9=1 ConvolutionDepthWise conv4/dw 1 1 conv3_conv3/relu conv4/dw_conv4/dw/relu -23330=4,3,38,38,128 0=128 1=3 3=2 4=1 5=1 6=1152 7=128 9=1 Convolution conv4 1 1 conv4/dw_conv4/dw/relu conv4_conv4/relu -23330=4,3,38,38,256 0=256 1=1 5=1 6=32768 9=1 ConvolutionDepthWise conv5/dw 1 1 conv4_conv4/relu conv5/dw_conv5/dw/relu -23330=4,3,38,38,256 0=256 1=3 4=1 5=1 6=2304 7=256 9=1 Convolution conv5 1 1 conv5/dw_conv5/dw/relu conv5_conv5/relu -23330=4,3,38,38,256 0=256 1=1 5=1 6=65536 9=1 ConvolutionDepthWise conv6/dw 1 1 conv5_conv5/relu conv6/dw_conv6/dw/relu -23330=4,3,19,19,256 0=256 1=3 3=2 4=1 5=1 6=2304 7=256 9=1 Convolution conv6 1 1 conv6/dw_conv6/dw/relu conv6_conv6/relu -23330=4,3,19,19,512 0=512 1=1 5=1 6=131072 9=1 ConvolutionDepthWise conv7/dw 1 1 conv6_conv6/relu conv7/dw_conv7/dw/relu -23330=4,3,19,19,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1 Convolution conv7 1 1 conv7/dw_conv7/dw/relu conv7_conv7/relu -23330=4,3,19,19,512 0=512 1=1 5=1 6=262144 9=1 ConvolutionDepthWise conv8/dw 1 1 conv7_conv7/relu conv8/dw_conv8/dw/relu -23330=4,3,19,19,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1 Convolution conv8 1 1 conv8/dw_conv8/dw/relu conv8_conv8/relu -23330=4,3,19,19,512 0=512 1=1 5=1 6=262144 9=1 ConvolutionDepthWise conv9/dw 1 1 conv8_conv8/relu conv9/dw_conv9/dw/relu -23330=4,3,19,19,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1 Convolution conv9 1 1 conv9/dw_conv9/dw/relu conv9_conv9/relu -23330=4,3,19,19,512 0=512 1=1 5=1 6=262144 9=1 ConvolutionDepthWise conv10/dw 1 1 conv9_conv9/relu conv10/dw_conv10/dw/relu -23330=4,3,19,19,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1 Convolution conv10 1 1 conv10/dw_conv10/dw/relu conv10_conv10/relu -23330=4,3,19,19,512 0=512 1=1 5=1 6=262144 9=1 ConvolutionDepthWise conv11/dw 1 1 conv10_conv10/relu conv11/dw_conv11/dw/relu -23330=4,3,19,19,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1 Convolution conv11 1 1 conv11/dw_conv11/dw/relu conv11_conv11/relu -23330=4,3,19,19,512 0=512 1=1 5=1 6=262144 9=1 Split splitncnn_1 1 4 conv11_conv11/relu conv11_conv11/relu_splitncnn_0 conv11_conv11/relu_splitncnn_1 conv11_conv11/relu_splitncnn_2 conv11_conv11/relu_splitncnn_3 -23330=16,3,19,19,512,3,19,19,512,3,19,19,512,3,19,19,512 ConvolutionDepthWise conv12/dw 1 1 conv11_conv11/relu_splitncnn_3 conv12/dw_conv12/dw/relu -23330=4,3,10,10,512 0=512 1=3 3=2 4=1 5=1 6=4608 7=512 9=1 Convolution conv12 1 1 conv12/dw_conv12/dw/relu conv12_conv12/relu -23330=4,3,10,10,1024 0=1024 1=1 5=1 6=524288 9=1 ConvolutionDepthWise conv13/dw 1 1 conv12_conv12/relu conv13/dw_conv13/dw/relu -23330=4,3,10,10,1024 0=1024 1=3 4=1 5=1 6=9216 7=1024 9=1 Convolution conv13 1 1 conv13/dw_conv13/dw/relu conv13_conv13/relu -23330=4,3,10,10,1024 0=1024 1=1 5=1 6=1048576 9=1 Split splitncnn_2 1 4 conv13_conv13/relu conv13_conv13/relu_splitncnn_0 conv13_conv13/relu_splitncnn_1 conv13_conv13/relu_splitncnn_2 conv13_conv13/relu_splitncnn_3 -23330=16,3,10,10,1024,3,10,10,1024,3,10,10,1024,3,10,10,1024 Convolution conv14_1 1 1 conv13_conv13/relu_splitncnn_3 conv14_1_conv14_1/relu -23330=4,3,10,10,256 0=256 1=1 5=1 6=262144 9=1 Convolution conv14_2 1 1 conv14_1_conv14_1/relu conv14_2_conv14_2/relu -23330=4,3,5,5,512 0=512 1=3 3=2 4=1 5=1 6=1179648 9=1 Split splitncnn_3 1 4 conv14_2_conv14_2/relu conv14_2_conv14_2/relu_splitncnn_0 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_conv14_2/relu_splitncnn_3 -23330=16,3,5,5,512,3,5,5,512,3,5,5,512,3,5,5,512 Convolution conv15_1 1 1 conv14_2_conv14_2/relu_splitncnn_3 conv15_1_conv15_1/relu -23330=4,3,5,5,128 0=128 1=1 5=1 6=65536 9=1 Convolution conv15_2 1 1 conv15_1_conv15_1/relu conv15_2_conv15_2/relu -23330=4,3,3,3,256 0=256 1=3 3=2 4=1 5=1 6=294912 9=1 Split splitncnn_4 1 4 conv15_2_conv15_2/relu conv15_2_conv15_2/relu_splitncnn_0 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_conv15_2/relu_splitncnn_3 -23330=16,3,3,3,256,3,3,3,256,3,3,3,256,3,3,3,256 Convolution conv16_1 1 1 conv15_2_conv15_2/relu_splitncnn_3 conv16_1_conv16_1/relu -23330=4,3,3,3,128 0=128 1=1 5=1 6=32768 9=1 Convolution conv16_2 1 1 conv16_1_conv16_1/relu conv16_2_conv16_2/relu -23330=4,3,2,2,256 0=256 1=3 3=2 4=1 5=1 6=294912 9=1 Split splitncnn_5 1 4 conv16_2_conv16_2/relu conv16_2_conv16_2/relu_splitncnn_0 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_conv16_2/relu_splitncnn_3 -23330=16,3,2,2,256,3,2,2,256,3,2,2,256,3,2,2,256 Convolution conv17_1 1 1 conv16_2_conv16_2/relu_splitncnn_3 conv17_1_conv17_1/relu -23330=4,3,2,2,64 0=64 1=1 5=1 6=16384 9=1 Convolution conv17_2 1 1 conv17_1_conv17_1/relu conv17_2_conv17_2/relu -23330=4,3,1,1,128 0=128 1=3 3=2 4=1 5=1 6=73728 9=1 Split splitncnn_6 1 3 conv17_2_conv17_2/relu conv17_2_conv17_2/relu_splitncnn_0 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_conv17_2/relu_splitncnn_2 -23330=12,3,1,1,128,3,1,1,128,3,1,1,128 Convolution conv11_mbox_loc 1 1 conv11_conv11/relu_splitncnn_2 conv11_mbox_loc -23330=4,3,19,19,12 0=12 1=1 5=1 6=6144 Permute conv11_mbox_loc_perm 1 1 conv11_mbox_loc conv11_mbox_loc_perm -23330=4,3,12,19,19 0=3 Flatten conv11_mbox_loc_flat 1 1 conv11_mbox_loc_perm conv11_mbox_loc_flat -23330=4,1,4332,1,1 Convolution conv11_mbox_conf 1 1 conv11_conv11/relu_splitncnn_1 conv11_mbox_conf -23330=4,3,19,19,63 0=63 1=1 5=1 6=32256 Permute conv11_mbox_conf_perm 1 1 conv11_mbox_conf conv11_mbox_conf_perm -23330=4,3,63,19,19 0=3 Flatten conv11_mbox_conf_flat 1 1 conv11_mbox_conf_perm conv11_mbox_conf_flat -23330=4,1,22743,1,1 PriorBox conv11_mbox_priorbox 2 1 conv11_conv11/relu_splitncnn_0 data_splitncnn_5 conv11_mbox_priorbox -23330=4,2,4332,2,1 -23300=1,6.000000e+01 -23302=1,2.000000e+00 9=-233 10=-233 13=5.000000e-01 Convolution conv13_mbox_loc 1 1 conv13_conv13/relu_splitncnn_2 conv13_mbox_loc -23330=4,3,10,10,24 0=24 1=1 5=1 6=24576 Permute conv13_mbox_loc_perm 1 1 conv13_mbox_loc conv13_mbox_loc_perm -23330=4,3,24,10,10 0=3 Flatten conv13_mbox_loc_flat 1 1 conv13_mbox_loc_perm conv13_mbox_loc_flat -23330=4,1,2400,1,1 Convolution conv13_mbox_conf 1 1 conv13_conv13/relu_splitncnn_1 conv13_mbox_conf -23330=4,3,10,10,126 0=126 1=1 5=1 6=129024 Permute conv13_mbox_conf_perm 1 1 conv13_mbox_conf conv13_mbox_conf_perm -23330=4,3,126,10,10 0=3 Flatten conv13_mbox_conf_flat 1 1 conv13_mbox_conf_perm conv13_mbox_conf_flat -23330=4,1,12600,1,1 PriorBox conv13_mbox_priorbox 2 1 conv13_conv13/relu_splitncnn_0 data_splitncnn_4 conv13_mbox_priorbox -23330=4,2,2400,2,1 -23300=1,1.050000e+02 -23301=1,1.500000e+02 -23302=2,2.000000e+00,3.000000e+00 9=-233 10=-233 13=5.000000e-01 Convolution conv14_2_mbox_loc 1 1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_mbox_loc -23330=4,3,5,5,24 0=24 1=1 5=1 6=12288 Permute conv14_2_mbox_loc_perm 1 1 conv14_2_mbox_loc conv14_2_mbox_loc_perm -23330=4,3,24,5,5 0=3 Flatten conv14_2_mbox_loc_flat 1 1 conv14_2_mbox_loc_perm conv14_2_mbox_loc_flat -23330=4,1,600,1,1 Convolution conv14_2_mbox_conf 1 1 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_mbox_conf -23330=4,3,5,5,126 0=126 1=1 5=1 6=64512 Permute conv14_2_mbox_conf_perm 1 1 conv14_2_mbox_conf conv14_2_mbox_conf_perm -23330=4,3,126,5,5 0=3 Flatten conv14_2_mbox_conf_flat 1 1 conv14_2_mbox_conf_perm conv14_2_mbox_conf_flat -23330=4,1,3150,1,1 PriorBox conv14_2_mbox_priorbox 2 1 conv14_2_conv14_2/relu_splitncnn_0 data_splitncnn_3 conv14_2_mbox_priorbox -23330=4,2,600,2,1 -23300=1,1.500000e+02 -23301=1,1.950000e+02 -23302=2,2.000000e+00,3.000000e+00 9=-233 10=-233 13=5.000000e-01 Convolution conv15_2_mbox_loc 1 1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_mbox_loc -23330=4,3,3,3,24 0=24 1=1 5=1 6=6144 Permute conv15_2_mbox_loc_perm 1 1 conv15_2_mbox_loc conv15_2_mbox_loc_perm -23330=4,3,24,3,3 0=3 Flatten conv15_2_mbox_loc_flat 1 1 conv15_2_mbox_loc_perm conv15_2_mbox_loc_flat -23330=4,1,216,1,1 Convolution conv15_2_mbox_conf 1 1 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_mbox_conf -23330=4,3,3,3,126 0=126 1=1 5=1 6=32256 Permute conv15_2_mbox_conf_perm 1 1 conv15_2_mbox_conf conv15_2_mbox_conf_perm -23330=4,3,126,3,3 0=3 Flatten conv15_2_mbox_conf_flat 1 1 conv15_2_mbox_conf_perm conv15_2_mbox_conf_flat -23330=4,1,1134,1,1 PriorBox conv15_2_mbox_priorbox 2 1 conv15_2_conv15_2/relu_splitncnn_0 data_splitncnn_2 conv15_2_mbox_priorbox -23330=4,2,216,2,1 -23300=1,1.950000e+02 -23301=1,2.400000e+02 -23302=2,2.000000e+00,3.000000e+00 9=-233 10=-233 13=5.000000e-01 Convolution conv16_2_mbox_loc 1 1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_mbox_loc -23330=4,3,2,2,24 0=24 1=1 5=1 6=6144 Permute conv16_2_mbox_loc_perm 1 1 conv16_2_mbox_loc conv16_2_mbox_loc_perm -23330=4,3,24,2,2 0=3 Flatten conv16_2_mbox_loc_flat 1 1 conv16_2_mbox_loc_perm conv16_2_mbox_loc_flat -23330=4,1,96,1,1 Convolution conv16_2_mbox_conf 1 1 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_mbox_conf -23330=4,3,2,2,126 0=126 1=1 5=1 6=32256 Permute conv16_2_mbox_conf_perm 1 1 conv16_2_mbox_conf conv16_2_mbox_conf_perm -23330=4,3,126,2,2 0=3 Flatten conv16_2_mbox_conf_flat 1 1 conv16_2_mbox_conf_perm conv16_2_mbox_conf_flat -23330=4,1,504,1,1 PriorBox conv16_2_mbox_priorbox 2 1 conv16_2_conv16_2/relu_splitncnn_0 data_splitncnn_1 conv16_2_mbox_priorbox -23330=4,2,96,2,1 -23300=1,2.400000e+02 -23301=1,2.850000e+02 -23302=2,2.000000e+00,3.000000e+00 9=-233 10=-233 13=5.000000e-01 Convolution conv17_2_mbox_loc 1 1 conv17_2_conv17_2/relu_splitncnn_2 conv17_2_mbox_loc -23330=4,3,1,1,24 0=24 1=1 5=1 6=3072 Permute conv17_2_mbox_loc_perm 1 1 conv17_2_mbox_loc conv17_2_mbox_loc_perm -23330=4,3,24,1,1 0=3 Flatten conv17_2_mbox_loc_flat 1 1 conv17_2_mbox_loc_perm conv17_2_mbox_loc_flat -23330=4,1,24,1,1 Convolution conv17_2_mbox_conf 1 1 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_mbox_conf -23330=4,3,1,1,126 0=126 1=1 5=1 6=16128 Permute conv17_2_mbox_conf_perm 1 1 conv17_2_mbox_conf conv17_2_mbox_conf_perm -23330=4,3,126,1,1 0=3 Flatten conv17_2_mbox_conf_flat 1 1 conv17_2_mbox_conf_perm conv17_2_mbox_conf_flat -23330=4,1,126,1,1 PriorBox conv17_2_mbox_priorbox 2 1 conv17_2_conv17_2/relu_splitncnn_0 data_splitncnn_0 conv17_2_mbox_priorbox -23330=4,2,24,2,1 -23300=1,2.850000e+02 -23301=1,3.000000e+02 -23302=2,2.000000e+00,3.000000e+00 9=-233 10=-233 13=5.000000e-01 Concat mbox_loc 6 1 conv11_mbox_loc_flat conv13_mbox_loc_flat conv14_2_mbox_loc_flat conv15_2_mbox_loc_flat conv16_2_mbox_loc_flat conv17_2_mbox_loc_flat mbox_loc -23330=4,1,7668,1,1 Concat mbox_conf 6 1 conv11_mbox_conf_flat conv13_mbox_conf_flat conv14_2_mbox_conf_flat conv15_2_mbox_conf_flat conv16_2_mbox_conf_flat conv17_2_mbox_conf_flat mbox_conf -23330=4,1,40257,1,1 Concat mbox_priorbox 6 1 conv11_mbox_priorbox conv13_mbox_priorbox conv14_2_mbox_priorbox conv15_2_mbox_priorbox conv16_2_mbox_priorbox conv17_2_mbox_priorbox mbox_priorbox -23330=4,2,7668,2,1 0=1 Reshape mbox_conf_reshape 1 1 mbox_conf mbox_conf_reshape -23330=4,2,21,1917,1 0=21 1=-1 Softmax mbox_conf_softmax 1 1 mbox_conf_reshape mbox_conf_softmax -23330=4,2,21,1917,1 0=1 1=1 Flatten mbox_conf_flatten 1 1 mbox_conf_softmax mbox_conf_flatten -23330=4,1,40257,1,1 DetectionOutput detection_out 3 1 mbox_loc mbox_conf_flatten mbox_priorbox output 0=21 1=4.500000e-01 2=100 4=2.500000e-01 ================================================ FILE: benchmark/mobilenet_ssd_int8.param ================================================ 7767517 92 115 Input input 0 1 data 0=300 1=300 2=3 Split splitncnn_0 1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6 Convolution conv0 1 1 data_splitncnn_6 conv0_conv0/relu 0=32 1=3 3=2 4=1 5=1 6=864 8=102 9=1 ConvolutionDepthWise conv1/dw 1 1 conv0_conv0/relu conv1/dw_conv1/dw/relu 0=32 1=3 4=1 5=1 6=288 7=32 8=101 9=1 Convolution conv1 1 1 conv1/dw_conv1/dw/relu conv1_conv1/relu 0=64 1=1 5=1 6=2048 8=102 9=1 ConvolutionDepthWise conv2/dw 1 1 conv1_conv1/relu conv2/dw_conv2/dw/relu 0=64 1=3 3=2 4=1 5=1 6=576 7=64 8=101 9=1 Convolution conv2 1 1 conv2/dw_conv2/dw/relu conv2_conv2/relu 0=128 1=1 5=1 6=8192 8=102 9=1 ConvolutionDepthWise conv3/dw 1 1 conv2_conv2/relu conv3/dw_conv3/dw/relu 0=128 1=3 4=1 5=1 6=1152 7=128 8=101 9=1 Convolution conv3 1 1 conv3/dw_conv3/dw/relu conv3_conv3/relu 0=128 1=1 5=1 6=16384 8=102 9=1 ConvolutionDepthWise conv4/dw 1 1 conv3_conv3/relu conv4/dw_conv4/dw/relu 0=128 1=3 3=2 4=1 5=1 6=1152 7=128 8=101 9=1 Convolution conv4 1 1 conv4/dw_conv4/dw/relu conv4_conv4/relu 0=256 1=1 5=1 6=32768 8=102 9=1 ConvolutionDepthWise conv5/dw 1 1 conv4_conv4/relu conv5/dw_conv5/dw/relu 0=256 1=3 4=1 5=1 6=2304 7=256 8=101 9=1 Convolution conv5 1 1 conv5/dw_conv5/dw/relu conv5_conv5/relu 0=256 1=1 5=1 6=65536 8=102 9=1 ConvolutionDepthWise conv6/dw 1 1 conv5_conv5/relu conv6/dw_conv6/dw/relu 0=256 1=3 3=2 4=1 5=1 6=2304 7=256 8=101 9=1 Convolution conv6 1 1 conv6/dw_conv6/dw/relu conv6_conv6/relu 0=512 1=1 5=1 6=131072 8=102 9=1 ConvolutionDepthWise conv7/dw 1 1 conv6_conv6/relu conv7/dw_conv7/dw/relu 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 Convolution conv7 1 1 conv7/dw_conv7/dw/relu conv7_conv7/relu 0=512 1=1 5=1 6=262144 8=102 9=1 ConvolutionDepthWise conv8/dw 1 1 conv7_conv7/relu conv8/dw_conv8/dw/relu 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 Convolution conv8 1 1 conv8/dw_conv8/dw/relu conv8_conv8/relu 0=512 1=1 5=1 6=262144 8=102 9=1 ConvolutionDepthWise conv9/dw 1 1 conv8_conv8/relu conv9/dw_conv9/dw/relu 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 Convolution conv9 1 1 conv9/dw_conv9/dw/relu conv9_conv9/relu 0=512 1=1 5=1 6=262144 8=102 9=1 ConvolutionDepthWise conv10/dw 1 1 conv9_conv9/relu conv10/dw_conv10/dw/relu 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 Convolution conv10 1 1 conv10/dw_conv10/dw/relu conv10_conv10/relu 0=512 1=1 5=1 6=262144 8=102 9=1 ConvolutionDepthWise conv11/dw 1 1 conv10_conv10/relu conv11/dw_conv11/dw/relu 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 Convolution conv11 1 1 conv11/dw_conv11/dw/relu conv11_conv11/relu 0=512 1=1 5=1 6=262144 8=2 9=1 Split splitncnn_1 1 4 conv11_conv11/relu conv11_conv11/relu_splitncnn_0 conv11_conv11/relu_splitncnn_1 conv11_conv11/relu_splitncnn_2 conv11_conv11/relu_splitncnn_3 ConvolutionDepthWise conv12/dw 1 1 conv11_conv11/relu_splitncnn_3 conv12/dw_conv12/dw/relu 0=512 1=3 3=2 4=1 5=1 6=4608 7=512 8=101 9=1 Convolution conv12 1 1 conv12/dw_conv12/dw/relu conv12_conv12/relu 0=1024 1=1 5=1 6=524288 8=102 9=1 ConvolutionDepthWise conv13/dw 1 1 conv12_conv12/relu conv13/dw_conv13/dw/relu 0=1024 1=3 4=1 5=1 6=9216 7=1024 8=101 9=1 Convolution conv13 1 1 conv13/dw_conv13/dw/relu conv13_conv13/relu 0=1024 1=1 5=1 6=1048576 8=2 9=1 Split splitncnn_2 1 4 conv13_conv13/relu conv13_conv13/relu_splitncnn_0 conv13_conv13/relu_splitncnn_1 conv13_conv13/relu_splitncnn_2 conv13_conv13/relu_splitncnn_3 Convolution conv14_1 1 1 conv13_conv13/relu_splitncnn_3 conv14_1_conv14_1/relu 0=256 1=1 5=1 6=262144 8=102 9=1 Convolution conv14_2 1 1 conv14_1_conv14_1/relu conv14_2_conv14_2/relu 0=512 1=3 3=2 4=1 5=1 6=1179648 8=2 9=1 Split splitncnn_3 1 4 conv14_2_conv14_2/relu conv14_2_conv14_2/relu_splitncnn_0 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_conv14_2/relu_splitncnn_3 Convolution conv15_1 1 1 conv14_2_conv14_2/relu_splitncnn_3 conv15_1_conv15_1/relu 0=128 1=1 5=1 6=65536 8=102 9=1 Convolution conv15_2 1 1 conv15_1_conv15_1/relu conv15_2_conv15_2/relu 0=256 1=3 3=2 4=1 5=1 6=294912 8=2 9=1 Split splitncnn_4 1 4 conv15_2_conv15_2/relu conv15_2_conv15_2/relu_splitncnn_0 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_conv15_2/relu_splitncnn_3 Convolution conv16_1 1 1 conv15_2_conv15_2/relu_splitncnn_3 conv16_1_conv16_1/relu 0=128 1=1 5=1 6=32768 8=102 9=1 Convolution conv16_2 1 1 conv16_1_conv16_1/relu conv16_2_conv16_2/relu 0=256 1=3 3=2 4=1 5=1 6=294912 8=2 9=1 Split splitncnn_5 1 4 conv16_2_conv16_2/relu conv16_2_conv16_2/relu_splitncnn_0 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_conv16_2/relu_splitncnn_3 Convolution conv17_1 1 1 conv16_2_conv16_2/relu_splitncnn_3 conv17_1_conv17_1/relu 0=64 1=1 5=1 6=16384 8=102 9=1 Convolution conv17_2 1 1 conv17_1_conv17_1/relu conv17_2_conv17_2/relu 0=128 1=3 3=2 4=1 5=1 6=73728 8=2 9=1 Split splitncnn_6 1 3 conv17_2_conv17_2/relu conv17_2_conv17_2/relu_splitncnn_0 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_conv17_2/relu_splitncnn_2 Convolution conv11_mbox_loc 1 1 conv11_conv11/relu_splitncnn_2 conv11_mbox_loc 0=12 1=1 5=1 6=6144 8=2 Permute conv11_mbox_loc_perm 1 1 conv11_mbox_loc conv11_mbox_loc_perm 0=3 Flatten conv11_mbox_loc_flat 1 1 conv11_mbox_loc_perm conv11_mbox_loc_flat Convolution conv11_mbox_conf 1 1 conv11_conv11/relu_splitncnn_1 conv11_mbox_conf 0=63 1=1 5=1 6=32256 8=2 Permute conv11_mbox_conf_perm 1 1 conv11_mbox_conf conv11_mbox_conf_perm 0=3 Flatten conv11_mbox_conf_flat 1 1 conv11_mbox_conf_perm conv11_mbox_conf_flat PriorBox conv11_mbox_priorbox 2 1 conv11_conv11/relu_splitncnn_0 data_splitncnn_5 conv11_mbox_priorbox -23300=1,60.000000 -23302=1,2.000000 9=-233 10=-233 13=0.500000 Convolution conv13_mbox_loc 1 1 conv13_conv13/relu_splitncnn_2 conv13_mbox_loc 0=24 1=1 5=1 6=24576 8=2 Permute conv13_mbox_loc_perm 1 1 conv13_mbox_loc conv13_mbox_loc_perm 0=3 Flatten conv13_mbox_loc_flat 1 1 conv13_mbox_loc_perm conv13_mbox_loc_flat Convolution conv13_mbox_conf 1 1 conv13_conv13/relu_splitncnn_1 conv13_mbox_conf 0=126 1=1 5=1 6=129024 8=2 Permute conv13_mbox_conf_perm 1 1 conv13_mbox_conf conv13_mbox_conf_perm 0=3 Flatten conv13_mbox_conf_flat 1 1 conv13_mbox_conf_perm conv13_mbox_conf_flat PriorBox conv13_mbox_priorbox 2 1 conv13_conv13/relu_splitncnn_0 data_splitncnn_4 conv13_mbox_priorbox -23300=1,105.000000 -23301=1,150.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000 Convolution conv14_2_mbox_loc 1 1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_mbox_loc 0=24 1=1 5=1 6=12288 8=2 Permute conv14_2_mbox_loc_perm 1 1 conv14_2_mbox_loc conv14_2_mbox_loc_perm 0=3 Flatten conv14_2_mbox_loc_flat 1 1 conv14_2_mbox_loc_perm conv14_2_mbox_loc_flat Convolution conv14_2_mbox_conf 1 1 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_mbox_conf 0=126 1=1 5=1 6=64512 8=2 Permute conv14_2_mbox_conf_perm 1 1 conv14_2_mbox_conf conv14_2_mbox_conf_perm 0=3 Flatten conv14_2_mbox_conf_flat 1 1 conv14_2_mbox_conf_perm conv14_2_mbox_conf_flat PriorBox conv14_2_mbox_priorbox 2 1 conv14_2_conv14_2/relu_splitncnn_0 data_splitncnn_3 conv14_2_mbox_priorbox -23300=1,150.000000 -23301=1,195.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000 Convolution conv15_2_mbox_loc 1 1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_mbox_loc 0=24 1=1 5=1 6=6144 8=2 Permute conv15_2_mbox_loc_perm 1 1 conv15_2_mbox_loc conv15_2_mbox_loc_perm 0=3 Flatten conv15_2_mbox_loc_flat 1 1 conv15_2_mbox_loc_perm conv15_2_mbox_loc_flat Convolution conv15_2_mbox_conf 1 1 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_mbox_conf 0=126 1=1 5=1 6=32256 8=2 Permute conv15_2_mbox_conf_perm 1 1 conv15_2_mbox_conf conv15_2_mbox_conf_perm 0=3 Flatten conv15_2_mbox_conf_flat 1 1 conv15_2_mbox_conf_perm conv15_2_mbox_conf_flat PriorBox conv15_2_mbox_priorbox 2 1 conv15_2_conv15_2/relu_splitncnn_0 data_splitncnn_2 conv15_2_mbox_priorbox -23300=1,195.000000 -23301=1,240.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000 Convolution conv16_2_mbox_loc 1 1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_mbox_loc 0=24 1=1 5=1 6=6144 8=2 Permute conv16_2_mbox_loc_perm 1 1 conv16_2_mbox_loc conv16_2_mbox_loc_perm 0=3 Flatten conv16_2_mbox_loc_flat 1 1 conv16_2_mbox_loc_perm conv16_2_mbox_loc_flat Convolution conv16_2_mbox_conf 1 1 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_mbox_conf 0=126 1=1 5=1 6=32256 8=2 Permute conv16_2_mbox_conf_perm 1 1 conv16_2_mbox_conf conv16_2_mbox_conf_perm 0=3 Flatten conv16_2_mbox_conf_flat 1 1 conv16_2_mbox_conf_perm conv16_2_mbox_conf_flat PriorBox conv16_2_mbox_priorbox 2 1 conv16_2_conv16_2/relu_splitncnn_0 data_splitncnn_1 conv16_2_mbox_priorbox -23300=1,240.000000 -23301=1,285.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000 Convolution conv17_2_mbox_loc 1 1 conv17_2_conv17_2/relu_splitncnn_2 conv17_2_mbox_loc 0=24 1=1 5=1 6=3072 8=2 Permute conv17_2_mbox_loc_perm 1 1 conv17_2_mbox_loc conv17_2_mbox_loc_perm 0=3 Flatten conv17_2_mbox_loc_flat 1 1 conv17_2_mbox_loc_perm conv17_2_mbox_loc_flat Convolution conv17_2_mbox_conf 1 1 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_mbox_conf 0=126 1=1 5=1 6=16128 8=2 Permute conv17_2_mbox_conf_perm 1 1 conv17_2_mbox_conf conv17_2_mbox_conf_perm 0=3 Flatten conv17_2_mbox_conf_flat 1 1 conv17_2_mbox_conf_perm conv17_2_mbox_conf_flat PriorBox conv17_2_mbox_priorbox 2 1 conv17_2_conv17_2/relu_splitncnn_0 data_splitncnn_0 conv17_2_mbox_priorbox -23300=1,285.000000 -23301=1,300.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000 Concat mbox_loc 6 1 conv11_mbox_loc_flat conv13_mbox_loc_flat conv14_2_mbox_loc_flat conv15_2_mbox_loc_flat conv16_2_mbox_loc_flat conv17_2_mbox_loc_flat mbox_loc Concat mbox_conf 6 1 conv11_mbox_conf_flat conv13_mbox_conf_flat conv14_2_mbox_conf_flat conv15_2_mbox_conf_flat conv16_2_mbox_conf_flat conv17_2_mbox_conf_flat mbox_conf Concat mbox_priorbox 6 1 conv11_mbox_priorbox conv13_mbox_priorbox conv14_2_mbox_priorbox conv15_2_mbox_priorbox conv16_2_mbox_priorbox conv17_2_mbox_priorbox mbox_priorbox 0=1 Reshape mbox_conf_reshape 1 1 mbox_conf mbox_conf_reshape 0=21 1=-1 Softmax mbox_conf_softmax 1 1 mbox_conf_reshape mbox_conf_softmax 0=1 1=1 Flatten mbox_conf_flatten 1 1 mbox_conf_softmax mbox_conf_flatten DetectionOutput detection_out 3 1 mbox_loc mbox_conf_flatten mbox_priorbox output 0=21 1=0.450000 2=100 4=0.250000 ================================================ FILE: benchmark/mobilenet_v2.param ================================================ 7767517 77 87 Input data 0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3 Convolution conv1 1 1 data conv1/bn_relu1 -23330=4,3,112,112,32 0=32 1=3 3=2 4=1 5=1 6=864 9=1 Convolution conv2_1/expand 1 1 conv1/bn_relu1 conv2_1/expand/bn_relu2_1/expand -23330=4,3,112,112,32 0=32 1=1 5=1 6=1024 9=1 ConvolutionDepthWise conv2_1/dwise 1 1 conv2_1/expand/bn_relu2_1/expand conv2_1/dwise/bn_relu2_1/dwise -23330=4,3,112,112,32 0=32 1=3 4=1 5=1 6=288 7=32 9=1 Convolution conv2_1/linear 1 1 conv2_1/dwise/bn_relu2_1/dwise conv2_1/linear/bn_conv2_1/linear/scale -23330=4,3,112,112,16 0=16 1=1 5=1 6=512 Convolution conv2_2/expand 1 1 conv2_1/linear/bn_conv2_1/linear/scale conv2_2/expand/bn_relu2_2/expand -23330=4,3,112,112,96 0=96 1=1 5=1 6=1536 9=1 ConvolutionDepthWise conv2_2/dwise 1 1 conv2_2/expand/bn_relu2_2/expand conv2_2/dwise/bn_relu2_2/dwise -23330=4,3,56,56,96 0=96 1=3 3=2 4=1 5=1 6=864 7=96 9=1 Convolution conv2_2/linear 1 1 conv2_2/dwise/bn_relu2_2/dwise conv2_2/linear/bn_conv2_2/linear/scale -23330=4,3,56,56,24 0=24 1=1 5=1 6=2304 Split splitncnn_0 1 2 conv2_2/linear/bn_conv2_2/linear/scale conv2_2/linear/bn_conv2_2/linear/scale_splitncnn_0 conv2_2/linear/bn_conv2_2/linear/scale_splitncnn_1 -23330=8,3,56,56,24,3,56,56,24 Convolution conv3_1/expand 1 1 conv2_2/linear/bn_conv2_2/linear/scale_splitncnn_1 conv3_1/expand/bn_relu3_1/expand -23330=4,3,56,56,144 0=144 1=1 5=1 6=3456 9=1 ConvolutionDepthWise conv3_1/dwise 1 1 conv3_1/expand/bn_relu3_1/expand conv3_1/dwise/bn_relu3_1/dwise -23330=4,3,56,56,144 0=144 1=3 4=1 5=1 6=1296 7=144 9=1 Convolution conv3_1/linear 1 1 conv3_1/dwise/bn_relu3_1/dwise conv3_1/linear/bn_conv3_1/linear/scale -23330=4,3,56,56,24 0=24 1=1 5=1 6=3456 Eltwise block_3_1 2 1 conv2_2/linear/bn_conv2_2/linear/scale_splitncnn_0 conv3_1/linear/bn_conv3_1/linear/scale block_3_1 -23330=4,3,56,56,24 0=1 Convolution conv3_2/expand 1 1 block_3_1 conv3_2/expand/bn_relu3_2/expand -23330=4,3,56,56,144 0=144 1=1 5=1 6=3456 9=1 ConvolutionDepthWise conv3_2/dwise 1 1 conv3_2/expand/bn_relu3_2/expand conv3_2/dwise/bn_relu3_2/dwise -23330=4,3,28,28,144 0=144 1=3 3=2 4=1 5=1 6=1296 7=144 9=1 Convolution conv3_2/linear 1 1 conv3_2/dwise/bn_relu3_2/dwise conv3_2/linear/bn_conv3_2/linear/scale -23330=4,3,28,28,32 0=32 1=1 5=1 6=4608 Split splitncnn_1 1 2 conv3_2/linear/bn_conv3_2/linear/scale conv3_2/linear/bn_conv3_2/linear/scale_splitncnn_0 conv3_2/linear/bn_conv3_2/linear/scale_splitncnn_1 -23330=8,3,28,28,32,3,28,28,32 Convolution conv4_1/expand 1 1 conv3_2/linear/bn_conv3_2/linear/scale_splitncnn_1 conv4_1/expand/bn_relu4_1/expand -23330=4,3,28,28,192 0=192 1=1 5=1 6=6144 9=1 ConvolutionDepthWise conv4_1/dwise 1 1 conv4_1/expand/bn_relu4_1/expand conv4_1/dwise/bn_relu4_1/dwise -23330=4,3,28,28,192 0=192 1=3 4=1 5=1 6=1728 7=192 9=1 Convolution conv4_1/linear 1 1 conv4_1/dwise/bn_relu4_1/dwise conv4_1/linear/bn_conv4_1/linear/scale -23330=4,3,28,28,32 0=32 1=1 5=1 6=6144 Eltwise block_4_1 2 1 conv3_2/linear/bn_conv3_2/linear/scale_splitncnn_0 conv4_1/linear/bn_conv4_1/linear/scale block_4_1 -23330=4,3,28,28,32 0=1 Split splitncnn_2 1 2 block_4_1 block_4_1_splitncnn_0 block_4_1_splitncnn_1 -23330=8,3,28,28,32,3,28,28,32 Convolution conv4_2/expand 1 1 block_4_1_splitncnn_1 conv4_2/expand/bn_relu4_2/expand -23330=4,3,28,28,192 0=192 1=1 5=1 6=6144 9=1 ConvolutionDepthWise conv4_2/dwise 1 1 conv4_2/expand/bn_relu4_2/expand conv4_2/dwise/bn_relu4_2/dwise -23330=4,3,28,28,192 0=192 1=3 4=1 5=1 6=1728 7=192 9=1 Convolution conv4_2/linear 1 1 conv4_2/dwise/bn_relu4_2/dwise conv4_2/linear/bn_conv4_2/linear/scale -23330=4,3,28,28,32 0=32 1=1 5=1 6=6144 Eltwise block_4_2 2 1 block_4_1_splitncnn_0 conv4_2/linear/bn_conv4_2/linear/scale block_4_2 -23330=4,3,28,28,32 0=1 Convolution conv4_3/expand 1 1 block_4_2 conv4_3/expand/bn_relu4_3/expand -23330=4,3,28,28,192 0=192 1=1 5=1 6=6144 9=1 ConvolutionDepthWise conv4_3/dwise 1 1 conv4_3/expand/bn_relu4_3/expand conv4_3/dwise/bn_relu4_3/dwise -23330=4,3,14,14,192 0=192 1=3 3=2 4=1 5=1 6=1728 7=192 9=1 Convolution conv4_3/linear 1 1 conv4_3/dwise/bn_relu4_3/dwise conv4_3/linear/bn_conv4_3/linear/scale -23330=4,3,14,14,64 0=64 1=1 5=1 6=12288 Split splitncnn_3 1 2 conv4_3/linear/bn_conv4_3/linear/scale conv4_3/linear/bn_conv4_3/linear/scale_splitncnn_0 conv4_3/linear/bn_conv4_3/linear/scale_splitncnn_1 -23330=8,3,14,14,64,3,14,14,64 Convolution conv4_4/expand 1 1 conv4_3/linear/bn_conv4_3/linear/scale_splitncnn_1 conv4_4/expand/bn_relu4_4/expand -23330=4,3,14,14,384 0=384 1=1 5=1 6=24576 9=1 ConvolutionDepthWise conv4_4/dwise 1 1 conv4_4/expand/bn_relu4_4/expand conv4_4/dwise/bn_relu4_4/dwise -23330=4,3,14,14,384 0=384 1=3 4=1 5=1 6=3456 7=384 9=1 Convolution conv4_4/linear 1 1 conv4_4/dwise/bn_relu4_4/dwise conv4_4/linear/bn_conv4_4/linear/scale -23330=4,3,14,14,64 0=64 1=1 5=1 6=24576 Eltwise block_4_4 2 1 conv4_3/linear/bn_conv4_3/linear/scale_splitncnn_0 conv4_4/linear/bn_conv4_4/linear/scale block_4_4 -23330=4,3,14,14,64 0=1 Split splitncnn_4 1 2 block_4_4 block_4_4_splitncnn_0 block_4_4_splitncnn_1 -23330=8,3,14,14,64,3,14,14,64 Convolution conv4_5/expand 1 1 block_4_4_splitncnn_1 conv4_5/expand/bn_relu4_5/expand -23330=4,3,14,14,384 0=384 1=1 5=1 6=24576 9=1 ConvolutionDepthWise conv4_5/dwise 1 1 conv4_5/expand/bn_relu4_5/expand conv4_5/dwise/bn_relu4_5/dwise -23330=4,3,14,14,384 0=384 1=3 4=1 5=1 6=3456 7=384 9=1 Convolution conv4_5/linear 1 1 conv4_5/dwise/bn_relu4_5/dwise conv4_5/linear/bn_conv4_5/linear/scale -23330=4,3,14,14,64 0=64 1=1 5=1 6=24576 Eltwise block_4_5 2 1 block_4_4_splitncnn_0 conv4_5/linear/bn_conv4_5/linear/scale block_4_5 -23330=4,3,14,14,64 0=1 Split splitncnn_5 1 2 block_4_5 block_4_5_splitncnn_0 block_4_5_splitncnn_1 -23330=8,3,14,14,64,3,14,14,64 Convolution conv4_6/expand 1 1 block_4_5_splitncnn_1 conv4_6/expand/bn_relu4_6/expand -23330=4,3,14,14,384 0=384 1=1 5=1 6=24576 9=1 ConvolutionDepthWise conv4_6/dwise 1 1 conv4_6/expand/bn_relu4_6/expand conv4_6/dwise/bn_relu4_6/dwise -23330=4,3,14,14,384 0=384 1=3 4=1 5=1 6=3456 7=384 9=1 Convolution conv4_6/linear 1 1 conv4_6/dwise/bn_relu4_6/dwise conv4_6/linear/bn_conv4_6/linear/scale -23330=4,3,14,14,64 0=64 1=1 5=1 6=24576 Eltwise block_4_6 2 1 block_4_5_splitncnn_0 conv4_6/linear/bn_conv4_6/linear/scale block_4_6 -23330=4,3,14,14,64 0=1 Convolution conv4_7/expand 1 1 block_4_6 conv4_7/expand/bn_relu4_7/expand -23330=4,3,14,14,384 0=384 1=1 5=1 6=24576 9=1 ConvolutionDepthWise conv4_7/dwise 1 1 conv4_7/expand/bn_relu4_7/expand conv4_7/dwise/bn_relu4_7/dwise -23330=4,3,14,14,384 0=384 1=3 4=1 5=1 6=3456 7=384 9=1 Convolution conv4_7/linear 1 1 conv4_7/dwise/bn_relu4_7/dwise conv4_7/linear/bn_conv4_7/linear/scale -23330=4,3,14,14,96 0=96 1=1 5=1 6=36864 Split splitncnn_6 1 2 conv4_7/linear/bn_conv4_7/linear/scale conv4_7/linear/bn_conv4_7/linear/scale_splitncnn_0 conv4_7/linear/bn_conv4_7/linear/scale_splitncnn_1 -23330=8,3,14,14,96,3,14,14,96 Convolution conv5_1/expand 1 1 conv4_7/linear/bn_conv4_7/linear/scale_splitncnn_1 conv5_1/expand/bn_relu5_1/expand -23330=4,3,14,14,576 0=576 1=1 5=1 6=55296 9=1 ConvolutionDepthWise conv5_1/dwise 1 1 conv5_1/expand/bn_relu5_1/expand conv5_1/dwise/bn_relu5_1/dwise -23330=4,3,14,14,576 0=576 1=3 4=1 5=1 6=5184 7=576 9=1 Convolution conv5_1/linear 1 1 conv5_1/dwise/bn_relu5_1/dwise conv5_1/linear/bn_conv5_1/linear/scale -23330=4,3,14,14,96 0=96 1=1 5=1 6=55296 Eltwise block_5_1 2 1 conv4_7/linear/bn_conv4_7/linear/scale_splitncnn_0 conv5_1/linear/bn_conv5_1/linear/scale block_5_1 -23330=4,3,14,14,96 0=1 Split splitncnn_7 1 2 block_5_1 block_5_1_splitncnn_0 block_5_1_splitncnn_1 -23330=8,3,14,14,96,3,14,14,96 Convolution conv5_2/expand 1 1 block_5_1_splitncnn_1 conv5_2/expand/bn_relu5_2/expand -23330=4,3,14,14,576 0=576 1=1 5=1 6=55296 9=1 ConvolutionDepthWise conv5_2/dwise 1 1 conv5_2/expand/bn_relu5_2/expand conv5_2/dwise/bn_relu5_2/dwise -23330=4,3,14,14,576 0=576 1=3 4=1 5=1 6=5184 7=576 9=1 Convolution conv5_2/linear 1 1 conv5_2/dwise/bn_relu5_2/dwise conv5_2/linear/bn_conv5_2/linear/scale -23330=4,3,14,14,96 0=96 1=1 5=1 6=55296 Eltwise block_5_2 2 1 block_5_1_splitncnn_0 conv5_2/linear/bn_conv5_2/linear/scale block_5_2 -23330=4,3,14,14,96 0=1 Convolution conv5_3/expand 1 1 block_5_2 conv5_3/expand/bn_relu5_3/expand -23330=4,3,14,14,576 0=576 1=1 5=1 6=55296 9=1 ConvolutionDepthWise conv5_3/dwise 1 1 conv5_3/expand/bn_relu5_3/expand conv5_3/dwise/bn_relu5_3/dwise -23330=4,3,7,7,576 0=576 1=3 3=2 4=1 5=1 6=5184 7=576 9=1 Convolution conv5_3/linear 1 1 conv5_3/dwise/bn_relu5_3/dwise conv5_3/linear/bn_conv5_3/linear/scale -23330=4,3,7,7,160 0=160 1=1 5=1 6=92160 Split splitncnn_8 1 2 conv5_3/linear/bn_conv5_3/linear/scale conv5_3/linear/bn_conv5_3/linear/scale_splitncnn_0 conv5_3/linear/bn_conv5_3/linear/scale_splitncnn_1 -23330=8,3,7,7,160,3,7,7,160 Convolution conv6_1/expand 1 1 conv5_3/linear/bn_conv5_3/linear/scale_splitncnn_1 conv6_1/expand/bn_relu6_1/expand -23330=4,3,7,7,960 0=960 1=1 5=1 6=153600 9=1 ConvolutionDepthWise conv6_1/dwise 1 1 conv6_1/expand/bn_relu6_1/expand conv6_1/dwise/bn_relu6_1/dwise -23330=4,3,7,7,960 0=960 1=3 4=1 5=1 6=8640 7=960 9=1 Convolution conv6_1/linear 1 1 conv6_1/dwise/bn_relu6_1/dwise conv6_1/linear/bn_conv6_1/linear/scale -23330=4,3,7,7,160 0=160 1=1 5=1 6=153600 Eltwise block_6_1 2 1 conv5_3/linear/bn_conv5_3/linear/scale_splitncnn_0 conv6_1/linear/bn_conv6_1/linear/scale block_6_1 -23330=4,3,7,7,160 0=1 Split splitncnn_9 1 2 block_6_1 block_6_1_splitncnn_0 block_6_1_splitncnn_1 -23330=8,3,7,7,160,3,7,7,160 Convolution conv6_2/expand 1 1 block_6_1_splitncnn_1 conv6_2/expand/bn_relu6_2/expand -23330=4,3,7,7,960 0=960 1=1 5=1 6=153600 9=1 ConvolutionDepthWise conv6_2/dwise 1 1 conv6_2/expand/bn_relu6_2/expand conv6_2/dwise/bn_relu6_2/dwise -23330=4,3,7,7,960 0=960 1=3 4=1 5=1 6=8640 7=960 9=1 Convolution conv6_2/linear 1 1 conv6_2/dwise/bn_relu6_2/dwise conv6_2/linear/bn_conv6_2/linear/scale -23330=4,3,7,7,160 0=160 1=1 5=1 6=153600 Eltwise block_6_2 2 1 block_6_1_splitncnn_0 conv6_2/linear/bn_conv6_2/linear/scale block_6_2 -23330=4,3,7,7,160 0=1 Convolution conv6_3/expand 1 1 block_6_2 conv6_3/expand/bn_relu6_3/expand -23330=4,3,7,7,960 0=960 1=1 5=1 6=153600 9=1 ConvolutionDepthWise conv6_3/dwise 1 1 conv6_3/expand/bn_relu6_3/expand conv6_3/dwise/bn_relu6_3/dwise -23330=4,3,7,7,960 0=960 1=3 4=1 5=1 6=8640 7=960 9=1 Convolution conv6_3/linear 1 1 conv6_3/dwise/bn_relu6_3/dwise conv6_3/linear/bn_conv6_3/linear/scale -23330=4,3,7,7,320 0=320 1=1 5=1 6=307200 Convolution conv6_4 1 1 conv6_3/linear/bn_conv6_3/linear/scale conv6_4/bn_relu6_4 -23330=4,3,7,7,1280 0=1280 1=1 5=1 6=409600 9=1 Pooling pool6 1 1 conv6_4/bn_relu6_4 pool6 -23330=4,1,1280,1,1 0=1 4=1 InnerProduct fc7 1 1 pool6 fc7 -23330=4,1,1000,1,1 0=1000 1=1 2=1280000 Softmax prob 1 1 fc7 output -23330=4,1,1000,1,1 ================================================ FILE: benchmark/mobilenet_v3.param ================================================ 7767517 145 163 Input data 0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3 Convolution 313 1 1 data 313 -23330=4,3,112,112,16 0=16 1=3 3=2 4=1 5=1 6=432 Split splitncnn_0 1 2 313 313_splitncnn_0 313_splitncnn_1 -23330=8,3,112,112,16,3,112,112,16 HardSigmoid 319 1 1 313_splitncnn_1 319 -23330=4,3,112,112,16 BinaryOp 320 2 1 313_splitncnn_0 319 320 -23330=4,3,112,112,16 0=2 Split splitncnn_1 1 2 320 320_splitncnn_0 320_splitncnn_1 -23330=8,3,112,112,16,3,112,112,16 ConvolutionDepthWise 321 1 1 320_splitncnn_1 323 -23330=4,3,112,112,16 0=16 1=3 4=1 5=1 6=144 7=16 9=1 Convolution 324 1 1 323 324 -23330=4,3,112,112,16 0=16 1=1 5=1 6=256 BinaryOp 326 2 1 320_splitncnn_0 324 326 -23330=4,3,112,112,16 Convolution 327 1 1 326 329 -23330=4,3,112,112,64 0=64 1=1 5=1 6=1024 9=1 ConvolutionDepthWise 330 1 1 329 332 -23330=4,3,56,56,64 0=64 1=3 3=2 4=1 5=1 6=576 7=64 9=1 Convolution 333 1 1 332 333 -23330=4,3,56,56,24 0=24 1=1 5=1 6=1536 Split splitncnn_2 1 2 333 333_splitncnn_0 333_splitncnn_1 -23330=8,3,56,56,24,3,56,56,24 Convolution 335 1 1 333_splitncnn_1 337 -23330=4,3,56,56,72 0=72 1=1 5=1 6=1728 9=1 ConvolutionDepthWise 338 1 1 337 340 -23330=4,3,56,56,72 0=72 1=3 4=1 5=1 6=648 7=72 9=1 Convolution 341 1 1 340 341 -23330=4,3,56,56,24 0=24 1=1 5=1 6=1728 BinaryOp 343 2 1 333_splitncnn_0 341 343 -23330=4,3,56,56,24 Convolution 344 1 1 343 346 -23330=4,3,56,56,72 0=72 1=1 5=1 6=1728 9=1 ConvolutionDepthWise 347 1 1 346 347 -23330=4,3,28,28,72 0=72 1=5 3=2 4=2 5=1 6=1800 7=72 Split splitncnn_3 1 2 347 347_splitncnn_0 347_splitncnn_1 -23330=8,3,28,28,72,3,28,28,72 Pooling 355 1 1 347_splitncnn_1 359 -23330=4,1,72,1,1 0=1 4=1 InnerProduct 360 1 1 359 361 -23330=4,1,18,1,1 0=18 1=1 2=1296 9=1 InnerProduct 362 1 1 361 362 -23330=4,1,72,1,1 0=72 1=1 2=1296 HardSigmoid 367 1 1 362 367 -23330=4,1,72,1,1 BinaryOp 376 2 1 347_splitncnn_0 367 376 -23330=4,3,28,28,72 0=2 ReLU 377 1 1 376 377 -23330=4,3,28,28,72 Convolution 378 1 1 377 378 -23330=4,3,28,28,40 0=40 1=1 5=1 6=2880 Split splitncnn_4 1 2 378 378_splitncnn_0 378_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40 Convolution 380 1 1 378_splitncnn_1 382 -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1 ConvolutionDepthWise 383 1 1 382 383 -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120 Split splitncnn_5 1 2 383 383_splitncnn_0 383_splitncnn_1 -23330=8,3,28,28,120,3,28,28,120 Pooling 391 1 1 383_splitncnn_1 395 -23330=4,1,120,1,1 0=1 4=1 InnerProduct 396 1 1 395 397 -23330=4,1,30,1,1 0=30 1=1 2=3600 9=1 InnerProduct 398 1 1 397 398 -23330=4,1,120,1,1 0=120 1=1 2=3600 HardSigmoid 403 1 1 398 403 -23330=4,1,120,1,1 BinaryOp 412 2 1 383_splitncnn_0 403 412 -23330=4,3,28,28,120 0=2 ReLU 413 1 1 412 413 -23330=4,3,28,28,120 Convolution 414 1 1 413 414 -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800 BinaryOp 416 2 1 378_splitncnn_0 414 416 -23330=4,3,28,28,40 Split splitncnn_6 1 2 416 416_splitncnn_0 416_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40 Convolution 417 1 1 416_splitncnn_1 419 -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1 ConvolutionDepthWise 420 1 1 419 420 -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120 Split splitncnn_7 1 2 420 420_splitncnn_0 420_splitncnn_1 -23330=8,3,28,28,120,3,28,28,120 Pooling 428 1 1 420_splitncnn_1 432 -23330=4,1,120,1,1 0=1 4=1 InnerProduct 433 1 1 432 434 -23330=4,1,30,1,1 0=30 1=1 2=3600 9=1 InnerProduct 435 1 1 434 435 -23330=4,1,120,1,1 0=120 1=1 2=3600 HardSigmoid 440 1 1 435 440 -23330=4,1,120,1,1 BinaryOp 449 2 1 420_splitncnn_0 440 449 -23330=4,3,28,28,120 0=2 ReLU 450 1 1 449 450 -23330=4,3,28,28,120 Convolution 451 1 1 450 451 -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800 BinaryOp 453 2 1 416_splitncnn_0 451 453 -23330=4,3,28,28,40 Convolution 454 1 1 453 454 -23330=4,3,28,28,240 0=240 1=1 5=1 6=9600 HardSwish 461 1 1 454 461 -23330=4,3,28,28,240 ConvolutionDepthWise 462 1 1 461 462 -23330=4,3,14,14,240 0=240 1=3 3=2 4=1 5=1 6=2160 7=240 HardSwish 469 1 1 462 469 -23330=4,3,14,14,240 Convolution 470 1 1 469 470 -23330=4,3,14,14,80 0=80 1=1 5=1 6=19200 Split splitncnn_8 1 2 470 470_splitncnn_0 470_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80 Convolution 472 1 1 470_splitncnn_1 472 -23330=4,3,14,14,200 0=200 1=1 5=1 6=16000 HardSwish 479 1 1 472 479 -23330=4,3,14,14,200 ConvolutionDepthWise 480 1 1 479 480 -23330=4,3,14,14,200 0=200 1=3 4=1 5=1 6=1800 7=200 HardSwish 487 1 1 480 487 -23330=4,3,14,14,200 Convolution 488 1 1 487 488 -23330=4,3,14,14,80 0=80 1=1 5=1 6=16000 BinaryOp 490 2 1 470_splitncnn_0 488 490 -23330=4,3,14,14,80 Split splitncnn_9 1 2 490 490_splitncnn_0 490_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80 Convolution 491 1 1 490_splitncnn_1 491 -23330=4,3,14,14,184 0=184 1=1 5=1 6=14720 HardSwish 498 1 1 491 498 -23330=4,3,14,14,184 ConvolutionDepthWise 499 1 1 498 499 -23330=4,3,14,14,184 0=184 1=3 4=1 5=1 6=1656 7=184 HardSwish 506 1 1 499 506 -23330=4,3,14,14,184 Convolution 507 1 1 506 507 -23330=4,3,14,14,80 0=80 1=1 5=1 6=14720 BinaryOp 509 2 1 490_splitncnn_0 507 509 -23330=4,3,14,14,80 Split splitncnn_10 1 2 509 509_splitncnn_0 509_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80 Convolution 510 1 1 509_splitncnn_1 510 -23330=4,3,14,14,184 0=184 1=1 5=1 6=14720 HardSwish 517 1 1 510 517 -23330=4,3,14,14,184 ConvolutionDepthWise 518 1 1 517 518 -23330=4,3,14,14,184 0=184 1=3 4=1 5=1 6=1656 7=184 HardSwish 525 1 1 518 525 -23330=4,3,14,14,184 Convolution 526 1 1 525 526 -23330=4,3,14,14,80 0=80 1=1 5=1 6=14720 BinaryOp 528 2 1 509_splitncnn_0 526 528 -23330=4,3,14,14,80 Convolution 529 1 1 528 529 -23330=4,3,14,14,480 0=480 1=1 5=1 6=38400 HardSwish 536 1 1 529 536 -23330=4,3,14,14,480 ConvolutionDepthWise 537 1 1 536 537 -23330=4,3,14,14,480 0=480 1=3 4=1 5=1 6=4320 7=480 Split splitncnn_11 1 2 537 537_splitncnn_0 537_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480 Pooling 545 1 1 537_splitncnn_1 549 -23330=4,1,480,1,1 0=1 4=1 InnerProduct 550 1 1 549 551 -23330=4,1,120,1,1 0=120 1=1 2=57600 9=1 InnerProduct 552 1 1 551 552 -23330=4,1,480,1,1 0=480 1=1 2=57600 HardSigmoid 557 1 1 552 557 -23330=4,1,480,1,1 BinaryOp 566 2 1 537_splitncnn_0 557 566 -23330=4,3,14,14,480 0=2 HardSwish 572 1 1 566 572 -23330=4,3,14,14,480 Convolution 573 1 1 572 573 -23330=4,3,14,14,112 0=112 1=1 5=1 6=53760 Split splitncnn_12 1 2 573 573_splitncnn_0 573_splitncnn_1 -23330=8,3,14,14,112,3,14,14,112 Convolution 575 1 1 573_splitncnn_1 575 -23330=4,3,14,14,672 0=672 1=1 5=1 6=75264 HardSwish 582 1 1 575 582 -23330=4,3,14,14,672 ConvolutionDepthWise 583 1 1 582 583 -23330=4,3,14,14,672 0=672 1=3 4=1 5=1 6=6048 7=672 Split splitncnn_13 1 2 583 583_splitncnn_0 583_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672 Pooling 591 1 1 583_splitncnn_1 595 -23330=4,1,672,1,1 0=1 4=1 InnerProduct 596 1 1 595 597 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1 InnerProduct 598 1 1 597 598 -23330=4,1,672,1,1 0=672 1=1 2=112896 HardSigmoid 603 1 1 598 603 -23330=4,1,672,1,1 BinaryOp 612 2 1 583_splitncnn_0 603 612 -23330=4,3,14,14,672 0=2 HardSwish 618 1 1 612 618 -23330=4,3,14,14,672 Convolution 619 1 1 618 619 -23330=4,3,14,14,112 0=112 1=1 5=1 6=75264 BinaryOp 621 2 1 573_splitncnn_0 619 621 -23330=4,3,14,14,112 Convolution 622 1 1 621 622 -23330=4,3,14,14,672 0=672 1=1 5=1 6=75264 HardSwish 629 1 1 622 629 -23330=4,3,14,14,672 ConvolutionDepthWise 630 1 1 629 630 -23330=4,3,14,14,672 0=672 1=5 4=2 5=1 6=16800 7=672 Split splitncnn_14 1 2 630 630_splitncnn_0 630_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672 Pooling 638 1 1 630_splitncnn_1 642 -23330=4,1,672,1,1 0=1 4=1 InnerProduct 643 1 1 642 644 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1 InnerProduct 645 1 1 644 645 -23330=4,1,672,1,1 0=672 1=1 2=112896 HardSigmoid 650 1 1 645 650 -23330=4,1,672,1,1 BinaryOp 659 2 1 630_splitncnn_0 650 659 -23330=4,3,14,14,672 0=2 HardSwish 665 1 1 659 665 -23330=4,3,14,14,672 Convolution 666 1 1 665 666 -23330=4,3,14,14,160 0=160 1=1 5=1 6=107520 Convolution 668 1 1 666 668 -23330=4,3,14,14,672 0=672 1=1 5=1 6=107520 HardSwish 675 1 1 668 675 -23330=4,3,14,14,672 ConvolutionDepthWise 676 1 1 675 676 -23330=4,3,7,7,672 0=672 1=5 3=2 4=2 5=1 6=16800 7=672 Split splitncnn_15 1 2 676 676_splitncnn_0 676_splitncnn_1 -23330=8,3,7,7,672,3,7,7,672 Pooling 684 1 1 676_splitncnn_1 688 -23330=4,1,672,1,1 0=1 4=1 InnerProduct 689 1 1 688 690 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1 InnerProduct 691 1 1 690 691 -23330=4,1,672,1,1 0=672 1=1 2=112896 HardSigmoid 696 1 1 691 696 -23330=4,1,672,1,1 BinaryOp 705 2 1 676_splitncnn_0 696 705 -23330=4,3,7,7,672 0=2 HardSwish 711 1 1 705 711 -23330=4,3,7,7,672 Convolution 712 1 1 711 712 -23330=4,3,7,7,160 0=160 1=1 5=1 6=107520 Split splitncnn_16 1 2 712 712_splitncnn_0 712_splitncnn_1 -23330=8,3,7,7,160,3,7,7,160 Convolution 714 1 1 712_splitncnn_1 714 -23330=4,3,7,7,960 0=960 1=1 5=1 6=153600 HardSwish 721 1 1 714 721 -23330=4,3,7,7,960 ConvolutionDepthWise 722 1 1 721 722 -23330=4,3,7,7,960 0=960 1=5 4=2 5=1 6=24000 7=960 Split splitncnn_17 1 2 722 722_splitncnn_0 722_splitncnn_1 -23330=8,3,7,7,960,3,7,7,960 Pooling 730 1 1 722_splitncnn_1 734 -23330=4,1,960,1,1 0=1 4=1 InnerProduct 735 1 1 734 736 -23330=4,1,240,1,1 0=240 1=1 2=230400 9=1 InnerProduct 737 1 1 736 737 -23330=4,1,960,1,1 0=960 1=1 2=230400 HardSigmoid 742 1 1 737 742 -23330=4,1,960,1,1 BinaryOp 751 2 1 722_splitncnn_0 742 751 -23330=4,3,7,7,960 0=2 HardSwish 757 1 1 751 757 -23330=4,3,7,7,960 Convolution 758 1 1 757 758 -23330=4,3,7,7,160 0=160 1=1 5=1 6=153600 BinaryOp 760 2 1 712_splitncnn_0 758 760 -23330=4,3,7,7,160 Convolution 761 1 1 760 761 -23330=4,3,7,7,960 0=960 1=1 5=1 6=153600 HardSwish 768 1 1 761 768 -23330=4,3,7,7,960 Pooling 769 1 1 768 769 -23330=4,1,960,1,1 0=1 4=1 HardSwish 775 1 1 769 775 -23330=4,1,960,1,1 Reshape 783 1 1 775 783 -23330=4,1,960,1,1 0=-1 InnerProduct 784 1 1 783 784 -23330=4,1,1280,1,1 0=1280 1=1 2=1228800 HardSwish 790 1 1 784 790 -23330=4,1,1280,1,1 InnerProduct 791 1 1 790 791 -23330=4,1,1000,1,1 0=1000 1=1 2=1280000 Softmax prob 1 1 791 output -23330=4,1,1000,1,1 ================================================ FILE: benchmark/mobilenet_yolo.param ================================================ 7767517 39 41 Input data 0 1 data -23330=4,3,416,416,3 0=416 1=416 2=3 Convolution conv0 1 1 data conv0_conv0/relu -23330=4,3,208,208,32 0=32 1=3 3=2 4=1 5=1 6=864 9=1 ConvolutionDepthWise conv1/dw 1 1 conv0_conv0/relu conv1/dw_conv1/dw/relu -23330=4,3,208,208,32 0=32 1=3 4=1 5=1 6=288 7=32 9=1 Convolution conv1 1 1 conv1/dw_conv1/dw/relu conv1_conv1/relu -23330=4,3,208,208,64 0=64 1=1 5=1 6=2048 9=1 ConvolutionDepthWise conv2/dw 1 1 conv1_conv1/relu conv2/dw_conv2/dw/relu -23330=4,3,104,104,64 0=64 1=3 3=2 4=1 5=1 6=576 7=64 9=1 Convolution conv2 1 1 conv2/dw_conv2/dw/relu conv2_conv2/relu -23330=4,3,104,104,128 0=128 1=1 5=1 6=8192 9=1 ConvolutionDepthWise conv3/dw 1 1 conv2_conv2/relu conv3/dw_conv3/dw/relu -23330=4,3,104,104,128 0=128 1=3 4=1 5=1 6=1152 7=128 9=1 Convolution conv3 1 1 conv3/dw_conv3/dw/relu conv3_conv3/relu -23330=4,3,104,104,128 0=128 1=1 5=1 6=16384 9=1 ConvolutionDepthWise conv4/dw 1 1 conv3_conv3/relu conv4/dw_conv4/dw/relu -23330=4,3,52,52,128 0=128 1=3 3=2 4=1 5=1 6=1152 7=128 9=1 Convolution conv4 1 1 conv4/dw_conv4/dw/relu conv4_conv4/relu -23330=4,3,52,52,256 0=256 1=1 5=1 6=32768 9=1 ConvolutionDepthWise conv5/dw 1 1 conv4_conv4/relu conv5/dw_conv5/dw/relu -23330=4,3,52,52,256 0=256 1=3 4=1 5=1 6=2304 7=256 9=1 Convolution conv5 1 1 conv5/dw_conv5/dw/relu conv5_conv5/relu -23330=4,3,52,52,256 0=256 1=1 5=1 6=65536 9=1 ConvolutionDepthWise conv6/dw 1 1 conv5_conv5/relu conv6/dw_conv6/dw/relu -23330=4,3,26,26,256 0=256 1=3 3=2 4=1 5=1 6=2304 7=256 9=1 Convolution conv6 1 1 conv6/dw_conv6/dw/relu conv6_conv6/relu -23330=4,3,26,26,512 0=512 1=1 5=1 6=131072 9=1 ConvolutionDepthWise conv7/dw 1 1 conv6_conv6/relu conv7/dw_conv7/dw/relu -23330=4,3,26,26,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1 Convolution conv7 1 1 conv7/dw_conv7/dw/relu conv7_conv7/relu -23330=4,3,26,26,512 0=512 1=1 5=1 6=262144 9=1 ConvolutionDepthWise conv8/dw 1 1 conv7_conv7/relu conv8/dw_conv8/dw/relu -23330=4,3,26,26,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1 Convolution conv8 1 1 conv8/dw_conv8/dw/relu conv8_conv8/relu -23330=4,3,26,26,512 0=512 1=1 5=1 6=262144 9=1 ConvolutionDepthWise conv9/dw 1 1 conv8_conv8/relu conv9/dw_conv9/dw/relu -23330=4,3,26,26,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1 Convolution conv9 1 1 conv9/dw_conv9/dw/relu conv9_conv9/relu -23330=4,3,26,26,512 0=512 1=1 5=1 6=262144 9=1 ConvolutionDepthWise conv10/dw 1 1 conv9_conv9/relu conv10/dw_conv10/dw/relu -23330=4,3,26,26,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1 Convolution conv10 1 1 conv10/dw_conv10/dw/relu conv10_conv10/relu -23330=4,3,26,26,512 0=512 1=1 5=1 6=262144 9=1 ConvolutionDepthWise conv11/dw 1 1 conv10_conv10/relu conv11/dw_conv11/dw/relu -23330=4,3,26,26,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1 Convolution conv11 1 1 conv11/dw_conv11/dw/relu conv11_conv11/relu -23330=4,3,26,26,512 0=512 1=1 5=1 6=262144 9=1 Split splitncnn_0 1 2 conv11_conv11/relu conv11_conv11/relu_splitncnn_0 conv11_conv11/relu_splitncnn_1 -23330=8,3,26,26,512,3,26,26,512 ConvolutionDepthWise conv12/dw 1 1 conv11_conv11/relu_splitncnn_1 conv12/dw_conv12/dw/relu -23330=4,3,13,13,512 0=512 1=3 3=2 4=1 5=1 6=4608 7=512 9=1 Convolution conv12 1 1 conv12/dw_conv12/dw/relu conv12_conv12/relu -23330=4,3,13,13,1024 0=1024 1=1 5=1 6=524288 9=1 ConvolutionDepthWise conv13/dw 1 1 conv12_conv12/relu conv13/dw_conv13/dw/relu -23330=4,3,13,13,1024 0=1024 1=3 4=1 5=1 6=9216 7=1024 9=1 Convolution conv13 1 1 conv13/dw_conv13/dw/relu conv13_conv13/relu -23330=4,3,13,13,1024 0=1024 1=1 5=1 6=1048576 9=1 ConvolutionDepthWise conv16/dw 1 1 conv13_conv13/relu conv16/dw_conv16/dw/relu -23330=4,3,13,13,1024 0=1024 1=3 4=1 5=1 6=9216 7=1024 9=1 Convolution conv17 1 1 conv16/dw_conv16/dw/relu conv17_conv17/relu -23330=4,3,13,13,1024 0=1024 1=1 5=1 6=1048576 9=1 Split splitncnn_1 1 2 conv17_conv17/relu conv17_conv17/relu_splitncnn_0 conv17_conv17/relu_splitncnn_1 -23330=8,3,13,13,1024,3,13,13,1024 DeconvolutionDepthWise upsample 1 1 conv17_conv17/relu_splitncnn_1 upsample -23330=4,3,26,26,512 0=512 1=4 3=2 4=1 6=16384 7=512 Eltwise conv_18/sum 2 1 conv11_conv11/relu_splitncnn_0 upsample conv_18/sum -23330=4,3,26,26,512 0=1 ConvolutionDepthWise conv19/dw 1 1 conv_18/sum conv19/dw_conv19/dw/relu -23330=4,3,26,26,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1 Convolution conv20 1 1 conv19/dw_conv19/dw/relu conv20_conv20/relu -23330=4,3,26,26,1024 0=1024 1=1 5=1 6=524288 9=1 Convolution conv22_indoor 1 1 conv17_conv17/relu_splitncnn_0 conv22 -23330=4,3,13,13,125 0=125 1=1 5=1 6=128000 Convolution conv23_indoor 1 1 conv20_conv20/relu conv23 -23330=4,3,26,26,125 0=125 1=1 5=1 6=128000 YoloDetectionOutput detection_out 2 1 conv22 conv23 output -23330=4,3,13,13,125 2=4.000000e-01 -23304=10,1.080000e+00,1.190000e+00,3.420000e+00,4.410000e+00,6.630000e+00,1.138000e+01,9.420000e+00,5.110000e+00,1.662000e+01,1.052000e+01 ================================================ FILE: benchmark/mobilenetv2_yolov3.param ================================================ 7767517 87 99 Input data 0 1 data -23330=4,3,352,352,3 0=352 1=352 2=3 Convolution conv1 1 1 data conv1_relu1 -23330=4,3,176,176,32 0=32 1=3 3=2 4=1 5=1 6=864 9=1 ConvolutionDepthWise conv2 1 1 conv1_relu1 conv2_relu2 -23330=4,3,176,176,32 0=32 1=3 4=1 5=1 6=288 7=32 9=1 Convolution conv3 1 1 conv2_relu2 conv3 -23330=4,3,176,176,16 0=16 1=1 5=1 6=512 Convolution conv4 1 1 conv3 conv4_relu3 -23330=4,3,176,176,96 0=96 1=1 5=1 6=1536 9=1 ConvolutionDepthWise conv5 1 1 conv4_relu3 conv5_relu4 -23330=4,3,88,88,96 0=96 1=3 3=2 4=1 5=1 6=864 7=96 9=1 Convolution conv6 1 1 conv5_relu4 conv6 -23330=4,3,88,88,24 0=24 1=1 5=1 6=2304 Split splitncnn_0 1 2 conv6 conv6_splitncnn_0 conv6_splitncnn_1 -23330=8,3,88,88,24,3,88,88,24 Convolution conv7 1 1 conv6_splitncnn_1 conv7_relu5 -23330=4,3,88,88,144 0=144 1=1 5=1 6=3456 9=1 ConvolutionDepthWise conv8 1 1 conv7_relu5 conv8_relu6 -23330=4,3,88,88,144 0=144 1=3 4=1 5=1 6=1296 7=144 9=1 Convolution conv9 1 1 conv8_relu6 conv9 -23330=4,3,88,88,24 0=24 1=1 5=1 6=3456 Eltwise add1 2 1 conv6_splitncnn_0 conv9 add1 -23330=4,3,88,88,24 0=1 Convolution conv10 1 1 add1 conv10_relu7 -23330=4,3,88,88,144 0=144 1=1 5=1 6=3456 9=1 ConvolutionDepthWise conv11 1 1 conv10_relu7 conv11_relu8 -23330=4,3,44,44,144 0=144 1=3 3=2 4=1 5=1 6=1296 7=144 9=1 Convolution conv12 1 1 conv11_relu8 conv12 -23330=4,3,44,44,32 0=32 1=1 5=1 6=4608 Split splitncnn_1 1 2 conv12 conv12_splitncnn_0 conv12_splitncnn_1 -23330=8,3,44,44,32,3,44,44,32 Convolution conv13 1 1 conv12_splitncnn_1 conv13_relu9 -23330=4,3,44,44,192 0=192 1=1 5=1 6=6144 9=1 ConvolutionDepthWise conv14 1 1 conv13_relu9 conv14_relu10 -23330=4,3,44,44,192 0=192 1=3 4=1 5=1 6=1728 7=192 9=1 Convolution conv15 1 1 conv14_relu10 conv15 -23330=4,3,44,44,32 0=32 1=1 5=1 6=6144 Eltwise add2 2 1 conv12_splitncnn_0 conv15 add2 -23330=4,3,44,44,32 0=1 Split splitncnn_2 1 2 add2 add2_splitncnn_0 add2_splitncnn_1 -23330=8,3,44,44,32,3,44,44,32 Convolution conv16 1 1 add2_splitncnn_1 conv16_relu11 -23330=4,3,44,44,192 0=192 1=1 5=1 6=6144 9=1 ConvolutionDepthWise conv17 1 1 conv16_relu11 conv17_relu12 -23330=4,3,44,44,192 0=192 1=3 4=1 5=1 6=1728 7=192 9=1 Convolution conv18 1 1 conv17_relu12 conv18 -23330=4,3,44,44,32 0=32 1=1 5=1 6=6144 Eltwise add3 2 1 add2_splitncnn_0 conv18 add3 -23330=4,3,44,44,32 0=1 Convolution conv19 1 1 add3 conv19_relu13 -23330=4,3,44,44,192 0=192 1=1 5=1 6=6144 9=1 ConvolutionDepthWise conv20 1 1 conv19_relu13 conv20_relu14 -23330=4,3,22,22,192 0=192 1=3 3=2 4=1 5=1 6=1728 7=192 9=1 Convolution conv21 1 1 conv20_relu14 conv21 -23330=4,3,22,22,64 0=64 1=1 5=1 6=12288 Split splitncnn_3 1 2 conv21 conv21_splitncnn_0 conv21_splitncnn_1 -23330=8,3,22,22,64,3,22,22,64 Convolution conv22 1 1 conv21_splitncnn_1 conv22_relu15 -23330=4,3,22,22,384 0=384 1=1 5=1 6=24576 9=1 ConvolutionDepthWise conv23 1 1 conv22_relu15 conv23_relu16 -23330=4,3,22,22,384 0=384 1=3 4=1 5=1 6=3456 7=384 9=1 Convolution conv24 1 1 conv23_relu16 conv24 -23330=4,3,22,22,64 0=64 1=1 5=1 6=24576 Eltwise add4 2 1 conv21_splitncnn_0 conv24 add4 -23330=4,3,22,22,64 0=1 Split splitncnn_4 1 2 add4 add4_splitncnn_0 add4_splitncnn_1 -23330=8,3,22,22,64,3,22,22,64 Convolution conv25 1 1 add4_splitncnn_1 conv25_relu17 -23330=4,3,22,22,384 0=384 1=1 5=1 6=24576 9=1 ConvolutionDepthWise conv26 1 1 conv25_relu17 conv26_relu18 -23330=4,3,22,22,384 0=384 1=3 4=1 5=1 6=3456 7=384 9=1 Convolution conv27 1 1 conv26_relu18 conv27 -23330=4,3,22,22,64 0=64 1=1 5=1 6=24576 Eltwise add5 2 1 add4_splitncnn_0 conv27 add5 -23330=4,3,22,22,64 0=1 Split splitncnn_5 1 2 add5 add5_splitncnn_0 add5_splitncnn_1 -23330=8,3,22,22,64,3,22,22,64 Convolution conv28 1 1 add5_splitncnn_1 conv28_relu19 -23330=4,3,22,22,384 0=384 1=1 5=1 6=24576 9=1 ConvolutionDepthWise conv29 1 1 conv28_relu19 conv29_relu20 -23330=4,3,22,22,384 0=384 1=3 4=1 5=1 6=3456 7=384 9=1 Convolution conv30 1 1 conv29_relu20 conv30 -23330=4,3,22,22,64 0=64 1=1 5=1 6=24576 Eltwise add6 2 1 add5_splitncnn_0 conv30 add6 -23330=4,3,22,22,64 0=1 Convolution conv31 1 1 add6 conv31_relu21 -23330=4,3,22,22,384 0=384 1=1 5=1 6=24576 9=1 ConvolutionDepthWise conv32 1 1 conv31_relu21 conv32_relu22 -23330=4,3,22,22,384 0=384 1=3 4=1 5=1 6=3456 7=384 9=1 Convolution conv33 1 1 conv32_relu22 conv33 -23330=4,3,22,22,96 0=96 1=1 5=1 6=36864 Split splitncnn_6 1 2 conv33 conv33_splitncnn_0 conv33_splitncnn_1 -23330=8,3,22,22,96,3,22,22,96 Convolution conv34 1 1 conv33_splitncnn_1 conv34_relu23 -23330=4,3,22,22,576 0=576 1=1 5=1 6=55296 9=1 ConvolutionDepthWise conv35 1 1 conv34_relu23 conv35_relu24 -23330=4,3,22,22,576 0=576 1=3 4=1 5=1 6=5184 7=576 9=1 Convolution conv36 1 1 conv35_relu24 conv36 -23330=4,3,22,22,96 0=96 1=1 5=1 6=55296 Eltwise add7 2 1 conv33_splitncnn_0 conv36 add7 -23330=4,3,22,22,96 0=1 Split splitncnn_7 1 2 add7 add7_splitncnn_0 add7_splitncnn_1 -23330=8,3,22,22,96,3,22,22,96 Convolution conv37 1 1 add7_splitncnn_1 conv37_relu25 -23330=4,3,22,22,576 0=576 1=1 5=1 6=55296 9=1 ConvolutionDepthWise conv38 1 1 conv37_relu25 conv38_relu26 -23330=4,3,22,22,576 0=576 1=3 4=1 5=1 6=5184 7=576 9=1 Convolution conv39 1 1 conv38_relu26 conv39 -23330=4,3,22,22,96 0=96 1=1 5=1 6=55296 Eltwise add8 2 1 add7_splitncnn_0 conv39 add8 -23330=4,3,22,22,96 0=1 Convolution conv40 1 1 add8 conv40_relu27 -23330=4,3,22,22,576 0=576 1=1 5=1 6=55296 9=1 Split splitncnn_8 1 2 conv40_relu27 conv40_relu27_splitncnn_0 conv40_relu27_splitncnn_1 -23330=8,3,22,22,576,3,22,22,576 ConvolutionDepthWise conv41 1 1 conv40_relu27_splitncnn_1 conv41_relu28 -23330=4,3,11,11,576 0=576 1=3 3=2 4=1 5=1 6=5184 7=576 9=1 Convolution conv42 1 1 conv41_relu28 conv42 -23330=4,3,11,11,160 0=160 1=1 5=1 6=92160 Split splitncnn_9 1 2 conv42 conv42_splitncnn_0 conv42_splitncnn_1 -23330=8,3,11,11,160,3,11,11,160 Convolution conv43 1 1 conv42_splitncnn_1 conv43_relu29 -23330=4,3,11,11,960 0=960 1=1 5=1 6=153600 9=1 ConvolutionDepthWise conv44 1 1 conv43_relu29 conv44_relu30 -23330=4,3,11,11,960 0=960 1=3 4=1 5=1 6=8640 7=960 9=1 Convolution conv45 1 1 conv44_relu30 conv45 -23330=4,3,11,11,160 0=160 1=1 5=1 6=153600 Eltwise add9 2 1 conv42_splitncnn_0 conv45 add9 -23330=4,3,11,11,160 0=1 Split splitncnn_10 1 2 add9 add9_splitncnn_0 add9_splitncnn_1 -23330=8,3,11,11,160,3,11,11,160 Convolution conv46 1 1 add9_splitncnn_1 conv46_relu31 -23330=4,3,11,11,960 0=960 1=1 5=1 6=153600 9=1 ConvolutionDepthWise conv47 1 1 conv46_relu31 conv47_relu32 -23330=4,3,11,11,960 0=960 1=3 4=1 5=1 6=8640 7=960 9=1 Convolution conv48 1 1 conv47_relu32 conv48 -23330=4,3,11,11,160 0=160 1=1 5=1 6=153600 Eltwise add10 2 1 add9_splitncnn_0 conv48 add10 -23330=4,3,11,11,160 0=1 Convolution conv49 1 1 add10 conv49_relu33 -23330=4,3,11,11,960 0=960 1=1 5=1 6=153600 9=1 ConvolutionDepthWise conv50 1 1 conv49_relu33 conv50_relu34 -23330=4,3,11,11,960 0=960 1=3 4=1 5=1 6=8640 7=960 9=1 Convolution conv51 1 1 conv50_relu34 conv51 -23330=4,3,11,11,320 0=320 1=1 5=1 6=307200 Convolution conv52 1 1 conv51 conv52_relu35 -23330=4,3,11,11,1280 0=1280 1=1 5=1 6=409600 9=1 ConvolutionDepthWise yolo/conv1/dw 1 1 conv52_relu35 yolo/conv1/dw_yolo/conv1/dw/relu -23330=4,3,11,11,1280 0=1280 1=3 4=1 5=1 6=11520 7=1280 9=1 Convolution yolo/conv1 1 1 yolo/conv1/dw_yolo/conv1/dw/relu yolo/conv1_yolo/conv1/relu -23330=4,3,11,11,576 0=576 1=1 5=1 6=737280 9=1 Split splitncnn_11 1 2 yolo/conv1_yolo/conv1/relu yolo/conv1_yolo/conv1/relu_splitncnn_0 yolo/conv1_yolo/conv1/relu_splitncnn_1 -23330=8,3,11,11,576,3,11,11,576 DeconvolutionDepthWise upsample 1 1 yolo/conv1_yolo/conv1/relu_splitncnn_1 upsample -23330=4,3,21,21,576 0=576 1=1 3=2 6=576 7=576 Pooling maxpool 1 1 upsample maxpool -23330=4,3,22,22,576 1=2 3=1 ConvolutionDepthWise yolo/conv2/dw 1 1 conv40_relu27_splitncnn_0 yolo/conv2/dw_yolo/conv2/dw/relu -23330=4,3,22,22,576 0=576 1=3 4=1 5=1 6=5184 7=576 9=1 Convolution yolo/conv2 1 1 yolo/conv2/dw_yolo/conv2/dw/relu yolo/conv2_yolo/conv2/relu -23330=4,3,22,22,576 0=576 1=1 5=1 6=331776 9=1 Eltwise yolo/conv2/sum 2 1 maxpool yolo/conv2_yolo/conv2/relu yolo/conv2/sum -23330=4,3,22,22,576 0=1 ConvolutionDepthWise yolo/conv3/dw 1 1 yolo/conv2/sum yolo/conv3/dw_yolo/conv3/dw/relu -23330=4,3,22,22,576 0=576 1=3 4=1 5=1 6=5184 7=576 9=1 Convolution yolo/conv3 1 1 yolo/conv3/dw_yolo/conv3/dw/relu yolo/conv3_yolo/conv3/relu -23330=4,3,22,22,576 0=576 1=1 5=1 6=331776 9=1 Convolution yolo/conv4 1 1 yolo/conv1_yolo/conv1/relu_splitncnn_0 yolo/conv4 -23330=4,3,11,11,75 0=75 1=1 5=1 6=43200 Convolution yolo/conv5 1 1 yolo/conv3_yolo/conv3/relu yolo/conv5 -23330=4,3,22,22,75 0=75 1=1 5=1 6=43200 Yolov3DetectionOutput detection_out 2 1 yolo/conv4 yolo/conv5 output 1=3 2=3.000000e-01 -23304=12,2.000000e+01,3.700000e+01,4.900000e+01,9.400000e+01,7.300000e+01,2.010000e+02,1.430000e+02,2.650000e+02,1.530000e+02,1.210000e+02,2.800000e+02,2.790000e+02 -23305=6,1077936128,1082130432,1084227584,0,1065353216,1073741824 -23306=2,3.200000e+01,1.600000e+01 ================================================ FILE: benchmark/nanodet_m.param ================================================ 7767517 179 204 Input input.1 0 1 input.1 -23330=4,3,320,320,3 0=320 1=320 2=3 Convolution Conv_0 1 1 input.1 424 -23330=4,3,160,160,24 0=24 1=3 3=2 4=1 5=1 6=648 9=2 -23310=1,1.000000e-01 Pooling MaxPool_2 1 1 424 425 -23330=4,3,80,80,24 1=3 2=2 3=1 5=1 Split splitncnn_0 1 2 425 425_splitncnn_0 425_splitncnn_1 -23330=8,3,80,80,24,3,80,80,24 ConvolutionDepthWise Conv_3 1 1 425_splitncnn_1 943 -23330=4,3,40,40,24 0=24 1=3 3=2 4=1 5=1 6=216 7=24 Convolution Conv_4 1 1 943 430 -23330=4,3,40,40,58 0=58 1=1 5=1 6=1392 9=2 -23310=1,1.000000e-01 Convolution Conv_6 1 1 425_splitncnn_0 433 -23330=4,3,80,80,58 0=58 1=1 5=1 6=1392 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise Conv_8 1 1 433 952 -23330=4,3,40,40,58 0=58 1=3 3=2 4=1 5=1 6=522 7=58 Convolution Conv_9 1 1 952 438 -23330=4,3,40,40,58 0=58 1=1 5=1 6=3364 9=2 -23310=1,1.000000e-01 Concat Concat_11 2 1 430 438 439 -23330=4,3,40,40,116 ShuffleChannel Reshape_16 1 1 439 444 -23330=4,3,40,40,116 0=2 Split splitncnn_1 1 2 444 444_splitncnn_0 444_splitncnn_1 -23330=8,3,40,40,116,3,40,40,116 Crop Slice_27 1 1 444_splitncnn_1 455 -23330=4,3,40,40,58 -23309=1,0 -23310=1,58 -23311=1,0 Crop Slice_30 1 1 444_splitncnn_0 458 -23330=4,3,40,40,58 -23309=1,58 -23310=1,116 -23311=1,0 Convolution Conv_31 1 1 458 461 -23330=4,3,40,40,58 0=58 1=1 5=1 6=3364 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise Conv_33 1 1 461 961 -23330=4,3,40,40,58 0=58 1=3 4=1 5=1 6=522 7=58 Convolution Conv_34 1 1 961 466 -23330=4,3,40,40,58 0=58 1=1 5=1 6=3364 9=2 -23310=1,1.000000e-01 Concat Concat_36 2 1 455 466 467 -23330=4,3,40,40,116 ShuffleChannel Reshape_41 1 1 467 472 -23330=4,3,40,40,116 0=2 Split splitncnn_2 1 2 472 472_splitncnn_0 472_splitncnn_1 -23330=8,3,40,40,116,3,40,40,116 Crop Slice_52 1 1 472_splitncnn_1 483 -23330=4,3,40,40,58 -23309=1,0 -23310=1,58 -23311=1,0 Crop Slice_55 1 1 472_splitncnn_0 486 -23330=4,3,40,40,58 -23309=1,58 -23310=1,116 -23311=1,0 Convolution Conv_56 1 1 486 489 -23330=4,3,40,40,58 0=58 1=1 5=1 6=3364 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise Conv_58 1 1 489 970 -23330=4,3,40,40,58 0=58 1=3 4=1 5=1 6=522 7=58 Convolution Conv_59 1 1 970 494 -23330=4,3,40,40,58 0=58 1=1 5=1 6=3364 9=2 -23310=1,1.000000e-01 Concat Concat_61 2 1 483 494 495 -23330=4,3,40,40,116 ShuffleChannel Reshape_66 1 1 495 500 -23330=4,3,40,40,116 0=2 Split splitncnn_3 1 2 500 500_splitncnn_0 500_splitncnn_1 -23330=8,3,40,40,116,3,40,40,116 Crop Slice_77 1 1 500_splitncnn_1 511 -23330=4,3,40,40,58 -23309=1,0 -23310=1,58 -23311=1,0 Crop Slice_80 1 1 500_splitncnn_0 514 -23330=4,3,40,40,58 -23309=1,58 -23310=1,116 -23311=1,0 Convolution Conv_81 1 1 514 517 -23330=4,3,40,40,58 0=58 1=1 5=1 6=3364 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise Conv_83 1 1 517 979 -23330=4,3,40,40,58 0=58 1=3 4=1 5=1 6=522 7=58 Convolution Conv_84 1 1 979 522 -23330=4,3,40,40,58 0=58 1=1 5=1 6=3364 9=2 -23310=1,1.000000e-01 Concat Concat_86 2 1 511 522 523 -23330=4,3,40,40,116 ShuffleChannel Reshape_91 1 1 523 528 -23330=4,3,40,40,116 0=2 Split splitncnn_4 1 3 528 528_splitncnn_0 528_splitncnn_1 528_splitncnn_2 -23330=12,3,40,40,116,3,40,40,116,3,40,40,116 ConvolutionDepthWise Conv_92 1 1 528_splitncnn_2 985 -23330=4,3,20,20,116 0=116 1=3 3=2 4=1 5=1 6=1044 7=116 Convolution Conv_93 1 1 985 533 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01 Convolution Conv_95 1 1 528_splitncnn_1 536 -23330=4,3,40,40,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise Conv_97 1 1 536 994 -23330=4,3,20,20,116 0=116 1=3 3=2 4=1 5=1 6=1044 7=116 Convolution Conv_98 1 1 994 541 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01 Concat Concat_100 2 1 533 541 542 -23330=4,3,20,20,232 ShuffleChannel Reshape_105 1 1 542 547 -23330=4,3,20,20,232 0=2 Split splitncnn_5 1 2 547 547_splitncnn_0 547_splitncnn_1 -23330=8,3,20,20,232,3,20,20,232 Crop Slice_116 1 1 547_splitncnn_1 558 -23330=4,3,20,20,116 -23309=1,0 -23310=1,116 -23311=1,0 Crop Slice_119 1 1 547_splitncnn_0 561 -23330=4,3,20,20,116 -23309=1,116 -23310=1,232 -23311=1,0 Convolution Conv_120 1 1 561 564 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise Conv_122 1 1 564 1003 -23330=4,3,20,20,116 0=116 1=3 4=1 5=1 6=1044 7=116 Convolution Conv_123 1 1 1003 569 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01 Concat Concat_125 2 1 558 569 570 -23330=4,3,20,20,232 ShuffleChannel Reshape_130 1 1 570 575 -23330=4,3,20,20,232 0=2 Split splitncnn_6 1 2 575 575_splitncnn_0 575_splitncnn_1 -23330=8,3,20,20,232,3,20,20,232 Crop Slice_141 1 1 575_splitncnn_1 586 -23330=4,3,20,20,116 -23309=1,0 -23310=1,116 -23311=1,0 Crop Slice_144 1 1 575_splitncnn_0 589 -23330=4,3,20,20,116 -23309=1,116 -23310=1,232 -23311=1,0 Convolution Conv_145 1 1 589 592 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise Conv_147 1 1 592 1012 -23330=4,3,20,20,116 0=116 1=3 4=1 5=1 6=1044 7=116 Convolution Conv_148 1 1 1012 597 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01 Concat Concat_150 2 1 586 597 598 -23330=4,3,20,20,232 ShuffleChannel Reshape_155 1 1 598 603 -23330=4,3,20,20,232 0=2 Split splitncnn_7 1 2 603 603_splitncnn_0 603_splitncnn_1 -23330=8,3,20,20,232,3,20,20,232 Crop Slice_166 1 1 603_splitncnn_1 614 -23330=4,3,20,20,116 -23309=1,0 -23310=1,116 -23311=1,0 Crop Slice_169 1 1 603_splitncnn_0 617 -23330=4,3,20,20,116 -23309=1,116 -23310=1,232 -23311=1,0 Convolution Conv_170 1 1 617 620 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise Conv_172 1 1 620 1021 -23330=4,3,20,20,116 0=116 1=3 4=1 5=1 6=1044 7=116 Convolution Conv_173 1 1 1021 625 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01 Concat Concat_175 2 1 614 625 626 -23330=4,3,20,20,232 ShuffleChannel Reshape_180 1 1 626 631 -23330=4,3,20,20,232 0=2 Split splitncnn_8 1 2 631 631_splitncnn_0 631_splitncnn_1 -23330=8,3,20,20,232,3,20,20,232 Crop Slice_191 1 1 631_splitncnn_1 642 -23330=4,3,20,20,116 -23309=1,0 -23310=1,116 -23311=1,0 Crop Slice_194 1 1 631_splitncnn_0 645 -23330=4,3,20,20,116 -23309=1,116 -23310=1,232 -23311=1,0 Convolution Conv_195 1 1 645 648 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise Conv_197 1 1 648 1030 -23330=4,3,20,20,116 0=116 1=3 4=1 5=1 6=1044 7=116 Convolution Conv_198 1 1 1030 653 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01 Concat Concat_200 2 1 642 653 654 -23330=4,3,20,20,232 ShuffleChannel Reshape_205 1 1 654 659 -23330=4,3,20,20,232 0=2 Split splitncnn_9 1 2 659 659_splitncnn_0 659_splitncnn_1 -23330=8,3,20,20,232,3,20,20,232 Crop Slice_216 1 1 659_splitncnn_1 670 -23330=4,3,20,20,116 -23309=1,0 -23310=1,116 -23311=1,0 Crop Slice_219 1 1 659_splitncnn_0 673 -23330=4,3,20,20,116 -23309=1,116 -23310=1,232 -23311=1,0 Convolution Conv_220 1 1 673 676 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise Conv_222 1 1 676 1039 -23330=4,3,20,20,116 0=116 1=3 4=1 5=1 6=1044 7=116 Convolution Conv_223 1 1 1039 681 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01 Concat Concat_225 2 1 670 681 682 -23330=4,3,20,20,232 ShuffleChannel Reshape_230 1 1 682 687 -23330=4,3,20,20,232 0=2 Split splitncnn_10 1 2 687 687_splitncnn_0 687_splitncnn_1 -23330=8,3,20,20,232,3,20,20,232 Crop Slice_241 1 1 687_splitncnn_1 698 -23330=4,3,20,20,116 -23309=1,0 -23310=1,116 -23311=1,0 Crop Slice_244 1 1 687_splitncnn_0 701 -23330=4,3,20,20,116 -23309=1,116 -23310=1,232 -23311=1,0 Convolution Conv_245 1 1 701 704 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise Conv_247 1 1 704 1048 -23330=4,3,20,20,116 0=116 1=3 4=1 5=1 6=1044 7=116 Convolution Conv_248 1 1 1048 709 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01 Concat Concat_250 2 1 698 709 710 -23330=4,3,20,20,232 ShuffleChannel Reshape_255 1 1 710 715 -23330=4,3,20,20,232 0=2 Split splitncnn_11 1 2 715 715_splitncnn_0 715_splitncnn_1 -23330=8,3,20,20,232,3,20,20,232 Crop Slice_266 1 1 715_splitncnn_1 726 -23330=4,3,20,20,116 -23309=1,0 -23310=1,116 -23311=1,0 Crop Slice_269 1 1 715_splitncnn_0 729 -23330=4,3,20,20,116 -23309=1,116 -23310=1,232 -23311=1,0 Convolution Conv_270 1 1 729 732 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise Conv_272 1 1 732 1057 -23330=4,3,20,20,116 0=116 1=3 4=1 5=1 6=1044 7=116 Convolution Conv_273 1 1 1057 737 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01 Concat Concat_275 2 1 726 737 738 -23330=4,3,20,20,232 ShuffleChannel Reshape_280 1 1 738 743 -23330=4,3,20,20,232 0=2 Split splitncnn_12 1 3 743 743_splitncnn_0 743_splitncnn_1 743_splitncnn_2 -23330=12,3,20,20,232,3,20,20,232,3,20,20,232 ConvolutionDepthWise Conv_281 1 1 743_splitncnn_2 1063 -23330=4,3,10,10,232 0=232 1=3 3=2 4=1 5=1 6=2088 7=232 Convolution Conv_282 1 1 1063 748 -23330=4,3,10,10,232 0=232 1=1 5=1 6=53824 9=2 -23310=1,1.000000e-01 Convolution Conv_284 1 1 743_splitncnn_1 751 -23330=4,3,20,20,232 0=232 1=1 5=1 6=53824 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise Conv_286 1 1 751 1072 -23330=4,3,10,10,232 0=232 1=3 3=2 4=1 5=1 6=2088 7=232 Convolution Conv_287 1 1 1072 756 -23330=4,3,10,10,232 0=232 1=1 5=1 6=53824 9=2 -23310=1,1.000000e-01 Concat Concat_289 2 1 748 756 757 -23330=4,3,10,10,464 ShuffleChannel Reshape_294 1 1 757 762 -23330=4,3,10,10,464 0=2 Split splitncnn_13 1 2 762 762_splitncnn_0 762_splitncnn_1 -23330=8,3,10,10,464,3,10,10,464 Crop Slice_305 1 1 762_splitncnn_1 773 -23330=4,3,10,10,232 -23309=1,0 -23310=1,232 -23311=1,0 Crop Slice_308 1 1 762_splitncnn_0 776 -23330=4,3,10,10,232 -23309=1,232 -23310=1,464 -23311=1,0 Convolution Conv_309 1 1 776 779 -23330=4,3,10,10,232 0=232 1=1 5=1 6=53824 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise Conv_311 1 1 779 1081 -23330=4,3,10,10,232 0=232 1=3 4=1 5=1 6=2088 7=232 Convolution Conv_312 1 1 1081 784 -23330=4,3,10,10,232 0=232 1=1 5=1 6=53824 9=2 -23310=1,1.000000e-01 Concat Concat_314 2 1 773 784 785 -23330=4,3,10,10,464 ShuffleChannel Reshape_319 1 1 785 790 -23330=4,3,10,10,464 0=2 Split splitncnn_14 1 2 790 790_splitncnn_0 790_splitncnn_1 -23330=8,3,10,10,464,3,10,10,464 Crop Slice_330 1 1 790_splitncnn_1 801 -23330=4,3,10,10,232 -23309=1,0 -23310=1,232 -23311=1,0 Crop Slice_333 1 1 790_splitncnn_0 804 -23330=4,3,10,10,232 -23309=1,232 -23310=1,464 -23311=1,0 Convolution Conv_334 1 1 804 807 -23330=4,3,10,10,232 0=232 1=1 5=1 6=53824 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise Conv_336 1 1 807 1090 -23330=4,3,10,10,232 0=232 1=3 4=1 5=1 6=2088 7=232 Convolution Conv_337 1 1 1090 812 -23330=4,3,10,10,232 0=232 1=1 5=1 6=53824 9=2 -23310=1,1.000000e-01 Concat Concat_339 2 1 801 812 813 -23330=4,3,10,10,464 ShuffleChannel Reshape_344 1 1 813 818 -23330=4,3,10,10,464 0=2 Split splitncnn_15 1 2 818 818_splitncnn_0 818_splitncnn_1 -23330=8,3,10,10,464,3,10,10,464 Crop Slice_355 1 1 818_splitncnn_1 829 -23330=4,3,10,10,232 -23309=1,0 -23310=1,232 -23311=1,0 Crop Slice_358 1 1 818_splitncnn_0 832 -23330=4,3,10,10,232 -23309=1,232 -23310=1,464 -23311=1,0 Convolution Conv_359 1 1 832 835 -23330=4,3,10,10,232 0=232 1=1 5=1 6=53824 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise Conv_361 1 1 835 1099 -23330=4,3,10,10,232 0=232 1=3 4=1 5=1 6=2088 7=232 Convolution Conv_362 1 1 1099 840 -23330=4,3,10,10,232 0=232 1=1 5=1 6=53824 9=2 -23310=1,1.000000e-01 Concat Concat_364 2 1 829 840 841 -23330=4,3,10,10,464 ShuffleChannel Reshape_369 1 1 841 846 -23330=4,3,10,10,464 0=2 Convolution Conv_370 1 1 528_splitncnn_0 847 -23330=4,3,40,40,96 0=96 1=1 5=1 6=11136 Convolution Conv_371 1 1 743_splitncnn_0 848 -23330=4,3,20,20,96 0=96 1=1 5=1 6=22272 Convolution Conv_372 1 1 846 849 -23330=4,3,10,10,96 0=96 1=1 5=1 6=44544 Split splitncnn_16 1 2 849 849_splitncnn_0 849_splitncnn_1 -23330=8,3,10,10,96,3,10,10,96 Interp Resize_374 1 1 849_splitncnn_1 854 -23330=4,3,20,20,96 0=2 1=2.000000e+00 2=2.000000e+00 BinaryOp Add_375 2 1 848 854 855 -23330=4,3,20,20,96 Split splitncnn_17 1 2 855 855_splitncnn_0 855_splitncnn_1 -23330=8,3,20,20,96,3,20,20,96 Interp Resize_377 1 1 855_splitncnn_1 860 -23330=4,3,40,40,96 0=2 1=2.000000e+00 2=2.000000e+00 BinaryOp Add_378 2 1 847 860 861 -23330=4,3,40,40,96 Split splitncnn_18 1 2 861 861_splitncnn_0 861_splitncnn_1 -23330=8,3,40,40,96,3,40,40,96 Interp Resize_380 1 1 861_splitncnn_1 866 -23330=4,3,20,20,96 0=2 1=5.000000e-01 2=5.000000e-01 BinaryOp Add_381 2 1 855_splitncnn_0 866 867 -23330=4,3,20,20,96 Split splitncnn_19 1 2 867 867_splitncnn_0 867_splitncnn_1 -23330=8,3,20,20,96,3,20,20,96 Interp Resize_383 1 1 867_splitncnn_1 872 -23330=4,3,10,10,96 0=2 1=5.000000e-01 2=5.000000e-01 BinaryOp Add_384 2 1 849_splitncnn_0 872 873 -23330=4,3,10,10,96 ConvolutionDepthWise Conv_385 1 1 861_splitncnn_0 876 -23330=4,3,40,40,96 0=96 1=3 4=1 5=1 6=864 7=96 9=2 -23310=1,1.000000e-01 Convolution Conv_387 1 1 876 879 -23330=4,3,40,40,96 0=96 1=1 5=1 6=9216 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise Conv_389 1 1 879 882 -23330=4,3,40,40,96 0=96 1=3 4=1 5=1 6=864 7=96 9=2 -23310=1,1.000000e-01 Convolution Conv_391 1 1 882 885 -23330=4,3,40,40,96 0=96 1=1 5=1 6=9216 9=2 -23310=1,1.000000e-01 Convolution Conv_393 1 1 885 886 -23330=4,3,40,40,112 0=112 1=1 5=1 6=10752 Slice Split_394 1 2 886 887 888 -23330=8,3,40,40,80,3,40,40,32 -23300=2,80,-233 Sigmoid Sigmoid_395 1 1 887 889 -23330=4,3,40,40,80 Reshape Reshape_397 1 1 889 891 -23330=4,2,1600,80,1 0=-1 1=80 Permute Transpose_398 1 1 891 cls_pred_stride_8 -23330=4,2,80,1600,1 0=1 Reshape Reshape_400 1 1 888 894 -23330=4,2,1600,32,1 0=-1 1=32 Permute Transpose_401 1 1 894 dis_pred_stride_8 -23330=4,2,32,1600,1 0=1 ConvolutionDepthWise Conv_402 1 1 867_splitncnn_0 898 -23330=4,3,20,20,96 0=96 1=3 4=1 5=1 6=864 7=96 9=2 -23310=1,1.000000e-01 Convolution Conv_404 1 1 898 901 -23330=4,3,20,20,96 0=96 1=1 5=1 6=9216 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise Conv_406 1 1 901 904 -23330=4,3,20,20,96 0=96 1=3 4=1 5=1 6=864 7=96 9=2 -23310=1,1.000000e-01 Convolution Conv_408 1 1 904 907 -23330=4,3,20,20,96 0=96 1=1 5=1 6=9216 9=2 -23310=1,1.000000e-01 Convolution Conv_410 1 1 907 908 -23330=4,3,20,20,112 0=112 1=1 5=1 6=10752 Slice Split_411 1 2 908 909 910 -23330=8,3,20,20,80,3,20,20,32 -23300=2,80,-233 Sigmoid Sigmoid_412 1 1 909 911 -23330=4,3,20,20,80 Reshape Reshape_414 1 1 911 913 -23330=4,2,400,80,1 0=-1 1=80 Permute Transpose_415 1 1 913 cls_pred_stride_16 -23330=4,2,80,400,1 0=1 Reshape Reshape_417 1 1 910 916 -23330=4,2,400,32,1 0=-1 1=32 Permute Transpose_418 1 1 916 dis_pred_stride_16 -23330=4,2,32,400,1 0=1 ConvolutionDepthWise Conv_419 1 1 873 920 -23330=4,3,10,10,96 0=96 1=3 4=1 5=1 6=864 7=96 9=2 -23310=1,1.000000e-01 Convolution Conv_421 1 1 920 923 -23330=4,3,10,10,96 0=96 1=1 5=1 6=9216 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise Conv_423 1 1 923 926 -23330=4,3,10,10,96 0=96 1=3 4=1 5=1 6=864 7=96 9=2 -23310=1,1.000000e-01 Convolution Conv_425 1 1 926 929 -23330=4,3,10,10,96 0=96 1=1 5=1 6=9216 9=2 -23310=1,1.000000e-01 Convolution Conv_427 1 1 929 930 -23330=4,3,10,10,112 0=112 1=1 5=1 6=10752 Slice Split_428 1 2 930 931 932 -23330=8,3,10,10,80,3,10,10,32 -23300=2,80,-233 Sigmoid Sigmoid_429 1 1 931 933 -23330=4,3,10,10,80 Reshape Reshape_431 1 1 933 935 -23330=4,2,100,80,1 0=-1 1=80 Permute Transpose_432 1 1 935 cls_pred_stride_32 -23330=4,2,80,100,1 0=1 Reshape Reshape_434 1 1 932 938 -23330=4,2,100,32,1 0=-1 1=32 Permute Transpose_435 1 1 938 dis_pred_stride_32 -23330=4,2,32,100,1 0=1 Noop Output 6 1 cls_pred_stride_8 cls_pred_stride_16 cls_pred_stride_32 dis_pred_stride_8 dis_pred_stride_16 dis_pred_stride_32 output ================================================ FILE: benchmark/proxylessnasnet.param ================================================ 7767517 91 104 Input data 0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3 Convolution first-3x3-conv 1 1 data first-3x3-conv_relu -23330=4,3,112,112,32 0=32 1=3 3=2 4=1 5=1 6=864 9=1 ConvolutionDepthWise A0_dw 1 1 first-3x3-conv_relu A0_dw_relu -23330=4,3,112,112,32 0=32 1=3 4=1 5=1 6=288 7=32 9=1 Convolution A0_linear 1 1 A0_dw_relu A0_linear_bn -23330=4,3,112,112,32 0=32 1=1 5=1 6=1024 Convolution B0_expand 1 1 A0_linear_bn B0_expand_relu -23330=4,3,112,112,48 0=48 1=1 5=1 6=1536 9=1 ConvolutionDepthWise B0_dw 1 1 B0_expand_relu B0_dw_relu -23330=4,3,56,56,48 0=48 1=5 3=2 4=2 5=1 6=1200 7=48 9=1 Convolution B0_linear 1 1 B0_dw_relu B0_linear_bn -23330=4,3,56,56,32 0=32 1=1 5=1 6=1536 Split splitncnn_0 1 2 B0_linear_bn B0_linear_bn_splitncnn_0 B0_linear_bn_splitncnn_1 -23330=8,3,56,56,32,3,56,56,32 Convolution B1_expand 1 1 B0_linear_bn_splitncnn_1 B1_expand_relu -23330=4,3,56,56,96 0=96 1=1 5=1 6=3072 9=1 ConvolutionDepthWise B1_dw 1 1 B1_expand_relu B1_dw_relu -23330=4,3,56,56,96 0=96 1=3 4=1 5=1 6=864 7=96 9=1 Convolution B1_linear 1 1 B1_dw_relu B1_linear_bn -23330=4,3,56,56,32 0=32 1=1 5=1 6=3072 BinaryOp unknownncnn_0 2 1 B0_linear_bn_splitncnn_0 B1_linear_bn unknownncnn_0 -23330=4,3,56,56,32 Convolution C0_expand 1 1 unknownncnn_0 C0_expand_relu -23330=4,3,56,56,96 0=96 1=1 5=1 6=3072 9=1 ConvolutionDepthWise C0_dw 1 1 C0_expand_relu C0_dw_relu -23330=4,3,28,28,96 0=96 1=7 3=2 4=3 5=1 6=4704 7=96 9=1 Convolution C0_linear 1 1 C0_dw_relu C0_linear_bn -23330=4,3,28,28,40 0=40 1=1 5=1 6=3840 Split splitncnn_1 1 2 C0_linear_bn C0_linear_bn_splitncnn_0 C0_linear_bn_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40 Convolution C1_expand 1 1 C0_linear_bn_splitncnn_1 C1_expand_relu -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1 ConvolutionDepthWise C1_dw 1 1 C1_expand_relu C1_dw_relu -23330=4,3,28,28,120 0=120 1=3 4=1 5=1 6=1080 7=120 9=1 Convolution C1_linear 1 1 C1_dw_relu C1_linear_bn -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800 BinaryOp unknownncnn_1 2 1 C0_linear_bn_splitncnn_0 C1_linear_bn unknownncnn_1 -23330=4,3,28,28,40 Split splitncnn_2 1 2 unknownncnn_1 unknownncnn_1_splitncnn_0 unknownncnn_1_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40 Convolution C2_expand 1 1 unknownncnn_1_splitncnn_1 C2_expand_relu -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1 ConvolutionDepthWise C2_dw 1 1 C2_expand_relu C2_dw_relu -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120 9=1 Convolution C2_linear 1 1 C2_dw_relu C2_linear_bn -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800 BinaryOp unknownncnn_2 2 1 unknownncnn_1_splitncnn_0 C2_linear_bn unknownncnn_2 -23330=4,3,28,28,40 Split splitncnn_3 1 2 unknownncnn_2 unknownncnn_2_splitncnn_0 unknownncnn_2_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40 Convolution C3_expand 1 1 unknownncnn_2_splitncnn_1 C3_expand_relu -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1 ConvolutionDepthWise C3_dw 1 1 C3_expand_relu C3_dw_relu -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120 9=1 Convolution C3_linear 1 1 C3_dw_relu C3_linear_bn -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800 BinaryOp unknownncnn_3 2 1 unknownncnn_2_splitncnn_0 C3_linear_bn unknownncnn_3 -23330=4,3,28,28,40 Convolution D0_expand 1 1 unknownncnn_3 D0_expand_relu -23330=4,3,28,28,240 0=240 1=1 5=1 6=9600 9=1 ConvolutionDepthWise D0_dw 1 1 D0_expand_relu D0_dw_relu -23330=4,3,14,14,240 0=240 1=7 3=2 4=3 5=1 6=11760 7=240 9=1 Convolution D0_linear 1 1 D0_dw_relu D0_linear_bn -23330=4,3,14,14,80 0=80 1=1 5=1 6=19200 Split splitncnn_4 1 2 D0_linear_bn D0_linear_bn_splitncnn_0 D0_linear_bn_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80 Convolution D1_expand 1 1 D0_linear_bn_splitncnn_1 D1_expand_relu -23330=4,3,14,14,240 0=240 1=1 5=1 6=19200 9=1 ConvolutionDepthWise D1_dw 1 1 D1_expand_relu D1_dw_relu -23330=4,3,14,14,240 0=240 1=5 4=2 5=1 6=6000 7=240 9=1 Convolution D1_linear 1 1 D1_dw_relu D1_linear_bn -23330=4,3,14,14,80 0=80 1=1 5=1 6=19200 BinaryOp unknownncnn_4 2 1 D0_linear_bn_splitncnn_0 D1_linear_bn unknownncnn_4 -23330=4,3,14,14,80 Split splitncnn_5 1 2 unknownncnn_4 unknownncnn_4_splitncnn_0 unknownncnn_4_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80 Convolution D2_expand 1 1 unknownncnn_4_splitncnn_1 D2_expand_relu -23330=4,3,14,14,240 0=240 1=1 5=1 6=19200 9=1 ConvolutionDepthWise D2_dw 1 1 D2_expand_relu D2_dw_relu -23330=4,3,14,14,240 0=240 1=5 4=2 5=1 6=6000 7=240 9=1 Convolution D2_linear 1 1 D2_dw_relu D2_linear_bn -23330=4,3,14,14,80 0=80 1=1 5=1 6=19200 BinaryOp unknownncnn_5 2 1 unknownncnn_4_splitncnn_0 D2_linear_bn unknownncnn_5 -23330=4,3,14,14,80 Split splitncnn_6 1 2 unknownncnn_5 unknownncnn_5_splitncnn_0 unknownncnn_5_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80 Convolution D3_expand 1 1 unknownncnn_5_splitncnn_1 D3_expand_relu -23330=4,3,14,14,240 0=240 1=1 5=1 6=19200 9=1 ConvolutionDepthWise D3_dw 1 1 D3_expand_relu D3_dw_relu -23330=4,3,14,14,240 0=240 1=5 4=2 5=1 6=6000 7=240 9=1 Convolution D3_linear 1 1 D3_dw_relu D3_linear_bn -23330=4,3,14,14,80 0=80 1=1 5=1 6=19200 BinaryOp unknownncnn_6 2 1 unknownncnn_5_splitncnn_0 D3_linear_bn unknownncnn_6 -23330=4,3,14,14,80 Convolution E0_expand 1 1 unknownncnn_6 E0_expand_relu -23330=4,3,14,14,480 0=480 1=1 5=1 6=38400 9=1 ConvolutionDepthWise E0_dw 1 1 E0_expand_relu E0_dw_relu -23330=4,3,14,14,480 0=480 1=5 4=2 5=1 6=12000 7=480 9=1 Convolution E0_linear 1 1 E0_dw_relu E0_linear_bn -23330=4,3,14,14,96 0=96 1=1 5=1 6=46080 Split splitncnn_7 1 2 E0_linear_bn E0_linear_bn_splitncnn_0 E0_linear_bn_splitncnn_1 -23330=8,3,14,14,96,3,14,14,96 Convolution E1_expand 1 1 E0_linear_bn_splitncnn_1 E1_expand_relu -23330=4,3,14,14,288 0=288 1=1 5=1 6=27648 9=1 ConvolutionDepthWise E1_dw 1 1 E1_expand_relu E1_dw_relu -23330=4,3,14,14,288 0=288 1=5 4=2 5=1 6=7200 7=288 9=1 Convolution E1_linear 1 1 E1_dw_relu E1_linear_bn -23330=4,3,14,14,96 0=96 1=1 5=1 6=27648 BinaryOp unknownncnn_7 2 1 E0_linear_bn_splitncnn_0 E1_linear_bn unknownncnn_7 -23330=4,3,14,14,96 Split splitncnn_8 1 2 unknownncnn_7 unknownncnn_7_splitncnn_0 unknownncnn_7_splitncnn_1 -23330=8,3,14,14,96,3,14,14,96 Convolution E2_expand 1 1 unknownncnn_7_splitncnn_1 E2_expand_relu -23330=4,3,14,14,288 0=288 1=1 5=1 6=27648 9=1 ConvolutionDepthWise E2_dw 1 1 E2_expand_relu E2_dw_relu -23330=4,3,14,14,288 0=288 1=5 4=2 5=1 6=7200 7=288 9=1 Convolution E2_linear 1 1 E2_dw_relu E2_linear_bn -23330=4,3,14,14,96 0=96 1=1 5=1 6=27648 BinaryOp unknownncnn_8 2 1 unknownncnn_7_splitncnn_0 E2_linear_bn unknownncnn_8 -23330=4,3,14,14,96 Split splitncnn_9 1 2 unknownncnn_8 unknownncnn_8_splitncnn_0 unknownncnn_8_splitncnn_1 -23330=8,3,14,14,96,3,14,14,96 Convolution E3_expand 1 1 unknownncnn_8_splitncnn_1 E3_expand_relu -23330=4,3,14,14,288 0=288 1=1 5=1 6=27648 9=1 ConvolutionDepthWise E3_dw 1 1 E3_expand_relu E3_dw_relu -23330=4,3,14,14,288 0=288 1=5 4=2 5=1 6=7200 7=288 9=1 Convolution E3_linear 1 1 E3_dw_relu E3_linear_bn -23330=4,3,14,14,96 0=96 1=1 5=1 6=27648 BinaryOp unknownncnn_9 2 1 unknownncnn_8_splitncnn_0 E3_linear_bn unknownncnn_9 -23330=4,3,14,14,96 Convolution F0_expand 1 1 unknownncnn_9 F0_expand_relu -23330=4,3,14,14,576 0=576 1=1 5=1 6=55296 9=1 ConvolutionDepthWise F0_dw 1 1 F0_expand_relu F0_dw_relu -23330=4,3,7,7,576 0=576 1=7 3=2 4=3 5=1 6=28224 7=576 9=1 Convolution F0_linear 1 1 F0_dw_relu F0_linear_bn -23330=4,3,7,7,192 0=192 1=1 5=1 6=110592 Split splitncnn_10 1 2 F0_linear_bn F0_linear_bn_splitncnn_0 F0_linear_bn_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192 Convolution F1_expand 1 1 F0_linear_bn_splitncnn_1 F1_expand_relu -23330=4,3,7,7,1152 0=1152 1=1 5=1 6=221184 9=1 ConvolutionDepthWise F1_dw 1 1 F1_expand_relu F1_dw_relu -23330=4,3,7,7,1152 0=1152 1=7 4=3 5=1 6=56448 7=1152 9=1 Convolution F1_linear 1 1 F1_dw_relu F1_linear_bn -23330=4,3,7,7,192 0=192 1=1 5=1 6=221184 BinaryOp unknownncnn_10 2 1 F0_linear_bn_splitncnn_0 F1_linear_bn unknownncnn_10 -23330=4,3,7,7,192 Split splitncnn_11 1 2 unknownncnn_10 unknownncnn_10_splitncnn_0 unknownncnn_10_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192 Convolution F2_expand 1 1 unknownncnn_10_splitncnn_1 F2_expand_relu -23330=4,3,7,7,576 0=576 1=1 5=1 6=110592 9=1 ConvolutionDepthWise F2_dw 1 1 F2_expand_relu F2_dw_relu -23330=4,3,7,7,576 0=576 1=7 4=3 5=1 6=28224 7=576 9=1 Convolution F2_linear 1 1 F2_dw_relu F2_linear_bn -23330=4,3,7,7,192 0=192 1=1 5=1 6=110592 BinaryOp unknownncnn_11 2 1 unknownncnn_10_splitncnn_0 F2_linear_bn unknownncnn_11 -23330=4,3,7,7,192 Split splitncnn_12 1 2 unknownncnn_11 unknownncnn_11_splitncnn_0 unknownncnn_11_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192 Convolution F3_expand 1 1 unknownncnn_11_splitncnn_1 F3_expand_relu -23330=4,3,7,7,576 0=576 1=1 5=1 6=110592 9=1 ConvolutionDepthWise F3_dw 1 1 F3_expand_relu F3_dw_relu -23330=4,3,7,7,576 0=576 1=7 4=3 5=1 6=28224 7=576 9=1 Convolution F3_linear 1 1 F3_dw_relu F3_linear_bn -23330=4,3,7,7,192 0=192 1=1 5=1 6=110592 BinaryOp unknownncnn_12 2 1 unknownncnn_11_splitncnn_0 F3_linear_bn unknownncnn_12 -23330=4,3,7,7,192 Convolution G0_expand 1 1 unknownncnn_12 G0_expand_relu -23330=4,3,7,7,1152 0=1152 1=1 5=1 6=221184 9=1 ConvolutionDepthWise G0_dw 1 1 G0_expand_relu G0_dw_relu -23330=4,3,7,7,1152 0=1152 1=7 4=3 5=1 6=56448 7=1152 9=1 Convolution G0_linear 1 1 G0_dw_relu G0_linear_bn -23330=4,3,7,7,320 0=320 1=1 5=1 6=368640 Convolution last-1x1-conv 1 1 G0_linear_bn last-1x1-conv_relu -23330=4,3,7,7,1280 0=1280 1=1 5=1 6=409600 9=1 Pooling avgpool 1 1 last-1x1-conv_relu flatten -23330=4,1,1280,1,1 0=1 1=7 4=1 5=1 InnerProduct fc 1 1 flatten fc -23330=4,1,1000,1,1 0=1000 1=1 2=1280000 Softmax prob 1 1 fc output -23330=4,1,1000,1,1 ================================================ FILE: benchmark/regnety_400m.param ================================================ 7767517 185 217 Input input.1 0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3 Convolution Conv_0 1 1 data 387 -23330=4,3,112,112,32 0=32 1=3 3=2 4=1 5=1 6=864 9=1 Split splitncnn_0 1 2 387 387_splitncnn_0 387_splitncnn_1 -23330=8,3,112,112,32,3,112,112,32 Convolution Conv_3 1 1 387_splitncnn_1 389 -23330=4,3,56,56,48 0=48 1=1 3=2 5=1 6=1536 Convolution Conv_5 1 1 387_splitncnn_0 392 -23330=4,3,112,112,48 0=48 1=1 5=1 6=1536 9=1 ConvolutionDepthWise Conv_8 1 1 392 395 -23330=4,3,56,56,48 0=48 1=3 3=2 4=1 5=1 6=3456 7=6 9=1 Split splitncnn_1 1 2 395 395_splitncnn_0 395_splitncnn_1 -23330=8,3,56,56,48,3,56,56,48 Pooling GlobalAveragePool_11 1 1 395_splitncnn_1 396 -23330=4,1,48,1,1 0=1 4=1 InnerProduct Conv_12 1 1 396 398 -23330=4,1,8,1,1 0=8 1=1 2=384 9=1 InnerProduct Conv_14 1 1 398 400 -23330=4,1,48,1,1 0=48 1=1 2=384 9=4 BinaryOp Mul_16 2 1 395_splitncnn_0 400 401 -23330=4,3,56,56,48 0=2 Convolution Conv_17 1 1 401 403 -23330=4,3,56,56,48 0=48 1=1 5=1 6=2304 BinaryOp Add_19 2 1 389 403 404 -23330=4,3,56,56,48 ReLU Relu_20 1 1 404 405 -23330=4,3,56,56,48 Split splitncnn_2 1 2 405 405_splitncnn_0 405_splitncnn_1 -23330=8,3,56,56,48,3,56,56,48 Convolution Conv_21 1 1 405_splitncnn_1 407 -23330=4,3,28,28,104 0=104 1=1 3=2 5=1 6=4992 Convolution Conv_23 1 1 405_splitncnn_0 410 -23330=4,3,56,56,104 0=104 1=1 5=1 6=4992 9=1 ConvolutionDepthWise Conv_26 1 1 410 413 -23330=4,3,28,28,104 0=104 1=3 3=2 4=1 5=1 6=7488 7=13 9=1 Split splitncnn_3 1 2 413 413_splitncnn_0 413_splitncnn_1 -23330=8,3,28,28,104,3,28,28,104 Pooling GlobalAveragePool_29 1 1 413_splitncnn_1 414 -23330=4,1,104,1,1 0=1 4=1 InnerProduct Conv_30 1 1 414 416 -23330=4,1,12,1,1 0=12 1=1 2=1248 9=1 InnerProduct Conv_32 1 1 416 418 -23330=4,1,104,1,1 0=104 1=1 2=1248 9=4 BinaryOp Mul_34 2 1 413_splitncnn_0 418 419 -23330=4,3,28,28,104 0=2 Convolution Conv_35 1 1 419 421 -23330=4,3,28,28,104 0=104 1=1 5=1 6=10816 BinaryOp Add_37 2 1 407 421 422 -23330=4,3,28,28,104 ReLU Relu_38 1 1 422 423 -23330=4,3,28,28,104 Split splitncnn_4 1 2 423 423_splitncnn_0 423_splitncnn_1 -23330=8,3,28,28,104,3,28,28,104 Convolution Conv_39 1 1 423_splitncnn_1 426 -23330=4,3,28,28,104 0=104 1=1 5=1 6=10816 9=1 ConvolutionDepthWise Conv_42 1 1 426 429 -23330=4,3,28,28,104 0=104 1=3 4=1 5=1 6=7488 7=13 9=1 Split splitncnn_5 1 2 429 429_splitncnn_0 429_splitncnn_1 -23330=8,3,28,28,104,3,28,28,104 Pooling GlobalAveragePool_45 1 1 429_splitncnn_1 430 -23330=4,1,104,1,1 0=1 4=1 InnerProduct Conv_46 1 1 430 432 -23330=4,1,26,1,1 0=26 1=1 2=2704 9=1 InnerProduct Conv_48 1 1 432 434 -23330=4,1,104,1,1 0=104 1=1 2=2704 9=4 BinaryOp Mul_50 2 1 429_splitncnn_0 434 435 -23330=4,3,28,28,104 0=2 Convolution Conv_51 1 1 435 437 -23330=4,3,28,28,104 0=104 1=1 5=1 6=10816 BinaryOp Add_53 2 1 423_splitncnn_0 437 438 -23330=4,3,28,28,104 ReLU Relu_54 1 1 438 439 -23330=4,3,28,28,104 Split splitncnn_6 1 2 439 439_splitncnn_0 439_splitncnn_1 -23330=8,3,28,28,104,3,28,28,104 Convolution Conv_55 1 1 439_splitncnn_1 442 -23330=4,3,28,28,104 0=104 1=1 5=1 6=10816 9=1 ConvolutionDepthWise Conv_58 1 1 442 445 -23330=4,3,28,28,104 0=104 1=3 4=1 5=1 6=7488 7=13 9=1 Split splitncnn_7 1 2 445 445_splitncnn_0 445_splitncnn_1 -23330=8,3,28,28,104,3,28,28,104 Pooling GlobalAveragePool_61 1 1 445_splitncnn_1 446 -23330=4,1,104,1,1 0=1 4=1 InnerProduct Conv_62 1 1 446 448 -23330=4,1,26,1,1 0=26 1=1 2=2704 9=1 InnerProduct Conv_64 1 1 448 450 -23330=4,1,104,1,1 0=104 1=1 2=2704 9=4 BinaryOp Mul_66 2 1 445_splitncnn_0 450 451 -23330=4,3,28,28,104 0=2 Convolution Conv_67 1 1 451 453 -23330=4,3,28,28,104 0=104 1=1 5=1 6=10816 BinaryOp Add_69 2 1 439_splitncnn_0 453 454 -23330=4,3,28,28,104 ReLU Relu_70 1 1 454 455 -23330=4,3,28,28,104 Split splitncnn_8 1 2 455 455_splitncnn_0 455_splitncnn_1 -23330=8,3,28,28,104,3,28,28,104 Convolution Conv_71 1 1 455_splitncnn_1 457 -23330=4,3,14,14,208 0=208 1=1 3=2 5=1 6=21632 Convolution Conv_73 1 1 455_splitncnn_0 460 -23330=4,3,28,28,208 0=208 1=1 5=1 6=21632 9=1 ConvolutionDepthWise Conv_76 1 1 460 463 -23330=4,3,14,14,208 0=208 1=3 3=2 4=1 5=1 6=14976 7=26 9=1 Split splitncnn_9 1 2 463 463_splitncnn_0 463_splitncnn_1 -23330=8,3,14,14,208,3,14,14,208 Pooling GlobalAveragePool_79 1 1 463_splitncnn_1 464 -23330=4,1,208,1,1 0=1 4=1 InnerProduct Conv_80 1 1 464 466 -23330=4,1,26,1,1 0=26 1=1 2=5408 9=1 InnerProduct Conv_82 1 1 466 468 -23330=4,1,208,1,1 0=208 1=1 2=5408 9=4 BinaryOp Mul_84 2 1 463_splitncnn_0 468 469 -23330=4,3,14,14,208 0=2 Convolution Conv_85 1 1 469 471 -23330=4,3,14,14,208 0=208 1=1 5=1 6=43264 BinaryOp Add_87 2 1 457 471 472 -23330=4,3,14,14,208 ReLU Relu_88 1 1 472 473 -23330=4,3,14,14,208 Split splitncnn_10 1 2 473 473_splitncnn_0 473_splitncnn_1 -23330=8,3,14,14,208,3,14,14,208 Convolution Conv_89 1 1 473_splitncnn_1 476 -23330=4,3,14,14,208 0=208 1=1 5=1 6=43264 9=1 ConvolutionDepthWise Conv_92 1 1 476 479 -23330=4,3,14,14,208 0=208 1=3 4=1 5=1 6=14976 7=26 9=1 Split splitncnn_11 1 2 479 479_splitncnn_0 479_splitncnn_1 -23330=8,3,14,14,208,3,14,14,208 Pooling GlobalAveragePool_95 1 1 479_splitncnn_1 480 -23330=4,1,208,1,1 0=1 4=1 InnerProduct Conv_96 1 1 480 482 -23330=4,1,52,1,1 0=52 1=1 2=10816 9=1 InnerProduct Conv_98 1 1 482 484 -23330=4,1,208,1,1 0=208 1=1 2=10816 9=4 BinaryOp Mul_100 2 1 479_splitncnn_0 484 485 -23330=4,3,14,14,208 0=2 Convolution Conv_101 1 1 485 487 -23330=4,3,14,14,208 0=208 1=1 5=1 6=43264 BinaryOp Add_103 2 1 473_splitncnn_0 487 488 -23330=4,3,14,14,208 ReLU Relu_104 1 1 488 489 -23330=4,3,14,14,208 Split splitncnn_12 1 2 489 489_splitncnn_0 489_splitncnn_1 -23330=8,3,14,14,208,3,14,14,208 Convolution Conv_105 1 1 489_splitncnn_1 492 -23330=4,3,14,14,208 0=208 1=1 5=1 6=43264 9=1 ConvolutionDepthWise Conv_108 1 1 492 495 -23330=4,3,14,14,208 0=208 1=3 4=1 5=1 6=14976 7=26 9=1 Split splitncnn_13 1 2 495 495_splitncnn_0 495_splitncnn_1 -23330=8,3,14,14,208,3,14,14,208 Pooling GlobalAveragePool_111 1 1 495_splitncnn_1 496 -23330=4,1,208,1,1 0=1 4=1 InnerProduct Conv_112 1 1 496 498 -23330=4,1,52,1,1 0=52 1=1 2=10816 9=1 InnerProduct Conv_114 1 1 498 500 -23330=4,1,208,1,1 0=208 1=1 2=10816 9=4 BinaryOp Mul_116 2 1 495_splitncnn_0 500 501 -23330=4,3,14,14,208 0=2 Convolution Conv_117 1 1 501 503 -23330=4,3,14,14,208 0=208 1=1 5=1 6=43264 BinaryOp Add_119 2 1 489_splitncnn_0 503 504 -23330=4,3,14,14,208 ReLU Relu_120 1 1 504 505 -23330=4,3,14,14,208 Split splitncnn_14 1 2 505 505_splitncnn_0 505_splitncnn_1 -23330=8,3,14,14,208,3,14,14,208 Convolution Conv_121 1 1 505_splitncnn_1 508 -23330=4,3,14,14,208 0=208 1=1 5=1 6=43264 9=1 ConvolutionDepthWise Conv_124 1 1 508 511 -23330=4,3,14,14,208 0=208 1=3 4=1 5=1 6=14976 7=26 9=1 Split splitncnn_15 1 2 511 511_splitncnn_0 511_splitncnn_1 -23330=8,3,14,14,208,3,14,14,208 Pooling GlobalAveragePool_127 1 1 511_splitncnn_1 512 -23330=4,1,208,1,1 0=1 4=1 InnerProduct Conv_128 1 1 512 514 -23330=4,1,52,1,1 0=52 1=1 2=10816 9=1 InnerProduct Conv_130 1 1 514 516 -23330=4,1,208,1,1 0=208 1=1 2=10816 9=4 BinaryOp Mul_132 2 1 511_splitncnn_0 516 517 -23330=4,3,14,14,208 0=2 Convolution Conv_133 1 1 517 519 -23330=4,3,14,14,208 0=208 1=1 5=1 6=43264 BinaryOp Add_135 2 1 505_splitncnn_0 519 520 -23330=4,3,14,14,208 ReLU Relu_136 1 1 520 521 -23330=4,3,14,14,208 Split splitncnn_16 1 2 521 521_splitncnn_0 521_splitncnn_1 -23330=8,3,14,14,208,3,14,14,208 Convolution Conv_137 1 1 521_splitncnn_1 524 -23330=4,3,14,14,208 0=208 1=1 5=1 6=43264 9=1 ConvolutionDepthWise Conv_140 1 1 524 527 -23330=4,3,14,14,208 0=208 1=3 4=1 5=1 6=14976 7=26 9=1 Split splitncnn_17 1 2 527 527_splitncnn_0 527_splitncnn_1 -23330=8,3,14,14,208,3,14,14,208 Pooling GlobalAveragePool_143 1 1 527_splitncnn_1 528 -23330=4,1,208,1,1 0=1 4=1 InnerProduct Conv_144 1 1 528 530 -23330=4,1,52,1,1 0=52 1=1 2=10816 9=1 InnerProduct Conv_146 1 1 530 532 -23330=4,1,208,1,1 0=208 1=1 2=10816 9=4 BinaryOp Mul_148 2 1 527_splitncnn_0 532 533 -23330=4,3,14,14,208 0=2 Convolution Conv_149 1 1 533 535 -23330=4,3,14,14,208 0=208 1=1 5=1 6=43264 BinaryOp Add_151 2 1 521_splitncnn_0 535 536 -23330=4,3,14,14,208 ReLU Relu_152 1 1 536 537 -23330=4,3,14,14,208 Split splitncnn_18 1 2 537 537_splitncnn_0 537_splitncnn_1 -23330=8,3,14,14,208,3,14,14,208 Convolution Conv_153 1 1 537_splitncnn_1 540 -23330=4,3,14,14,208 0=208 1=1 5=1 6=43264 9=1 ConvolutionDepthWise Conv_156 1 1 540 543 -23330=4,3,14,14,208 0=208 1=3 4=1 5=1 6=14976 7=26 9=1 Split splitncnn_19 1 2 543 543_splitncnn_0 543_splitncnn_1 -23330=8,3,14,14,208,3,14,14,208 Pooling GlobalAveragePool_159 1 1 543_splitncnn_1 544 -23330=4,1,208,1,1 0=1 4=1 InnerProduct Conv_160 1 1 544 546 -23330=4,1,52,1,1 0=52 1=1 2=10816 9=1 InnerProduct Conv_162 1 1 546 548 -23330=4,1,208,1,1 0=208 1=1 2=10816 9=4 BinaryOp Mul_164 2 1 543_splitncnn_0 548 549 -23330=4,3,14,14,208 0=2 Convolution Conv_165 1 1 549 551 -23330=4,3,14,14,208 0=208 1=1 5=1 6=43264 BinaryOp Add_167 2 1 537_splitncnn_0 551 552 -23330=4,3,14,14,208 ReLU Relu_168 1 1 552 553 -23330=4,3,14,14,208 Split splitncnn_20 1 2 553 553_splitncnn_0 553_splitncnn_1 -23330=8,3,14,14,208,3,14,14,208 Convolution Conv_169 1 1 553_splitncnn_1 555 -23330=4,3,7,7,440 0=440 1=1 3=2 5=1 6=91520 Convolution Conv_171 1 1 553_splitncnn_0 558 -23330=4,3,14,14,440 0=440 1=1 5=1 6=91520 9=1 ConvolutionDepthWise Conv_174 1 1 558 561 -23330=4,3,7,7,440 0=440 1=3 3=2 4=1 5=1 6=31680 7=55 9=1 Split splitncnn_21 1 2 561 561_splitncnn_0 561_splitncnn_1 -23330=8,3,7,7,440,3,7,7,440 Pooling GlobalAveragePool_177 1 1 561_splitncnn_1 562 -23330=4,1,440,1,1 0=1 4=1 InnerProduct Conv_178 1 1 562 564 -23330=4,1,52,1,1 0=52 1=1 2=22880 9=1 InnerProduct Conv_180 1 1 564 566 -23330=4,1,440,1,1 0=440 1=1 2=22880 9=4 BinaryOp Mul_182 2 1 561_splitncnn_0 566 567 -23330=4,3,7,7,440 0=2 Convolution Conv_183 1 1 567 569 -23330=4,3,7,7,440 0=440 1=1 5=1 6=193600 BinaryOp Add_185 2 1 555 569 570 -23330=4,3,7,7,440 ReLU Relu_186 1 1 570 571 -23330=4,3,7,7,440 Split splitncnn_22 1 2 571 571_splitncnn_0 571_splitncnn_1 -23330=8,3,7,7,440,3,7,7,440 Convolution Conv_187 1 1 571_splitncnn_1 574 -23330=4,3,7,7,440 0=440 1=1 5=1 6=193600 9=1 ConvolutionDepthWise Conv_190 1 1 574 577 -23330=4,3,7,7,440 0=440 1=3 4=1 5=1 6=31680 7=55 9=1 Split splitncnn_23 1 2 577 577_splitncnn_0 577_splitncnn_1 -23330=8,3,7,7,440,3,7,7,440 Pooling GlobalAveragePool_193 1 1 577_splitncnn_1 578 -23330=4,1,440,1,1 0=1 4=1 InnerProduct Conv_194 1 1 578 580 -23330=4,1,110,1,1 0=110 1=1 2=48400 9=1 InnerProduct Conv_196 1 1 580 582 -23330=4,1,440,1,1 0=440 1=1 2=48400 9=4 BinaryOp Mul_198 2 1 577_splitncnn_0 582 583 -23330=4,3,7,7,440 0=2 Convolution Conv_199 1 1 583 585 -23330=4,3,7,7,440 0=440 1=1 5=1 6=193600 BinaryOp Add_201 2 1 571_splitncnn_0 585 586 -23330=4,3,7,7,440 ReLU Relu_202 1 1 586 587 -23330=4,3,7,7,440 Split splitncnn_24 1 2 587 587_splitncnn_0 587_splitncnn_1 -23330=8,3,7,7,440,3,7,7,440 Convolution Conv_203 1 1 587_splitncnn_1 590 -23330=4,3,7,7,440 0=440 1=1 5=1 6=193600 9=1 ConvolutionDepthWise Conv_206 1 1 590 593 -23330=4,3,7,7,440 0=440 1=3 4=1 5=1 6=31680 7=55 9=1 Split splitncnn_25 1 2 593 593_splitncnn_0 593_splitncnn_1 -23330=8,3,7,7,440,3,7,7,440 Pooling GlobalAveragePool_209 1 1 593_splitncnn_1 594 -23330=4,1,440,1,1 0=1 4=1 InnerProduct Conv_210 1 1 594 596 -23330=4,1,110,1,1 0=110 1=1 2=48400 9=1 InnerProduct Conv_212 1 1 596 598 -23330=4,1,440,1,1 0=440 1=1 2=48400 9=4 BinaryOp Mul_214 2 1 593_splitncnn_0 598 599 -23330=4,3,7,7,440 0=2 Convolution Conv_215 1 1 599 601 -23330=4,3,7,7,440 0=440 1=1 5=1 6=193600 BinaryOp Add_217 2 1 587_splitncnn_0 601 602 -23330=4,3,7,7,440 ReLU Relu_218 1 1 602 603 -23330=4,3,7,7,440 Split splitncnn_26 1 2 603 603_splitncnn_0 603_splitncnn_1 -23330=8,3,7,7,440,3,7,7,440 Convolution Conv_219 1 1 603_splitncnn_1 606 -23330=4,3,7,7,440 0=440 1=1 5=1 6=193600 9=1 ConvolutionDepthWise Conv_222 1 1 606 609 -23330=4,3,7,7,440 0=440 1=3 4=1 5=1 6=31680 7=55 9=1 Split splitncnn_27 1 2 609 609_splitncnn_0 609_splitncnn_1 -23330=8,3,7,7,440,3,7,7,440 Pooling GlobalAveragePool_225 1 1 609_splitncnn_1 610 -23330=4,1,440,1,1 0=1 4=1 InnerProduct Conv_226 1 1 610 612 -23330=4,1,110,1,1 0=110 1=1 2=48400 9=1 InnerProduct Conv_228 1 1 612 614 -23330=4,1,440,1,1 0=440 1=1 2=48400 9=4 BinaryOp Mul_230 2 1 609_splitncnn_0 614 615 -23330=4,3,7,7,440 0=2 Convolution Conv_231 1 1 615 617 -23330=4,3,7,7,440 0=440 1=1 5=1 6=193600 BinaryOp Add_233 2 1 603_splitncnn_0 617 618 -23330=4,3,7,7,440 ReLU Relu_234 1 1 618 619 -23330=4,3,7,7,440 Split splitncnn_28 1 2 619 619_splitncnn_0 619_splitncnn_1 -23330=8,3,7,7,440,3,7,7,440 Convolution Conv_235 1 1 619_splitncnn_1 622 -23330=4,3,7,7,440 0=440 1=1 5=1 6=193600 9=1 ConvolutionDepthWise Conv_238 1 1 622 625 -23330=4,3,7,7,440 0=440 1=3 4=1 5=1 6=31680 7=55 9=1 Split splitncnn_29 1 2 625 625_splitncnn_0 625_splitncnn_1 -23330=8,3,7,7,440,3,7,7,440 Pooling GlobalAveragePool_241 1 1 625_splitncnn_1 626 -23330=4,1,440,1,1 0=1 4=1 InnerProduct Conv_242 1 1 626 628 -23330=4,1,110,1,1 0=110 1=1 2=48400 9=1 InnerProduct Conv_244 1 1 628 630 -23330=4,1,440,1,1 0=440 1=1 2=48400 9=4 BinaryOp Mul_246 2 1 625_splitncnn_0 630 631 -23330=4,3,7,7,440 0=2 Convolution Conv_247 1 1 631 633 -23330=4,3,7,7,440 0=440 1=1 5=1 6=193600 BinaryOp Add_249 2 1 619_splitncnn_0 633 634 -23330=4,3,7,7,440 ReLU Relu_250 1 1 634 635 -23330=4,3,7,7,440 Split splitncnn_30 1 2 635 635_splitncnn_0 635_splitncnn_1 -23330=8,3,7,7,440,3,7,7,440 Convolution Conv_251 1 1 635_splitncnn_1 638 -23330=4,3,7,7,440 0=440 1=1 5=1 6=193600 9=1 ConvolutionDepthWise Conv_254 1 1 638 641 -23330=4,3,7,7,440 0=440 1=3 4=1 5=1 6=31680 7=55 9=1 Split splitncnn_31 1 2 641 641_splitncnn_0 641_splitncnn_1 -23330=8,3,7,7,440,3,7,7,440 Pooling GlobalAveragePool_257 1 1 641_splitncnn_1 642 -23330=4,1,440,1,1 0=1 4=1 InnerProduct Conv_258 1 1 642 644 -23330=4,1,110,1,1 0=110 1=1 2=48400 9=1 InnerProduct Conv_260 1 1 644 646 -23330=4,1,440,1,1 0=440 1=1 2=48400 9=4 BinaryOp Mul_262 2 1 641_splitncnn_0 646 647 -23330=4,3,7,7,440 0=2 Convolution Conv_263 1 1 647 649 -23330=4,3,7,7,440 0=440 1=1 5=1 6=193600 BinaryOp Add_265 2 1 635_splitncnn_0 649 650 -23330=4,3,7,7,440 ReLU Relu_266 1 1 650 651 -23330=4,3,7,7,440 Pooling GlobalAveragePool_267 1 1 651 660 -23330=4,1,440,1,1 0=1 4=1 InnerProduct Gemm_274 1 1 660 661 -23330=4,1,1000,1,1 0=1000 1=1 2=440000 Softmax prob 1 1 661 output -23330=4,1,1000,1,1 ================================================ FILE: benchmark/resnet18.param ================================================ 7767517 50 58 Input data 0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3 Convolution conv1 1 1 data conv1_conv1_relu -23330=4,3,112,112,64 0=64 1=7 3=2 4=3 5=1 6=9408 9=1 Pooling pool1 1 1 conv1_conv1_relu pool1 -23330=4,3,56,56,64 1=3 2=2 Split splitncnn_0 1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1 -23330=8,3,56,56,64,3,56,56,64 Convolution res2a_branch1 1 1 pool1_splitncnn_1 res2a_branch1_scale2a_branch1 -23330=4,3,56,56,64 0=64 1=1 5=1 6=4096 Convolution res2a_branch2a 1 1 pool1_splitncnn_0 res2a_branch2a_res2a_branch2a_relu -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 9=1 Convolution res2a_branch2b 1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b_scale2a_branch2b -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 Eltwise res2a 2 1 res2a_branch1_scale2a_branch1 res2a_branch2b_scale2a_branch2b res2a -23330=4,3,56,56,64 0=1 ReLU res2a_relu 1 1 res2a res2a_res2a_relu -23330=4,3,56,56,64 Split splitncnn_1 1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1 -23330=8,3,56,56,64,3,56,56,64 Convolution res2b_branch2a 1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a_res2b_branch2a_relu -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 9=1 Convolution res2b_branch2b 1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b_scale2b_branch2b -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 Eltwise res2b 2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2b_scale2b_branch2b res2b -23330=4,3,56,56,64 0=1 ReLU res2b_relu 1 1 res2b res2b_res2b_relu -23330=4,3,56,56,64 Split splitncnn_2 1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1 -23330=8,3,56,56,64,3,56,56,64 Convolution res3a_branch1 1 1 res2b_res2b_relu_splitncnn_1 res3a_branch1_scale3a_branch1 -23330=4,3,28,28,128 0=128 1=1 3=2 5=1 6=8192 Convolution res3a_branch2a 1 1 res2b_res2b_relu_splitncnn_0 res3a_branch2a_res3a_branch2a_relu -23330=4,3,28,28,128 0=128 1=3 3=2 4=1 5=1 6=73728 9=1 Convolution res3a_branch2b 1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b_scale3a_branch2b -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 Eltwise res3a 2 1 res3a_branch1_scale3a_branch1 res3a_branch2b_scale3a_branch2b res3a -23330=4,3,28,28,128 0=1 ReLU res3a_relu 1 1 res3a res3a_res3a_relu -23330=4,3,28,28,128 Split splitncnn_3 1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1 -23330=8,3,28,28,128,3,28,28,128 Convolution res3b_branch2a 1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a_res3b_branch2a_relu -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 9=1 Convolution res3b_branch2b 1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b_scale3b_branch2b -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 Eltwise res3b 2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2b_scale3b_branch2b res3b -23330=4,3,28,28,128 0=1 ReLU res3b_relu 1 1 res3b res3b_res3b_relu -23330=4,3,28,28,128 Split splitncnn_4 1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1 -23330=8,3,28,28,128,3,28,28,128 Convolution res4a_branch1 1 1 res3b_res3b_relu_splitncnn_1 res4a_branch1_scale4a_branch1 -23330=4,3,14,14,256 0=256 1=1 3=2 5=1 6=32768 Convolution res4a_branch2a 1 1 res3b_res3b_relu_splitncnn_0 res4a_branch2a_res4a_branch2a_relu -23330=4,3,14,14,256 0=256 1=3 3=2 4=1 5=1 6=294912 9=1 Convolution res4a_branch2b 1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b_scale4a_branch2b -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 Eltwise res4a 2 1 res4a_branch1_scale4a_branch1 res4a_branch2b_scale4a_branch2b res4a -23330=4,3,14,14,256 0=1 ReLU res4a_relu 1 1 res4a res4a_res4a_relu -23330=4,3,14,14,256 Split splitncnn_5 1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1 -23330=8,3,14,14,256,3,14,14,256 Convolution res4b_branch2a 1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a_res4b_branch2a_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 9=1 Convolution res4b_branch2b 1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b_scale4b_branch2b -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 Eltwise res4b 2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2b_scale4b_branch2b res4b -23330=4,3,14,14,256 0=1 ReLU res4b_relu 1 1 res4b res4b_res4b_relu -23330=4,3,14,14,256 Split splitncnn_6 1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1 -23330=8,3,14,14,256,3,14,14,256 Convolution res5a_branch1 1 1 res4b_res4b_relu_splitncnn_1 res5a_branch1_scale5a_branch1 -23330=4,3,7,7,512 0=512 1=1 3=2 5=1 6=131072 Convolution res5a_branch2a 1 1 res4b_res4b_relu_splitncnn_0 res5a_branch2a_res5a_branch2a_relu -23330=4,3,7,7,512 0=512 1=3 3=2 4=1 5=1 6=1179648 9=1 Convolution res5a_branch2b 1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b_scale5a_branch2b -23330=4,3,7,7,512 0=512 1=3 4=1 5=1 6=2359296 Eltwise res5a 2 1 res5a_branch1_scale5a_branch1 res5a_branch2b_scale5a_branch2b res5a -23330=4,3,7,7,512 0=1 ReLU res5a_relu 1 1 res5a res5a_res5a_relu -23330=4,3,7,7,512 Split splitncnn_7 1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1 -23330=8,3,7,7,512,3,7,7,512 Convolution res5b_branch2a 1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a_res5b_branch2a_relu -23330=4,3,7,7,512 0=512 1=3 4=1 5=1 6=2359296 9=1 Convolution res5b_branch2b 1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b_scale5b_branch2b -23330=4,3,7,7,512 0=512 1=3 4=1 5=1 6=2359296 Eltwise res5b 2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2b_scale5b_branch2b res5b -23330=4,3,7,7,512 0=1 ReLU res5b_relu 1 1 res5b res5b_res5b_relu -23330=4,3,7,7,512 Pooling pool5 1 1 res5b_res5b_relu pool5 -23330=4,3,1,1,512 0=1 1=7 InnerProduct fc1000 1 1 pool5 fc1000 -23330=4,1,1000,1,1 0=1000 1=1 2=512000 Softmax prob 1 1 fc1000 output -23330=4,1,1000,1,1 ================================================ FILE: benchmark/resnet18_int8.param ================================================ 7767517 50 58 Input data 0 1 data 0=224 1=224 2=3 Convolution conv1 1 1 data conv1_conv1_relu 0=64 1=7 3=2 4=3 5=1 6=9408 8=2 9=1 Pooling pool1 1 1 conv1_conv1_relu pool1 1=3 2=2 Split splitncnn_0 1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1 Convolution res2a_branch1 1 1 pool1_splitncnn_1 res2a_branch1_scale2a_branch1 0=64 1=1 5=1 6=4096 8=2 Convolution res2a_branch2a 1 1 pool1_splitncnn_0 res2a_branch2a_res2a_branch2a_relu 0=64 1=3 4=1 5=1 6=36864 8=102 9=1 Convolution res2a_branch2b 1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b_scale2a_branch2b 0=64 1=3 4=1 5=1 6=36864 8=2 Eltwise res2a 2 1 res2a_branch1_scale2a_branch1 res2a_branch2b_scale2a_branch2b res2a 0=1 ReLU res2a_relu 1 1 res2a res2a_res2a_relu Split splitncnn_1 1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1 Convolution res2b_branch2a 1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a_res2b_branch2a_relu 0=64 1=3 4=1 5=1 6=36864 8=102 9=1 Convolution res2b_branch2b 1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b_scale2b_branch2b 0=64 1=3 4=1 5=1 6=36864 8=2 Eltwise res2b 2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2b_scale2b_branch2b res2b 0=1 ReLU res2b_relu 1 1 res2b res2b_res2b_relu Split splitncnn_2 1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1 Convolution res3a_branch1 1 1 res2b_res2b_relu_splitncnn_1 res3a_branch1_scale3a_branch1 0=128 1=1 3=2 5=1 6=8192 8=2 Convolution res3a_branch2a 1 1 res2b_res2b_relu_splitncnn_0 res3a_branch2a_res3a_branch2a_relu 0=128 1=3 3=2 4=1 5=1 6=73728 8=102 9=1 Convolution res3a_branch2b 1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b_scale3a_branch2b 0=128 1=3 4=1 5=1 6=147456 8=2 Eltwise res3a 2 1 res3a_branch1_scale3a_branch1 res3a_branch2b_scale3a_branch2b res3a 0=1 ReLU res3a_relu 1 1 res3a res3a_res3a_relu Split splitncnn_3 1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1 Convolution res3b_branch2a 1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a_res3b_branch2a_relu 0=128 1=3 4=1 5=1 6=147456 8=102 9=1 Convolution res3b_branch2b 1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b_scale3b_branch2b 0=128 1=3 4=1 5=1 6=147456 8=2 Eltwise res3b 2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2b_scale3b_branch2b res3b 0=1 ReLU res3b_relu 1 1 res3b res3b_res3b_relu Split splitncnn_4 1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1 Convolution res4a_branch1 1 1 res3b_res3b_relu_splitncnn_1 res4a_branch1_scale4a_branch1 0=256 1=1 3=2 5=1 6=32768 8=2 Convolution res4a_branch2a 1 1 res3b_res3b_relu_splitncnn_0 res4a_branch2a_res4a_branch2a_relu 0=256 1=3 3=2 4=1 5=1 6=294912 8=102 9=1 Convolution res4a_branch2b 1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b_scale4a_branch2b 0=256 1=3 4=1 5=1 6=589824 8=2 Eltwise res4a 2 1 res4a_branch1_scale4a_branch1 res4a_branch2b_scale4a_branch2b res4a 0=1 ReLU res4a_relu 1 1 res4a res4a_res4a_relu Split splitncnn_5 1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1 Convolution res4b_branch2a 1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a_res4b_branch2a_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1 Convolution res4b_branch2b 1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b_scale4b_branch2b 0=256 1=3 4=1 5=1 6=589824 8=2 Eltwise res4b 2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2b_scale4b_branch2b res4b 0=1 ReLU res4b_relu 1 1 res4b res4b_res4b_relu Split splitncnn_6 1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1 Convolution res5a_branch1 1 1 res4b_res4b_relu_splitncnn_1 res5a_branch1_scale5a_branch1 0=512 1=1 3=2 5=1 6=131072 8=2 Convolution res5a_branch2a 1 1 res4b_res4b_relu_splitncnn_0 res5a_branch2a_res5a_branch2a_relu 0=512 1=3 3=2 4=1 5=1 6=1179648 8=102 9=1 Convolution res5a_branch2b 1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b_scale5a_branch2b 0=512 1=3 4=1 5=1 6=2359296 8=2 Eltwise res5a 2 1 res5a_branch1_scale5a_branch1 res5a_branch2b_scale5a_branch2b res5a 0=1 ReLU res5a_relu 1 1 res5a res5a_res5a_relu Split splitncnn_7 1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1 Convolution res5b_branch2a 1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a_res5b_branch2a_relu 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1 Convolution res5b_branch2b 1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b_scale5b_branch2b 0=512 1=3 4=1 5=1 6=2359296 8=2 Eltwise res5b 2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2b_scale5b_branch2b res5b 0=1 ReLU res5b_relu 1 1 res5b res5b_res5b_relu Pooling pool5 1 1 res5b_res5b_relu pool5 0=1 1=7 InnerProduct fc1000 1 1 pool5 fc1000 0=1000 1=1 2=512000 Softmax prob 1 1 fc1000 output ================================================ FILE: benchmark/resnet50.param ================================================ 7767517 106 122 Input data 0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3 Convolution conv1 1 1 data conv1_conv1_relu -23330=4,3,112,112,64 0=64 1=7 3=2 4=3 5=1 6=9408 9=1 Pooling pool1 1 1 conv1_conv1_relu pool1 -23330=4,3,56,56,64 1=3 2=2 Split splitncnn_0 1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1 -23330=8,3,56,56,64,3,56,56,64 Convolution res2a_branch1 1 1 pool1_splitncnn_1 res2a_branch1_scale2a_branch1 -23330=4,3,56,56,256 0=256 1=1 5=1 6=16384 Convolution res2a_branch2a 1 1 pool1_splitncnn_0 res2a_branch2a_res2a_branch2a_relu -23330=4,3,56,56,64 0=64 1=1 5=1 6=4096 9=1 Convolution res2a_branch2b 1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b_res2a_branch2b_relu -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 9=1 Convolution res2a_branch2c 1 1 res2a_branch2b_res2a_branch2b_relu res2a_branch2c_scale2a_branch2c -23330=4,3,56,56,256 0=256 1=1 5=1 6=16384 Eltwise res2a 2 1 res2a_branch1_scale2a_branch1 res2a_branch2c_scale2a_branch2c res2a -23330=4,3,56,56,256 0=1 ReLU res2a_relu 1 1 res2a res2a_res2a_relu -23330=4,3,56,56,256 Split splitncnn_1 1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1 -23330=8,3,56,56,256,3,56,56,256 Convolution res2b_branch2a 1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a_res2b_branch2a_relu -23330=4,3,56,56,64 0=64 1=1 5=1 6=16384 9=1 Convolution res2b_branch2b 1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b_res2b_branch2b_relu -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 9=1 Convolution res2b_branch2c 1 1 res2b_branch2b_res2b_branch2b_relu res2b_branch2c_scale2b_branch2c -23330=4,3,56,56,256 0=256 1=1 5=1 6=16384 Eltwise res2b 2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2c_scale2b_branch2c res2b -23330=4,3,56,56,256 0=1 ReLU res2b_relu 1 1 res2b res2b_res2b_relu -23330=4,3,56,56,256 Split splitncnn_2 1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1 -23330=8,3,56,56,256,3,56,56,256 Convolution res2c_branch2a 1 1 res2b_res2b_relu_splitncnn_1 res2c_branch2a_res2c_branch2a_relu -23330=4,3,56,56,64 0=64 1=1 5=1 6=16384 9=1 Convolution res2c_branch2b 1 1 res2c_branch2a_res2c_branch2a_relu res2c_branch2b_res2c_branch2b_relu -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 9=1 Convolution res2c_branch2c 1 1 res2c_branch2b_res2c_branch2b_relu res2c_branch2c_scale2c_branch2c -23330=4,3,56,56,256 0=256 1=1 5=1 6=16384 Eltwise res2c 2 1 res2b_res2b_relu_splitncnn_0 res2c_branch2c_scale2c_branch2c res2c -23330=4,3,56,56,256 0=1 ReLU res2c_relu 1 1 res2c res2c_res2c_relu -23330=4,3,56,56,256 Split splitncnn_3 1 2 res2c_res2c_relu res2c_res2c_relu_splitncnn_0 res2c_res2c_relu_splitncnn_1 -23330=8,3,56,56,256,3,56,56,256 Convolution res3a_branch1 1 1 res2c_res2c_relu_splitncnn_1 res3a_branch1_scale3a_branch1 -23330=4,3,28,28,512 0=512 1=1 3=2 5=1 6=131072 Convolution res3a_branch2a 1 1 res2c_res2c_relu_splitncnn_0 res3a_branch2a_res3a_branch2a_relu -23330=4,3,28,28,128 0=128 1=1 3=2 5=1 6=32768 9=1 Convolution res3a_branch2b 1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b_res3a_branch2b_relu -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 9=1 Convolution res3a_branch2c 1 1 res3a_branch2b_res3a_branch2b_relu res3a_branch2c_scale3a_branch2c -23330=4,3,28,28,512 0=512 1=1 5=1 6=65536 Eltwise res3a 2 1 res3a_branch1_scale3a_branch1 res3a_branch2c_scale3a_branch2c res3a -23330=4,3,28,28,512 0=1 ReLU res3a_relu 1 1 res3a res3a_res3a_relu -23330=4,3,28,28,512 Split splitncnn_4 1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1 -23330=8,3,28,28,512,3,28,28,512 Convolution res3b_branch2a 1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a_res3b_branch2a_relu -23330=4,3,28,28,128 0=128 1=1 5=1 6=65536 9=1 Convolution res3b_branch2b 1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b_res3b_branch2b_relu -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 9=1 Convolution res3b_branch2c 1 1 res3b_branch2b_res3b_branch2b_relu res3b_branch2c_scale3b_branch2c -23330=4,3,28,28,512 0=512 1=1 5=1 6=65536 Eltwise res3b 2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2c_scale3b_branch2c res3b -23330=4,3,28,28,512 0=1 ReLU res3b_relu 1 1 res3b res3b_res3b_relu -23330=4,3,28,28,512 Split splitncnn_5 1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1 -23330=8,3,28,28,512,3,28,28,512 Convolution res3c_branch2a 1 1 res3b_res3b_relu_splitncnn_1 res3c_branch2a_res3c_branch2a_relu -23330=4,3,28,28,128 0=128 1=1 5=1 6=65536 9=1 Convolution res3c_branch2b 1 1 res3c_branch2a_res3c_branch2a_relu res3c_branch2b_res3c_branch2b_relu -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 9=1 Convolution res3c_branch2c 1 1 res3c_branch2b_res3c_branch2b_relu res3c_branch2c_scale3c_branch2c -23330=4,3,28,28,512 0=512 1=1 5=1 6=65536 Eltwise res3c 2 1 res3b_res3b_relu_splitncnn_0 res3c_branch2c_scale3c_branch2c res3c -23330=4,3,28,28,512 0=1 ReLU res3c_relu 1 1 res3c res3c_res3c_relu -23330=4,3,28,28,512 Split splitncnn_6 1 2 res3c_res3c_relu res3c_res3c_relu_splitncnn_0 res3c_res3c_relu_splitncnn_1 -23330=8,3,28,28,512,3,28,28,512 Convolution res3d_branch2a 1 1 res3c_res3c_relu_splitncnn_1 res3d_branch2a_res3d_branch2a_relu -23330=4,3,28,28,128 0=128 1=1 5=1 6=65536 9=1 Convolution res3d_branch2b 1 1 res3d_branch2a_res3d_branch2a_relu res3d_branch2b_res3d_branch2b_relu -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 9=1 Convolution res3d_branch2c 1 1 res3d_branch2b_res3d_branch2b_relu res3d_branch2c_scale3d_branch2c -23330=4,3,28,28,512 0=512 1=1 5=1 6=65536 Eltwise res3d 2 1 res3c_res3c_relu_splitncnn_0 res3d_branch2c_scale3d_branch2c res3d -23330=4,3,28,28,512 0=1 ReLU res3d_relu 1 1 res3d res3d_res3d_relu -23330=4,3,28,28,512 Split splitncnn_7 1 2 res3d_res3d_relu res3d_res3d_relu_splitncnn_0 res3d_res3d_relu_splitncnn_1 -23330=8,3,28,28,512,3,28,28,512 Convolution res4a_branch1 1 1 res3d_res3d_relu_splitncnn_1 res4a_branch1_scale4a_branch1 -23330=4,3,14,14,1024 0=1024 1=1 3=2 5=1 6=524288 Convolution res4a_branch2a 1 1 res3d_res3d_relu_splitncnn_0 res4a_branch2a_res4a_branch2a_relu -23330=4,3,14,14,256 0=256 1=1 3=2 5=1 6=131072 9=1 Convolution res4a_branch2b 1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b_res4a_branch2b_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 9=1 Convolution res4a_branch2c 1 1 res4a_branch2b_res4a_branch2b_relu res4a_branch2c_scale4a_branch2c -23330=4,3,14,14,1024 0=1024 1=1 5=1 6=262144 Eltwise res4a 2 1 res4a_branch1_scale4a_branch1 res4a_branch2c_scale4a_branch2c res4a -23330=4,3,14,14,1024 0=1 ReLU res4a_relu 1 1 res4a res4a_res4a_relu -23330=4,3,14,14,1024 Split splitncnn_8 1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1 -23330=8,3,14,14,1024,3,14,14,1024 Convolution res4b_branch2a 1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a_res4b_branch2a_relu -23330=4,3,14,14,256 0=256 1=1 5=1 6=262144 9=1 Convolution res4b_branch2b 1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b_res4b_branch2b_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 9=1 Convolution res4b_branch2c 1 1 res4b_branch2b_res4b_branch2b_relu res4b_branch2c_scale4b_branch2c -23330=4,3,14,14,1024 0=1024 1=1 5=1 6=262144 Eltwise res4b 2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2c_scale4b_branch2c res4b -23330=4,3,14,14,1024 0=1 ReLU res4b_relu 1 1 res4b res4b_res4b_relu -23330=4,3,14,14,1024 Split splitncnn_9 1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1 -23330=8,3,14,14,1024,3,14,14,1024 Convolution res4c_branch2a 1 1 res4b_res4b_relu_splitncnn_1 res4c_branch2a_res4c_branch2a_relu -23330=4,3,14,14,256 0=256 1=1 5=1 6=262144 9=1 Convolution res4c_branch2b 1 1 res4c_branch2a_res4c_branch2a_relu res4c_branch2b_res4c_branch2b_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 9=1 Convolution res4c_branch2c 1 1 res4c_branch2b_res4c_branch2b_relu res4c_branch2c_scale4c_branch2c -23330=4,3,14,14,1024 0=1024 1=1 5=1 6=262144 Eltwise res4c 2 1 res4b_res4b_relu_splitncnn_0 res4c_branch2c_scale4c_branch2c res4c -23330=4,3,14,14,1024 0=1 ReLU res4c_relu 1 1 res4c res4c_res4c_relu -23330=4,3,14,14,1024 Split splitncnn_10 1 2 res4c_res4c_relu res4c_res4c_relu_splitncnn_0 res4c_res4c_relu_splitncnn_1 -23330=8,3,14,14,1024,3,14,14,1024 Convolution res4d_branch2a 1 1 res4c_res4c_relu_splitncnn_1 res4d_branch2a_res4d_branch2a_relu -23330=4,3,14,14,256 0=256 1=1 5=1 6=262144 9=1 Convolution res4d_branch2b 1 1 res4d_branch2a_res4d_branch2a_relu res4d_branch2b_res4d_branch2b_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 9=1 Convolution res4d_branch2c 1 1 res4d_branch2b_res4d_branch2b_relu res4d_branch2c_scale4d_branch2c -23330=4,3,14,14,1024 0=1024 1=1 5=1 6=262144 Eltwise res4d 2 1 res4c_res4c_relu_splitncnn_0 res4d_branch2c_scale4d_branch2c res4d -23330=4,3,14,14,1024 0=1 ReLU res4d_relu 1 1 res4d res4d_res4d_relu -23330=4,3,14,14,1024 Split splitncnn_11 1 2 res4d_res4d_relu res4d_res4d_relu_splitncnn_0 res4d_res4d_relu_splitncnn_1 -23330=8,3,14,14,1024,3,14,14,1024 Convolution res4e_branch2a 1 1 res4d_res4d_relu_splitncnn_1 res4e_branch2a_res4e_branch2a_relu -23330=4,3,14,14,256 0=256 1=1 5=1 6=262144 9=1 Convolution res4e_branch2b 1 1 res4e_branch2a_res4e_branch2a_relu res4e_branch2b_res4e_branch2b_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 9=1 Convolution res4e_branch2c 1 1 res4e_branch2b_res4e_branch2b_relu res4e_branch2c_scale4e_branch2c -23330=4,3,14,14,1024 0=1024 1=1 5=1 6=262144 Eltwise res4e 2 1 res4d_res4d_relu_splitncnn_0 res4e_branch2c_scale4e_branch2c res4e -23330=4,3,14,14,1024 0=1 ReLU res4e_relu 1 1 res4e res4e_res4e_relu -23330=4,3,14,14,1024 Split splitncnn_12 1 2 res4e_res4e_relu res4e_res4e_relu_splitncnn_0 res4e_res4e_relu_splitncnn_1 -23330=8,3,14,14,1024,3,14,14,1024 Convolution res4f_branch2a 1 1 res4e_res4e_relu_splitncnn_1 res4f_branch2a_res4f_branch2a_relu -23330=4,3,14,14,256 0=256 1=1 5=1 6=262144 9=1 Convolution res4f_branch2b 1 1 res4f_branch2a_res4f_branch2a_relu res4f_branch2b_res4f_branch2b_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 9=1 Convolution res4f_branch2c 1 1 res4f_branch2b_res4f_branch2b_relu res4f_branch2c_scale4f_branch2c -23330=4,3,14,14,1024 0=1024 1=1 5=1 6=262144 Eltwise res4f 2 1 res4e_res4e_relu_splitncnn_0 res4f_branch2c_scale4f_branch2c res4f -23330=4,3,14,14,1024 0=1 ReLU res4f_relu 1 1 res4f res4f_res4f_relu -23330=4,3,14,14,1024 Split splitncnn_13 1 2 res4f_res4f_relu res4f_res4f_relu_splitncnn_0 res4f_res4f_relu_splitncnn_1 -23330=8,3,14,14,1024,3,14,14,1024 Convolution res5a_branch1 1 1 res4f_res4f_relu_splitncnn_1 res5a_branch1_scale5a_branch1 -23330=4,3,7,7,2048 0=2048 1=1 3=2 5=1 6=2097152 Convolution res5a_branch2a 1 1 res4f_res4f_relu_splitncnn_0 res5a_branch2a_res5a_branch2a_relu -23330=4,3,7,7,512 0=512 1=1 3=2 5=1 6=524288 9=1 Convolution res5a_branch2b 1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b_res5a_branch2b_relu -23330=4,3,7,7,512 0=512 1=3 4=1 5=1 6=2359296 9=1 Convolution res5a_branch2c 1 1 res5a_branch2b_res5a_branch2b_relu res5a_branch2c_scale5a_branch2c -23330=4,3,7,7,2048 0=2048 1=1 5=1 6=1048576 Eltwise res5a 2 1 res5a_branch1_scale5a_branch1 res5a_branch2c_scale5a_branch2c res5a -23330=4,3,7,7,2048 0=1 ReLU res5a_relu 1 1 res5a res5a_res5a_relu -23330=4,3,7,7,2048 Split splitncnn_14 1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1 -23330=8,3,7,7,2048,3,7,7,2048 Convolution res5b_branch2a 1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a_res5b_branch2a_relu -23330=4,3,7,7,512 0=512 1=1 5=1 6=1048576 9=1 Convolution res5b_branch2b 1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b_res5b_branch2b_relu -23330=4,3,7,7,512 0=512 1=3 4=1 5=1 6=2359296 9=1 Convolution res5b_branch2c 1 1 res5b_branch2b_res5b_branch2b_relu res5b_branch2c_scale5b_branch2c -23330=4,3,7,7,2048 0=2048 1=1 5=1 6=1048576 Eltwise res5b 2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2c_scale5b_branch2c res5b -23330=4,3,7,7,2048 0=1 ReLU res5b_relu 1 1 res5b res5b_res5b_relu -23330=4,3,7,7,2048 Split splitncnn_15 1 2 res5b_res5b_relu res5b_res5b_relu_splitncnn_0 res5b_res5b_relu_splitncnn_1 -23330=8,3,7,7,2048,3,7,7,2048 Convolution res5c_branch2a 1 1 res5b_res5b_relu_splitncnn_1 res5c_branch2a_res5c_branch2a_relu -23330=4,3,7,7,512 0=512 1=1 5=1 6=1048576 9=1 Convolution res5c_branch2b 1 1 res5c_branch2a_res5c_branch2a_relu res5c_branch2b_res5c_branch2b_relu -23330=4,3,7,7,512 0=512 1=3 4=1 5=1 6=2359296 9=1 Convolution res5c_branch2c 1 1 res5c_branch2b_res5c_branch2b_relu res5c_branch2c_scale5c_branch2c -23330=4,3,7,7,2048 0=2048 1=1 5=1 6=1048576 Eltwise res5c 2 1 res5b_res5b_relu_splitncnn_0 res5c_branch2c_scale5c_branch2c res5c -23330=4,3,7,7,2048 0=1 ReLU res5c_relu 1 1 res5c res5c_res5c_relu -23330=4,3,7,7,2048 Pooling pool5 1 1 res5c_res5c_relu pool5 -23330=4,3,1,1,2048 0=1 1=7 InnerProduct fc1000 1 1 pool5 fc1000 -23330=4,1,1000,1,1 0=1000 1=1 2=2048000 Softmax prob 1 1 fc1000 output -23330=4,1,1000,1,1 ================================================ FILE: benchmark/resnet50_int8.param ================================================ 7767517 106 122 Input data 0 1 data 0=224 1=224 2=3 Convolution conv1 1 1 data conv1_conv1_relu 0=64 1=7 3=2 4=3 5=1 6=9408 8=2 9=1 Pooling pool1 1 1 conv1_conv1_relu pool1 1=3 2=2 Split splitncnn_0 1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1 Convolution res2a_branch1 1 1 pool1_splitncnn_1 res2a_branch1_scale2a_branch1 0=256 1=1 5=1 6=16384 8=2 Convolution res2a_branch2a 1 1 pool1_splitncnn_0 res2a_branch2a_res2a_branch2a_relu 0=64 1=1 5=1 6=4096 8=102 9=1 Convolution res2a_branch2b 1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b_res2a_branch2b_relu 0=64 1=3 4=1 5=1 6=36864 8=102 9=1 Convolution res2a_branch2c 1 1 res2a_branch2b_res2a_branch2b_relu res2a_branch2c_scale2a_branch2c 0=256 1=1 5=1 6=16384 8=2 Eltwise res2a 2 1 res2a_branch1_scale2a_branch1 res2a_branch2c_scale2a_branch2c res2a 0=1 ReLU res2a_relu 1 1 res2a res2a_res2a_relu Split splitncnn_1 1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1 Convolution res2b_branch2a 1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a_res2b_branch2a_relu 0=64 1=1 5=1 6=16384 8=102 9=1 Convolution res2b_branch2b 1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b_res2b_branch2b_relu 0=64 1=3 4=1 5=1 6=36864 8=102 9=1 Convolution res2b_branch2c 1 1 res2b_branch2b_res2b_branch2b_relu res2b_branch2c_scale2b_branch2c 0=256 1=1 5=1 6=16384 8=2 Eltwise res2b 2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2c_scale2b_branch2c res2b 0=1 ReLU res2b_relu 1 1 res2b res2b_res2b_relu Split splitncnn_2 1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1 Convolution res2c_branch2a 1 1 res2b_res2b_relu_splitncnn_1 res2c_branch2a_res2c_branch2a_relu 0=64 1=1 5=1 6=16384 8=102 9=1 Convolution res2c_branch2b 1 1 res2c_branch2a_res2c_branch2a_relu res2c_branch2b_res2c_branch2b_relu 0=64 1=3 4=1 5=1 6=36864 8=102 9=1 Convolution res2c_branch2c 1 1 res2c_branch2b_res2c_branch2b_relu res2c_branch2c_scale2c_branch2c 0=256 1=1 5=1 6=16384 8=2 Eltwise res2c 2 1 res2b_res2b_relu_splitncnn_0 res2c_branch2c_scale2c_branch2c res2c 0=1 ReLU res2c_relu 1 1 res2c res2c_res2c_relu Split splitncnn_3 1 2 res2c_res2c_relu res2c_res2c_relu_splitncnn_0 res2c_res2c_relu_splitncnn_1 Convolution res3a_branch1 1 1 res2c_res2c_relu_splitncnn_1 res3a_branch1_scale3a_branch1 0=512 1=1 3=2 5=1 6=131072 8=2 Convolution res3a_branch2a 1 1 res2c_res2c_relu_splitncnn_0 res3a_branch2a_res3a_branch2a_relu 0=128 1=1 3=2 5=1 6=32768 8=102 9=1 Convolution res3a_branch2b 1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b_res3a_branch2b_relu 0=128 1=3 4=1 5=1 6=147456 8=102 9=1 Convolution res3a_branch2c 1 1 res3a_branch2b_res3a_branch2b_relu res3a_branch2c_scale3a_branch2c 0=512 1=1 5=1 6=65536 8=2 Eltwise res3a 2 1 res3a_branch1_scale3a_branch1 res3a_branch2c_scale3a_branch2c res3a 0=1 ReLU res3a_relu 1 1 res3a res3a_res3a_relu Split splitncnn_4 1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1 Convolution res3b_branch2a 1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a_res3b_branch2a_relu 0=128 1=1 5=1 6=65536 8=102 9=1 Convolution res3b_branch2b 1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b_res3b_branch2b_relu 0=128 1=3 4=1 5=1 6=147456 8=102 9=1 Convolution res3b_branch2c 1 1 res3b_branch2b_res3b_branch2b_relu res3b_branch2c_scale3b_branch2c 0=512 1=1 5=1 6=65536 8=2 Eltwise res3b 2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2c_scale3b_branch2c res3b 0=1 ReLU res3b_relu 1 1 res3b res3b_res3b_relu Split splitncnn_5 1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1 Convolution res3c_branch2a 1 1 res3b_res3b_relu_splitncnn_1 res3c_branch2a_res3c_branch2a_relu 0=128 1=1 5=1 6=65536 8=102 9=1 Convolution res3c_branch2b 1 1 res3c_branch2a_res3c_branch2a_relu res3c_branch2b_res3c_branch2b_relu 0=128 1=3 4=1 5=1 6=147456 8=102 9=1 Convolution res3c_branch2c 1 1 res3c_branch2b_res3c_branch2b_relu res3c_branch2c_scale3c_branch2c 0=512 1=1 5=1 6=65536 8=2 Eltwise res3c 2 1 res3b_res3b_relu_splitncnn_0 res3c_branch2c_scale3c_branch2c res3c 0=1 ReLU res3c_relu 1 1 res3c res3c_res3c_relu Split splitncnn_6 1 2 res3c_res3c_relu res3c_res3c_relu_splitncnn_0 res3c_res3c_relu_splitncnn_1 Convolution res3d_branch2a 1 1 res3c_res3c_relu_splitncnn_1 res3d_branch2a_res3d_branch2a_relu 0=128 1=1 5=1 6=65536 8=102 9=1 Convolution res3d_branch2b 1 1 res3d_branch2a_res3d_branch2a_relu res3d_branch2b_res3d_branch2b_relu 0=128 1=3 4=1 5=1 6=147456 8=102 9=1 Convolution res3d_branch2c 1 1 res3d_branch2b_res3d_branch2b_relu res3d_branch2c_scale3d_branch2c 0=512 1=1 5=1 6=65536 8=2 Eltwise res3d 2 1 res3c_res3c_relu_splitncnn_0 res3d_branch2c_scale3d_branch2c res3d 0=1 ReLU res3d_relu 1 1 res3d res3d_res3d_relu Split splitncnn_7 1 2 res3d_res3d_relu res3d_res3d_relu_splitncnn_0 res3d_res3d_relu_splitncnn_1 Convolution res4a_branch1 1 1 res3d_res3d_relu_splitncnn_1 res4a_branch1_scale4a_branch1 0=1024 1=1 3=2 5=1 6=524288 8=2 Convolution res4a_branch2a 1 1 res3d_res3d_relu_splitncnn_0 res4a_branch2a_res4a_branch2a_relu 0=256 1=1 3=2 5=1 6=131072 8=102 9=1 Convolution res4a_branch2b 1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b_res4a_branch2b_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1 Convolution res4a_branch2c 1 1 res4a_branch2b_res4a_branch2b_relu res4a_branch2c_scale4a_branch2c 0=1024 1=1 5=1 6=262144 8=2 Eltwise res4a 2 1 res4a_branch1_scale4a_branch1 res4a_branch2c_scale4a_branch2c res4a 0=1 ReLU res4a_relu 1 1 res4a res4a_res4a_relu Split splitncnn_8 1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1 Convolution res4b_branch2a 1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a_res4b_branch2a_relu 0=256 1=1 5=1 6=262144 8=102 9=1 Convolution res4b_branch2b 1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b_res4b_branch2b_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1 Convolution res4b_branch2c 1 1 res4b_branch2b_res4b_branch2b_relu res4b_branch2c_scale4b_branch2c 0=1024 1=1 5=1 6=262144 8=2 Eltwise res4b 2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2c_scale4b_branch2c res4b 0=1 ReLU res4b_relu 1 1 res4b res4b_res4b_relu Split splitncnn_9 1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1 Convolution res4c_branch2a 1 1 res4b_res4b_relu_splitncnn_1 res4c_branch2a_res4c_branch2a_relu 0=256 1=1 5=1 6=262144 8=102 9=1 Convolution res4c_branch2b 1 1 res4c_branch2a_res4c_branch2a_relu res4c_branch2b_res4c_branch2b_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1 Convolution res4c_branch2c 1 1 res4c_branch2b_res4c_branch2b_relu res4c_branch2c_scale4c_branch2c 0=1024 1=1 5=1 6=262144 8=2 Eltwise res4c 2 1 res4b_res4b_relu_splitncnn_0 res4c_branch2c_scale4c_branch2c res4c 0=1 ReLU res4c_relu 1 1 res4c res4c_res4c_relu Split splitncnn_10 1 2 res4c_res4c_relu res4c_res4c_relu_splitncnn_0 res4c_res4c_relu_splitncnn_1 Convolution res4d_branch2a 1 1 res4c_res4c_relu_splitncnn_1 res4d_branch2a_res4d_branch2a_relu 0=256 1=1 5=1 6=262144 8=102 9=1 Convolution res4d_branch2b 1 1 res4d_branch2a_res4d_branch2a_relu res4d_branch2b_res4d_branch2b_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1 Convolution res4d_branch2c 1 1 res4d_branch2b_res4d_branch2b_relu res4d_branch2c_scale4d_branch2c 0=1024 1=1 5=1 6=262144 8=2 Eltwise res4d 2 1 res4c_res4c_relu_splitncnn_0 res4d_branch2c_scale4d_branch2c res4d 0=1 ReLU res4d_relu 1 1 res4d res4d_res4d_relu Split splitncnn_11 1 2 res4d_res4d_relu res4d_res4d_relu_splitncnn_0 res4d_res4d_relu_splitncnn_1 Convolution res4e_branch2a 1 1 res4d_res4d_relu_splitncnn_1 res4e_branch2a_res4e_branch2a_relu 0=256 1=1 5=1 6=262144 8=102 9=1 Convolution res4e_branch2b 1 1 res4e_branch2a_res4e_branch2a_relu res4e_branch2b_res4e_branch2b_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1 Convolution res4e_branch2c 1 1 res4e_branch2b_res4e_branch2b_relu res4e_branch2c_scale4e_branch2c 0=1024 1=1 5=1 6=262144 8=2 Eltwise res4e 2 1 res4d_res4d_relu_splitncnn_0 res4e_branch2c_scale4e_branch2c res4e 0=1 ReLU res4e_relu 1 1 res4e res4e_res4e_relu Split splitncnn_12 1 2 res4e_res4e_relu res4e_res4e_relu_splitncnn_0 res4e_res4e_relu_splitncnn_1 Convolution res4f_branch2a 1 1 res4e_res4e_relu_splitncnn_1 res4f_branch2a_res4f_branch2a_relu 0=256 1=1 5=1 6=262144 8=102 9=1 Convolution res4f_branch2b 1 1 res4f_branch2a_res4f_branch2a_relu res4f_branch2b_res4f_branch2b_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1 Convolution res4f_branch2c 1 1 res4f_branch2b_res4f_branch2b_relu res4f_branch2c_scale4f_branch2c 0=1024 1=1 5=1 6=262144 8=2 Eltwise res4f 2 1 res4e_res4e_relu_splitncnn_0 res4f_branch2c_scale4f_branch2c res4f 0=1 ReLU res4f_relu 1 1 res4f res4f_res4f_relu Split splitncnn_13 1 2 res4f_res4f_relu res4f_res4f_relu_splitncnn_0 res4f_res4f_relu_splitncnn_1 Convolution res5a_branch1 1 1 res4f_res4f_relu_splitncnn_1 res5a_branch1_scale5a_branch1 0=2048 1=1 3=2 5=1 6=2097152 8=2 Convolution res5a_branch2a 1 1 res4f_res4f_relu_splitncnn_0 res5a_branch2a_res5a_branch2a_relu 0=512 1=1 3=2 5=1 6=524288 8=102 9=1 Convolution res5a_branch2b 1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b_res5a_branch2b_relu 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1 Convolution res5a_branch2c 1 1 res5a_branch2b_res5a_branch2b_relu res5a_branch2c_scale5a_branch2c 0=2048 1=1 5=1 6=1048576 8=2 Eltwise res5a 2 1 res5a_branch1_scale5a_branch1 res5a_branch2c_scale5a_branch2c res5a 0=1 ReLU res5a_relu 1 1 res5a res5a_res5a_relu Split splitncnn_14 1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1 Convolution res5b_branch2a 1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a_res5b_branch2a_relu 0=512 1=1 5=1 6=1048576 8=102 9=1 Convolution res5b_branch2b 1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b_res5b_branch2b_relu 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1 Convolution res5b_branch2c 1 1 res5b_branch2b_res5b_branch2b_relu res5b_branch2c_scale5b_branch2c 0=2048 1=1 5=1 6=1048576 8=2 Eltwise res5b 2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2c_scale5b_branch2c res5b 0=1 ReLU res5b_relu 1 1 res5b res5b_res5b_relu Split splitncnn_15 1 2 res5b_res5b_relu res5b_res5b_relu_splitncnn_0 res5b_res5b_relu_splitncnn_1 Convolution res5c_branch2a 1 1 res5b_res5b_relu_splitncnn_1 res5c_branch2a_res5c_branch2a_relu 0=512 1=1 5=1 6=1048576 8=102 9=1 Convolution res5c_branch2b 1 1 res5c_branch2a_res5c_branch2a_relu res5c_branch2b_res5c_branch2b_relu 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1 Convolution res5c_branch2c 1 1 res5c_branch2b_res5c_branch2b_relu res5c_branch2c_scale5c_branch2c 0=2048 1=1 5=1 6=1048576 8=2 Eltwise res5c 2 1 res5b_res5b_relu_splitncnn_0 res5c_branch2c_scale5c_branch2c res5c 0=1 ReLU res5c_relu 1 1 res5c res5c_res5c_relu Pooling pool5 1 1 res5c_res5c_relu pool5 0=1 1=7 InnerProduct fc1000 1 1 pool5 fc1000 0=1000 1=1 2=2048000 Softmax prob 1 1 fc1000 output ================================================ FILE: benchmark/shufflenet.param ================================================ 7767517 120 136 Input data 0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3 Convolution conv1 1 1 data conv1_conv1_relu -23330=4,3,112,112,24 0=24 1=3 3=2 4=1 5=1 6=648 9=1 Pooling pool1 1 1 conv1_conv1_relu pool1 -23330=4,3,56,56,24 1=3 2=2 Split splitncnn_0 1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1 -23330=8,3,56,56,24,3,56,56,24 Pooling resx1_match_conv 1 1 pool1_splitncnn_1 resx1_match_conv -23330=4,3,28,28,24 0=1 1=3 2=2 Convolution resx1_conv1 1 1 pool1_splitncnn_0 resx1_conv1_resx1_conv1_relu -23330=4,3,56,56,54 0=54 1=1 5=1 6=1296 9=1 ConvolutionDepthWise resx1_conv2 1 1 resx1_conv1_resx1_conv1_relu resx1_conv2_resx1_conv2_scale -23330=4,3,28,28,54 0=54 1=3 3=2 4=1 5=1 6=486 7=54 ConvolutionDepthWise resx1_conv3 1 1 resx1_conv2_resx1_conv2_scale resx1_conv3_resx1_conv3_scale -23330=4,3,28,28,216 0=216 1=1 5=1 6=3888 7=3 Concat resx1_concat 2 1 resx1_match_conv resx1_conv3_resx1_conv3_scale resx1_concat -23330=4,3,28,28,240 ReLU resx1_concat_relu 1 1 resx1_concat resx1_concat_resx1_concat_relu -23330=4,3,28,28,240 Split splitncnn_1 1 2 resx1_concat_resx1_concat_relu resx1_concat_resx1_concat_relu_splitncnn_0 resx1_concat_resx1_concat_relu_splitncnn_1 -23330=8,3,28,28,240,3,28,28,240 ConvolutionDepthWise resx2_conv1 1 1 resx1_concat_resx1_concat_relu_splitncnn_1 resx2_conv1_resx2_conv1_relu -23330=4,3,28,28,60 0=60 1=1 5=1 6=4800 7=3 9=1 ShuffleChannel shuffle2 1 1 resx2_conv1_resx2_conv1_relu shuffle2 -23330=4,3,28,28,60 0=3 ConvolutionDepthWise resx2_conv2 1 1 shuffle2 resx2_conv2_resx2_conv2_scale -23330=4,3,28,28,60 0=60 1=3 4=1 5=1 6=540 7=60 ConvolutionDepthWise resx2_conv3 1 1 resx2_conv2_resx2_conv2_scale resx2_conv3_resx2_conv3_scale -23330=4,3,28,28,240 0=240 1=1 5=1 6=4800 7=3 Eltwise resx2_elewise 2 1 resx1_concat_resx1_concat_relu_splitncnn_0 resx2_conv3_resx2_conv3_scale resx2_elewise -23330=4,3,28,28,240 0=1 ReLU resx2_elewise_relu 1 1 resx2_elewise resx2_elewise_resx2_elewise_relu -23330=4,3,28,28,240 Split splitncnn_2 1 2 resx2_elewise_resx2_elewise_relu resx2_elewise_resx2_elewise_relu_splitncnn_0 resx2_elewise_resx2_elewise_relu_splitncnn_1 -23330=8,3,28,28,240,3,28,28,240 ConvolutionDepthWise resx3_conv1 1 1 resx2_elewise_resx2_elewise_relu_splitncnn_1 resx3_conv1_resx3_conv1_relu -23330=4,3,28,28,60 0=60 1=1 5=1 6=4800 7=3 9=1 ShuffleChannel shuffle3 1 1 resx3_conv1_resx3_conv1_relu shuffle3 -23330=4,3,28,28,60 0=3 ConvolutionDepthWise resx3_conv2 1 1 shuffle3 resx3_conv2_resx3_conv2_scale -23330=4,3,28,28,60 0=60 1=3 4=1 5=1 6=540 7=60 ConvolutionDepthWise resx3_conv3 1 1 resx3_conv2_resx3_conv2_scale resx3_conv3_resx3_conv3_scale -23330=4,3,28,28,240 0=240 1=1 5=1 6=4800 7=3 Eltwise resx3_elewise 2 1 resx2_elewise_resx2_elewise_relu_splitncnn_0 resx3_conv3_resx3_conv3_scale resx3_elewise -23330=4,3,28,28,240 0=1 ReLU resx3_elewise_relu 1 1 resx3_elewise resx3_elewise_resx3_elewise_relu -23330=4,3,28,28,240 Split splitncnn_3 1 2 resx3_elewise_resx3_elewise_relu resx3_elewise_resx3_elewise_relu_splitncnn_0 resx3_elewise_resx3_elewise_relu_splitncnn_1 -23330=8,3,28,28,240,3,28,28,240 ConvolutionDepthWise resx4_conv1 1 1 resx3_elewise_resx3_elewise_relu_splitncnn_1 resx4_conv1_resx4_conv1_relu -23330=4,3,28,28,60 0=60 1=1 5=1 6=4800 7=3 9=1 ShuffleChannel shuffle4 1 1 resx4_conv1_resx4_conv1_relu shuffle4 -23330=4,3,28,28,60 0=3 ConvolutionDepthWise resx4_conv2 1 1 shuffle4 resx4_conv2_resx4_conv2_scale -23330=4,3,28,28,60 0=60 1=3 4=1 5=1 6=540 7=60 ConvolutionDepthWise resx4_conv3 1 1 resx4_conv2_resx4_conv2_scale resx4_conv3_resx4_conv3_scale -23330=4,3,28,28,240 0=240 1=1 5=1 6=4800 7=3 Eltwise resx4_elewise 2 1 resx3_elewise_resx3_elewise_relu_splitncnn_0 resx4_conv3_resx4_conv3_scale resx4_elewise -23330=4,3,28,28,240 0=1 ReLU resx4_elewise_relu 1 1 resx4_elewise resx4_elewise_resx4_elewise_relu -23330=4,3,28,28,240 Split splitncnn_4 1 2 resx4_elewise_resx4_elewise_relu resx4_elewise_resx4_elewise_relu_splitncnn_0 resx4_elewise_resx4_elewise_relu_splitncnn_1 -23330=8,3,28,28,240,3,28,28,240 Pooling resx5_match_conv 1 1 resx4_elewise_resx4_elewise_relu_splitncnn_1 resx5_match_conv -23330=4,3,14,14,240 0=1 1=3 2=2 ConvolutionDepthWise resx5_conv1 1 1 resx4_elewise_resx4_elewise_relu_splitncnn_0 resx5_conv1_resx5_conv1_relu -23330=4,3,28,28,60 0=60 1=1 5=1 6=4800 7=3 9=1 ShuffleChannel shuffle5 1 1 resx5_conv1_resx5_conv1_relu shuffle5 -23330=4,3,28,28,60 0=3 ConvolutionDepthWise resx5_conv2 1 1 shuffle5 resx5_conv2_resx5_conv2_scale -23330=4,3,14,14,60 0=60 1=3 3=2 4=1 5=1 6=540 7=60 ConvolutionDepthWise resx5_conv3 1 1 resx5_conv2_resx5_conv2_scale resx5_conv3_resx5_conv3_scale -23330=4,3,14,14,240 0=240 1=1 5=1 6=4800 7=3 Concat resx5_concat 2 1 resx5_match_conv resx5_conv3_resx5_conv3_scale resx5_concat -23330=4,3,14,14,480 ReLU resx5_concat_relu 1 1 resx5_concat resx5_concat_resx5_concat_relu -23330=4,3,14,14,480 Split splitncnn_5 1 2 resx5_concat_resx5_concat_relu resx5_concat_resx5_concat_relu_splitncnn_0 resx5_concat_resx5_concat_relu_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480 ConvolutionDepthWise resx6_conv1 1 1 resx5_concat_resx5_concat_relu_splitncnn_1 resx6_conv1_resx6_conv1_relu -23330=4,3,14,14,120 0=120 1=1 5=1 6=19200 7=3 9=1 ShuffleChannel shuffle6 1 1 resx6_conv1_resx6_conv1_relu shuffle6 -23330=4,3,14,14,120 0=3 ConvolutionDepthWise resx6_conv2 1 1 shuffle6 resx6_conv2_resx6_conv2_scale -23330=4,3,14,14,120 0=120 1=3 4=1 5=1 6=1080 7=120 ConvolutionDepthWise resx6_conv3 1 1 resx6_conv2_resx6_conv2_scale resx6_conv3_resx6_conv3_scale -23330=4,3,14,14,480 0=480 1=1 5=1 6=19200 7=3 Eltwise resx6_elewise 2 1 resx5_concat_resx5_concat_relu_splitncnn_0 resx6_conv3_resx6_conv3_scale resx6_elewise -23330=4,3,14,14,480 0=1 ReLU resx6_elewise_relu 1 1 resx6_elewise resx6_elewise_resx6_elewise_relu -23330=4,3,14,14,480 Split splitncnn_6 1 2 resx6_elewise_resx6_elewise_relu resx6_elewise_resx6_elewise_relu_splitncnn_0 resx6_elewise_resx6_elewise_relu_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480 ConvolutionDepthWise resx7_conv1 1 1 resx6_elewise_resx6_elewise_relu_splitncnn_1 resx7_conv1_resx7_conv1_relu -23330=4,3,14,14,120 0=120 1=1 5=1 6=19200 7=3 9=1 ShuffleChannel shuffle7 1 1 resx7_conv1_resx7_conv1_relu shuffle7 -23330=4,3,14,14,120 0=3 ConvolutionDepthWise resx7_conv2 1 1 shuffle7 resx7_conv2_resx7_conv2_scale -23330=4,3,14,14,120 0=120 1=3 4=1 5=1 6=1080 7=120 ConvolutionDepthWise resx7_conv3 1 1 resx7_conv2_resx7_conv2_scale resx7_conv3_resx7_conv3_scale -23330=4,3,14,14,480 0=480 1=1 5=1 6=19200 7=3 Eltwise resx7_elewise 2 1 resx6_elewise_resx6_elewise_relu_splitncnn_0 resx7_conv3_resx7_conv3_scale resx7_elewise -23330=4,3,14,14,480 0=1 ReLU resx7_elewise_relu 1 1 resx7_elewise resx7_elewise_resx7_elewise_relu -23330=4,3,14,14,480 Split splitncnn_7 1 2 resx7_elewise_resx7_elewise_relu resx7_elewise_resx7_elewise_relu_splitncnn_0 resx7_elewise_resx7_elewise_relu_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480 ConvolutionDepthWise resx8_conv1 1 1 resx7_elewise_resx7_elewise_relu_splitncnn_1 resx8_conv1_resx8_conv1_relu -23330=4,3,14,14,120 0=120 1=1 5=1 6=19200 7=3 9=1 ShuffleChannel shuffle8 1 1 resx8_conv1_resx8_conv1_relu shuffle8 -23330=4,3,14,14,120 0=3 ConvolutionDepthWise resx8_conv2 1 1 shuffle8 resx8_conv2_resx8_conv2_scale -23330=4,3,14,14,120 0=120 1=3 4=1 5=1 6=1080 7=120 ConvolutionDepthWise resx8_conv3 1 1 resx8_conv2_resx8_conv2_scale resx8_conv3_resx8_conv3_scale -23330=4,3,14,14,480 0=480 1=1 5=1 6=19200 7=3 Eltwise resx8_elewise 2 1 resx7_elewise_resx7_elewise_relu_splitncnn_0 resx8_conv3_resx8_conv3_scale resx8_elewise -23330=4,3,14,14,480 0=1 ReLU resx8_elewise_relu 1 1 resx8_elewise resx8_elewise_resx8_elewise_relu -23330=4,3,14,14,480 Split splitncnn_8 1 2 resx8_elewise_resx8_elewise_relu resx8_elewise_resx8_elewise_relu_splitncnn_0 resx8_elewise_resx8_elewise_relu_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480 ConvolutionDepthWise resx9_conv1 1 1 resx8_elewise_resx8_elewise_relu_splitncnn_1 resx9_conv1_resx9_conv1_relu -23330=4,3,14,14,120 0=120 1=1 5=1 6=19200 7=3 9=1 ShuffleChannel shuffle9 1 1 resx9_conv1_resx9_conv1_relu shuffle9 -23330=4,3,14,14,120 0=3 ConvolutionDepthWise resx9_conv2 1 1 shuffle9 resx9_conv2_resx9_conv2_scale -23330=4,3,14,14,120 0=120 1=3 4=1 5=1 6=1080 7=120 ConvolutionDepthWise resx9_conv3 1 1 resx9_conv2_resx9_conv2_scale resx9_conv3_resx9_conv3_scale -23330=4,3,14,14,480 0=480 1=1 5=1 6=19200 7=3 Eltwise resx9_elewise 2 1 resx8_elewise_resx8_elewise_relu_splitncnn_0 resx9_conv3_resx9_conv3_scale resx9_elewise -23330=4,3,14,14,480 0=1 ReLU resx9_elewise_relu 1 1 resx9_elewise resx9_elewise_resx9_elewise_relu -23330=4,3,14,14,480 Split splitncnn_9 1 2 resx9_elewise_resx9_elewise_relu resx9_elewise_resx9_elewise_relu_splitncnn_0 resx9_elewise_resx9_elewise_relu_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480 ConvolutionDepthWise resx10_conv1 1 1 resx9_elewise_resx9_elewise_relu_splitncnn_1 resx10_conv1_resx10_conv1_relu -23330=4,3,14,14,120 0=120 1=1 5=1 6=19200 7=3 9=1 ShuffleChannel shuffle10 1 1 resx10_conv1_resx10_conv1_relu shuffle10 -23330=4,3,14,14,120 0=3 ConvolutionDepthWise resx10_conv2 1 1 shuffle10 resx10_conv2_resx10_conv2_scale -23330=4,3,14,14,120 0=120 1=3 4=1 5=1 6=1080 7=120 ConvolutionDepthWise resx10_conv3 1 1 resx10_conv2_resx10_conv2_scale resx10_conv3_resx10_conv3_scale -23330=4,3,14,14,480 0=480 1=1 5=1 6=19200 7=3 Eltwise resx10_elewise 2 1 resx9_elewise_resx9_elewise_relu_splitncnn_0 resx10_conv3_resx10_conv3_scale resx10_elewise -23330=4,3,14,14,480 0=1 ReLU resx10_elewise_relu 1 1 resx10_elewise resx10_elewise_resx10_elewise_relu -23330=4,3,14,14,480 Split splitncnn_10 1 2 resx10_elewise_resx10_elewise_relu resx10_elewise_resx10_elewise_relu_splitncnn_0 resx10_elewise_resx10_elewise_relu_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480 ConvolutionDepthWise resx11_conv1 1 1 resx10_elewise_resx10_elewise_relu_splitncnn_1 resx11_conv1_resx11_conv1_relu -23330=4,3,14,14,120 0=120 1=1 5=1 6=19200 7=3 9=1 ShuffleChannel shuffle11 1 1 resx11_conv1_resx11_conv1_relu shuffle11 -23330=4,3,14,14,120 0=3 ConvolutionDepthWise resx11_conv2 1 1 shuffle11 resx11_conv2_resx11_conv2_scale -23330=4,3,14,14,120 0=120 1=3 4=1 5=1 6=1080 7=120 ConvolutionDepthWise resx11_conv3 1 1 resx11_conv2_resx11_conv2_scale resx11_conv3_resx11_conv3_scale -23330=4,3,14,14,480 0=480 1=1 5=1 6=19200 7=3 Eltwise resx11_elewise 2 1 resx10_elewise_resx10_elewise_relu_splitncnn_0 resx11_conv3_resx11_conv3_scale resx11_elewise -23330=4,3,14,14,480 0=1 ReLU resx11_elewise_relu 1 1 resx11_elewise resx11_elewise_resx11_elewise_relu -23330=4,3,14,14,480 Split splitncnn_11 1 2 resx11_elewise_resx11_elewise_relu resx11_elewise_resx11_elewise_relu_splitncnn_0 resx11_elewise_resx11_elewise_relu_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480 ConvolutionDepthWise resx12_conv1 1 1 resx11_elewise_resx11_elewise_relu_splitncnn_1 resx12_conv1_resx12_conv1_relu -23330=4,3,14,14,120 0=120 1=1 5=1 6=19200 7=3 9=1 ShuffleChannel shuffle12 1 1 resx12_conv1_resx12_conv1_relu shuffle12 -23330=4,3,14,14,120 0=3 ConvolutionDepthWise resx12_conv2 1 1 shuffle12 resx12_conv2_resx12_conv2_scale -23330=4,3,14,14,120 0=120 1=3 4=1 5=1 6=1080 7=120 ConvolutionDepthWise resx12_conv3 1 1 resx12_conv2_resx12_conv2_scale resx12_conv3_resx12_conv3_scale -23330=4,3,14,14,480 0=480 1=1 5=1 6=19200 7=3 Eltwise resx12_elewise 2 1 resx11_elewise_resx11_elewise_relu_splitncnn_0 resx12_conv3_resx12_conv3_scale resx12_elewise -23330=4,3,14,14,480 0=1 ReLU resx12_elewise_relu 1 1 resx12_elewise resx12_elewise_resx12_elewise_relu -23330=4,3,14,14,480 Split splitncnn_12 1 2 resx12_elewise_resx12_elewise_relu resx12_elewise_resx12_elewise_relu_splitncnn_0 resx12_elewise_resx12_elewise_relu_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480 Pooling resx13_match_conv 1 1 resx12_elewise_resx12_elewise_relu_splitncnn_1 resx13_match_conv -23330=4,3,7,7,480 0=1 1=3 2=2 ConvolutionDepthWise resx13_conv1 1 1 resx12_elewise_resx12_elewise_relu_splitncnn_0 resx13_conv1_resx13_conv1_relu -23330=4,3,14,14,120 0=120 1=1 5=1 6=19200 7=3 9=1 ShuffleChannel shuffle13 1 1 resx13_conv1_resx13_conv1_relu shuffle13 -23330=4,3,14,14,120 0=3 ConvolutionDepthWise resx13_conv2 1 1 shuffle13 resx13_conv2_resx13_conv2_scale -23330=4,3,7,7,120 0=120 1=3 3=2 4=1 5=1 6=1080 7=120 ConvolutionDepthWise resx13_conv3 1 1 resx13_conv2_resx13_conv2_scale resx13_conv3_resx13_conv3_scale -23330=4,3,7,7,480 0=480 1=1 5=1 6=19200 7=3 Concat resx13_concat 2 1 resx13_match_conv resx13_conv3_resx13_conv3_scale resx13_concat -23330=4,3,7,7,960 ReLU resx13_concat_relu 1 1 resx13_concat resx13_concat_resx13_concat_relu -23330=4,3,7,7,960 Split splitncnn_13 1 2 resx13_concat_resx13_concat_relu resx13_concat_resx13_concat_relu_splitncnn_0 resx13_concat_resx13_concat_relu_splitncnn_1 -23330=8,3,7,7,960,3,7,7,960 ConvolutionDepthWise resx14_conv1 1 1 resx13_concat_resx13_concat_relu_splitncnn_1 resx14_conv1_resx14_conv1_relu -23330=4,3,7,7,240 0=240 1=1 5=1 6=76800 7=3 9=1 ShuffleChannel shuffle14 1 1 resx14_conv1_resx14_conv1_relu shuffle14 -23330=4,3,7,7,240 0=3 ConvolutionDepthWise resx14_conv2 1 1 shuffle14 resx14_conv2_resx14_conv2_scale -23330=4,3,7,7,240 0=240 1=3 4=1 5=1 6=2160 7=240 ConvolutionDepthWise resx14_conv3 1 1 resx14_conv2_resx14_conv2_scale resx14_conv3_resx14_conv3_scale -23330=4,3,7,7,960 0=960 1=1 5=1 6=76800 7=3 Eltwise resx14_elewise 2 1 resx13_concat_resx13_concat_relu_splitncnn_0 resx14_conv3_resx14_conv3_scale resx14_elewise -23330=4,3,7,7,960 0=1 ReLU resx14_elewise_relu 1 1 resx14_elewise resx14_elewise_resx14_elewise_relu -23330=4,3,7,7,960 Split splitncnn_14 1 2 resx14_elewise_resx14_elewise_relu resx14_elewise_resx14_elewise_relu_splitncnn_0 resx14_elewise_resx14_elewise_relu_splitncnn_1 -23330=8,3,7,7,960,3,7,7,960 ConvolutionDepthWise resx15_conv1 1 1 resx14_elewise_resx14_elewise_relu_splitncnn_1 resx15_conv1_resx15_conv1_relu -23330=4,3,7,7,240 0=240 1=1 5=1 6=76800 7=3 9=1 ShuffleChannel shuffle15 1 1 resx15_conv1_resx15_conv1_relu shuffle15 -23330=4,3,7,7,240 0=3 ConvolutionDepthWise resx15_conv2 1 1 shuffle15 resx15_conv2_resx15_conv2_scale -23330=4,3,7,7,240 0=240 1=3 4=1 5=1 6=2160 7=240 ConvolutionDepthWise resx15_conv3 1 1 resx15_conv2_resx15_conv2_scale resx15_conv3_resx15_conv3_scale -23330=4,3,7,7,960 0=960 1=1 5=1 6=76800 7=3 Eltwise resx15_elewise 2 1 resx14_elewise_resx14_elewise_relu_splitncnn_0 resx15_conv3_resx15_conv3_scale resx15_elewise -23330=4,3,7,7,960 0=1 ReLU resx15_elewise_relu 1 1 resx15_elewise resx15_elewise_resx15_elewise_relu -23330=4,3,7,7,960 Split splitncnn_15 1 2 resx15_elewise_resx15_elewise_relu resx15_elewise_resx15_elewise_relu_splitncnn_0 resx15_elewise_resx15_elewise_relu_splitncnn_1 -23330=8,3,7,7,960,3,7,7,960 ConvolutionDepthWise resx16_conv1 1 1 resx15_elewise_resx15_elewise_relu_splitncnn_1 resx16_conv1_resx16_conv1_relu -23330=4,3,7,7,240 0=240 1=1 5=1 6=76800 7=3 9=1 ShuffleChannel shuffle16 1 1 resx16_conv1_resx16_conv1_relu shuffle16 -23330=4,3,7,7,240 0=3 ConvolutionDepthWise resx16_conv2 1 1 shuffle16 resx16_conv2_resx16_conv2_scale -23330=4,3,7,7,240 0=240 1=3 4=1 5=1 6=2160 7=240 ConvolutionDepthWise resx16_conv3 1 1 resx16_conv2_resx16_conv2_scale resx16_conv3_resx16_conv3_scale -23330=4,3,7,7,960 0=960 1=1 5=1 6=76800 7=3 Eltwise resx16_elewise 2 1 resx15_elewise_resx15_elewise_relu_splitncnn_0 resx16_conv3_resx16_conv3_scale resx16_elewise -23330=4,3,7,7,960 0=1 ReLU resx16_elewise_relu 1 1 resx16_elewise resx16_elewise_resx16_elewise_relu -23330=4,3,7,7,960 Pooling pool_ave 1 1 resx16_elewise_resx16_elewise_relu pool_ave -23330=4,1,960,1,1 0=1 4=1 InnerProduct fc1000 1 1 pool_ave fc1000 -23330=4,1,1000,1,1 0=1000 1=1 2=960000 Softmax prob 1 1 fc1000 output -23330=4,1,1000,1,1 ================================================ FILE: benchmark/shufflenet_v2.param ================================================ 7767517 109 125 Input data 0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3 Convolution conv1 1 1 data conv1_conv1_relu -23330=4,3,112,112,24 0=24 1=3 3=2 4=1 5=1 6=648 9=1 Pooling pool1 1 1 conv1_conv1_relu pool1 -23330=4,3,56,56,24 1=3 2=2 Split splitncnn_0 1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1 -23330=8,3,56,56,24,3,56,56,24 ConvolutionDepthWise branch1_1_conv1 1 1 pool1_splitncnn_1 branch1_1_conv1_branch1_1_conv1_scale -23330=4,3,28,28,24 0=24 1=3 3=2 4=1 5=1 6=216 7=24 Convolution branch1_1_conv2 1 1 branch1_1_conv1_branch1_1_conv1_scale branch1_1_conv2_branch1_1_conv2_relu -23330=4,3,28,28,58 0=58 1=1 5=1 6=1392 9=1 Convolution branch1_2_conv1 1 1 pool1_splitncnn_0 branch1_2_conv1_branch1_2_conv1_relu -23330=4,3,56,56,58 0=58 1=1 5=1 6=1392 9=1 ConvolutionDepthWise branch1_2_conv2 1 1 branch1_2_conv1_branch1_2_conv1_relu branch1_2_conv2_branch1_2_conv2_scale -23330=4,3,28,28,58 0=58 1=3 3=2 4=1 5=1 6=522 7=58 Convolution branch1_2_conv3 1 1 branch1_2_conv2_branch1_2_conv2_scale branch1_2_conv3_branch1_2_conv3_relu -23330=4,3,28,28,58 0=58 1=1 5=1 6=3364 9=1 Concat concat1 2 1 branch1_1_conv2_branch1_1_conv2_relu branch1_2_conv3_branch1_2_conv3_relu concat1 -23330=4,3,28,28,116 ShuffleChannel shuffle1 1 1 concat1 shuffle1 -23330=4,3,28,28,116 0=2 Slice slice2 1 2 shuffle1 branch2_1 branch2_2 -23330=8,3,28,28,58,3,28,28,58 -23300=2,58,-233 Convolution branch2_2_conv1 1 1 branch2_2 branch2_2_conv1_branch2_2_conv1_relu -23330=4,3,28,28,58 0=58 1=1 5=1 6=3364 9=1 ConvolutionDepthWise branch2_2_conv2 1 1 branch2_2_conv1_branch2_2_conv1_relu branch2_2_conv2_branch2_2_conv2_scale -23330=4,3,28,28,58 0=58 1=3 4=1 5=1 6=522 7=58 Convolution branch2_2_conv3 1 1 branch2_2_conv2_branch2_2_conv2_scale branch2_2_conv3_branch2_2_conv3_relu -23330=4,3,28,28,58 0=58 1=1 5=1 6=3364 9=1 Concat concat2 2 1 branch2_1 branch2_2_conv3_branch2_2_conv3_relu concat2 -23330=4,3,28,28,116 ShuffleChannel shuffle2 1 1 concat2 shuffle2 -23330=4,3,28,28,116 0=2 Slice slice3 1 2 shuffle2 branch3_1 branch3_2 -23330=8,3,28,28,58,3,28,28,58 -23300=2,58,-233 Convolution branch3_2_conv1 1 1 branch3_2 branch3_2_conv1_branch3_2_conv1_relu -23330=4,3,28,28,58 0=58 1=1 5=1 6=3364 9=1 ConvolutionDepthWise branch3_2_conv2 1 1 branch3_2_conv1_branch3_2_conv1_relu branch3_2_conv2_branch3_2_conv2_scale -23330=4,3,28,28,58 0=58 1=3 4=1 5=1 6=522 7=58 Convolution branch3_2_conv3 1 1 branch3_2_conv2_branch3_2_conv2_scale branch3_2_conv3_branch3_2_conv3_relu -23330=4,3,28,28,58 0=58 1=1 5=1 6=3364 9=1 Concat concat3 2 1 branch3_1 branch3_2_conv3_branch3_2_conv3_relu concat3 -23330=4,3,28,28,116 ShuffleChannel shuffle3 1 1 concat3 shuffle3 -23330=4,3,28,28,116 0=2 Slice slice4 1 2 shuffle3 branch4_1 branch4_2 -23330=8,3,28,28,58,3,28,28,58 -23300=2,58,-233 Convolution branch4_2_conv1 1 1 branch4_2 branch4_2_conv1_branch4_2_conv1_relu -23330=4,3,28,28,58 0=58 1=1 5=1 6=3364 9=1 ConvolutionDepthWise branch4_2_conv2 1 1 branch4_2_conv1_branch4_2_conv1_relu branch4_2_conv2_branch4_2_conv2_scale -23330=4,3,28,28,58 0=58 1=3 4=1 5=1 6=522 7=58 Convolution branch4_2_conv3 1 1 branch4_2_conv2_branch4_2_conv2_scale branch4_2_conv3_branch4_2_conv3_relu -23330=4,3,28,28,58 0=58 1=1 5=1 6=3364 9=1 Concat concat4 2 1 branch4_1 branch4_2_conv3_branch4_2_conv3_relu concat4 -23330=4,3,28,28,116 ShuffleChannel shuffle4 1 1 concat4 shuffle4 -23330=4,3,28,28,116 0=2 Split splitncnn_1 1 2 shuffle4 shuffle4_splitncnn_0 shuffle4_splitncnn_1 -23330=8,3,28,28,116,3,28,28,116 ConvolutionDepthWise branch5_1_conv1 1 1 shuffle4_splitncnn_1 branch5_1_conv1_branch5_1_conv1_scale -23330=4,3,14,14,116 0=116 1=3 3=2 4=1 5=1 6=1044 7=116 Convolution branch5_1_conv2 1 1 branch5_1_conv1_branch5_1_conv1_scale branch5_1_conv2_branch5_1_conv2_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1 Convolution branch5_2_conv1 1 1 shuffle4_splitncnn_0 branch5_2_conv1_branch5_2_conv1_relu -23330=4,3,28,28,116 0=116 1=1 5=1 6=13456 9=1 ConvolutionDepthWise branch5_2_conv2 1 1 branch5_2_conv1_branch5_2_conv1_relu branch5_2_conv2_branch5_2_conv2_scale -23330=4,3,14,14,116 0=116 1=3 3=2 4=1 5=1 6=1044 7=116 Convolution branch5_2_conv3 1 1 branch5_2_conv2_branch5_2_conv2_scale branch5_2_conv3_branch5_2_conv3_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1 Concat concat5 2 1 branch5_1_conv2_branch5_1_conv2_relu branch5_2_conv3_branch5_2_conv3_relu concat5 -23330=4,3,14,14,232 ShuffleChannel shuffle5 1 1 concat5 shuffle5 -23330=4,3,14,14,232 0=2 Slice slice6 1 2 shuffle5 branch6_1 branch6_2 -23330=8,3,14,14,116,3,14,14,116 -23300=2,116,-233 Convolution branch6_2_conv1 1 1 branch6_2 branch6_2_conv1_branch6_2_conv1_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1 ConvolutionDepthWise branch6_2_conv2 1 1 branch6_2_conv1_branch6_2_conv1_relu branch6_2_conv2_branch6_2_conv2_scale -23330=4,3,14,14,116 0=116 1=3 4=1 5=1 6=1044 7=116 Convolution branch6_2_conv3 1 1 branch6_2_conv2_branch6_2_conv2_scale branch6_2_conv3_branch6_2_conv3_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1 Concat concat6 2 1 branch6_1 branch6_2_conv3_branch6_2_conv3_relu concat6 -23330=4,3,14,14,232 ShuffleChannel shuffle6 1 1 concat6 shuffle6 -23330=4,3,14,14,232 0=2 Slice slice7 1 2 shuffle6 branch7_1 branch7_2 -23330=8,3,14,14,116,3,14,14,116 -23300=2,116,-233 Convolution branch7_2_conv1 1 1 branch7_2 branch7_2_conv1_branch7_2_conv1_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1 ConvolutionDepthWise branch7_2_conv2 1 1 branch7_2_conv1_branch7_2_conv1_relu branch7_2_conv2_branch7_2_conv2_scale -23330=4,3,14,14,116 0=116 1=3 4=1 5=1 6=1044 7=116 Convolution branch7_2_conv3 1 1 branch7_2_conv2_branch7_2_conv2_scale branch7_2_conv3_branch7_2_conv3_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1 Concat concat7 2 1 branch7_1 branch7_2_conv3_branch7_2_conv3_relu concat7 -23330=4,3,14,14,232 ShuffleChannel shuffle7 1 1 concat7 shuffle7 -23330=4,3,14,14,232 0=2 Slice slice8 1 2 shuffle7 branch8_1 branch8_2 -23330=8,3,14,14,116,3,14,14,116 -23300=2,116,-233 Convolution branch8_2_conv1 1 1 branch8_2 branch8_2_conv1_branch8_2_conv1_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1 ConvolutionDepthWise branch8_2_conv2 1 1 branch8_2_conv1_branch8_2_conv1_relu branch8_2_conv2_branch8_2_conv2_scale -23330=4,3,14,14,116 0=116 1=3 4=1 5=1 6=1044 7=116 Convolution branch8_2_conv3 1 1 branch8_2_conv2_branch8_2_conv2_scale branch8_2_conv3_branch8_2_conv3_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1 Concat concat8 2 1 branch8_1 branch8_2_conv3_branch8_2_conv3_relu concat8 -23330=4,3,14,14,232 ShuffleChannel shuffle8 1 1 concat8 shuffle8 -23330=4,3,14,14,232 0=2 Slice slice9 1 2 shuffle8 branch9_1 branch9_2 -23330=8,3,14,14,116,3,14,14,116 -23300=2,116,-233 Convolution branch9_2_conv1 1 1 branch9_2 branch9_2_conv1_branch9_2_conv1_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1 ConvolutionDepthWise branch9_2_conv2 1 1 branch9_2_conv1_branch9_2_conv1_relu branch9_2_conv2_branch9_2_conv2_scale -23330=4,3,14,14,116 0=116 1=3 4=1 5=1 6=1044 7=116 Convolution branch9_2_conv3 1 1 branch9_2_conv2_branch9_2_conv2_scale branch9_2_conv3_branch9_2_conv3_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1 Concat concat9 2 1 branch9_1 branch9_2_conv3_branch9_2_conv3_relu concat9 -23330=4,3,14,14,232 ShuffleChannel shuffle9 1 1 concat9 shuffle9 -23330=4,3,14,14,232 0=2 Slice slice10 1 2 shuffle9 branch10_1 branch10_2 -23330=8,3,14,14,116,3,14,14,116 -23300=2,116,-233 Convolution branch10_2_conv1 1 1 branch10_2 branch10_2_conv1_branch10_2_conv1_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1 ConvolutionDepthWise branch10_2_conv2 1 1 branch10_2_conv1_branch10_2_conv1_relu branch10_2_conv2_branch10_2_conv2_scale -23330=4,3,14,14,116 0=116 1=3 4=1 5=1 6=1044 7=116 Convolution branch10_2_conv3 1 1 branch10_2_conv2_branch10_2_conv2_scale branch10_2_conv3_branch10_2_conv3_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1 Concat concat10 2 1 branch10_1 branch10_2_conv3_branch10_2_conv3_relu concat10 -23330=4,3,14,14,232 ShuffleChannel shuffle10 1 1 concat10 shuffle10 -23330=4,3,14,14,232 0=2 Slice slice11 1 2 shuffle10 branch11_1 branch11_2 -23330=8,3,14,14,116,3,14,14,116 -23300=2,116,-233 Convolution branch11_2_conv1 1 1 branch11_2 branch11_2_conv1_branch11_2_conv1_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1 ConvolutionDepthWise branch11_2_conv2 1 1 branch11_2_conv1_branch11_2_conv1_relu branch11_2_conv2_branch11_2_conv2_scale -23330=4,3,14,14,116 0=116 1=3 4=1 5=1 6=1044 7=116 Convolution branch11_2_conv3 1 1 branch11_2_conv2_branch11_2_conv2_scale branch11_2_conv3_branch11_2_conv3_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1 Concat concat11 2 1 branch11_1 branch11_2_conv3_branch11_2_conv3_relu concat11 -23330=4,3,14,14,232 ShuffleChannel shuffle11 1 1 concat11 shuffle11 -23330=4,3,14,14,232 0=2 Slice slice12 1 2 shuffle11 branch12_1 branch12_2 -23330=8,3,14,14,116,3,14,14,116 -23300=2,116,-233 Convolution branch12_2_conv1 1 1 branch12_2 branch12_2_conv1_branch12_2_conv1_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1 ConvolutionDepthWise branch12_2_conv2 1 1 branch12_2_conv1_branch12_2_conv1_relu branch12_2_conv2_branch12_2_conv2_scale -23330=4,3,14,14,116 0=116 1=3 4=1 5=1 6=1044 7=116 Convolution branch12_2_conv3 1 1 branch12_2_conv2_branch12_2_conv2_scale branch12_2_conv3_branch12_2_conv3_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1 Concat concat12 2 1 branch12_1 branch12_2_conv3_branch12_2_conv3_relu concat12 -23330=4,3,14,14,232 ShuffleChannel shuffle12 1 1 concat12 shuffle12 -23330=4,3,14,14,232 0=2 Split splitncnn_2 1 2 shuffle12 shuffle12_splitncnn_0 shuffle12_splitncnn_1 -23330=8,3,14,14,232,3,14,14,232 ConvolutionDepthWise branch13_1_conv1 1 1 shuffle12_splitncnn_1 branch13_1_conv1_branch13_1_conv1_scale -23330=4,3,7,7,232 0=232 1=3 3=2 4=1 5=1 6=2088 7=232 Convolution branch13_1_conv2 1 1 branch13_1_conv1_branch13_1_conv1_scale branch13_1_conv2_branch13_1_conv2_relu -23330=4,3,7,7,232 0=232 1=1 5=1 6=53824 9=1 Convolution branch13_2_conv1 1 1 shuffle12_splitncnn_0 branch13_2_conv1_branch13_2_conv1_relu -23330=4,3,14,14,232 0=232 1=1 5=1 6=53824 9=1 ConvolutionDepthWise branch13_2_conv2 1 1 branch13_2_conv1_branch13_2_conv1_relu branch13_2_conv2_branch13_2_conv2_scale -23330=4,3,7,7,232 0=232 1=3 3=2 4=1 5=1 6=2088 7=232 Convolution branch13_2_conv3 1 1 branch13_2_conv2_branch13_2_conv2_scale branch13_2_conv3_branch13_2_conv3_relu -23330=4,3,7,7,232 0=232 1=1 5=1 6=53824 9=1 Concat concat13 2 1 branch13_1_conv2_branch13_1_conv2_relu branch13_2_conv3_branch13_2_conv3_relu concat13 -23330=4,3,7,7,464 ShuffleChannel shuffle13 1 1 concat13 shuffle13 -23330=4,3,7,7,464 0=2 Slice slice14 1 2 shuffle13 branch14_1 branch14_2 -23330=8,3,7,7,232,3,7,7,232 -23300=2,232,-233 Convolution branch14_2_conv1 1 1 branch14_2 branch14_2_conv1_branch14_2_conv1_relu -23330=4,3,7,7,232 0=232 1=1 5=1 6=53824 9=1 ConvolutionDepthWise branch14_2_conv2 1 1 branch14_2_conv1_branch14_2_conv1_relu branch14_2_conv2_branch14_2_conv2_scale -23330=4,3,7,7,232 0=232 1=3 4=1 5=1 6=2088 7=232 Convolution branch14_2_conv3 1 1 branch14_2_conv2_branch14_2_conv2_scale branch14_2_conv3_branch14_2_conv3_relu -23330=4,3,7,7,232 0=232 1=1 5=1 6=53824 9=1 Concat concat14 2 1 branch14_1 branch14_2_conv3_branch14_2_conv3_relu concat14 -23330=4,3,7,7,464 ShuffleChannel shuffle14 1 1 concat14 shuffle14 -23330=4,3,7,7,464 0=2 Slice slice15 1 2 shuffle14 branch15_1 branch15_2 -23330=8,3,7,7,232,3,7,7,232 -23300=2,232,-233 Convolution branch15_2_conv1 1 1 branch15_2 branch15_2_conv1_branch15_2_conv1_relu -23330=4,3,7,7,232 0=232 1=1 5=1 6=53824 9=1 ConvolutionDepthWise branch15_2_conv2 1 1 branch15_2_conv1_branch15_2_conv1_relu branch15_2_conv2_branch15_2_conv2_scale -23330=4,3,7,7,232 0=232 1=3 4=1 5=1 6=2088 7=232 Convolution branch15_2_conv3 1 1 branch15_2_conv2_branch15_2_conv2_scale branch15_2_conv3_branch15_2_conv3_relu -23330=4,3,7,7,232 0=232 1=1 5=1 6=53824 9=1 Concat concat15 2 1 branch15_1 branch15_2_conv3_branch15_2_conv3_relu concat15 -23330=4,3,7,7,464 ShuffleChannel shuffle15 1 1 concat15 shuffle15 -23330=4,3,7,7,464 0=2 Slice slice16 1 2 shuffle15 branch16_1 branch16_2 -23330=8,3,7,7,232,3,7,7,232 -23300=2,232,-233 Convolution branch16_2_conv1 1 1 branch16_2 branch16_2_conv1_branch16_2_conv1_relu -23330=4,3,7,7,232 0=232 1=1 5=1 6=53824 9=1 ConvolutionDepthWise branch16_2_conv2 1 1 branch16_2_conv1_branch16_2_conv1_relu branch16_2_conv2_branch16_2_conv2_scale -23330=4,3,7,7,232 0=232 1=3 4=1 5=1 6=2088 7=232 Convolution branch16_2_conv3 1 1 branch16_2_conv2_branch16_2_conv2_scale branch16_2_conv3_branch16_2_conv3_relu -23330=4,3,7,7,232 0=232 1=1 5=1 6=53824 9=1 Concat concat16 2 1 branch16_1 branch16_2_conv3_branch16_2_conv3_relu concat16 -23330=4,3,7,7,464 ShuffleChannel shuffle16 1 1 concat16 shuffle16 -23330=4,3,7,7,464 0=2 Convolution conv5 1 1 shuffle16 conv5_conv5_relu -23330=4,3,7,7,1024 0=1024 1=1 5=1 6=475136 9=1 Pooling pool_ave 1 1 conv5_conv5_relu pool_ave -23330=4,1,1024,1,1 0=1 4=1 InnerProduct fc1000 1 1 pool_ave fc1000 -23330=4,1,1000,1,1 0=1000 1=1 2=1024000 Softmax prob 1 1 fc1000 output -23330=4,1,1000,1,1 ================================================ FILE: benchmark/squeezenet.param ================================================ 7767517 48 56 Input data 0 1 data -23330=4,3,227,227,3 0=227 1=227 2=3 Convolution conv1 1 1 data conv1_relu_conv1 -23330=4,3,113,113,64 0=64 1=3 3=2 5=1 6=1728 9=1 Pooling pool1 1 1 conv1_relu_conv1 pool1 -23330=4,3,56,56,64 1=3 2=2 Convolution fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1_fire2/relu_squeeze1x1 -23330=4,3,56,56,16 0=16 1=1 5=1 6=1024 9=1 Split splitncnn_0 1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 -23330=8,3,56,56,16,3,56,56,16 Convolution fire2/expand1x1 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1_fire2/relu_expand1x1 -23330=4,3,56,56,64 0=64 1=1 5=1 6=1024 9=1 Convolution fire2/expand3x3 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3_fire2/relu_expand3x3 -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=9216 9=1 Concat fire2/concat 2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat -23330=4,3,56,56,128 Convolution fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1_fire3/relu_squeeze1x1 -23330=4,3,56,56,16 0=16 1=1 5=1 6=2048 9=1 Split splitncnn_1 1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 -23330=8,3,56,56,16,3,56,56,16 Convolution fire3/expand1x1 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1_fire3/relu_expand1x1 -23330=4,3,56,56,64 0=64 1=1 5=1 6=1024 9=1 Convolution fire3/expand3x3 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3_fire3/relu_expand3x3 -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=9216 9=1 Concat fire3/concat 2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat -23330=4,3,56,56,128 Pooling pool3 1 1 fire3/concat pool3 -23330=4,3,28,28,128 1=3 2=2 Convolution fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1_fire4/relu_squeeze1x1 -23330=4,3,28,28,32 0=32 1=1 5=1 6=4096 9=1 Split splitncnn_2 1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 -23330=8,3,28,28,32,3,28,28,32 Convolution fire4/expand1x1 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1_fire4/relu_expand1x1 -23330=4,3,28,28,128 0=128 1=1 5=1 6=4096 9=1 Convolution fire4/expand3x3 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3_fire4/relu_expand3x3 -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=36864 9=1 Concat fire4/concat 2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat -23330=4,3,28,28,256 Convolution fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1_fire5/relu_squeeze1x1 -23330=4,3,28,28,32 0=32 1=1 5=1 6=8192 9=1 Split splitncnn_3 1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 -23330=8,3,28,28,32,3,28,28,32 Convolution fire5/expand1x1 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1_fire5/relu_expand1x1 -23330=4,3,28,28,128 0=128 1=1 5=1 6=4096 9=1 Convolution fire5/expand3x3 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3_fire5/relu_expand3x3 -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=36864 9=1 Concat fire5/concat 2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat -23330=4,3,28,28,256 Pooling pool5 1 1 fire5/concat pool5 -23330=4,3,14,14,256 1=3 2=2 Convolution fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1_fire6/relu_squeeze1x1 -23330=4,3,14,14,48 0=48 1=1 5=1 6=12288 9=1 Split splitncnn_4 1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 -23330=8,3,14,14,48,3,14,14,48 Convolution fire6/expand1x1 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1_fire6/relu_expand1x1 -23330=4,3,14,14,192 0=192 1=1 5=1 6=9216 9=1 Convolution fire6/expand3x3 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3_fire6/relu_expand3x3 -23330=4,3,14,14,192 0=192 1=3 4=1 5=1 6=82944 9=1 Concat fire6/concat 2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat -23330=4,3,14,14,384 Convolution fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1_fire7/relu_squeeze1x1 -23330=4,3,14,14,48 0=48 1=1 5=1 6=18432 9=1 Split splitncnn_5 1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 -23330=8,3,14,14,48,3,14,14,48 Convolution fire7/expand1x1 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1_fire7/relu_expand1x1 -23330=4,3,14,14,192 0=192 1=1 5=1 6=9216 9=1 Convolution fire7/expand3x3 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3_fire7/relu_expand3x3 -23330=4,3,14,14,192 0=192 1=3 4=1 5=1 6=82944 9=1 Concat fire7/concat 2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat -23330=4,3,14,14,384 Convolution fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1_fire8/relu_squeeze1x1 -23330=4,3,14,14,64 0=64 1=1 5=1 6=24576 9=1 Split splitncnn_6 1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 -23330=8,3,14,14,64,3,14,14,64 Convolution fire8/expand1x1 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1_fire8/relu_expand1x1 -23330=4,3,14,14,256 0=256 1=1 5=1 6=16384 9=1 Convolution fire8/expand3x3 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3_fire8/relu_expand3x3 -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=147456 9=1 Concat fire8/concat 2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat -23330=4,3,14,14,512 Convolution fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1_fire9/relu_squeeze1x1 -23330=4,3,14,14,64 0=64 1=1 5=1 6=32768 9=1 Split splitncnn_7 1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 -23330=8,3,14,14,64,3,14,14,64 Convolution fire9/expand1x1 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1_fire9/relu_expand1x1 -23330=4,3,14,14,256 0=256 1=1 5=1 6=16384 9=1 Convolution fire9/expand3x3 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3_fire9/relu_expand3x3 -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=147456 9=1 Concat fire9/concat 2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat_drop9 -23330=4,3,14,14,512 Convolution conv10 1 1 fire9/concat_drop9 conv10_relu_conv10 -23330=4,3,16,16,1000 0=1000 1=1 4=1 5=1 6=512000 9=1 Pooling pool10 1 1 conv10_relu_conv10 pool10 -23330=4,1,1000,1,1 0=1 4=1 Softmax prob 1 1 pool10 output -23330=4,1,1000,1,1 ================================================ FILE: benchmark/squeezenet_int8.param ================================================ 7767517 48 56 Input data 0 1 data 0=227 1=227 2=3 Convolution conv1 1 1 data conv1_relu_conv1 0=64 1=3 3=2 5=1 6=1728 8=2 9=1 Pooling pool1 1 1 conv1_relu_conv1 pool1 1=3 2=2 Convolution fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1_fire2/relu_squeeze1x1 0=16 1=1 5=1 6=1024 8=102 9=1 Split splitncnn_0 1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 Convolution fire2/expand1x1 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1_fire2/relu_expand1x1 0=64 1=1 5=1 6=1024 8=2 9=1 Convolution fire2/expand3x3 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3_fire2/relu_expand3x3 0=64 1=3 4=1 5=1 6=9216 8=2 9=1 Concat fire2/concat 2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat Convolution fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1_fire3/relu_squeeze1x1 0=16 1=1 5=1 6=2048 8=102 9=1 Split splitncnn_1 1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 Convolution fire3/expand1x1 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1_fire3/relu_expand1x1 0=64 1=1 5=1 6=1024 8=2 9=1 Convolution fire3/expand3x3 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3_fire3/relu_expand3x3 0=64 1=3 4=1 5=1 6=9216 8=2 9=1 Concat fire3/concat 2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat Pooling pool3 1 1 fire3/concat pool3 1=3 2=2 Convolution fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1_fire4/relu_squeeze1x1 0=32 1=1 5=1 6=4096 8=102 9=1 Split splitncnn_2 1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 Convolution fire4/expand1x1 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1_fire4/relu_expand1x1 0=128 1=1 5=1 6=4096 8=2 9=1 Convolution fire4/expand3x3 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3_fire4/relu_expand3x3 0=128 1=3 4=1 5=1 6=36864 8=2 9=1 Concat fire4/concat 2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat Convolution fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1_fire5/relu_squeeze1x1 0=32 1=1 5=1 6=8192 8=102 9=1 Split splitncnn_3 1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 Convolution fire5/expand1x1 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1_fire5/relu_expand1x1 0=128 1=1 5=1 6=4096 8=2 9=1 Convolution fire5/expand3x3 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3_fire5/relu_expand3x3 0=128 1=3 4=1 5=1 6=36864 8=2 9=1 Concat fire5/concat 2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat Pooling pool5 1 1 fire5/concat pool5 1=3 2=2 Convolution fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1_fire6/relu_squeeze1x1 0=48 1=1 5=1 6=12288 8=102 9=1 Split splitncnn_4 1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 Convolution fire6/expand1x1 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1_fire6/relu_expand1x1 0=192 1=1 5=1 6=9216 8=2 9=1 Convolution fire6/expand3x3 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3_fire6/relu_expand3x3 0=192 1=3 4=1 5=1 6=82944 8=2 9=1 Concat fire6/concat 2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat Convolution fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1_fire7/relu_squeeze1x1 0=48 1=1 5=1 6=18432 8=102 9=1 Split splitncnn_5 1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 Convolution fire7/expand1x1 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1_fire7/relu_expand1x1 0=192 1=1 5=1 6=9216 8=2 9=1 Convolution fire7/expand3x3 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3_fire7/relu_expand3x3 0=192 1=3 4=1 5=1 6=82944 8=2 9=1 Concat fire7/concat 2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat Convolution fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1_fire8/relu_squeeze1x1 0=64 1=1 5=1 6=24576 8=102 9=1 Split splitncnn_6 1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 Convolution fire8/expand1x1 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1_fire8/relu_expand1x1 0=256 1=1 5=1 6=16384 8=2 9=1 Convolution fire8/expand3x3 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3_fire8/relu_expand3x3 0=256 1=3 4=1 5=1 6=147456 8=2 9=1 Concat fire8/concat 2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat Convolution fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1_fire9/relu_squeeze1x1 0=64 1=1 5=1 6=32768 8=102 9=1 Split splitncnn_7 1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 Convolution fire9/expand1x1 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1_fire9/relu_expand1x1 0=256 1=1 5=1 6=16384 8=2 9=1 Convolution fire9/expand3x3 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3_fire9/relu_expand3x3 0=256 1=3 4=1 5=1 6=147456 8=2 9=1 Concat fire9/concat 2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat_drop9 Convolution conv10 1 1 fire9/concat_drop9 conv10_relu_conv10 0=1000 1=1 4=1 5=1 6=512000 8=2 9=1 Pooling pool10 1 1 conv10_relu_conv10 pool10 0=1 4=1 Softmax prob 1 1 pool10 output ================================================ FILE: benchmark/squeezenet_ssd.param ================================================ 7767517 119 152 Input data 0 1 data -23330=4,3,300,300,3 0=300 1=300 2=3 Split splitncnn_0 1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6 -23330=28,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3 Convolution conv1 1 1 data_splitncnn_6 conv1_relu_conv1 -23330=4,3,149,149,64 0=64 1=3 3=2 5=1 6=1728 9=1 Pooling pool1 1 1 conv1_relu_conv1 pool1 -23330=4,3,74,74,64 1=3 2=2 Convolution fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1_fire2/relu_squeeze1x1 -23330=4,3,74,74,16 0=16 1=1 5=1 6=1024 9=1 Split splitncnn_1 1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 -23330=8,3,74,74,16,3,74,74,16 Convolution fire2/expand1x1 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1_fire2/relu_expand1x1 -23330=4,3,74,74,64 0=64 1=1 5=1 6=1024 9=1 Convolution fire2/expand3x3 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3_fire2/relu_expand3x3 -23330=4,3,74,74,64 0=64 1=3 4=1 5=1 6=9216 9=1 Concat fire2/concat 2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat -23330=4,3,74,74,128 Convolution fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1_fire3/relu_squeeze1x1 -23330=4,3,74,74,16 0=16 1=1 5=1 6=2048 9=1 Split splitncnn_2 1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 -23330=8,3,74,74,16,3,74,74,16 Convolution fire3/expand1x1 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1_fire3/relu_expand1x1 -23330=4,3,74,74,64 0=64 1=1 5=1 6=1024 9=1 Convolution fire3/expand3x3 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3_fire3/relu_expand3x3 -23330=4,3,74,74,64 0=64 1=3 4=1 5=1 6=9216 9=1 Concat fire3/concat 2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat -23330=4,3,74,74,128 Pooling pool3 1 1 fire3/concat pool3 -23330=4,3,37,37,128 1=3 2=2 Convolution fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1_fire4/relu_squeeze1x1 -23330=4,3,37,37,32 0=32 1=1 5=1 6=4096 9=1 Split splitncnn_3 1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 -23330=8,3,37,37,32,3,37,37,32 Convolution fire4/expand1x1 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1_fire4/relu_expand1x1 -23330=4,3,37,37,128 0=128 1=1 5=1 6=4096 9=1 Convolution fire4/expand3x3 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3_fire4/relu_expand3x3 -23330=4,3,37,37,128 0=128 1=3 4=1 5=1 6=36864 9=1 Concat fire4/concat 2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat -23330=4,3,37,37,256 Convolution fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1_fire5/relu_squeeze1x1 -23330=4,3,37,37,32 0=32 1=1 5=1 6=8192 9=1 Split splitncnn_4 1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 -23330=8,3,37,37,32,3,37,37,32 Convolution fire5/expand1x1 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1_fire5/relu_expand1x1 -23330=4,3,37,37,128 0=128 1=1 5=1 6=4096 9=1 Convolution fire5/expand3x3 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3_fire5/relu_expand3x3 -23330=4,3,37,37,128 0=128 1=3 4=1 5=1 6=36864 9=1 Concat fire5/concat 2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat -23330=4,3,37,37,256 Split splitncnn_5 1 2 fire5/concat fire5/concat_splitncnn_0 fire5/concat_splitncnn_1 -23330=8,3,37,37,256,3,37,37,256 Pooling pool5 1 1 fire5/concat_splitncnn_1 pool5 -23330=4,3,18,18,256 1=3 2=2 Convolution fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1_fire6/relu_squeeze1x1 -23330=4,3,18,18,48 0=48 1=1 5=1 6=12288 9=1 Split splitncnn_6 1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 -23330=8,3,18,18,48,3,18,18,48 Convolution fire6/expand1x1 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1_fire6/relu_expand1x1 -23330=4,3,18,18,192 0=192 1=1 5=1 6=9216 9=1 Convolution fire6/expand3x3 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3_fire6/relu_expand3x3 -23330=4,3,18,18,192 0=192 1=3 4=1 5=1 6=82944 9=1 Concat fire6/concat 2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat -23330=4,3,18,18,384 Convolution fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1_fire7/relu_squeeze1x1 -23330=4,3,18,18,48 0=48 1=1 5=1 6=18432 9=1 Split splitncnn_7 1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 -23330=8,3,18,18,48,3,18,18,48 Convolution fire7/expand1x1 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1_fire7/relu_expand1x1 -23330=4,3,18,18,192 0=192 1=1 5=1 6=9216 9=1 Convolution fire7/expand3x3 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3_fire7/relu_expand3x3 -23330=4,3,18,18,192 0=192 1=3 4=1 5=1 6=82944 9=1 Concat fire7/concat 2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat -23330=4,3,18,18,384 Convolution fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1_fire8/relu_squeeze1x1 -23330=4,3,18,18,64 0=64 1=1 5=1 6=24576 9=1 Split splitncnn_8 1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 -23330=8,3,18,18,64,3,18,18,64 Convolution fire8/expand1x1 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1_fire8/relu_expand1x1 -23330=4,3,18,18,256 0=256 1=1 5=1 6=16384 9=1 Convolution fire8/expand3x3 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3_fire8/relu_expand3x3 -23330=4,3,18,18,256 0=256 1=3 4=1 5=1 6=147456 9=1 Concat fire8/concat 2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat -23330=4,3,18,18,512 Convolution fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1_fire9/relu_squeeze1x1 -23330=4,3,18,18,64 0=64 1=1 5=1 6=32768 9=1 Split splitncnn_9 1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 -23330=8,3,18,18,64,3,18,18,64 Convolution fire9/expand1x1 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1_fire9/relu_expand1x1 -23330=4,3,18,18,256 0=256 1=1 5=1 6=16384 9=1 Convolution fire9/expand3x3 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3_fire9/relu_expand3x3 -23330=4,3,18,18,256 0=256 1=3 4=1 5=1 6=147456 9=1 Concat fire9/concat 2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat -23330=4,3,18,18,512 Split splitncnn_10 1 4 fire9/concat fire9/concat_splitncnn_0 fire9/concat_splitncnn_1 fire9/concat_splitncnn_2 fire9/concat_splitncnn_3 -23330=16,3,18,18,512,3,18,18,512,3,18,18,512,3,18,18,512 Pooling pool9 1 1 fire9/concat_splitncnn_3 pool9 -23330=4,3,9,9,512 1=3 2=2 Convolution fire10/squeeze1x1 1 1 pool9 fire10/squeeze1x1_fire10/relu_squeeze1x1 -23330=4,3,9,9,96 0=96 1=1 5=1 6=49152 9=1 Split splitncnn_11 1 2 fire10/squeeze1x1_fire10/relu_squeeze1x1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1 -23330=8,3,9,9,96,3,9,9,96 Convolution fire10/expand1x1 1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1 fire10/expand1x1_fire10/relu_expand1x1 -23330=4,3,9,9,384 0=384 1=1 5=1 6=36864 9=1 Convolution fire10/expand3x3 1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/expand3x3_fire10/relu_expand3x3 -23330=4,3,9,9,384 0=384 1=3 4=1 5=1 6=331776 9=1 Concat fire10/concat 2 1 fire10/expand1x1_fire10/relu_expand1x1 fire10/expand3x3_fire10/relu_expand3x3 fire10/concat -23330=4,3,9,9,768 Split splitncnn_12 1 4 fire10/concat fire10/concat_splitncnn_0 fire10/concat_splitncnn_1 fire10/concat_splitncnn_2 fire10/concat_splitncnn_3 -23330=16,3,9,9,768,3,9,9,768,3,9,9,768,3,9,9,768 Pooling pool10 1 1 fire10/concat_splitncnn_3 pool10 -23330=4,3,4,4,768 1=3 2=2 Convolution fire11/squeeze1x1 1 1 pool10 fire11/squeeze1x1_fire11/relu_squeeze1x1 -23330=4,3,4,4,96 0=96 1=1 5=1 6=73728 9=1 Split splitncnn_13 1 2 fire11/squeeze1x1_fire11/relu_squeeze1x1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1 -23330=8,3,4,4,96,3,4,4,96 Convolution fire11/expand1x1 1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1 fire11/expand1x1_fire11/relu_expand1x1 -23330=4,3,4,4,384 0=384 1=1 5=1 6=36864 9=1 Convolution fire11/expand3x3 1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/expand3x3_fire11/relu_expand3x3 -23330=4,3,4,4,384 0=384 1=3 4=1 5=1 6=331776 9=1 Concat fire11/concat 2 1 fire11/expand1x1_fire11/relu_expand1x1 fire11/expand3x3_fire11/relu_expand3x3 fire11/concat -23330=4,3,4,4,768 Split splitncnn_14 1 4 fire11/concat fire11/concat_splitncnn_0 fire11/concat_splitncnn_1 fire11/concat_splitncnn_2 fire11/concat_splitncnn_3 -23330=16,3,4,4,768,3,4,4,768,3,4,4,768,3,4,4,768 Convolution conv12_1 1 1 fire11/concat_splitncnn_3 conv12_1_conv12_1/relu -23330=4,3,4,4,128 0=128 1=1 5=1 6=98304 9=1 Convolution conv12_2 1 1 conv12_1_conv12_1/relu conv12_2_conv12_2/relu -23330=4,3,2,2,256 0=256 1=3 3=2 4=1 5=1 6=294912 9=1 Split splitncnn_15 1 4 conv12_2_conv12_2/relu conv12_2_conv12_2/relu_splitncnn_0 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_conv12_2/relu_splitncnn_3 -23330=16,3,2,2,256,3,2,2,256,3,2,2,256,3,2,2,256 Convolution conv13_1 1 1 conv12_2_conv12_2/relu_splitncnn_3 conv13_1_conv13_1/relu -23330=4,3,2,2,64 0=64 1=1 5=1 6=16384 9=1 Convolution conv13_2 1 1 conv13_1_conv13_1/relu conv13_2_conv13_2/relu -23330=4,3,1,1,128 0=128 1=3 3=2 4=1 5=1 6=73728 9=1 Split splitncnn_16 1 3 conv13_2_conv13_2/relu conv13_2_conv13_2/relu_splitncnn_0 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_conv13_2/relu_splitncnn_2 -23330=12,3,1,1,128,3,1,1,128,3,1,1,128 BatchNorm fire5/bn 1 1 fire5/concat_splitncnn_0 fire5/normal_fire5/scale -23330=4,3,37,37,256 0=256 Split splitncnn_17 1 3 fire5/normal_fire5/scale fire5/normal_fire5/scale_splitncnn_0 fire5/normal_fire5/scale_splitncnn_1 fire5/normal_fire5/scale_splitncnn_2 -23330=12,3,37,37,256,3,37,37,256,3,37,37,256 Convolution fire5_mbox_loc 1 1 fire5/normal_fire5/scale_splitncnn_2 fire5_mbox_loc -23330=4,3,37,37,16 0=16 1=3 4=1 5=1 6=36864 Permute fire5_mbox_loc_perm 1 1 fire5_mbox_loc fire5_mbox_loc_perm -23330=4,3,16,37,37 0=3 Flatten fire5_mbox_loc_flat 1 1 fire5_mbox_loc_perm fire5_mbox_loc_flat -23330=4,1,21904,1,1 Convolution fire5_mbox_conf 1 1 fire5/normal_fire5/scale_splitncnn_1 fire5_mbox_conf -23330=4,3,37,37,84 0=84 1=3 4=1 5=1 6=193536 Permute fire5_mbox_conf_perm 1 1 fire5_mbox_conf fire5_mbox_conf_perm -23330=4,3,84,37,37 0=3 Flatten fire5_mbox_conf_flat 1 1 fire5_mbox_conf_perm fire5_mbox_conf_flat -23330=4,1,114996,1,1 PriorBox fire5_mbox_priorbox 2 1 fire5/normal_fire5/scale_splitncnn_0 data_splitncnn_5 fire5_mbox_priorbox -23330=4,2,21904,2,1 -23300=1,2.100000e+01 -23301=1,4.500000e+01 -23302=1,2.000000e+00 9=-233 10=-233 11=8.000000e+00 12=8.000000e+00 13=5.000000e-01 Convolution fire9_mbox_loc 1 1 fire9/concat_splitncnn_2 fire9_mbox_loc -23330=4,3,18,18,24 0=24 1=3 4=1 5=1 6=110592 Permute fire9_mbox_loc_perm 1 1 fire9_mbox_loc fire9_mbox_loc_perm -23330=4,3,24,18,18 0=3 Flatten fire9_mbox_loc_flat 1 1 fire9_mbox_loc_perm fire9_mbox_loc_flat -23330=4,1,7776,1,1 Convolution fire9_mbox_conf 1 1 fire9/concat_splitncnn_1 fire9_mbox_conf -23330=4,3,18,18,126 0=126 1=3 4=1 5=1 6=580608 Permute fire9_mbox_conf_perm 1 1 fire9_mbox_conf fire9_mbox_conf_perm -23330=4,3,126,18,18 0=3 Flatten fire9_mbox_conf_flat 1 1 fire9_mbox_conf_perm fire9_mbox_conf_flat -23330=4,1,40824,1,1 PriorBox fire9_mbox_priorbox 2 1 fire9/concat_splitncnn_0 data_splitncnn_4 fire9_mbox_priorbox -23330=4,2,7776,2,1 -23300=1,4.500000e+01 -23301=1,9.900000e+01 -23302=2,2.000000e+00,3.000000e+00 9=-233 10=-233 11=1.600000e+01 12=1.600000e+01 13=5.000000e-01 Convolution fire10_mbox_loc 1 1 fire10/concat_splitncnn_2 fire10_mbox_loc -23330=4,3,9,9,24 0=24 1=3 4=1 5=1 6=165888 Permute fire10_mbox_loc_perm 1 1 fire10_mbox_loc fire10_mbox_loc_perm -23330=4,3,24,9,9 0=3 Flatten fire10_mbox_loc_flat 1 1 fire10_mbox_loc_perm fire10_mbox_loc_flat -23330=4,1,1944,1,1 Convolution fire10_mbox_conf 1 1 fire10/concat_splitncnn_1 fire10_mbox_conf -23330=4,3,9,9,126 0=126 1=3 4=1 5=1 6=870912 Permute fire10_mbox_conf_perm 1 1 fire10_mbox_conf fire10_mbox_conf_perm -23330=4,3,126,9,9 0=3 Flatten fire10_mbox_conf_flat 1 1 fire10_mbox_conf_perm fire10_mbox_conf_flat -23330=4,1,10206,1,1 PriorBox fire10_mbox_priorbox 2 1 fire10/concat_splitncnn_0 data_splitncnn_3 fire10_mbox_priorbox -23330=4,2,1944,2,1 -23300=1,9.900000e+01 -23301=1,1.530000e+02 -23302=2,2.000000e+00,3.000000e+00 9=-233 10=-233 11=3.200000e+01 12=3.200000e+01 13=5.000000e-01 Convolution fire11_mbox_loc 1 1 fire11/concat_splitncnn_2 fire11_mbox_loc -23330=4,3,4,4,24 0=24 1=3 4=1 5=1 6=165888 Permute fire11_mbox_loc_perm 1 1 fire11_mbox_loc fire11_mbox_loc_perm -23330=4,3,24,4,4 0=3 Flatten fire11_mbox_loc_flat 1 1 fire11_mbox_loc_perm fire11_mbox_loc_flat -23330=4,1,384,1,1 Convolution fire11_mbox_conf 1 1 fire11/concat_splitncnn_1 fire11_mbox_conf -23330=4,3,4,4,126 0=126 1=3 4=1 5=1 6=870912 Permute fire11_mbox_conf_perm 1 1 fire11_mbox_conf fire11_mbox_conf_perm -23330=4,3,126,4,4 0=3 Flatten fire11_mbox_conf_flat 1 1 fire11_mbox_conf_perm fire11_mbox_conf_flat -23330=4,1,2016,1,1 PriorBox fire11_mbox_priorbox 2 1 fire11/concat_splitncnn_0 data_splitncnn_2 fire11_mbox_priorbox -23330=4,2,384,2,1 -23300=1,1.530000e+02 -23301=1,2.070000e+02 -23302=2,2.000000e+00,3.000000e+00 9=-233 10=-233 11=6.400000e+01 12=6.400000e+01 13=5.000000e-01 Convolution conv12_2_mbox_loc 1 1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_mbox_loc -23330=4,3,2,2,24 0=24 1=3 4=1 5=1 6=55296 Permute conv12_2_mbox_loc_perm 1 1 conv12_2_mbox_loc conv12_2_mbox_loc_perm -23330=4,3,24,2,2 0=3 Flatten conv12_2_mbox_loc_flat 1 1 conv12_2_mbox_loc_perm conv12_2_mbox_loc_flat -23330=4,1,96,1,1 Convolution conv12_2_mbox_conf 1 1 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_mbox_conf -23330=4,3,2,2,126 0=126 1=3 4=1 5=1 6=290304 Permute conv12_2_mbox_conf_perm 1 1 conv12_2_mbox_conf conv12_2_mbox_conf_perm -23330=4,3,126,2,2 0=3 Flatten conv12_2_mbox_conf_flat 1 1 conv12_2_mbox_conf_perm conv12_2_mbox_conf_flat -23330=4,1,504,1,1 PriorBox conv12_2_mbox_priorbox 2 1 conv12_2_conv12_2/relu_splitncnn_0 data_splitncnn_1 conv12_2_mbox_priorbox -23330=4,2,96,2,1 -23300=1,2.070000e+02 -23301=1,2.610000e+02 -23302=2,2.000000e+00,3.000000e+00 9=-233 10=-233 11=1.000000e+02 12=1.000000e+02 13=5.000000e-01 Convolution conv13_2_mbox_loc 1 1 conv13_2_conv13_2/relu_splitncnn_2 conv13_2_mbox_loc -23330=4,3,1,1,16 0=16 1=3 4=1 5=1 6=18432 Permute conv13_2_mbox_loc_perm 1 1 conv13_2_mbox_loc conv13_2_mbox_loc_perm -23330=4,3,16,1,1 0=3 Flatten conv13_2_mbox_loc_flat 1 1 conv13_2_mbox_loc_perm conv13_2_mbox_loc_flat -23330=4,1,16,1,1 Convolution conv13_2_mbox_conf 1 1 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_mbox_conf -23330=4,3,1,1,84 0=84 1=3 4=1 5=1 6=96768 Permute conv13_2_mbox_conf_perm 1 1 conv13_2_mbox_conf conv13_2_mbox_conf_perm -23330=4,3,84,1,1 0=3 Flatten conv13_2_mbox_conf_flat 1 1 conv13_2_mbox_conf_perm conv13_2_mbox_conf_flat -23330=4,1,84,1,1 PriorBox conv13_2_mbox_priorbox 2 1 conv13_2_conv13_2/relu_splitncnn_0 data_splitncnn_0 conv13_2_mbox_priorbox -23330=4,2,16,2,1 -23300=1,2.610000e+02 -23301=1,3.150000e+02 -23302=1,2.000000e+00 9=-233 10=-233 11=3.000000e+02 12=3.000000e+02 13=5.000000e-01 Concat mbox_loc 6 1 fire5_mbox_loc_flat fire9_mbox_loc_flat fire10_mbox_loc_flat fire11_mbox_loc_flat conv12_2_mbox_loc_flat conv13_2_mbox_loc_flat mbox_loc -23330=4,1,32120,1,1 Concat mbox_conf 6 1 fire5_mbox_conf_flat fire9_mbox_conf_flat fire10_mbox_conf_flat fire11_mbox_conf_flat conv12_2_mbox_conf_flat conv13_2_mbox_conf_flat mbox_conf -23330=4,1,168630,1,1 Concat mbox_priorbox 6 1 fire5_mbox_priorbox fire9_mbox_priorbox fire10_mbox_priorbox fire11_mbox_priorbox conv12_2_mbox_priorbox conv13_2_mbox_priorbox mbox_priorbox -23330=4,2,32120,2,1 0=1 Reshape mbox_conf_reshape 1 1 mbox_conf mbox_conf_reshape -23330=4,2,21,8030,1 0=21 1=-1 Softmax mbox_conf_softmax 1 1 mbox_conf_reshape mbox_conf_softmax -23330=4,2,21,8030,1 0=1 1=1 Flatten mbox_conf_flatten 1 1 mbox_conf_softmax mbox_conf_flatten -23330=4,1,168630,1,1 DetectionOutput detection_out 3 1 mbox_loc mbox_conf_flatten mbox_priorbox output 0=21 1=4.500000e-01 2=100 4=2.500000e-01 ================================================ FILE: benchmark/squeezenet_ssd_int8.param ================================================ 7767517 119 152 Input data 0 1 data 0=300 1=300 2=3 Split splitncnn_0 1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6 Convolution conv1 1 1 data_splitncnn_6 conv1_relu_conv1 0=64 1=3 3=2 5=1 6=1728 8=2 9=1 Pooling pool1 1 1 conv1_relu_conv1 pool1 1=3 2=2 Convolution fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1_fire2/relu_squeeze1x1 0=16 1=1 5=1 6=1024 8=102 9=1 Split splitncnn_1 1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 Convolution fire2/expand1x1 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1_fire2/relu_expand1x1 0=64 1=1 5=1 6=1024 8=2 9=1 Convolution fire2/expand3x3 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3_fire2/relu_expand3x3 0=64 1=3 4=1 5=1 6=9216 8=2 9=1 Concat fire2/concat 2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat Convolution fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1_fire3/relu_squeeze1x1 0=16 1=1 5=1 6=2048 8=102 9=1 Split splitncnn_2 1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 Convolution fire3/expand1x1 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1_fire3/relu_expand1x1 0=64 1=1 5=1 6=1024 8=2 9=1 Convolution fire3/expand3x3 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3_fire3/relu_expand3x3 0=64 1=3 4=1 5=1 6=9216 8=2 9=1 Concat fire3/concat 2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat Pooling pool3 1 1 fire3/concat pool3 1=3 2=2 Convolution fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1_fire4/relu_squeeze1x1 0=32 1=1 5=1 6=4096 8=102 9=1 Split splitncnn_3 1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 Convolution fire4/expand1x1 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1_fire4/relu_expand1x1 0=128 1=1 5=1 6=4096 8=2 9=1 Convolution fire4/expand3x3 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3_fire4/relu_expand3x3 0=128 1=3 4=1 5=1 6=36864 8=2 9=1 Concat fire4/concat 2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat Convolution fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1_fire5/relu_squeeze1x1 0=32 1=1 5=1 6=8192 8=102 9=1 Split splitncnn_4 1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 Convolution fire5/expand1x1 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1_fire5/relu_expand1x1 0=128 1=1 5=1 6=4096 8=2 9=1 Convolution fire5/expand3x3 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3_fire5/relu_expand3x3 0=128 1=3 4=1 5=1 6=36864 8=2 9=1 Concat fire5/concat 2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat Split splitncnn_5 1 2 fire5/concat fire5/concat_splitncnn_0 fire5/concat_splitncnn_1 Pooling pool5 1 1 fire5/concat_splitncnn_1 pool5 1=3 2=2 Convolution fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1_fire6/relu_squeeze1x1 0=48 1=1 5=1 6=12288 8=102 9=1 Split splitncnn_6 1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 Convolution fire6/expand1x1 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1_fire6/relu_expand1x1 0=192 1=1 5=1 6=9216 8=2 9=1 Convolution fire6/expand3x3 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3_fire6/relu_expand3x3 0=192 1=3 4=1 5=1 6=82944 8=2 9=1 Concat fire6/concat 2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat Convolution fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1_fire7/relu_squeeze1x1 0=48 1=1 5=1 6=18432 8=102 9=1 Split splitncnn_7 1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 Convolution fire7/expand1x1 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1_fire7/relu_expand1x1 0=192 1=1 5=1 6=9216 8=2 9=1 Convolution fire7/expand3x3 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3_fire7/relu_expand3x3 0=192 1=3 4=1 5=1 6=82944 8=2 9=1 Concat fire7/concat 2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat Convolution fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1_fire8/relu_squeeze1x1 0=64 1=1 5=1 6=24576 8=102 9=1 Split splitncnn_8 1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 Convolution fire8/expand1x1 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1_fire8/relu_expand1x1 0=256 1=1 5=1 6=16384 8=2 9=1 Convolution fire8/expand3x3 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3_fire8/relu_expand3x3 0=256 1=3 4=1 5=1 6=147456 8=2 9=1 Concat fire8/concat 2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat Convolution fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1_fire9/relu_squeeze1x1 0=64 1=1 5=1 6=32768 8=102 9=1 Split splitncnn_9 1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 Convolution fire9/expand1x1 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1_fire9/relu_expand1x1 0=256 1=1 5=1 6=16384 8=2 9=1 Convolution fire9/expand3x3 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3_fire9/relu_expand3x3 0=256 1=3 4=1 5=1 6=147456 8=2 9=1 Concat fire9/concat 2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat Split splitncnn_10 1 4 fire9/concat fire9/concat_splitncnn_0 fire9/concat_splitncnn_1 fire9/concat_splitncnn_2 fire9/concat_splitncnn_3 Pooling pool9 1 1 fire9/concat_splitncnn_3 pool9 1=3 2=2 Convolution fire10/squeeze1x1 1 1 pool9 fire10/squeeze1x1_fire10/relu_squeeze1x1 0=96 1=1 5=1 6=49152 8=102 9=1 Split splitncnn_11 1 2 fire10/squeeze1x1_fire10/relu_squeeze1x1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1 Convolution fire10/expand1x1 1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1 fire10/expand1x1_fire10/relu_expand1x1 0=384 1=1 5=1 6=36864 8=2 9=1 Convolution fire10/expand3x3 1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/expand3x3_fire10/relu_expand3x3 0=384 1=3 4=1 5=1 6=331776 8=2 9=1 Concat fire10/concat 2 1 fire10/expand1x1_fire10/relu_expand1x1 fire10/expand3x3_fire10/relu_expand3x3 fire10/concat Split splitncnn_12 1 4 fire10/concat fire10/concat_splitncnn_0 fire10/concat_splitncnn_1 fire10/concat_splitncnn_2 fire10/concat_splitncnn_3 Pooling pool10 1 1 fire10/concat_splitncnn_3 pool10 1=3 2=2 Convolution fire11/squeeze1x1 1 1 pool10 fire11/squeeze1x1_fire11/relu_squeeze1x1 0=96 1=1 5=1 6=73728 8=102 9=1 Split splitncnn_13 1 2 fire11/squeeze1x1_fire11/relu_squeeze1x1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1 Convolution fire11/expand1x1 1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1 fire11/expand1x1_fire11/relu_expand1x1 0=384 1=1 5=1 6=36864 8=2 9=1 Convolution fire11/expand3x3 1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/expand3x3_fire11/relu_expand3x3 0=384 1=3 4=1 5=1 6=331776 8=2 9=1 Concat fire11/concat 2 1 fire11/expand1x1_fire11/relu_expand1x1 fire11/expand3x3_fire11/relu_expand3x3 fire11/concat Split splitncnn_14 1 4 fire11/concat fire11/concat_splitncnn_0 fire11/concat_splitncnn_1 fire11/concat_splitncnn_2 fire11/concat_splitncnn_3 Convolution conv12_1 1 1 fire11/concat_splitncnn_3 conv12_1_conv12_1/relu 0=128 1=1 5=1 6=98304 8=102 9=1 Convolution conv12_2 1 1 conv12_1_conv12_1/relu conv12_2_conv12_2/relu 0=256 1=3 3=2 4=1 5=1 6=294912 8=2 9=1 Split splitncnn_15 1 4 conv12_2_conv12_2/relu conv12_2_conv12_2/relu_splitncnn_0 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_conv12_2/relu_splitncnn_3 Convolution conv13_1 1 1 conv12_2_conv12_2/relu_splitncnn_3 conv13_1_conv13_1/relu 0=64 1=1 5=1 6=16384 8=102 9=1 Convolution conv13_2 1 1 conv13_1_conv13_1/relu conv13_2_conv13_2/relu 0=128 1=3 3=2 4=1 5=1 6=73728 8=2 9=1 Split splitncnn_16 1 3 conv13_2_conv13_2/relu conv13_2_conv13_2/relu_splitncnn_0 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_conv13_2/relu_splitncnn_2 BatchNorm fire5/bn 1 1 fire5/concat_splitncnn_0 fire5/normal_fire5/scale 0=256 Split splitncnn_17 1 3 fire5/normal_fire5/scale fire5/normal_fire5/scale_splitncnn_0 fire5/normal_fire5/scale_splitncnn_1 fire5/normal_fire5/scale_splitncnn_2 Convolution fire5_mbox_loc 1 1 fire5/normal_fire5/scale_splitncnn_2 fire5_mbox_loc 0=16 1=3 4=1 5=1 6=36864 8=2 Permute fire5_mbox_loc_perm 1 1 fire5_mbox_loc fire5_mbox_loc_perm 0=3 Flatten fire5_mbox_loc_flat 1 1 fire5_mbox_loc_perm fire5_mbox_loc_flat Convolution fire5_mbox_conf 1 1 fire5/normal_fire5/scale_splitncnn_1 fire5_mbox_conf 0=84 1=3 4=1 5=1 6=193536 8=2 Permute fire5_mbox_conf_perm 1 1 fire5_mbox_conf fire5_mbox_conf_perm 0=3 Flatten fire5_mbox_conf_flat 1 1 fire5_mbox_conf_perm fire5_mbox_conf_flat PriorBox fire5_mbox_priorbox 2 1 fire5/normal_fire5/scale_splitncnn_0 data_splitncnn_5 fire5_mbox_priorbox -23300=1,21.000000 -23301=1,45.000000 -23302=1,2.000000 9=-233 10=-233 11=8.000000 12=8.000000 13=0.500000 Convolution fire9_mbox_loc 1 1 fire9/concat_splitncnn_2 fire9_mbox_loc 0=24 1=3 4=1 5=1 6=110592 8=2 Permute fire9_mbox_loc_perm 1 1 fire9_mbox_loc fire9_mbox_loc_perm 0=3 Flatten fire9_mbox_loc_flat 1 1 fire9_mbox_loc_perm fire9_mbox_loc_flat Convolution fire9_mbox_conf 1 1 fire9/concat_splitncnn_1 fire9_mbox_conf 0=126 1=3 4=1 5=1 6=580608 8=2 Permute fire9_mbox_conf_perm 1 1 fire9_mbox_conf fire9_mbox_conf_perm 0=3 Flatten fire9_mbox_conf_flat 1 1 fire9_mbox_conf_perm fire9_mbox_conf_flat PriorBox fire9_mbox_priorbox 2 1 fire9/concat_splitncnn_0 data_splitncnn_4 fire9_mbox_priorbox -23300=1,45.000000 -23301=1,99.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 11=16.000000 12=16.000000 13=0.500000 Convolution fire10_mbox_loc 1 1 fire10/concat_splitncnn_2 fire10_mbox_loc 0=24 1=3 4=1 5=1 6=165888 8=2 Permute fire10_mbox_loc_perm 1 1 fire10_mbox_loc fire10_mbox_loc_perm 0=3 Flatten fire10_mbox_loc_flat 1 1 fire10_mbox_loc_perm fire10_mbox_loc_flat Convolution fire10_mbox_conf 1 1 fire10/concat_splitncnn_1 fire10_mbox_conf 0=126 1=3 4=1 5=1 6=870912 8=2 Permute fire10_mbox_conf_perm 1 1 fire10_mbox_conf fire10_mbox_conf_perm 0=3 Flatten fire10_mbox_conf_flat 1 1 fire10_mbox_conf_perm fire10_mbox_conf_flat PriorBox fire10_mbox_priorbox 2 1 fire10/concat_splitncnn_0 data_splitncnn_3 fire10_mbox_priorbox -23300=1,99.000000 -23301=1,153.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 11=32.000000 12=32.000000 13=0.500000 Convolution fire11_mbox_loc 1 1 fire11/concat_splitncnn_2 fire11_mbox_loc 0=24 1=3 4=1 5=1 6=165888 8=2 Permute fire11_mbox_loc_perm 1 1 fire11_mbox_loc fire11_mbox_loc_perm 0=3 Flatten fire11_mbox_loc_flat 1 1 fire11_mbox_loc_perm fire11_mbox_loc_flat Convolution fire11_mbox_conf 1 1 fire11/concat_splitncnn_1 fire11_mbox_conf 0=126 1=3 4=1 5=1 6=870912 8=2 Permute fire11_mbox_conf_perm 1 1 fire11_mbox_conf fire11_mbox_conf_perm 0=3 Flatten fire11_mbox_conf_flat 1 1 fire11_mbox_conf_perm fire11_mbox_conf_flat PriorBox fire11_mbox_priorbox 2 1 fire11/concat_splitncnn_0 data_splitncnn_2 fire11_mbox_priorbox -23300=1,153.000000 -23301=1,207.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 11=64.000000 12=64.000000 13=0.500000 Convolution conv12_2_mbox_loc 1 1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_mbox_loc 0=24 1=3 4=1 5=1 6=55296 8=2 Permute conv12_2_mbox_loc_perm 1 1 conv12_2_mbox_loc conv12_2_mbox_loc_perm 0=3 Flatten conv12_2_mbox_loc_flat 1 1 conv12_2_mbox_loc_perm conv12_2_mbox_loc_flat Convolution conv12_2_mbox_conf 1 1 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_mbox_conf 0=126 1=3 4=1 5=1 6=290304 8=2 Permute conv12_2_mbox_conf_perm 1 1 conv12_2_mbox_conf conv12_2_mbox_conf_perm 0=3 Flatten conv12_2_mbox_conf_flat 1 1 conv12_2_mbox_conf_perm conv12_2_mbox_conf_flat PriorBox conv12_2_mbox_priorbox 2 1 conv12_2_conv12_2/relu_splitncnn_0 data_splitncnn_1 conv12_2_mbox_priorbox -23300=1,207.000000 -23301=1,261.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 11=100.000000 12=100.000000 13=0.500000 Convolution conv13_2_mbox_loc 1 1 conv13_2_conv13_2/relu_splitncnn_2 conv13_2_mbox_loc 0=16 1=3 4=1 5=1 6=18432 8=2 Permute conv13_2_mbox_loc_perm 1 1 conv13_2_mbox_loc conv13_2_mbox_loc_perm 0=3 Flatten conv13_2_mbox_loc_flat 1 1 conv13_2_mbox_loc_perm conv13_2_mbox_loc_flat Convolution conv13_2_mbox_conf 1 1 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_mbox_conf 0=84 1=3 4=1 5=1 6=96768 8=2 Permute conv13_2_mbox_conf_perm 1 1 conv13_2_mbox_conf conv13_2_mbox_conf_perm 0=3 Flatten conv13_2_mbox_conf_flat 1 1 conv13_2_mbox_conf_perm conv13_2_mbox_conf_flat PriorBox conv13_2_mbox_priorbox 2 1 conv13_2_conv13_2/relu_splitncnn_0 data_splitncnn_0 conv13_2_mbox_priorbox -23300=1,261.000000 -23301=1,315.000000 -23302=1,2.000000 9=-233 10=-233 11=300.000000 12=300.000000 13=0.500000 Concat mbox_loc 6 1 fire5_mbox_loc_flat fire9_mbox_loc_flat fire10_mbox_loc_flat fire11_mbox_loc_flat conv12_2_mbox_loc_flat conv13_2_mbox_loc_flat mbox_loc Concat mbox_conf 6 1 fire5_mbox_conf_flat fire9_mbox_conf_flat fire10_mbox_conf_flat fire11_mbox_conf_flat conv12_2_mbox_conf_flat conv13_2_mbox_conf_flat mbox_conf Concat mbox_priorbox 6 1 fire5_mbox_priorbox fire9_mbox_priorbox fire10_mbox_priorbox fire11_mbox_priorbox conv12_2_mbox_priorbox conv13_2_mbox_priorbox mbox_priorbox 0=1 Reshape mbox_conf_reshape 1 1 mbox_conf mbox_conf_reshape 0=21 1=-1 Softmax mbox_conf_softmax 1 1 mbox_conf_reshape mbox_conf_softmax 0=1 1=1 Flatten mbox_conf_flatten 1 1 mbox_conf_softmax mbox_conf_flatten DetectionOutput detection_out 3 1 mbox_loc mbox_conf_flatten mbox_priorbox output 0=21 1=0.450000 2=100 4=0.250000 ================================================ FILE: benchmark/vgg16.param ================================================ 7767517 23 23 Input data 0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3 Convolution conv1_1 1 1 data conv1_1_relu1_1 -23330=4,3,224,224,64 0=64 1=3 4=1 5=1 6=1728 9=1 Convolution conv1_2 1 1 conv1_1_relu1_1 conv1_2_relu1_2 -23330=4,3,224,224,64 0=64 1=3 4=1 5=1 6=36864 9=1 Pooling pool1 1 1 conv1_2_relu1_2 pool1 -23330=4,3,112,112,64 1=2 2=2 Convolution conv2_1 1 1 pool1 conv2_1_relu2_1 -23330=4,3,112,112,128 0=128 1=3 4=1 5=1 6=73728 9=1 Convolution conv2_2 1 1 conv2_1_relu2_1 conv2_2_relu2_2 -23330=4,3,112,112,128 0=128 1=3 4=1 5=1 6=147456 9=1 Pooling pool2 1 1 conv2_2_relu2_2 pool2 -23330=4,3,56,56,128 1=2 2=2 Convolution conv3_1 1 1 pool2 conv3_1_relu3_1 -23330=4,3,56,56,256 0=256 1=3 4=1 5=1 6=294912 9=1 Convolution conv3_2 1 1 conv3_1_relu3_1 conv3_2_relu3_2 -23330=4,3,56,56,256 0=256 1=3 4=1 5=1 6=589824 9=1 Convolution conv3_3 1 1 conv3_2_relu3_2 conv3_3_relu3_3 -23330=4,3,56,56,256 0=256 1=3 4=1 5=1 6=589824 9=1 Pooling pool3 1 1 conv3_3_relu3_3 pool3 -23330=4,3,28,28,256 1=2 2=2 Convolution conv4_1 1 1 pool3 conv4_1_relu4_1 -23330=4,3,28,28,512 0=512 1=3 4=1 5=1 6=1179648 9=1 Convolution conv4_2 1 1 conv4_1_relu4_1 conv4_2_relu4_2 -23330=4,3,28,28,512 0=512 1=3 4=1 5=1 6=2359296 9=1 Convolution conv4_3 1 1 conv4_2_relu4_2 conv4_3_relu4_3 -23330=4,3,28,28,512 0=512 1=3 4=1 5=1 6=2359296 9=1 Pooling pool4 1 1 conv4_3_relu4_3 pool4 -23330=4,3,14,14,512 1=2 2=2 Convolution conv5_1 1 1 pool4 conv5_1_relu5_1 -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=2359296 9=1 Convolution conv5_2 1 1 conv5_1_relu5_1 conv5_2_relu5_2 -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=2359296 9=1 Convolution conv5_3 1 1 conv5_2_relu5_2 conv5_3_relu5_3 -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=2359296 9=1 Pooling pool5 1 1 conv5_3_relu5_3 pool5 -23330=4,3,7,7,512 1=2 2=2 InnerProduct fc6 1 1 pool5 fc6_drop6 -23330=4,1,4096,1,1 0=4096 1=1 2=102760448 9=1 InnerProduct fc7 1 1 fc6_drop6 fc7_drop7 -23330=4,1,4096,1,1 0=4096 1=1 2=16777216 9=1 InnerProduct fc8 1 1 fc7_drop7 fc8 -23330=4,1,1000,1,1 0=1000 1=1 2=4096000 Softmax prob 1 1 fc8 output -23330=4,1,1000,1,1 ================================================ FILE: benchmark/vgg16_int8.param ================================================ 7767517 23 23 Input data 0 1 data 0=224 1=224 2=3 Convolution conv1_1 1 1 data conv1_1_relu1_1 0=64 1=3 4=1 5=1 6=1728 8=102 9=1 Convolution conv1_2 1 1 conv1_1_relu1_1 conv1_2_relu1_2 0=64 1=3 4=1 5=1 6=36864 8=2 9=1 Pooling pool1 1 1 conv1_2_relu1_2 pool1 1=2 2=2 Convolution conv2_1 1 1 pool1 conv2_1_relu2_1 0=128 1=3 4=1 5=1 6=73728 8=102 9=1 Convolution conv2_2 1 1 conv2_1_relu2_1 conv2_2_relu2_2 0=128 1=3 4=1 5=1 6=147456 8=2 9=1 Pooling pool2 1 1 conv2_2_relu2_2 pool2 1=2 2=2 Convolution conv3_1 1 1 pool2 conv3_1_relu3_1 0=256 1=3 4=1 5=1 6=294912 8=102 9=1 Convolution conv3_2 1 1 conv3_1_relu3_1 conv3_2_relu3_2 0=256 1=3 4=1 5=1 6=589824 8=102 9=1 Convolution conv3_3 1 1 conv3_2_relu3_2 conv3_3_relu3_3 0=256 1=3 4=1 5=1 6=589824 8=2 9=1 Pooling pool3 1 1 conv3_3_relu3_3 pool3 1=2 2=2 Convolution conv4_1 1 1 pool3 conv4_1_relu4_1 0=512 1=3 4=1 5=1 6=1179648 8=102 9=1 Convolution conv4_2 1 1 conv4_1_relu4_1 conv4_2_relu4_2 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1 Convolution conv4_3 1 1 conv4_2_relu4_2 conv4_3_relu4_3 0=512 1=3 4=1 5=1 6=2359296 8=2 9=1 Pooling pool4 1 1 conv4_3_relu4_3 pool4 1=2 2=2 Convolution conv5_1 1 1 pool4 conv5_1_relu5_1 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1 Convolution conv5_2 1 1 conv5_1_relu5_1 conv5_2_relu5_2 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1 Convolution conv5_3 1 1 conv5_2_relu5_2 conv5_3_relu5_3 0=512 1=3 4=1 5=1 6=2359296 8=2 9=1 Pooling pool5 1 1 conv5_3_relu5_3 pool5 1=2 2=2 InnerProduct fc6 1 1 pool5 fc6_drop6 0=4096 1=1 2=102760448 8=2 9=1 InnerProduct fc7 1 1 fc6_drop6 fc7_drop7 0=4096 1=1 2=16777216 8=2 9=1 InnerProduct fc8 1 1 fc7_drop7 fc8 0=1000 1=1 2=4096000 8=2 Softmax prob 1 1 fc8 output ================================================ FILE: benchmark/vision_transformer.param ================================================ 7767517 144 192 Input input 0 1 input MemoryData backbone.cls_token 0 1 backbone.cls_token 0=768 1=1 MemoryData backbone.pos_embed 0 1 backbone.pos_embed 0=768 1=145 Convolution Conv_0 1 1 input onnx::Shape_153 0=768 1=32 11=32 2=1 12=1 3=32 13=32 4=0 14=0 15=0 16=0 5=1 6=2359296 Reshape Reshape_8 1 1 onnx::Shape_153 onnx::Transpose_161 0=-1 1=768 Permute Transpose_9 1 1 onnx::Transpose_161 onnx::Concat_162 0=1 Concat Concat_10 2 1 backbone.cls_token onnx::Concat_162 onnx::Add_163 0=0 BinaryOp Add_11 2 1 onnx::Add_163 backbone.pos_embed input.1 0=0 Split splitncnn_0 1 2 input.1 input.1_splitncnn_0 input.1_splitncnn_1 LayerNorm LayerNorm_12 1 1 input.1_splitncnn_1 qkv_input 0=768 1=1.000000e-06 2=1 Split splitncnn_1 1 3 qkv_input qkv_input_splitncnn_0 qkv_input_splitncnn_1 qkv_input_splitncnn_2 MultiHeadAttention MultiHeadAttention_21 3 1 qkv_input_splitncnn_2 qkv_input_splitncnn_1 qkv_input_splitncnn_0 onnx::Add_174 0=768 1=12 2=589824 BinaryOp Add_22 2 1 input.1_splitncnn_0 onnx::Add_174 input.4 0=0 Split splitncnn_2 1 2 input.4 input.4_splitncnn_0 input.4_splitncnn_1 LayerNorm LayerNorm_23 1 1 input.4_splitncnn_1 mmdeploy::Gemm_176 0=768 1=1.000000e-06 2=1 InnerProduct Gemm_24 1 1 mmdeploy::Gemm_176 mmdeploy::Gelu_177 0=3072 1=1 2=2359296 GELU Gelu_25 1 1 mmdeploy::Gelu_177 input.8 0=1 InnerProduct Gemm_26 1 1 input.8 input.12 0=768 1=1 2=2359296 BinaryOp Add_27 2 1 input.4_splitncnn_0 input.12 input.16 0=0 Split splitncnn_3 1 2 input.16 input.16_splitncnn_0 input.16_splitncnn_1 LayerNorm LayerNorm_28 1 1 input.16_splitncnn_1 qkv_input.3 0=768 1=1.000000e-06 2=1 Split splitncnn_4 1 3 qkv_input.3 qkv_input.3_splitncnn_0 qkv_input.3_splitncnn_1 qkv_input.3_splitncnn_2 MultiHeadAttention MultiHeadAttention_37 3 1 qkv_input.3_splitncnn_2 qkv_input.3_splitncnn_1 qkv_input.3_splitncnn_0 onnx::Add_190 0=768 1=12 2=589824 BinaryOp Add_38 2 1 input.16_splitncnn_0 onnx::Add_190 input.20 0=0 Split splitncnn_5 1 2 input.20 input.20_splitncnn_0 input.20_splitncnn_1 LayerNorm LayerNorm_39 1 1 input.20_splitncnn_1 mmdeploy::Gemm_192 0=768 1=1.000000e-06 2=1 InnerProduct Gemm_40 1 1 mmdeploy::Gemm_192 mmdeploy::Gelu_193 0=3072 1=1 2=2359296 GELU Gelu_41 1 1 mmdeploy::Gelu_193 input.24 0=1 InnerProduct Gemm_42 1 1 input.24 input.28 0=768 1=1 2=2359296 BinaryOp Add_43 2 1 input.20_splitncnn_0 input.28 input.32 0=0 Split splitncnn_6 1 2 input.32 input.32_splitncnn_0 input.32_splitncnn_1 LayerNorm LayerNorm_44 1 1 input.32_splitncnn_1 qkv_input.7 0=768 1=1.000000e-06 2=1 Split splitncnn_7 1 3 qkv_input.7 qkv_input.7_splitncnn_0 qkv_input.7_splitncnn_1 qkv_input.7_splitncnn_2 MultiHeadAttention MultiHeadAttention_53 3 1 qkv_input.7_splitncnn_2 qkv_input.7_splitncnn_1 qkv_input.7_splitncnn_0 onnx::Add_206 0=768 1=12 2=589824 BinaryOp Add_54 2 1 input.32_splitncnn_0 onnx::Add_206 input.36 0=0 Split splitncnn_8 1 2 input.36 input.36_splitncnn_0 input.36_splitncnn_1 LayerNorm LayerNorm_55 1 1 input.36_splitncnn_1 mmdeploy::Gemm_208 0=768 1=1.000000e-06 2=1 InnerProduct Gemm_56 1 1 mmdeploy::Gemm_208 mmdeploy::Gelu_209 0=3072 1=1 2=2359296 GELU Gelu_57 1 1 mmdeploy::Gelu_209 input.40 0=1 InnerProduct Gemm_58 1 1 input.40 input.44 0=768 1=1 2=2359296 BinaryOp Add_59 2 1 input.36_splitncnn_0 input.44 input.48 0=0 Split splitncnn_9 1 2 input.48 input.48_splitncnn_0 input.48_splitncnn_1 LayerNorm LayerNorm_60 1 1 input.48_splitncnn_1 qkv_input.11 0=768 1=1.000000e-06 2=1 Split splitncnn_10 1 3 qkv_input.11 qkv_input.11_splitncnn_0 qkv_input.11_splitncnn_1 qkv_input.11_splitncnn_2 MultiHeadAttention MultiHeadAttention_69 3 1 qkv_input.11_splitncnn_2 qkv_input.11_splitncnn_1 qkv_input.11_splitncnn_0 onnx::Add_222 0=768 1=12 2=589824 BinaryOp Add_70 2 1 input.48_splitncnn_0 onnx::Add_222 input.52 0=0 Split splitncnn_11 1 2 input.52 input.52_splitncnn_0 input.52_splitncnn_1 LayerNorm LayerNorm_71 1 1 input.52_splitncnn_1 mmdeploy::Gemm_224 0=768 1=1.000000e-06 2=1 InnerProduct Gemm_72 1 1 mmdeploy::Gemm_224 mmdeploy::Gelu_225 0=3072 1=1 2=2359296 GELU Gelu_73 1 1 mmdeploy::Gelu_225 input.56 0=1 InnerProduct Gemm_74 1 1 input.56 input.60 0=768 1=1 2=2359296 BinaryOp Add_75 2 1 input.52_splitncnn_0 input.60 input.64 0=0 Split splitncnn_12 1 2 input.64 input.64_splitncnn_0 input.64_splitncnn_1 LayerNorm LayerNorm_76 1 1 input.64_splitncnn_1 qkv_input.15 0=768 1=1.000000e-06 2=1 Split splitncnn_13 1 3 qkv_input.15 qkv_input.15_splitncnn_0 qkv_input.15_splitncnn_1 qkv_input.15_splitncnn_2 MultiHeadAttention MultiHeadAttention_85 3 1 qkv_input.15_splitncnn_2 qkv_input.15_splitncnn_1 qkv_input.15_splitncnn_0 onnx::Add_238 0=768 1=12 2=589824 BinaryOp Add_86 2 1 input.64_splitncnn_0 onnx::Add_238 input.68 0=0 Split splitncnn_14 1 2 input.68 input.68_splitncnn_0 input.68_splitncnn_1 LayerNorm LayerNorm_87 1 1 input.68_splitncnn_1 mmdeploy::Gemm_240 0=768 1=1.000000e-06 2=1 InnerProduct Gemm_88 1 1 mmdeploy::Gemm_240 mmdeploy::Gelu_241 0=3072 1=1 2=2359296 GELU Gelu_89 1 1 mmdeploy::Gelu_241 input.72 0=1 InnerProduct Gemm_90 1 1 input.72 input.76 0=768 1=1 2=2359296 BinaryOp Add_91 2 1 input.68_splitncnn_0 input.76 input.80 0=0 Split splitncnn_15 1 2 input.80 input.80_splitncnn_0 input.80_splitncnn_1 LayerNorm LayerNorm_92 1 1 input.80_splitncnn_1 qkv_input.19 0=768 1=1.000000e-06 2=1 Split splitncnn_16 1 3 qkv_input.19 qkv_input.19_splitncnn_0 qkv_input.19_splitncnn_1 qkv_input.19_splitncnn_2 MultiHeadAttention MultiHeadAttention_101 3 1 qkv_input.19_splitncnn_2 qkv_input.19_splitncnn_1 qkv_input.19_splitncnn_0 onnx::Add_254 0=768 1=12 2=589824 BinaryOp Add_102 2 1 input.80_splitncnn_0 onnx::Add_254 input.84 0=0 Split splitncnn_17 1 2 input.84 input.84_splitncnn_0 input.84_splitncnn_1 LayerNorm LayerNorm_103 1 1 input.84_splitncnn_1 mmdeploy::Gemm_256 0=768 1=1.000000e-06 2=1 InnerProduct Gemm_104 1 1 mmdeploy::Gemm_256 mmdeploy::Gelu_257 0=3072 1=1 2=2359296 GELU Gelu_105 1 1 mmdeploy::Gelu_257 input.88 0=1 InnerProduct Gemm_106 1 1 input.88 input.92 0=768 1=1 2=2359296 BinaryOp Add_107 2 1 input.84_splitncnn_0 input.92 input.96 0=0 Split splitncnn_18 1 2 input.96 input.96_splitncnn_0 input.96_splitncnn_1 LayerNorm LayerNorm_108 1 1 input.96_splitncnn_1 qkv_input.23 0=768 1=1.000000e-06 2=1 Split splitncnn_19 1 3 qkv_input.23 qkv_input.23_splitncnn_0 qkv_input.23_splitncnn_1 qkv_input.23_splitncnn_2 MultiHeadAttention MultiHeadAttention_117 3 1 qkv_input.23_splitncnn_2 qkv_input.23_splitncnn_1 qkv_input.23_splitncnn_0 onnx::Add_270 0=768 1=12 2=589824 BinaryOp Add_118 2 1 input.96_splitncnn_0 onnx::Add_270 input.100 0=0 Split splitncnn_20 1 2 input.100 input.100_splitncnn_0 input.100_splitncnn_1 LayerNorm LayerNorm_119 1 1 input.100_splitncnn_1 mmdeploy::Gemm_272 0=768 1=1.000000e-06 2=1 InnerProduct Gemm_120 1 1 mmdeploy::Gemm_272 mmdeploy::Gelu_273 0=3072 1=1 2=2359296 GELU Gelu_121 1 1 mmdeploy::Gelu_273 input.104 0=1 InnerProduct Gemm_122 1 1 input.104 input.108 0=768 1=1 2=2359296 BinaryOp Add_123 2 1 input.100_splitncnn_0 input.108 input.112 0=0 Split splitncnn_21 1 2 input.112 input.112_splitncnn_0 input.112_splitncnn_1 LayerNorm LayerNorm_124 1 1 input.112_splitncnn_1 qkv_input.27 0=768 1=1.000000e-06 2=1 Split splitncnn_22 1 3 qkv_input.27 qkv_input.27_splitncnn_0 qkv_input.27_splitncnn_1 qkv_input.27_splitncnn_2 MultiHeadAttention MultiHeadAttention_133 3 1 qkv_input.27_splitncnn_2 qkv_input.27_splitncnn_1 qkv_input.27_splitncnn_0 onnx::Add_286 0=768 1=12 2=589824 BinaryOp Add_134 2 1 input.112_splitncnn_0 onnx::Add_286 input.116 0=0 Split splitncnn_23 1 2 input.116 input.116_splitncnn_0 input.116_splitncnn_1 LayerNorm LayerNorm_135 1 1 input.116_splitncnn_1 mmdeploy::Gemm_288 0=768 1=1.000000e-06 2=1 InnerProduct Gemm_136 1 1 mmdeploy::Gemm_288 mmdeploy::Gelu_289 0=3072 1=1 2=2359296 GELU Gelu_137 1 1 mmdeploy::Gelu_289 input.120 0=1 InnerProduct Gemm_138 1 1 input.120 input.124 0=768 1=1 2=2359296 BinaryOp Add_139 2 1 input.116_splitncnn_0 input.124 input.128 0=0 Split splitncnn_24 1 2 input.128 input.128_splitncnn_0 input.128_splitncnn_1 LayerNorm LayerNorm_140 1 1 input.128_splitncnn_1 qkv_input.31 0=768 1=1.000000e-06 2=1 Split splitncnn_25 1 3 qkv_input.31 qkv_input.31_splitncnn_0 qkv_input.31_splitncnn_1 qkv_input.31_splitncnn_2 MultiHeadAttention MultiHeadAttention_149 3 1 qkv_input.31_splitncnn_2 qkv_input.31_splitncnn_1 qkv_input.31_splitncnn_0 onnx::Add_302 0=768 1=12 2=589824 BinaryOp Add_150 2 1 input.128_splitncnn_0 onnx::Add_302 input.132 0=0 Split splitncnn_26 1 2 input.132 input.132_splitncnn_0 input.132_splitncnn_1 LayerNorm LayerNorm_151 1 1 input.132_splitncnn_1 mmdeploy::Gemm_304 0=768 1=1.000000e-06 2=1 InnerProduct Gemm_152 1 1 mmdeploy::Gemm_304 mmdeploy::Gelu_305 0=3072 1=1 2=2359296 GELU Gelu_153 1 1 mmdeploy::Gelu_305 input.136 0=1 InnerProduct Gemm_154 1 1 input.136 input.140 0=768 1=1 2=2359296 BinaryOp Add_155 2 1 input.132_splitncnn_0 input.140 input.144 0=0 Split splitncnn_27 1 2 input.144 input.144_splitncnn_0 input.144_splitncnn_1 LayerNorm LayerNorm_156 1 1 input.144_splitncnn_1 qkv_input.35 0=768 1=1.000000e-06 2=1 Split splitncnn_28 1 3 qkv_input.35 qkv_input.35_splitncnn_0 qkv_input.35_splitncnn_1 qkv_input.35_splitncnn_2 MultiHeadAttention MultiHeadAttention_165 3 1 qkv_input.35_splitncnn_2 qkv_input.35_splitncnn_1 qkv_input.35_splitncnn_0 onnx::Add_318 0=768 1=12 2=589824 BinaryOp Add_166 2 1 input.144_splitncnn_0 onnx::Add_318 input.148 0=0 Split splitncnn_29 1 2 input.148 input.148_splitncnn_0 input.148_splitncnn_1 LayerNorm LayerNorm_167 1 1 input.148_splitncnn_1 mmdeploy::Gemm_320 0=768 1=1.000000e-06 2=1 InnerProduct Gemm_168 1 1 mmdeploy::Gemm_320 mmdeploy::Gelu_321 0=3072 1=1 2=2359296 GELU Gelu_169 1 1 mmdeploy::Gelu_321 input.152 0=1 InnerProduct Gemm_170 1 1 input.152 input.156 0=768 1=1 2=2359296 BinaryOp Add_171 2 1 input.148_splitncnn_0 input.156 input.160 0=0 Split splitncnn_30 1 2 input.160 input.160_splitncnn_0 input.160_splitncnn_1 LayerNorm LayerNorm_172 1 1 input.160_splitncnn_1 qkv_input.39 0=768 1=1.000000e-06 2=1 Split splitncnn_31 1 3 qkv_input.39 qkv_input.39_splitncnn_0 qkv_input.39_splitncnn_1 qkv_input.39_splitncnn_2 MultiHeadAttention MultiHeadAttention_181 3 1 qkv_input.39_splitncnn_2 qkv_input.39_splitncnn_1 qkv_input.39_splitncnn_0 onnx::Add_334 0=768 1=12 2=589824 BinaryOp Add_182 2 1 input.160_splitncnn_0 onnx::Add_334 input.164 0=0 Split splitncnn_32 1 2 input.164 input.164_splitncnn_0 input.164_splitncnn_1 LayerNorm LayerNorm_183 1 1 input.164_splitncnn_1 mmdeploy::Gemm_336 0=768 1=1.000000e-06 2=1 InnerProduct Gemm_184 1 1 mmdeploy::Gemm_336 mmdeploy::Gelu_337 0=3072 1=1 2=2359296 GELU Gelu_185 1 1 mmdeploy::Gelu_337 input.168 0=1 InnerProduct Gemm_186 1 1 input.168 input.172 0=768 1=1 2=2359296 BinaryOp Add_187 2 1 input.164_splitncnn_0 input.172 input.176 0=0 Split splitncnn_33 1 2 input.176 input.176_splitncnn_0 input.176_splitncnn_1 LayerNorm LayerNorm_188 1 1 input.176_splitncnn_1 qkv_input.43 0=768 1=1.000000e-06 2=1 Split splitncnn_34 1 3 qkv_input.43 qkv_input.43_splitncnn_0 qkv_input.43_splitncnn_1 qkv_input.43_splitncnn_2 MultiHeadAttention MultiHeadAttention_197 3 1 qkv_input.43_splitncnn_2 qkv_input.43_splitncnn_1 qkv_input.43_splitncnn_0 onnx::Add_350 0=768 1=12 2=589824 BinaryOp Add_198 2 1 input.176_splitncnn_0 onnx::Add_350 input.180 0=0 Split splitncnn_35 1 2 input.180 input.180_splitncnn_0 input.180_splitncnn_1 LayerNorm LayerNorm_199 1 1 input.180_splitncnn_1 mmdeploy::Gemm_352 0=768 1=1.000000e-06 2=1 InnerProduct Gemm_200 1 1 mmdeploy::Gemm_352 mmdeploy::Gelu_353 0=3072 1=1 2=2359296 GELU Gelu_201 1 1 mmdeploy::Gelu_353 input.184 0=1 InnerProduct Gemm_202 1 1 input.184 input.188 0=768 1=1 2=2359296 BinaryOp Add_203 2 1 input.180_splitncnn_0 input.188 input.192 0=0 LayerNorm LayerNorm_204 1 1 input.192 onnx::Gather_357 0=768 1=1.000000e-06 2=1 Crop Gather_206 1 1 onnx::Gather_357 mmdeploy::Gemm_359 -23309=1,0 -23310=1,1 -23311=1,0 InnerProduct Gemm_207 1 1 mmdeploy::Gemm_359 cls_score 0=1000 1=1 2=768000 Softmax Softmax_208 1 1 cls_score output 0=0 1=1 ================================================ FILE: benchmark/yolo-fastest-1.1.param ================================================ 7767517 131 154 Input data 0 1 data -23330=4,3,320,320,3 0=320 1=320 2=3 Convolution 0_22 1 1 data 0_22_bn_leaky -23330=4,3,160,160,8 0=8 1=3 3=2 4=1 5=1 6=216 9=2 -23310=1,1.000000e-01 Convolution 1_31 1 1 0_22_bn_leaky 1_31_bn_leaky -23330=4,3,160,160,8 0=8 1=1 5=1 6=64 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise 2_39 1 1 1_31_bn_leaky 2_39_bn_leaky -23330=4,3,160,160,8 0=8 1=3 4=1 5=1 6=72 7=8 9=2 -23310=1,1.000000e-01 Convolution 3_48 1 1 2_39_bn_leaky 3_48_bn -23330=4,3,160,160,4 0=4 1=1 5=1 6=32 Split 3_48_bn_split 1 2 3_48_bn 3_48_bn_split_0 3_48_bn_split_1 -23330=8,3,160,160,4,3,160,160,4 Convolution 4_57 1 1 3_48_bn_split_0 4_57_bn_leaky -23330=4,3,160,160,8 0=8 1=1 5=1 6=32 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise 5_65 1 1 4_57_bn_leaky 5_65_bn_leaky -23330=4,3,160,160,8 0=8 1=3 4=1 5=1 6=72 7=8 9=2 -23310=1,1.000000e-01 Convolution 6_74 1 1 5_65_bn_leaky 6_74_bn -23330=4,3,160,160,4 0=4 1=1 5=1 6=32 Eltwise 8_86 2 1 6_74_bn 3_48_bn_split_1 8_86 -23330=4,3,160,160,4 0=1 Convolution 9_90 1 1 8_86 9_90_bn_leaky -23330=4,3,160,160,24 0=24 1=1 5=1 6=96 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise 10_98 1 1 9_90_bn_leaky 10_98_bn_leaky -23330=4,3,80,80,24 0=24 1=3 3=2 4=1 5=1 6=216 7=24 9=2 -23310=1,1.000000e-01 Convolution 11_107 1 1 10_98_bn_leaky 11_107_bn -23330=4,3,80,80,8 0=8 1=1 5=1 6=192 Split 11_107_bn_split 1 2 11_107_bn 11_107_bn_split_0 11_107_bn_split_1 -23330=8,3,80,80,8,3,80,80,8 Convolution 12_116 1 1 11_107_bn_split_0 12_116_bn_leaky -23330=4,3,80,80,32 0=32 1=1 5=1 6=256 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise 13_124 1 1 12_116_bn_leaky 13_124_bn_leaky -23330=4,3,80,80,32 0=32 1=3 4=1 5=1 6=288 7=32 9=2 -23310=1,1.000000e-01 Convolution 14_133 1 1 13_124_bn_leaky 14_133_bn -23330=4,3,80,80,8 0=8 1=1 5=1 6=256 Eltwise 16_145 2 1 14_133_bn 11_107_bn_split_1 16_145 -23330=4,3,80,80,8 0=1 Split 16_145_split 1 2 16_145 16_145_split_0 16_145_split_1 -23330=8,3,80,80,8,3,80,80,8 Convolution 17_149 1 1 16_145_split_0 17_149_bn_leaky -23330=4,3,80,80,32 0=32 1=1 5=1 6=256 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise 18_157 1 1 17_149_bn_leaky 18_157_bn_leaky -23330=4,3,80,80,32 0=32 1=3 4=1 5=1 6=288 7=32 9=2 -23310=1,1.000000e-01 Convolution 19_166 1 1 18_157_bn_leaky 19_166_bn -23330=4,3,80,80,8 0=8 1=1 5=1 6=256 Eltwise 21_179 2 1 19_166_bn 16_145_split_1 21_179 -23330=4,3,80,80,8 0=1 Convolution 22_183 1 1 21_179 22_183_bn_leaky -23330=4,3,80,80,32 0=32 1=1 5=1 6=256 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise 23_191 1 1 22_183_bn_leaky 23_191_bn_leaky -23330=4,3,40,40,32 0=32 1=3 3=2 4=1 5=1 6=288 7=32 9=2 -23310=1,1.000000e-01 Convolution 24_200 1 1 23_191_bn_leaky 24_200_bn -23330=4,3,40,40,8 0=8 1=1 5=1 6=256 Split 24_200_bn_split 1 2 24_200_bn 24_200_bn_split_0 24_200_bn_split_1 -23330=8,3,40,40,8,3,40,40,8 Convolution 25_209 1 1 24_200_bn_split_0 25_209_bn_leaky -23330=4,3,40,40,48 0=48 1=1 5=1 6=384 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise 26_217 1 1 25_209_bn_leaky 26_217_bn_leaky -23330=4,3,40,40,48 0=48 1=3 4=1 5=1 6=432 7=48 9=2 -23310=1,1.000000e-01 Convolution 27_226 1 1 26_217_bn_leaky 27_226_bn -23330=4,3,40,40,8 0=8 1=1 5=1 6=384 Eltwise 29_238 2 1 27_226_bn 24_200_bn_split_1 29_238 -23330=4,3,40,40,8 0=1 Split 29_238_split 1 2 29_238 29_238_split_0 29_238_split_1 -23330=8,3,40,40,8,3,40,40,8 Convolution 30_242 1 1 29_238_split_0 30_242_bn_leaky -23330=4,3,40,40,48 0=48 1=1 5=1 6=384 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise 31_250 1 1 30_242_bn_leaky 31_250_bn_leaky -23330=4,3,40,40,48 0=48 1=3 4=1 5=1 6=432 7=48 9=2 -23310=1,1.000000e-01 Convolution 32_259 1 1 31_250_bn_leaky 32_259_bn -23330=4,3,40,40,8 0=8 1=1 5=1 6=384 Eltwise 34_273 2 1 32_259_bn 29_238_split_1 34_273 -23330=4,3,40,40,8 0=1 Convolution 35_277 1 1 34_273 35_277_bn_leaky -23330=4,3,40,40,48 0=48 1=1 5=1 6=384 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise 36_285 1 1 35_277_bn_leaky 36_285_bn_leaky -23330=4,3,40,40,48 0=48 1=3 4=1 5=1 6=432 7=48 9=2 -23310=1,1.000000e-01 Convolution 37_294 1 1 36_285_bn_leaky 37_294_bn -23330=4,3,40,40,16 0=16 1=1 5=1 6=768 Split 37_294_bn_split 1 2 37_294_bn 37_294_bn_split_0 37_294_bn_split_1 -23330=8,3,40,40,16,3,40,40,16 Convolution 38_303 1 1 37_294_bn_split_0 38_303_bn_leaky -23330=4,3,40,40,96 0=96 1=1 5=1 6=1536 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise 39_311 1 1 38_303_bn_leaky 39_311_bn_leaky -23330=4,3,40,40,96 0=96 1=3 4=1 5=1 6=864 7=96 9=2 -23310=1,1.000000e-01 Convolution 40_320 1 1 39_311_bn_leaky 40_320_bn -23330=4,3,40,40,16 0=16 1=1 5=1 6=1536 Eltwise 42_332 2 1 40_320_bn 37_294_bn_split_1 42_332 -23330=4,3,40,40,16 0=1 Split 42_332_split 1 2 42_332 42_332_split_0 42_332_split_1 -23330=8,3,40,40,16,3,40,40,16 Convolution 43_336 1 1 42_332_split_0 43_336_bn_leaky -23330=4,3,40,40,96 0=96 1=1 5=1 6=1536 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise 44_344 1 1 43_336_bn_leaky 44_344_bn_leaky -23330=4,3,40,40,96 0=96 1=3 4=1 5=1 6=864 7=96 9=2 -23310=1,1.000000e-01 Convolution 45_353 1 1 44_344_bn_leaky 45_353_bn -23330=4,3,40,40,16 0=16 1=1 5=1 6=1536 Eltwise 47_365 2 1 45_353_bn 42_332_split_1 47_365 -23330=4,3,40,40,16 0=1 Split 47_365_split 1 2 47_365 47_365_split_0 47_365_split_1 -23330=8,3,40,40,16,3,40,40,16 Convolution 48_369 1 1 47_365_split_0 48_369_bn_leaky -23330=4,3,40,40,96 0=96 1=1 5=1 6=1536 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise 49_377 1 1 48_369_bn_leaky 49_377_bn_leaky -23330=4,3,40,40,96 0=96 1=3 4=1 5=1 6=864 7=96 9=2 -23310=1,1.000000e-01 Convolution 50_386 1 1 49_377_bn_leaky 50_386_bn -23330=4,3,40,40,16 0=16 1=1 5=1 6=1536 Eltwise 52_399 2 1 50_386_bn 47_365_split_1 52_399 -23330=4,3,40,40,16 0=1 Split 52_399_split 1 2 52_399 52_399_split_0 52_399_split_1 -23330=8,3,40,40,16,3,40,40,16 Convolution 53_403 1 1 52_399_split_0 53_403_bn_leaky -23330=4,3,40,40,96 0=96 1=1 5=1 6=1536 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise 54_411 1 1 53_403_bn_leaky 54_411_bn_leaky -23330=4,3,40,40,96 0=96 1=3 4=1 5=1 6=864 7=96 9=2 -23310=1,1.000000e-01 Convolution 55_420 1 1 54_411_bn_leaky 55_420_bn -23330=4,3,40,40,16 0=16 1=1 5=1 6=1536 Eltwise 57_433 2 1 55_420_bn 52_399_split_1 57_433 -23330=4,3,40,40,16 0=1 Convolution 58_437 1 1 57_433 58_437_bn_leaky -23330=4,3,40,40,96 0=96 1=1 5=1 6=1536 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise 59_445 1 1 58_437_bn_leaky 59_445_bn_leaky -23330=4,3,20,20,96 0=96 1=3 3=2 4=1 5=1 6=864 7=96 9=2 -23310=1,1.000000e-01 Convolution 60_454 1 1 59_445_bn_leaky 60_454_bn -23330=4,3,20,20,24 0=24 1=1 5=1 6=2304 Split 60_454_bn_split 1 2 60_454_bn 60_454_bn_split_0 60_454_bn_split_1 -23330=8,3,20,20,24,3,20,20,24 Convolution 61_463 1 1 60_454_bn_split_0 61_463_bn_leaky -23330=4,3,20,20,136 0=136 1=1 5=1 6=3264 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise 62_471 1 1 61_463_bn_leaky 62_471_bn_leaky -23330=4,3,20,20,136 0=136 1=3 4=1 5=1 6=1224 7=136 9=2 -23310=1,1.000000e-01 Convolution 63_480 1 1 62_471_bn_leaky 63_480_bn -23330=4,3,20,20,24 0=24 1=1 5=1 6=3264 Eltwise 65_492 2 1 63_480_bn 60_454_bn_split_1 65_492 -23330=4,3,20,20,24 0=1 Split 65_492_split 1 2 65_492 65_492_split_0 65_492_split_1 -23330=8,3,20,20,24,3,20,20,24 Convolution 66_496 1 1 65_492_split_0 66_496_bn_leaky -23330=4,3,20,20,136 0=136 1=1 5=1 6=3264 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise 67_504 1 1 66_496_bn_leaky 67_504_bn_leaky -23330=4,3,20,20,136 0=136 1=3 4=1 5=1 6=1224 7=136 9=2 -23310=1,1.000000e-01 Convolution 68_513 1 1 67_504_bn_leaky 68_513_bn -23330=4,3,20,20,24 0=24 1=1 5=1 6=3264 Eltwise 70_526 2 1 68_513_bn 65_492_split_1 70_526 -23330=4,3,20,20,24 0=1 Split 70_526_split 1 2 70_526 70_526_split_0 70_526_split_1 -23330=8,3,20,20,24,3,20,20,24 Convolution 71_530 1 1 70_526_split_0 71_530_bn_leaky -23330=4,3,20,20,136 0=136 1=1 5=1 6=3264 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise 72_538 1 1 71_530_bn_leaky 72_538_bn_leaky -23330=4,3,20,20,136 0=136 1=3 4=1 5=1 6=1224 7=136 9=2 -23310=1,1.000000e-01 Convolution 73_547 1 1 72_538_bn_leaky 73_547_bn -23330=4,3,20,20,24 0=24 1=1 5=1 6=3264 Eltwise 75_559 2 1 73_547_bn 70_526_split_1 75_559 -23330=4,3,20,20,24 0=1 Split 75_559_split 1 2 75_559 75_559_split_0 75_559_split_1 -23330=8,3,20,20,24,3,20,20,24 Convolution 76_563 1 1 75_559_split_0 76_563_bn_leaky -23330=4,3,20,20,136 0=136 1=1 5=1 6=3264 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise 77_571 1 1 76_563_bn_leaky 77_571_bn_leaky -23330=4,3,20,20,136 0=136 1=3 4=1 5=1 6=1224 7=136 9=2 -23310=1,1.000000e-01 Convolution 78_580 1 1 77_571_bn_leaky 78_580_bn -23330=4,3,20,20,24 0=24 1=1 5=1 6=3264 Eltwise 80_593 2 1 78_580_bn 75_559_split_1 80_593 -23330=4,3,20,20,24 0=1 Split 80_593_split 1 2 80_593 80_593_split_0 80_593_split_1 -23330=8,3,20,20,24,3,20,20,24 Convolution 81_597 1 1 80_593_split_0 81_597_bn_leaky -23330=4,3,20,20,136 0=136 1=1 5=1 6=3264 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise 82_605 1 1 81_597_bn_leaky 82_605_bn_leaky -23330=4,3,10,10,136 0=136 1=3 3=2 4=1 5=1 6=1224 7=136 9=2 -23310=1,1.000000e-01 Convolution 83_615 1 1 82_605_bn_leaky 83_615_bn -23330=4,3,10,10,48 0=48 1=1 5=1 6=6528 Split 83_615_bn_split 1 2 83_615_bn 83_615_bn_split_0 83_615_bn_split_1 -23330=8,3,10,10,48,3,10,10,48 Convolution 84_624 1 1 83_615_bn_split_0 84_624_bn_leaky -23330=4,3,10,10,224 0=224 1=1 5=1 6=10752 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise 85_632 1 1 84_624_bn_leaky 85_632_bn_leaky -23330=4,3,10,10,224 0=224 1=3 4=1 5=1 6=2016 7=224 9=2 -23310=1,1.000000e-01 Convolution 86_641 1 1 85_632_bn_leaky 86_641_bn -23330=4,3,10,10,48 0=48 1=1 5=1 6=10752 Eltwise 88_653 2 1 86_641_bn 83_615_bn_split_1 88_653 -23330=4,3,10,10,48 0=1 Split 88_653_split 1 2 88_653 88_653_split_0 88_653_split_1 -23330=8,3,10,10,48,3,10,10,48 Convolution 89_657 1 1 88_653_split_0 89_657_bn_leaky -23330=4,3,10,10,224 0=224 1=1 5=1 6=10752 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise 90_665 1 1 89_657_bn_leaky 90_665_bn_leaky -23330=4,3,10,10,224 0=224 1=3 4=1 5=1 6=2016 7=224 9=2 -23310=1,1.000000e-01 Convolution 91_674 1 1 90_665_bn_leaky 91_674_bn -23330=4,3,10,10,48 0=48 1=1 5=1 6=10752 Eltwise 93_686 2 1 91_674_bn 88_653_split_1 93_686 -23330=4,3,10,10,48 0=1 Split 93_686_split 1 2 93_686 93_686_split_0 93_686_split_1 -23330=8,3,10,10,48,3,10,10,48 Convolution 94_690 1 1 93_686_split_0 94_690_bn_leaky -23330=4,3,10,10,224 0=224 1=1 5=1 6=10752 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise 95_698 1 1 94_690_bn_leaky 95_698_bn_leaky -23330=4,3,10,10,224 0=224 1=3 4=1 5=1 6=2016 7=224 9=2 -23310=1,1.000000e-01 Convolution 96_707 1 1 95_698_bn_leaky 96_707_bn -23330=4,3,10,10,48 0=48 1=1 5=1 6=10752 Eltwise 98_719 2 1 96_707_bn 93_686_split_1 98_719 -23330=4,3,10,10,48 0=1 Split 98_719_split 1 2 98_719 98_719_split_0 98_719_split_1 -23330=8,3,10,10,48,3,10,10,48 Convolution 99_723 1 1 98_719_split_0 99_723_bn_leaky -23330=4,3,10,10,224 0=224 1=1 5=1 6=10752 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise 100_731 1 1 99_723_bn_leaky 100_731_bn_leaky -23330=4,3,10,10,224 0=224 1=3 4=1 5=1 6=2016 7=224 9=2 -23310=1,1.000000e-01 Convolution 101_740 1 1 100_731_bn_leaky 101_740_bn -23330=4,3,10,10,48 0=48 1=1 5=1 6=10752 Eltwise 103_752 2 1 101_740_bn 98_719_split_1 103_752 -23330=4,3,10,10,48 0=1 Split 103_752_split 1 2 103_752 103_752_split_0 103_752_split_1 -23330=8,3,10,10,48,3,10,10,48 Convolution 104_756 1 1 103_752_split_0 104_756_bn_leaky -23330=4,3,10,10,224 0=224 1=1 5=1 6=10752 9=2 -23310=1,1.000000e-01 ConvolutionDepthWise 105_764 1 1 104_756_bn_leaky 105_764_bn_leaky -23330=4,3,10,10,224 0=224 1=3 4=1 5=1 6=2016 7=224 9=2 -23310=1,1.000000e-01 Convolution 106_773 1 1 105_764_bn_leaky 106_773_bn -23330=4,3,10,10,48 0=48 1=1 5=1 6=10752 Eltwise 108_784 2 1 106_773_bn 103_752_split_1 108_784 -23330=4,3,10,10,48 0=1 Split 108_784_split 1 4 108_784 108_784_split_0 108_784_split_1 108_784_split_2 108_784_split_3 -23330=16,3,10,10,48,3,10,10,48,3,10,10,48,3,10,10,48 Pooling 109_788 1 1 108_784_split_0 109_788 -23330=4,3,10,10,48 1=3 3=1 5=1 Pooling 111_795 1 1 108_784_split_1 111_795 -23330=4,3,10,10,48 1=5 3=2 5=1 Pooling 113_802 1 1 108_784_split_2 113_802 -23330=4,3,10,10,48 1=9 3=4 5=1 Concat 114_806 4 1 113_802 111_795 109_788 108_784_split_3 114_806 -23330=4,3,10,10,192 Convolution 115_811 1 1 114_806 115_811_bn_leaky -23330=4,3,10,10,96 0=96 1=1 5=1 6=18432 9=2 -23310=1,1.000000e-01 Split 115_811_bn_leaky_split 1 2 115_811_bn_leaky 115_811_bn_leaky_split_0 115_811_bn_leaky_split_1 -23330=8,3,10,10,96,3,10,10,96 ConvolutionDepthWise 116_819 1 1 115_811_bn_leaky_split_0 116_819_bn_leaky -23330=4,3,10,10,96 0=96 1=5 4=2 5=1 6=2400 7=96 9=2 -23310=1,1.000000e-01 Convolution 117_828 1 1 116_819_bn_leaky 117_828_bn -23330=4,3,10,10,96 0=96 1=1 5=1 6=9216 ConvolutionDepthWise 118_836 1 1 117_828_bn 118_836_bn_leaky -23330=4,3,10,10,96 0=96 1=5 4=2 5=1 6=2400 7=96 9=2 -23310=1,1.000000e-01 Convolution 119_845 1 1 118_836_bn_leaky 119_845_bn -23330=4,3,10,10,96 0=96 1=1 5=1 6=9216 Convolution 120_854 1 1 119_845_bn 120_854 -23330=4,3,10,10,255 0=255 1=1 5=1 6=24480 Interp 123_882 1 1 115_811_bn_leaky_split_1 123_882 -23330=4,3,20,20,96 0=1 1=2.000000e+00 2=2.000000e+00 Concat 124_885 2 1 123_882 80_593_split_1 124_885 -23330=4,3,20,20,120 ConvolutionDepthWise 125_888 1 1 124_885 125_888_bn_leaky -23330=4,3,20,20,120 0=120 1=5 4=2 5=1 6=3000 7=120 9=2 -23310=1,1.000000e-01 Convolution 126_897 1 1 125_888_bn_leaky 126_897_bn -23330=4,3,20,20,120 0=120 1=1 5=1 6=14400 ConvolutionDepthWise 127_905 1 1 126_897_bn 127_905_bn_leaky -23330=4,3,20,20,120 0=120 1=5 4=2 5=1 6=3000 7=120 9=2 -23310=1,1.000000e-01 Convolution 128_914 1 1 127_905_bn_leaky 128_914_bn -23330=4,3,20,20,120 0=120 1=1 5=1 6=14400 Convolution 129_922 1 1 128_914_bn 129_922 -23330=4,3,20,20,255 0=255 1=1 5=1 6=30600 Yolov3DetectionOutput detection_out 2 1 120_854 129_922 output -23330=4,2,6,1431,1 0=80 1=3 2=5.500000e-01 -23304=12,1.200000e+01,1.800000e+01,3.700000e+01,4.900000e+01,5.200000e+01,1.320000e+02,1.150000e+02,7.300000e+01,1.190000e+02,1.990000e+02,2.420000e+02,2.380000e+02 -23305=6,1077936128,1082130432,1084227584,0,1065353216,1073741824 -23306=2,3.200000e+01,1.600000e+01 ================================================ FILE: benchmark/yolo-fastestv2.param ================================================ 7767517 144 166 Input input.1 0 1 input.1 -23330=4,3,352,352,3 0=352 1=352 2=3 Convolution Conv_0 1 1 input.1 447 -23330=4,3,176,176,24 0=24 1=3 3=2 4=1 5=1 6=648 9=1 Pooling MaxPool_2 1 1 447 448 -23330=4,3,88,88,24 1=3 2=2 3=1 5=1 Split splitncnn_0 1 2 448 448_splitncnn_0 448_splitncnn_1 -23330=8,3,88,88,24,3,88,88,24 ConvolutionDepthWise Conv_3 1 1 448_splitncnn_1 800 -23330=4,3,44,44,24 0=24 1=3 3=2 4=1 5=1 6=216 7=24 Convolution Conv_4 1 1 800 453 -23330=4,3,44,44,24 0=24 1=1 5=1 6=576 9=1 Convolution Conv_6 1 1 448_splitncnn_0 456 -23330=4,3,88,88,24 0=24 1=1 5=1 6=576 9=1 ConvolutionDepthWise Conv_8 1 1 456 809 -23330=4,3,44,44,24 0=24 1=3 3=2 4=1 5=1 6=216 7=24 Convolution Conv_9 1 1 809 461 -23330=4,3,44,44,24 0=24 1=1 5=1 6=576 9=1 Concat Concat_11 2 1 453 461 462 -23330=4,3,44,44,48 ShuffleChannel Reshape_16 1 1 462 467 -23330=4,3,44,44,48 0=2 1=1 Slice Gather_20 1 2 467 469 471 -23330=8,3,44,44,24,3,44,44,24 -23300=2,-233,-233 Convolution Conv_21 1 1 471 474 -23330=4,3,44,44,24 0=24 1=1 5=1 6=576 9=1 ConvolutionDepthWise Conv_23 1 1 474 818 -23330=4,3,44,44,24 0=24 1=3 4=1 5=1 6=216 7=24 Convolution Conv_24 1 1 818 479 -23330=4,3,44,44,24 0=24 1=1 5=1 6=576 9=1 Concat Concat_26 2 1 469 479 480 -23330=4,3,44,44,48 ShuffleChannel Reshape_31 1 1 480 485 -23330=4,3,44,44,48 0=2 1=1 Slice Gather_35 1 2 485 487 489 -23330=8,3,44,44,24,3,44,44,24 -23300=2,-233,-233 Convolution Conv_36 1 1 489 492 -23330=4,3,44,44,24 0=24 1=1 5=1 6=576 9=1 ConvolutionDepthWise Conv_38 1 1 492 827 -23330=4,3,44,44,24 0=24 1=3 4=1 5=1 6=216 7=24 Convolution Conv_39 1 1 827 497 -23330=4,3,44,44,24 0=24 1=1 5=1 6=576 9=1 Concat Concat_41 2 1 487 497 498 -23330=4,3,44,44,48 ShuffleChannel Reshape_46 1 1 498 503 -23330=4,3,44,44,48 0=2 1=1 Slice Gather_50 1 2 503 505 507 -23330=8,3,44,44,24,3,44,44,24 -23300=2,-233,-233 Convolution Conv_51 1 1 507 510 -23330=4,3,44,44,24 0=24 1=1 5=1 6=576 9=1 ConvolutionDepthWise Conv_53 1 1 510 836 -23330=4,3,44,44,24 0=24 1=3 4=1 5=1 6=216 7=24 Convolution Conv_54 1 1 836 515 -23330=4,3,44,44,24 0=24 1=1 5=1 6=576 9=1 Concat Concat_56 2 1 505 515 516 -23330=4,3,44,44,48 Split splitncnn_1 1 2 516 516_splitncnn_0 516_splitncnn_1 -23330=8,3,44,44,48,3,44,44,48 ConvolutionDepthWise Conv_57 1 1 516_splitncnn_1 842 -23330=4,3,22,22,48 0=48 1=3 3=2 4=1 5=1 6=432 7=48 Convolution Conv_58 1 1 842 521 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1 Convolution Conv_60 1 1 516_splitncnn_0 524 -23330=4,3,44,44,48 0=48 1=1 5=1 6=2304 9=1 ConvolutionDepthWise Conv_62 1 1 524 851 -23330=4,3,22,22,48 0=48 1=3 3=2 4=1 5=1 6=432 7=48 Convolution Conv_63 1 1 851 529 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1 Concat Concat_65 2 1 521 529 530 -23330=4,3,22,22,96 ShuffleChannel Reshape_70 1 1 530 535 -23330=4,3,22,22,96 0=2 1=1 Slice Gather_74 1 2 535 537 539 -23330=8,3,22,22,48,3,22,22,48 -23300=2,-233,-233 Convolution Conv_75 1 1 539 542 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1 ConvolutionDepthWise Conv_77 1 1 542 860 -23330=4,3,22,22,48 0=48 1=3 4=1 5=1 6=432 7=48 Convolution Conv_78 1 1 860 547 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1 Concat Concat_80 2 1 537 547 548 -23330=4,3,22,22,96 ShuffleChannel Reshape_85 1 1 548 553 -23330=4,3,22,22,96 0=2 1=1 Slice Gather_89 1 2 553 555 557 -23330=8,3,22,22,48,3,22,22,48 -23300=2,-233,-233 Convolution Conv_90 1 1 557 560 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1 ConvolutionDepthWise Conv_92 1 1 560 869 -23330=4,3,22,22,48 0=48 1=3 4=1 5=1 6=432 7=48 Convolution Conv_93 1 1 869 565 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1 Concat Concat_95 2 1 555 565 566 -23330=4,3,22,22,96 ShuffleChannel Reshape_100 1 1 566 571 -23330=4,3,22,22,96 0=2 1=1 Slice Gather_104 1 2 571 573 575 -23330=8,3,22,22,48,3,22,22,48 -23300=2,-233,-233 Convolution Conv_105 1 1 575 578 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1 ConvolutionDepthWise Conv_107 1 1 578 878 -23330=4,3,22,22,48 0=48 1=3 4=1 5=1 6=432 7=48 Convolution Conv_108 1 1 878 583 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1 Concat Concat_110 2 1 573 583 584 -23330=4,3,22,22,96 ShuffleChannel Reshape_115 1 1 584 589 -23330=4,3,22,22,96 0=2 1=1 Slice Gather_119 1 2 589 591 593 -23330=8,3,22,22,48,3,22,22,48 -23300=2,-233,-233 Convolution Conv_120 1 1 593 596 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1 ConvolutionDepthWise Conv_122 1 1 596 887 -23330=4,3,22,22,48 0=48 1=3 4=1 5=1 6=432 7=48 Convolution Conv_123 1 1 887 601 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1 Concat Concat_125 2 1 591 601 602 -23330=4,3,22,22,96 ShuffleChannel Reshape_130 1 1 602 607 -23330=4,3,22,22,96 0=2 1=1 Slice Gather_134 1 2 607 609 611 -23330=8,3,22,22,48,3,22,22,48 -23300=2,-233,-233 Convolution Conv_135 1 1 611 614 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1 ConvolutionDepthWise Conv_137 1 1 614 896 -23330=4,3,22,22,48 0=48 1=3 4=1 5=1 6=432 7=48 Convolution Conv_138 1 1 896 619 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1 Concat Concat_140 2 1 609 619 620 -23330=4,3,22,22,96 ShuffleChannel Reshape_145 1 1 620 625 -23330=4,3,22,22,96 0=2 1=1 Slice Gather_149 1 2 625 627 629 -23330=8,3,22,22,48,3,22,22,48 -23300=2,-233,-233 Convolution Conv_150 1 1 629 632 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1 ConvolutionDepthWise Conv_152 1 1 632 905 -23330=4,3,22,22,48 0=48 1=3 4=1 5=1 6=432 7=48 Convolution Conv_153 1 1 905 637 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1 Concat Concat_155 2 1 627 637 638 -23330=4,3,22,22,96 ShuffleChannel Reshape_160 1 1 638 643 -23330=4,3,22,22,96 0=2 1=1 Slice Gather_164 1 2 643 645 647 -23330=8,3,22,22,48,3,22,22,48 -23300=2,-233,-233 Convolution Conv_165 1 1 647 650 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1 ConvolutionDepthWise Conv_167 1 1 650 914 -23330=4,3,22,22,48 0=48 1=3 4=1 5=1 6=432 7=48 Convolution Conv_168 1 1 914 655 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1 Concat Concat_170 2 1 645 655 656 -23330=4,3,22,22,96 Split splitncnn_2 1 3 656 656_splitncnn_0 656_splitncnn_1 656_splitncnn_2 -23330=12,3,22,22,96,3,22,22,96,3,22,22,96 ConvolutionDepthWise Conv_171 1 1 656_splitncnn_2 920 -23330=4,3,11,11,96 0=96 1=3 3=2 4=1 5=1 6=864 7=96 Convolution Conv_172 1 1 920 661 -23330=4,3,11,11,96 0=96 1=1 5=1 6=9216 9=1 Convolution Conv_174 1 1 656_splitncnn_1 664 -23330=4,3,22,22,96 0=96 1=1 5=1 6=9216 9=1 ConvolutionDepthWise Conv_176 1 1 664 929 -23330=4,3,11,11,96 0=96 1=3 3=2 4=1 5=1 6=864 7=96 Convolution Conv_177 1 1 929 669 -23330=4,3,11,11,96 0=96 1=1 5=1 6=9216 9=1 Concat Concat_179 2 1 661 669 670 -23330=4,3,11,11,192 ShuffleChannel Reshape_184 1 1 670 675 -23330=4,3,11,11,192 0=2 1=1 Slice Gather_188 1 2 675 677 679 -23330=8,3,11,11,96,3,11,11,96 -23300=2,-233,-233 Convolution Conv_189 1 1 679 682 -23330=4,3,11,11,96 0=96 1=1 5=1 6=9216 9=1 ConvolutionDepthWise Conv_191 1 1 682 938 -23330=4,3,11,11,96 0=96 1=3 4=1 5=1 6=864 7=96 Convolution Conv_192 1 1 938 687 -23330=4,3,11,11,96 0=96 1=1 5=1 6=9216 9=1 Concat Concat_194 2 1 677 687 688 -23330=4,3,11,11,192 ShuffleChannel Reshape_199 1 1 688 693 -23330=4,3,11,11,192 0=2 1=1 Slice Gather_203 1 2 693 695 697 -23330=8,3,11,11,96,3,11,11,96 -23300=2,-233,-233 Convolution Conv_204 1 1 697 700 -23330=4,3,11,11,96 0=96 1=1 5=1 6=9216 9=1 ConvolutionDepthWise Conv_206 1 1 700 947 -23330=4,3,11,11,96 0=96 1=3 4=1 5=1 6=864 7=96 Convolution Conv_207 1 1 947 705 -23330=4,3,11,11,96 0=96 1=1 5=1 6=9216 9=1 Concat Concat_209 2 1 695 705 706 -23330=4,3,11,11,192 ShuffleChannel Reshape_214 1 1 706 711 -23330=4,3,11,11,192 0=2 1=1 Slice Gather_218 1 2 711 713 715 -23330=8,3,11,11,96,3,11,11,96 -23300=2,-233,-233 Convolution Conv_219 1 1 715 718 -23330=4,3,11,11,96 0=96 1=1 5=1 6=9216 9=1 ConvolutionDepthWise Conv_221 1 1 718 956 -23330=4,3,11,11,96 0=96 1=3 4=1 5=1 6=864 7=96 Convolution Conv_222 1 1 956 723 -23330=4,3,11,11,96 0=96 1=1 5=1 6=9216 9=1 Concat Concat_224 2 1 713 723 724 -23330=4,3,11,11,192 Split splitncnn_3 1 2 724 724_splitncnn_0 724_splitncnn_1 -23330=8,3,11,11,192,3,11,11,192 Convolution Conv_225 1 1 724_splitncnn_1 727 -23330=4,3,11,11,72 0=72 1=1 5=1 6=13824 9=1 Split splitncnn_4 1 2 727 727_splitncnn_0 727_splitncnn_1 -23330=8,3,11,11,72,3,11,11,72 ConvolutionDepthWise Conv_227 1 1 727_splitncnn_1 730 -23330=4,3,11,11,72 0=72 1=5 4=2 5=1 6=1800 7=72 9=1 Convolution Conv_229 1 1 730 968 -23330=4,3,11,11,72 0=72 1=1 5=1 6=5184 ConvolutionDepthWise Conv_230 1 1 968 735 -23330=4,3,11,11,72 0=72 1=5 4=2 5=1 6=1800 7=72 9=1 Convolution Conv_232 1 1 735 974 -23330=4,3,11,11,72 0=72 1=1 5=1 6=5184 Split splitncnn_5 1 2 974 974_splitncnn_0 974_splitncnn_1 -23330=8,3,11,11,72,3,11,11,72 ConvolutionDepthWise Conv_233 1 1 727_splitncnn_0 740 -23330=4,3,11,11,72 0=72 1=5 4=2 5=1 6=1800 7=72 9=1 Convolution Conv_235 1 1 740 980 -23330=4,3,11,11,72 0=72 1=1 5=1 6=5184 ConvolutionDepthWise Conv_236 1 1 980 745 -23330=4,3,11,11,72 0=72 1=5 4=2 5=1 6=1800 7=72 9=1 Convolution Conv_238 1 1 745 986 -23330=4,3,11,11,72 0=72 1=1 5=1 6=5184 Interp Resize_240 1 1 724_splitncnn_0 752 -23330=4,3,22,22,192 0=1 1=2.000000e+00 2=2.000000e+00 Concat Concat_241 2 1 752 656_splitncnn_0 753 -23330=4,3,22,22,288 Convolution Conv_242 1 1 753 756 -23330=4,3,22,22,72 0=72 1=1 5=1 6=20736 9=1 Split splitncnn_6 1 2 756 756_splitncnn_0 756_splitncnn_1 -23330=8,3,22,22,72,3,22,22,72 ConvolutionDepthWise Conv_244 1 1 756_splitncnn_1 759 -23330=4,3,22,22,72 0=72 1=5 4=2 5=1 6=1800 7=72 9=1 Convolution Conv_246 1 1 759 995 -23330=4,3,22,22,72 0=72 1=1 5=1 6=5184 ConvolutionDepthWise Conv_247 1 1 995 764 -23330=4,3,22,22,72 0=72 1=5 4=2 5=1 6=1800 7=72 9=1 Convolution Conv_249 1 1 764 1001 -23330=4,3,22,22,72 0=72 1=1 5=1 6=5184 Split splitncnn_7 1 2 1001 1001_splitncnn_0 1001_splitncnn_1 -23330=8,3,22,22,72,3,22,22,72 ConvolutionDepthWise Conv_250 1 1 756_splitncnn_0 769 -23330=4,3,22,22,72 0=72 1=5 4=2 5=1 6=1800 7=72 9=1 Convolution Conv_252 1 1 769 1007 -23330=4,3,22,22,72 0=72 1=1 5=1 6=5184 ConvolutionDepthWise Conv_253 1 1 1007 774 -23330=4,3,22,22,72 0=72 1=5 4=2 5=1 6=1800 7=72 9=1 Convolution Conv_255 1 1 774 1013 -23330=4,3,22,22,72 0=72 1=1 5=1 6=5184 Convolution Conv_256 1 1 1013 783 -23330=4,3,22,22,12 0=12 1=1 5=1 6=864 9=4 Convolution Conv_257 1 1 1001_splitncnn_1 784 -23330=4,3,22,22,3 0=3 1=1 5=1 6=216 9=4 Convolution Conv_258 1 1 1001_splitncnn_0 779 -23330=4,3,22,22,80 0=80 1=1 5=1 6=5760 Convolution Conv_259 1 1 986 788 -23330=4,3,11,11,12 0=12 1=1 5=1 6=864 9=4 Convolution Conv_260 1 1 974_splitncnn_1 789 -23330=4,3,11,11,3 0=3 1=1 5=1 6=216 9=4 Convolution Conv_261 1 1 974_splitncnn_0 782 -23330=4,3,11,11,80 0=80 1=1 5=1 6=5760 Permute Transpose_264 1 1 779 785 -23330=4,3,80,22,22 0=5 Softmax Softmax_265 1 1 785 786 -23330=4,3,80,22,22 0=2 1=1 Permute Transpose_266 1 1 786 787 -23330=4,3,22,22,80 0=5 Permute Transpose_269 1 1 782 790 -23330=4,3,80,11,11 0=5 Softmax Softmax_270 1 1 790 791 -23330=4,3,80,11,11 0=2 1=1 Permute Transpose_271 1 1 791 792 -23330=4,3,11,11,80 0=5 Concat Concat_272 3 1 783 784 787 793 -23330=4,3,22,22,95 Permute Transpose_273 1 1 793 794 -23330=4,3,95,22,22 0=3 Concat Concat_274 3 1 788 789 792 795 -23330=4,3,11,11,95 Permute Transpose_275 1 1 795 796 -23330=4,3,95,11,11 0=3 Noop output 2 1 794 796 output ================================================ FILE: benchmark/yolov4-tiny.param ================================================ 7767517 45 53 Input data 0 1 data -23330=4,3,416,416,3 0=416 1=416 2=3 Convolution 0_25 1 1 data 0_25_bn_leaky -23330=4,3,208,208,32 0=32 1=3 3=2 4=1 5=1 6=864 9=2 -23310=1,1.000000e-01 Convolution 1_33 1 1 0_25_bn_leaky 1_33_bn_leaky -23330=4,3,104,104,64 0=64 1=3 3=2 4=1 5=1 6=18432 9=2 -23310=1,1.000000e-01 Convolution 2_41 1 1 1_33_bn_leaky 2_41_bn_leaky -23330=4,3,104,104,64 0=64 1=3 4=1 5=1 6=36864 9=2 -23310=1,1.000000e-01 Split 2_41_bn_leaky_split 1 2 2_41_bn_leaky 2_41_bn_leaky_split_0 2_41_bn_leaky_split_1 -23330=8,3,104,104,64,3,104,104,64 Crop 3_49 1 1 2_41_bn_leaky_split_0 3_49 -23330=4,3,104,104,32 2=32 3=104 4=104 5=32 Convolution 4_54 1 1 3_49 4_54_bn_leaky -23330=4,3,104,104,32 0=32 1=3 4=1 5=1 6=9216 9=2 -23310=1,1.000000e-01 Split 4_54_bn_leaky_split 1 2 4_54_bn_leaky 4_54_bn_leaky_split_0 4_54_bn_leaky_split_1 -23330=8,3,104,104,32,3,104,104,32 Convolution 5_62 1 1 4_54_bn_leaky_split_0 5_62_bn_leaky -23330=4,3,104,104,32 0=32 1=3 4=1 5=1 6=9216 9=2 -23310=1,1.000000e-01 Concat 6_70 2 1 5_62_bn_leaky 4_54_bn_leaky_split_1 6_70 -23330=4,3,104,104,64 Convolution 7_73 1 1 6_70 7_73_bn_leaky -23330=4,3,104,104,64 0=64 1=1 5=1 6=4096 9=2 -23310=1,1.000000e-01 Concat 8_81 2 1 2_41_bn_leaky_split_1 7_73_bn_leaky 8_81 -23330=4,3,104,104,128 Pooling 9_84 1 1 8_81 9_84 -23330=4,3,52,52,128 1=2 2=2 14=1 15=1 5=1 Convolution 10_88 1 1 9_84 10_88_bn_leaky -23330=4,3,52,52,128 0=128 1=3 4=1 5=1 6=147456 9=2 -23310=1,1.000000e-01 Split 10_88_bn_leaky_split 1 2 10_88_bn_leaky 10_88_bn_leaky_split_0 10_88_bn_leaky_split_1 -23330=8,3,52,52,128,3,52,52,128 Crop 11_96 1 1 10_88_bn_leaky_split_0 11_96 -23330=4,3,52,52,64 2=64 3=52 4=52 5=64 Convolution 12_101 1 1 11_96 12_101_bn_leaky -23330=4,3,52,52,64 0=64 1=3 4=1 5=1 6=36864 9=2 -23310=1,1.000000e-01 Split 12_101_bn_leaky_split 1 2 12_101_bn_leaky 12_101_bn_leaky_split_0 12_101_bn_leaky_split_1 -23330=8,3,52,52,64,3,52,52,64 Convolution 13_109 1 1 12_101_bn_leaky_split_0 13_109_bn_leaky -23330=4,3,52,52,64 0=64 1=3 4=1 5=1 6=36864 9=2 -23310=1,1.000000e-01 Concat 14_117 2 1 13_109_bn_leaky 12_101_bn_leaky_split_1 14_117 -23330=4,3,52,52,128 Convolution 15_120 1 1 14_117 15_120_bn_leaky -23330=4,3,52,52,128 0=128 1=1 5=1 6=16384 9=2 -23310=1,1.000000e-01 Concat 16_128 2 1 10_88_bn_leaky_split_1 15_120_bn_leaky 16_128 -23330=4,3,52,52,256 Pooling 17_131 1 1 16_128 17_131 -23330=4,3,26,26,256 1=2 2=2 14=1 15=1 5=1 Convolution 18_135 1 1 17_131 18_135_bn_leaky -23330=4,3,26,26,256 0=256 1=3 4=1 5=1 6=589824 9=2 -23310=1,1.000000e-01 Split 18_135_bn_leaky_split 1 2 18_135_bn_leaky 18_135_bn_leaky_split_0 18_135_bn_leaky_split_1 -23330=8,3,26,26,256,3,26,26,256 Crop 19_143 1 1 18_135_bn_leaky_split_0 19_143 -23330=4,3,26,26,128 2=128 3=26 4=26 5=128 Convolution 20_148 1 1 19_143 20_148_bn_leaky -23330=4,3,26,26,128 0=128 1=3 4=1 5=1 6=147456 9=2 -23310=1,1.000000e-01 Split 20_148_bn_leaky_split 1 2 20_148_bn_leaky 20_148_bn_leaky_split_0 20_148_bn_leaky_split_1 -23330=8,3,26,26,128,3,26,26,128 Convolution 21_156 1 1 20_148_bn_leaky_split_0 21_156_bn_leaky -23330=4,3,26,26,128 0=128 1=3 4=1 5=1 6=147456 9=2 -23310=1,1.000000e-01 Concat 22_164 2 1 21_156_bn_leaky 20_148_bn_leaky_split_1 22_164 -23330=4,3,26,26,256 Convolution 23_167 1 1 22_164 23_167_bn_leaky -23330=4,3,26,26,256 0=256 1=1 5=1 6=65536 9=2 -23310=1,1.000000e-01 Split 23_167_bn_leaky_split 1 2 23_167_bn_leaky 23_167_bn_leaky_split_0 23_167_bn_leaky_split_1 -23330=8,3,26,26,256,3,26,26,256 Concat 24_175 2 1 18_135_bn_leaky_split_1 23_167_bn_leaky_split_0 24_175 -23330=4,3,26,26,512 Pooling 25_178 1 1 24_175 25_178 -23330=4,3,13,13,512 1=2 2=2 14=1 15=1 5=1 Convolution 26_182 1 1 25_178 26_182_bn_leaky -23330=4,3,13,13,512 0=512 1=3 4=1 5=1 6=2359296 9=2 -23310=1,1.000000e-01 Convolution 27_192 1 1 26_182_bn_leaky 27_192_bn_leaky -23330=4,3,13,13,256 0=256 1=1 5=1 6=131072 9=2 -23310=1,1.000000e-01 Split 27_192_bn_leaky_split 1 2 27_192_bn_leaky 27_192_bn_leaky_split_0 27_192_bn_leaky_split_1 -23330=8,3,13,13,256,3,13,13,256 Convolution 28_200 1 1 27_192_bn_leaky_split_0 28_200_bn_leaky -23330=4,3,13,13,512 0=512 1=3 4=1 5=1 6=1179648 9=2 -23310=1,1.000000e-01 Convolution 29_208 1 1 28_200_bn_leaky 29_208 -23330=4,3,13,13,255 0=255 1=1 5=1 6=130560 Convolution 32_237 1 1 27_192_bn_leaky_split_1 32_237_bn_leaky -23330=4,3,13,13,128 0=128 1=1 5=1 6=32768 9=2 -23310=1,1.000000e-01 Interp 33_245 1 1 32_237_bn_leaky 33_245 -23330=4,3,26,26,128 0=1 1=2.000000e+00 2=2.000000e+00 Concat 34_248 2 1 33_245 23_167_bn_leaky_split_1 34_248 -23330=4,3,26,26,384 Convolution 35_251 1 1 34_248 35_251_bn_leaky -23330=4,3,26,26,256 0=256 1=3 4=1 5=1 6=884736 9=2 -23310=1,1.000000e-01 Convolution 36_259 1 1 35_251_bn_leaky 36_259 -23330=4,3,26,26,255 0=255 1=1 5=1 6=65280 Yolov3DetectionOutput detection_out 2 1 29_208 36_259 output -23330=4,2,6,1637,1 0=80 1=3 2=3.000001e-01 -23304=12,1.000000e+01,1.400000e+01,2.300000e+01,2.700000e+01,3.700000e+01,5.800000e+01,8.100000e+01,8.200000e+01,1.350000e+02,1.690000e+02,3.440000e+02,3.190000e+02 -23305=6,1077936128,1082130432,1084227584,1065353216,1073741824,1077936128 -23306=2,3.360000e+01,1.680000e+01 ================================================ FILE: build-android.cmd ================================================ :: Set android ndk root @ECHO OFF @SETLOCAL @SET ANDROID_NDK= :: Set ninja.exe :: @SET NINJA_EXE= :: android armv7 mkdir build-android-armv7-vulkan pushd build-android-armv7-vulkan cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-19 -DNCNN_VULKAN=ON .. cmake --build . --parallel %NUMBER_OF_PROCESSORS% cmake --build . --target install popd :: android aarch64 mkdir build-android-aarch64-vulkan pushd build-android-aarch64-vulkan cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON .. cmake --build . --parallel %NUMBER_OF_PROCESSORS% cmake --build . --target install popd :: android x86 mkdir build-android-x86 pushd build-android-x86 cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-19 -DNCNN_VULKAN=ON .. cmake --build . --parallel %NUMBER_OF_PROCESSORS% cmake --build . --target install popd :: android x86_64 mkdir build-android-x86_64 pushd build-android-x86_64 cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON .. cmake --build . --parallel %NUMBER_OF_PROCESSORS% cmake --build . --target install popd :: android riscv64 mkdir build-android-riscv64 pushd build-android-riscv64 cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="riscv64" -DANDROID_PLATFORM=android-35 -DNCNN_VULKAN=ON .. cmake --build . --parallel %NUMBER_OF_PROCESSORS% cmake --build . --target install popd @ENDLOCAL ================================================ FILE: build.sh ================================================ #!/usr/bin/env bash ##### android armv7 without neon mkdir -p build-android-armv7-without-neon pushd build-android-armv7-without-neon cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=OFF -DANDROID_PLATFORM=android-19 -DNCNN_VULKAN=ON .. make -j4 make install popd ##### android armv7 mkdir -p build-android-armv7 pushd build-android-armv7 cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-19 -DNCNN_VULKAN=ON .. make -j4 make install popd ##### android aarch64 mkdir -p build-android-aarch64 pushd build-android-aarch64 cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON .. make -j4 make install popd ##### android x86 mkdir -p build-android-x86 pushd build-android-x86 cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-19 -DNCNN_VULKAN=ON .. make -j4 make install popd ##### android x86_64 mkdir -p build-android-x86_64 pushd build-android-x86_64 cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON .. make -j4 make install popd ##### android riscv64 mkdir -p build-android-riscv64 pushd build-android-riscv64 cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="riscv64" -DANDROID_PLATFORM=android-35 -DNCNN_VULKAN=ON .. make -j4 make install popd ##### linux of hisiv300 (forgot the chip name) toolchain with neon and openmp mkdir -p build-hisiv300-linux pushd build-hisiv300-linux cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/hisiv300.toolchain.cmake .. make -j4 make install popd ##### linux of hisiv500 (Hi3516CV200 and Hi3519V101) toolchain with neon and openmp mkdir -p build-hisiv500-linux pushd build-hisiv500-linux cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/hisiv500.toolchain.cmake .. make -j4 make install popd ##### linux of hisiv600 (Hi3559V100) toolchain with neon and no openmp (due to only one cpu, close openmp) mkdir -p build-hisiv600-linux pushd build-hisiv600-linux cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/hisiv600.toolchain.cmake .. make -j4 make install popd ##### linux of himix100 (Hi3559a) toolchain with neon and openmp mkdir -p build-himix100-linux pushd build-himix100-linux cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/himix100.toolchain.cmake .. make -j4 make install popd ##### linux of arm-linux-gnueabi toolchain mkdir -p build-arm-linux-gnueabi pushd build-arm-linux-gnueabi cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake .. make -j4 make install popd ##### linux of arm-linux-gnueabihf toolchain mkdir -p build-arm-linux-gnueabihf pushd build-arm-linux-gnueabihf cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf.toolchain.cmake .. make -j4 make install popd ##### linux of v831 toolchain with neon and openmp mkdir -p build-v831-linux pushd build-v831-linux cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/v831.toolchain.cmake .. make -j4 make install popd ##### linux for aarch64-linux-gnu toolchain mkdir -p build-aarch64-linux-gnu pushd build-aarch64-linux-gnu cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake .. make -j4 make install popd ##### linux host system with gcc/g++ mkdir -p build-host-gcc-linux pushd build-host-gcc-linux cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc.toolchain.cmake .. make -j4 make install popd ##### MacOS mkdir -p build-mac pushd build-mac cmake -DNCNN_OPENMP=OFF \ -DNCNN_BENCHMARK=ON \ .. make -j8 make install popd ================================================ FILE: cmake/ncnnConfig.cmake.in ================================================ set(NCNN_VERSION @NCNN_VERSION@) set(NCNN_OPENMP @NCNN_OPENMP@) set(NCNN_THREADS @NCNN_THREADS@) set(NCNN_VULKAN @NCNN_VULKAN@) set(NCNN_SHARED_LIB @NCNN_SHARED_LIB@) set(NCNN_SYSTEM_GLSLANG @NCNN_SYSTEM_GLSLANG@) set(NCNN_SIMPLEVK @NCNN_SIMPLEVK@) if(NCNN_OPENMP) find_package(OpenMP) endif() if(NCNN_THREADS) set(CMAKE_THREAD_PREFER_PTHREAD TRUE) set(THREADS_PREFER_PTHREAD_FLAG TRUE) find_package(Threads REQUIRED) endif() if(NCNN_VULKAN) if(NOT NCNN_SIMPLEVK) find_package(Vulkan REQUIRED) endif() if(NOT NCNN_SHARED_LIB) if(NCNN_SYSTEM_GLSLANG) find_package(SPIRV-Tools QUIET) find_package(SPIRV-Tools-opt QUIET) find_package(glslang QUIET) if(NOT glslang_FOUND) set(GLSLANG_TARGET_DIR "@GLSLANG_TARGET_DIR@") include(${GLSLANG_TARGET_DIR}/OSDependentTargets.cmake) include(${GLSLANG_TARGET_DIR}/OGLCompilerTargets.cmake) if(EXISTS "${GLSLANG_TARGET_DIR}/HLSLTargets.cmake") # hlsl support can be optional include("${GLSLANG_TARGET_DIR}/HLSLTargets.cmake") endif() include(${GLSLANG_TARGET_DIR}/glslangTargets.cmake) include(${GLSLANG_TARGET_DIR}/SPIRVTargets.cmake) endif() else() set(glslang_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../@CMAKE_INSTALL_LIBDIR@/cmake/glslang") find_package(glslang QUIET) endif() endif() endif() include(${CMAKE_CURRENT_LIST_DIR}/ncnn.cmake) if(TARGET ncnn) set(ncnn_FOUND TRUE) if(NOT ncnn_FIND_QUIETLY) message(STATUS "Found ncnn: ${NCNN_VERSION}") endif() endif() ================================================ FILE: cmake/ncnn_add_layer.cmake ================================================ macro(ncnn_add_arch_opt_layer class NCNN_TARGET_ARCH_OPT NCNN_TARGET_ARCH_OPT_CFLAGS) set(NCNN_${NCNN_TARGET_ARCH}_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.h) set(NCNN_${NCNN_TARGET_ARCH}_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.cpp) if(WITH_LAYER_${name} AND EXISTS ${NCNN_${NCNN_TARGET_ARCH}_HEADER} AND EXISTS ${NCNN_${NCNN_TARGET_ARCH}_SOURCE}) set(NCNN_${NCNN_TARGET_ARCH_OPT}_HEADER ${CMAKE_CURRENT_BINARY_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}.h) set(NCNN_${NCNN_TARGET_ARCH_OPT}_SOURCE ${CMAKE_CURRENT_BINARY_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}.cpp) add_custom_command( OUTPUT ${NCNN_${NCNN_TARGET_ARCH_OPT}_HEADER} COMMAND ${CMAKE_COMMAND} -DSRC=${NCNN_${NCNN_TARGET_ARCH}_HEADER} -DDST=${NCNN_${NCNN_TARGET_ARCH_OPT}_HEADER} -DCLASS=${class} -P "${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_generate_${NCNN_TARGET_ARCH_OPT}_source.cmake" DEPENDS ${NCNN_${NCNN_TARGET_ARCH}_HEADER} COMMENT "Generating source ${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}.h" VERBATIM ) set_source_files_properties(${NCNN_${NCNN_TARGET_ARCH_OPT}_HEADER} PROPERTIES GENERATED TRUE) add_custom_command( OUTPUT ${NCNN_${NCNN_TARGET_ARCH_OPT}_SOURCE} COMMAND ${CMAKE_COMMAND} -DSRC=${NCNN_${NCNN_TARGET_ARCH}_SOURCE} -DDST=${NCNN_${NCNN_TARGET_ARCH_OPT}_SOURCE} -DCLASS=${class} -P "${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_generate_${NCNN_TARGET_ARCH_OPT}_source.cmake" DEPENDS ${NCNN_${NCNN_TARGET_ARCH}_SOURCE} COMMENT "Generating source ${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}.cpp" VERBATIM ) set_source_files_properties(${NCNN_${NCNN_TARGET_ARCH_OPT}_SOURCE} PROPERTIES GENERATED TRUE) set_source_files_properties(${NCNN_${NCNN_TARGET_ARCH_OPT}_SOURCE} PROPERTIES COMPILE_FLAGS ${NCNN_TARGET_ARCH_OPT_CFLAGS}) list(APPEND ncnn_SRCS ${NCNN_${NCNN_TARGET_ARCH_OPT}_HEADER} ${NCNN_${NCNN_TARGET_ARCH_OPT}_SOURCE}) # generate layer_declaration and layer_registry file set(layer_declaration "${layer_declaration}#include \"layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}.h\"\n") set(layer_declaration "${layer_declaration}namespace ncnn { DEFINE_LAYER_CREATOR(${class}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}) }\n") set(layer_registry_${NCNN_TARGET_ARCH_OPT} "${layer_registry_${NCNN_TARGET_ARCH_OPT}}#if NCNN_STRING\n{\"${class}\", ${class}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}_layer_creator},\n#else\n{${class}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}_layer_creator},\n#endif\n") else() # no isa optimized version if(WITH_LAYER_${name}) set(layer_registry_${NCNN_TARGET_ARCH_OPT} "${layer_registry_${NCNN_TARGET_ARCH_OPT}}#if NCNN_STRING\n{\"${class}\", ${class}_layer_creator},\n#else\n{${class}_layer_creator},\n#endif\n") else() set(layer_registry_${NCNN_TARGET_ARCH_OPT} "${layer_registry_${NCNN_TARGET_ARCH_OPT}}#if NCNN_STRING\n{\"${class}\", 0},\n#else\n{0},\n#endif\n") endif() endif() endmacro() macro(ncnn_add_arch_opt_source class NCNN_TARGET_ARCH_OPT NCNN_TARGET_ARCH_OPT_CFLAGS) set(NCNN_${NCNN_TARGET_ARCH_OPT}_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}.cpp) if(WITH_LAYER_${name} AND EXISTS ${NCNN_${NCNN_TARGET_ARCH_OPT}_SOURCE}) if(NCNN_RUNTIME_CPU) set_source_files_properties(${NCNN_${NCNN_TARGET_ARCH_OPT}_SOURCE} PROPERTIES COMPILE_FLAGS ${NCNN_TARGET_ARCH_OPT_CFLAGS}) endif() list(APPEND ncnn_SRCS ${NCNN_${NCNN_TARGET_ARCH_OPT}_SOURCE}) endif() endmacro() macro(ncnn_add_arch_opt_layer_source class NCNN_TARGET_ARCH_OPT_BASE NCNN_TARGET_ARCH_OPT NCNN_TARGET_ARCH_OPT_CFLAGS) set(NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT_BASE}.cpp) if(WITH_LAYER_${name} AND EXISTS ${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_SOURCE}) set(NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE ${CMAKE_CURRENT_BINARY_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}.cpp) add_custom_command( OUTPUT ${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE} COMMAND ${CMAKE_COMMAND} -DSRC=${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_SOURCE} -DDST=${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE} -DCLASS=${class} -P "${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_generate_${NCNN_TARGET_ARCH_OPT}_source.cmake" DEPENDS ${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_SOURCE} COMMENT "Generating source ${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}.cpp" VERBATIM ) set_source_files_properties(${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE} PROPERTIES GENERATED TRUE) if(NCNN_RUNTIME_CPU) set_source_files_properties(${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE} PROPERTIES COMPILE_FLAGS ${NCNN_TARGET_ARCH_OPT_CFLAGS}) endif() list(APPEND ncnn_SRCS ${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE}) endif() endmacro() macro(ncnn_add_layer class) string(TOLOWER ${class} name) # WITH_LAYER_xxx option if(${ARGC} EQUAL 2) option(WITH_LAYER_${name} "build with layer ${name}" ${ARGV1}) else() option(WITH_LAYER_${name} "build with layer ${name}" ON) endif() if(NCNN_CMAKE_VERBOSE) message(STATUS "WITH_LAYER_${name} = ${WITH_LAYER_${name}}") endif() if(WITH_LAYER_${name}) list(APPEND ncnn_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/layer/${name}.cpp) # look for arch specific implementation and append source # optimized implementation for armv7, aarch64 or x86 set(LAYER_ARCH_SRC ${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.cpp) if(EXISTS ${LAYER_ARCH_SRC}) set(WITH_LAYER_${name}_${NCNN_TARGET_ARCH} 1) list(APPEND ncnn_SRCS ${LAYER_ARCH_SRC}) endif() set(LAYER_VULKAN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/layer/vulkan/${name}_vulkan.cpp) if(NCNN_VULKAN AND EXISTS ${LAYER_VULKAN_SRC}) set(WITH_LAYER_${name}_vulkan 1) list(APPEND ncnn_SRCS ${LAYER_VULKAN_SRC}) endif() endif() # generate layer_declaration and layer_registry file if(WITH_LAYER_${name}) set(layer_declaration "${layer_declaration}#include \"layer/${name}.h\"\n") set(layer_declaration "${layer_declaration}namespace ncnn { DEFINE_LAYER_CREATOR(${class}) }\n") source_group ("sources\\\\layers" FILES "${CMAKE_CURRENT_SOURCE_DIR}/layer/${name}.cpp") endif() if(WITH_LAYER_${name}_${NCNN_TARGET_ARCH}) set(layer_declaration "${layer_declaration}#include \"layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.h\"\n") set(layer_declaration "${layer_declaration}namespace ncnn { DEFINE_LAYER_CREATOR(${class}_${NCNN_TARGET_ARCH}) }\n") source_group ("sources\\\\layers\\\\${NCNN_TARGET_ARCH}" FILES "${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.cpp") endif() if(WITH_LAYER_${name}_vulkan) set(layer_declaration "${layer_declaration}#include \"layer/vulkan/${name}_vulkan.h\"\n") set(layer_declaration "${layer_declaration}namespace ncnn { DEFINE_LAYER_CREATOR(${class}_vulkan) }\n") file(GLOB NCNN_SHADER_SRCS "layer/vulkan/shader/${name}.comp") file(GLOB NCNN_SHADER_SUBSRCS "layer/vulkan/shader/${name}_*.comp") list(APPEND NCNN_SHADER_SRCS ${NCNN_SHADER_SUBSRCS}) foreach(NCNN_SHADER_SRC ${NCNN_SHADER_SRCS}) ncnn_add_shader(${NCNN_SHADER_SRC}) endforeach() source_group ("sources\\\\layers\\\\vulkan" FILES "${CMAKE_CURRENT_SOURCE_DIR}/layer/vulkan/${name}_vulkan.cpp") endif() if(WITH_LAYER_${name}) set(layer_registry "${layer_registry}#if NCNN_STRING\n{\"${class}\", ${class}_layer_creator},\n#else\n{${class}_layer_creator},\n#endif\n") else() set(layer_registry "${layer_registry}#if NCNN_STRING\n{\"${class}\", 0},\n#else\n{0},\n#endif\n") endif() if(WITH_LAYER_${name}_${NCNN_TARGET_ARCH}) set(layer_registry_arch "${layer_registry_arch}#if NCNN_STRING\n{\"${class}\", ${class}_${NCNN_TARGET_ARCH}_layer_creator},\n#else\n{${class}_${NCNN_TARGET_ARCH}_layer_creator},\n#endif\n") else() set(layer_registry_arch "${layer_registry_arch}#if NCNN_STRING\n{\"${class}\", 0},\n#else\n{0},\n#endif\n") endif() if(WITH_LAYER_${name}_vulkan) set(layer_registry_vulkan "${layer_registry_vulkan}#if NCNN_STRING\n{\"${class}\", ${class}_vulkan_layer_creator},\n#else\n{${class}_vulkan_layer_creator},\n#endif\n") else() set(layer_registry_vulkan "${layer_registry_vulkan}#if NCNN_STRING\n{\"${class}\", 0},\n#else\n{0},\n#endif\n") endif() if(NCNN_TARGET_ARCH STREQUAL "x86") if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") if(NCNN_RUNTIME_CPU AND NCNN_AVX512) ncnn_add_arch_opt_layer(${class} avx512 "/arch:AVX512 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__") endif() if(NCNN_RUNTIME_CPU AND NCNN_FMA) ncnn_add_arch_opt_layer(${class} fma "/arch:AVX /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVX) ncnn_add_arch_opt_layer(${class} avx "/arch:AVX /D__SSSE3__ /D__SSE4_1__") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVX512VNNI) ncnn_add_arch_opt_source(${class} avx512vnni "/arch:AVX512 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVX512VNNI__") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVX512BF16) ncnn_add_arch_opt_source(${class} avx512bf16 "/arch:AVX512 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVX512BF16__") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVX512FP16) ncnn_add_arch_opt_source(${class} avx512fp16 "/arch:AVX512 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVX512FP16__") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVXVNNI) ncnn_add_arch_opt_source(${class} avxvnni "/arch:AVX2 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVXVNNI__") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVXVNNIINT8) ncnn_add_arch_opt_source(${class} avxvnniint8 "/arch:AVX2 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVXVNNI__ /D__AVXVNNIINT8__") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVXVNNIINT16) ncnn_add_arch_opt_source(${class} avxvnniint16 "/arch:AVX2 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVXVNNI__ /D__AVXVNNIINT16__") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVXNECONVERT) ncnn_add_arch_opt_source(${class} avxneconvert "/arch:AVX2 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVXNECONVERT__") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVX2) ncnn_add_arch_opt_source(${class} avx2 "/arch:AVX2 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__") endif() if(NCNN_RUNTIME_CPU AND NCNN_XOP) ncnn_add_arch_opt_source(${class} xop "/arch:AVX /D__SSSE3__ /D__SSE4_1__ /D__XOP__") endif() if(NCNN_RUNTIME_CPU AND NCNN_F16C) ncnn_add_arch_opt_source(${class} f16c "/arch:AVX /D__SSSE3__ /D__SSE4_1__ /D__F16C__") endif() elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC") if(NCNN_RUNTIME_CPU AND NCNN_AVX512) ncnn_add_arch_opt_layer(${class} avx512 "/arch:AVX512 -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mfma -mf16c /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__") endif() if(NCNN_RUNTIME_CPU AND NCNN_FMA) ncnn_add_arch_opt_layer(${class} fma "/arch:AVX -mfma -mf16c /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVX) ncnn_add_arch_opt_layer(${class} avx "/arch:AVX /D__SSSE3__ /D__SSE4_1__") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVX512VNNI) ncnn_add_arch_opt_source(${class} avx512vnni "/arch:AVX512 -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mfma -mf16c -mavx512vnni /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVX512VNNI__") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVX512BF16) ncnn_add_arch_opt_source(${class} avx512bf16 "/arch:AVX512 -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mfma -mf16c -mavx512bf16 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVX512BF16__") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVX512FP16) ncnn_add_arch_opt_source(${class} avx512fp16 "/arch:AVX512 -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mfma -mf16c -mavx512fp16 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVX512FP16__") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVXVNNI) ncnn_add_arch_opt_source(${class} avxvnni "/arch:AVX2 -mfma -mf16c -mavxvnni /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVXVNNI__") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVXVNNIINT8) ncnn_add_arch_opt_source(${class} avxvnniint8 "/arch:AVX2 -mfma -mf16c -mavxvnni -mavxvnniint8 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVXVNNI__ /D__AVXVNNIINT8__") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVXVNNIINT16) ncnn_add_arch_opt_source(${class} avxvnniint16 "/arch:AVX2 -mfma -mf16c -mavxvnni -mavxvnniint16 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVXVNNI__ /D__AVXVNNIINT16__") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVXNECONVERT) ncnn_add_arch_opt_source(${class} avxneconvert "/arch:AVX2 -mfma -mf16c -mavxneconvert /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVXNECONVERT__") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVX2) ncnn_add_arch_opt_source(${class} avx2 "/arch:AVX2 -mfma -mf16c /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__") endif() if(NCNN_RUNTIME_CPU AND NCNN_XOP) ncnn_add_arch_opt_source(${class} xop "/arch:AVX -mxop /D__SSSE3__ /D__SSE4_1__ /D__XOP__") endif() if(NCNN_RUNTIME_CPU AND NCNN_F16C) ncnn_add_arch_opt_source(${class} f16c "/arch:AVX -mf16c /D__SSSE3__ /D__SSE4_1__ /D__F16C__") endif() else() if(NCNN_RUNTIME_CPU AND NCNN_AVX512) ncnn_add_arch_opt_layer(${class} avx512 "-mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mfma -mf16c") endif() if(NCNN_RUNTIME_CPU AND NCNN_FMA) ncnn_add_arch_opt_layer(${class} fma "-mavx -mfma -mf16c") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVX) ncnn_add_arch_opt_layer(${class} avx "-mavx") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVX512VNNI) ncnn_add_arch_opt_source(${class} avx512vnni "-mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mfma -mf16c -mavx512vnni") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVX512BF16) ncnn_add_arch_opt_source(${class} avx512bf16 "-mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mfma -mf16c -mavx512bf16") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVX512FP16) ncnn_add_arch_opt_source(${class} avx512fp16 "-mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mfma -mf16c -mavx512fp16") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVXVNNI) ncnn_add_arch_opt_source(${class} avxvnni "-mavx2 -mfma -mf16c -mavxvnni") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVXVNNIINT8) ncnn_add_arch_opt_source(${class} avxvnniint8 "-mavx2 -mfma -mf16c -mavxvnni -mavxvnniint8") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVXVNNIINT16) ncnn_add_arch_opt_source(${class} avxvnniint16 "-mavx2 -mfma -mf16c -mavxvnni -mavxvnniint16") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVXNECONVERT) ncnn_add_arch_opt_source(${class} avxneconvert "-mavx2 -mfma -mf16c -mavxneconvert") endif() if(NCNN_RUNTIME_CPU AND NCNN_AVX2) ncnn_add_arch_opt_source(${class} avx2 "-mavx2 -mfma -mf16c") endif() if(NCNN_RUNTIME_CPU AND NCNN_XOP) ncnn_add_arch_opt_source(${class} xop "-mavx -mxop") endif() if(NCNN_RUNTIME_CPU AND NCNN_F16C) ncnn_add_arch_opt_source(${class} f16c "-mavx -mf16c") endif() endif() endif() if(NCNN_TARGET_ARCH STREQUAL "arm" AND (CMAKE_SIZEOF_VOID_P EQUAL 4 AND NOT NCNN_TARGET_ILP32)) if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")) if(NCNN_VFPV4) ncnn_add_arch_opt_source(${class} vfpv4 "/arch:VFPv4 /D__ARM_FP=0x0E") endif() else() if(NCNN_VFPV4) if(NCNN_COMPILER_SUPPORT_ARM_VFPV4) ncnn_add_arch_opt_source(${class} vfpv4 "-mfpu=neon-vfpv4") elseif(NCNN_COMPILER_SUPPORT_ARM_VFPV4_FP16) ncnn_add_arch_opt_source(${class} vfpv4 "-mfpu=neon-vfpv4 -mfp16-format=ieee") endif() endif() endif() endif() if(NCNN_TARGET_ARCH STREQUAL "arm" AND (CMAKE_SIZEOF_VOID_P EQUAL 8 OR NCNN_TARGET_ILP32)) if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") if(NCNN_VFPV4) ncnn_add_arch_opt_source(${class} vfpv4 " ") endif() if(NCNN_ARM82) ncnn_add_arch_opt_source(${class} asimdhp "/arch:armv8.2 /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC") endif() if(NCNN_RUNTIME_CPU AND NCNN_ARM82DOT) ncnn_add_arch_opt_source(${class} asimddp "/arch:armv8.2 /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_DOTPROD") endif() if(NCNN_RUNTIME_CPU AND NCNN_ARM82FP16FML) ncnn_add_arch_opt_source(${class} asimdfhm "/arch:armv8.2 /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_FP16_FML") endif() if(NCNN_RUNTIME_CPU AND NCNN_ARM84BF16) ncnn_add_arch_opt_source(${class} bf16 "/arch:armv8.4 /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_DOTPROD /D__ARM_FEATURE_FP16_FML /D__ARM_FEATURE_BF16_VECTOR_ARITHMETIC") endif() if(NCNN_RUNTIME_CPU AND NCNN_ARM84I8MM) ncnn_add_arch_opt_source(${class} i8mm "/arch:armv8.4 /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_DOTPROD /D__ARM_FEATURE_FP16_FML /D__ARM_FEATURE_MATMUL_INT8") endif() # TODO add support for sve family if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVE) endif() if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVE2) endif() if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVEBF16) endif() if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVEI8MM) endif() if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVEF32MM) endif() elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC") if(NCNN_VFPV4) ncnn_add_arch_opt_source(${class} vfpv4 " ") endif() if(NCNN_ARM82) ncnn_add_arch_opt_source(${class} asimdhp "/arch:armv8.2 -march=armv8.2-a+fp16 /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC") endif() if(NCNN_RUNTIME_CPU AND NCNN_ARM82DOT) ncnn_add_arch_opt_source(${class} asimddp "/arch:armv8.2 -march=armv8.2-a+fp16+dotprod /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_DOTPROD") endif() if(NCNN_RUNTIME_CPU AND NCNN_ARM82FP16FML) ncnn_add_arch_opt_source(${class} asimdfhm "/arch:armv8.2 -march=armv8.2-a+fp16+fp16fml /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_FP16_FML") endif() if(NCNN_RUNTIME_CPU AND NCNN_ARM84BF16) ncnn_add_arch_opt_source(${class} bf16 "/arch:armv8.4 -march=armv8.4-a+fp16+dotprod+bf16 /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_DOTPROD /D__ARM_FEATURE_FP16_FML /D__ARM_FEATURE_BF16_VECTOR_ARITHMETIC") endif() if(NCNN_RUNTIME_CPU AND NCNN_ARM84I8MM) ncnn_add_arch_opt_source(${class} i8mm "/arch:armv8.4 -march=armv8.4-a+fp16+dotprod+i8mm /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_DOTPROD /D__ARM_FEATURE_FP16_FML /D__ARM_FEATURE_MATMUL_INT8") endif() # TODO add support for sve family if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVE) endif() if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVE2) endif() if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVEBF16) endif() if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVEI8MM) endif() if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVEF32MM) endif() else() if(NCNN_VFPV4) ncnn_add_arch_opt_source(${class} vfpv4 " ") endif() if(NCNN_ARM82) ncnn_add_arch_opt_source(${class} asimdhp "-march=armv8.2-a+fp16") endif() if(NCNN_RUNTIME_CPU AND NCNN_ARM82DOT) ncnn_add_arch_opt_source(${class} asimddp "-march=armv8.2-a+fp16+dotprod") endif() if(NCNN_RUNTIME_CPU AND NCNN_ARM82FP16FML) # clang 9.0.9 shipped with android ndk-r21 is missing __ARM_FEATURE_FP16_FML macro for asimdfhm target ncnn_add_arch_opt_source(${class} asimdfhm "-march=armv8.2-a+fp16+fp16fml -D__ARM_FEATURE_FP16_FML") endif() if(NCNN_RUNTIME_CPU AND NCNN_ARM84BF16) ncnn_add_arch_opt_source(${class} bf16 "-march=armv8.4-a+fp16+dotprod+bf16") endif() if(NCNN_RUNTIME_CPU AND NCNN_ARM84I8MM) ncnn_add_arch_opt_source(${class} i8mm "-march=armv8.4-a+fp16+dotprod+i8mm") endif() if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVE) ncnn_add_arch_opt_source(${class} sve "-march=armv8.6-a+fp16+dotprod+sve") endif() if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVE2) ncnn_add_arch_opt_source(${class} sve2 "-march=armv8.6-a+fp16+dotprod+sve2") endif() if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVEBF16) ncnn_add_arch_opt_source(${class} svebf16 "-march=armv8.6-a+fp16+dotprod+sve+bf16") endif() if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVEI8MM) ncnn_add_arch_opt_source(${class} svei8mm "-march=armv8.6-a+fp16+dotprod+sve+i8mm") endif() if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVEF32MM) ncnn_add_arch_opt_source(${class} svef32mm "-march=armv8.6-a+fp16+dotprod+sve+f32mm") endif() endif() endif() if(NCNN_TARGET_ARCH STREQUAL "mips") if(NCNN_RUNTIME_CPU AND NCNN_MSA) ncnn_add_arch_opt_layer(${class} msa "-mmsa") endif() if(NCNN_MMI) ncnn_add_arch_opt_source(${class} mmi "-mloongson-mmi") endif() endif() if(NCNN_TARGET_ARCH STREQUAL "loongarch") if(NCNN_RUNTIME_CPU AND NCNN_LASX) ncnn_add_arch_opt_layer(${class} lasx "-mlasx -mlsx") endif() if(NCNN_RUNTIME_CPU AND NCNN_LSX) ncnn_add_arch_opt_layer(${class} lsx "-mlsx") endif() endif() if(NCNN_TARGET_ARCH STREQUAL "riscv") if(CMAKE_SIZEOF_VOID_P EQUAL 8) if(NCNN_RUNTIME_CPU AND NCNN_RVV) ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv") endif() if(NCNN_ZFH) if(NOT NCNN_RUNTIME_CPU AND NCNN_ZVFH) ncnn_add_arch_opt_source(${class} zfh "-march=rv64gcv_zfh_zvfh -D__fp16=_Float16") elseif(NOT NCNN_RUNTIME_CPU AND NCNN_XTHEADVECTOR) ncnn_add_arch_opt_source(${class} zfh "-march=rv64gc_zfh_xtheadvector -D__riscv_zvfh=1 -D__fp16=_Float16") else() ncnn_add_arch_opt_source(${class} zfh "-march=rv64gc_zfh -D__fp16=_Float16") endif() endif() if(NCNN_RUNTIME_CPU AND NCNN_XTHEADVECTOR) # linker complains the conflict of v and xtheadvector, so disable generating any riscv attributes ncnn_add_arch_opt_layer(${class} xtheadvector "-march=rv64gc_xtheadvector -mno-riscv-attribute -Wa,-mno-arch-attr") ncnn_add_arch_opt_layer_source(${class} zfh xtheadvector "-march=rv64gc_zfh_xtheadvector -mno-riscv-attribute -Wa,-mno-arch-attr -D__fp16=_Float16") endif() if(NCNN_RUNTIME_CPU AND NCNN_ZVFH) ncnn_add_arch_opt_layer_source(${class} zfh rvv "-march=rv64gcv_zfh_zvfh -D__fp16=_Float16") endif() elseif(CMAKE_SIZEOF_VOID_P EQUAL 4) if(NCNN_RUNTIME_CPU AND NCNN_RVV) ncnn_add_arch_opt_layer(${class} rvv "-march=rv32gcv") endif() if(NCNN_ZFH) if(NOT NCNN_RUNTIME_CPU AND NCNN_ZVFH) ncnn_add_arch_opt_source(${class} zfh "-march=rv32gcv_zfh_zvfh -D__fp16=_Float16") elseif(NOT NCNN_RUNTIME_CPU AND NCNN_XTHEADVECTOR) ncnn_add_arch_opt_source(${class} zfh "-march=rv32gc_zfh_xtheadvector -D__riscv_zvfh=1 -D__fp16=_Float16") else() ncnn_add_arch_opt_source(${class} zfh "-march=rv32gc_zfh -D__fp16=_Float16") endif() endif() if(NCNN_RUNTIME_CPU AND NCNN_XTHEADVECTOR) # linker complains the conflict of v and xtheadvector, so disable generating any riscv attributes ncnn_add_arch_opt_layer(${class} xtheadvector "-march=rv32gc_xtheadvector -mno-riscv-attribute -Wa,-mno-arch-attr") ncnn_add_arch_opt_layer_source(${class} zfh xtheadvector "-march=rv32gc_zfh_xtheadvector -mno-riscv-attribute -Wa,-mno-arch-attr -D__fp16=_Float16") endif() if(NCNN_RUNTIME_CPU AND NCNN_ZVFH) ncnn_add_arch_opt_layer_source(${class} zfh rvv "-march=rv32gcv_zfh_zvfh -D__fp16=_Float16") endif() endif() endif() # generate layer_type_enum file set(layer_type_enum "${layer_type_enum}${class} = ${__LAYER_TYPE_ENUM_INDEX},\n") math(EXPR __LAYER_TYPE_ENUM_INDEX "${__LAYER_TYPE_ENUM_INDEX}+1") endmacro() ================================================ FILE: cmake/ncnn_add_param.cmake ================================================ macro(ncnn_add_param NCNN_PARAM_SRC) # Get the file name with extension get_filename_component(NCNN_PARAM_SRC_NAME_WE ${NCNN_PARAM_SRC} NAME) # Manually remove ".param" since NAME_WE treats ".1.param" as a multi-extension string(REPLACE ".param" "" NCNN_PARAM_SRC_NAME_WE "${NCNN_PARAM_SRC_NAME_WE}") # Replace characters invalid in C identifiers ('.' and '-') with underscores string(REPLACE ".param" "" NCNN_PARAM_SRC_NAME_WE "${NCNN_PARAM_SRC_NAME_WE}") # Replace characters invalid in C identifiers ('.' and '-') with underscores string(REPLACE "." "_" NCNN_PARAM_SRC_NAME_WE "${NCNN_PARAM_SRC_NAME_WE}") string(REPLACE "-" "_" NCNN_PARAM_SRC_NAME_WE "${NCNN_PARAM_SRC_NAME_WE}") # Check if the result is empty if (NOT NCNN_PARAM_SRC_NAME_WE) message(FATAL_ERROR "Failed to extract valid filename from '${NCNN_PARAM_SRC}'") endif() # Check if the extracted filename is a valid C identifier string(REGEX MATCH "^[A-Za-z_][A-Za-z0-9_]*$" is_valid "${NCNN_PARAM_SRC_NAME_WE}") if (NOT is_valid) message(FATAL_ERROR "Extracted filename '${NCNN_PARAM_SRC_NAME_WE}' is not a valid C identifier") endif() set(NCNN_PARAM_HEADER ${CMAKE_CURRENT_BINARY_DIR}/param/${NCNN_PARAM_SRC_NAME_WE}.hex.h) add_custom_command( OUTPUT ${NCNN_PARAM_HEADER} COMMAND ${CMAKE_COMMAND} -DPARAM_SRC=${NCNN_PARAM_SRC} -DPARAM_SRC_NAME_WE=${NCNN_PARAM_SRC_NAME_WE} -DPARAM_HEADER=${NCNN_PARAM_HEADER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_generate_param_header.cmake" DEPENDS ${NCNN_PARAM_SRC} COMMENT "Preprocessing param source ${NCNN_PARAM_SRC_NAME_WE}.param" VERBATIM ) set_source_files_properties(${NCNN_PARAM_HEADER} PROPERTIES GENERATED TRUE) get_filename_component(NCNN_PARAM_HEADER_NAME ${NCNN_PARAM_HEADER} NAME) string(APPEND param_header_data "#include \"param/${NCNN_PARAM_HEADER_NAME}\"\n") list(APPEND NCNN_PARAM_HEX_FILES ${NCNN_PARAM_HEADER}) endmacro() ================================================ FILE: cmake/ncnn_add_shader.cmake ================================================ macro(ncnn_add_shader NCNN_SHADER_SRC) get_filename_component(NCNN_SHADER_SRC_NAME_WE ${NCNN_SHADER_SRC} NAME_WE) set(NCNN_SHADER_COMP_HEADER ${CMAKE_CURRENT_BINARY_DIR}/layer/vulkan/shader/${NCNN_SHADER_SRC_NAME_WE}.comp.hex.h) add_custom_command( OUTPUT ${NCNN_SHADER_COMP_HEADER} COMMAND ${CMAKE_COMMAND} -DSHADER_SRC=${NCNN_SHADER_SRC} -DSHADER_COMP_HEADER=${NCNN_SHADER_COMP_HEADER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_generate_shader_comp_header.cmake" DEPENDS ${NCNN_SHADER_SRC} COMMENT "Preprocessing shader source ${NCNN_SHADER_SRC_NAME_WE}.comp" VERBATIM ) set_source_files_properties(${NCNN_SHADER_COMP_HEADER} PROPERTIES GENERATED TRUE) get_filename_component(NCNN_SHADER_COMP_HEADER_NAME ${NCNN_SHADER_COMP_HEADER} NAME) string(APPEND layer_shader_spv_data "#include \"layer/vulkan/shader/${NCNN_SHADER_COMP_HEADER_NAME}\"\n") get_filename_component(NCNN_SHADER_SRC_NAME_WE ${NCNN_SHADER_SRC} NAME_WE) string(APPEND layer_shader_registry "{${NCNN_SHADER_SRC_NAME_WE}_comp_data,sizeof(${NCNN_SHADER_SRC_NAME_WE}_comp_data)},\n") list(APPEND NCNN_SHADER_SPV_HEX_FILES ${NCNN_SHADER_COMP_HEADER}) # generate layer_shader_type_enum file set(layer_shader_type_enum "${layer_shader_type_enum}${NCNN_SHADER_SRC_NAME_WE} = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") endmacro() ================================================ FILE: cmake/ncnn_generate_avx512_source.cmake ================================================ # must define SRC DST CLASS file(READ ${SRC} source_data) # replace string(TOUPPER ${CLASS} CLASS_UPPER) string(TOLOWER ${CLASS} CLASS_LOWER) string(REGEX REPLACE "LAYER_${CLASS_UPPER}_X86_H" "LAYER_${CLASS_UPPER}_X86_AVX512_H" source_data "${source_data}") string(REGEX REPLACE "${CLASS}_x86" "${CLASS}_x86_avx512" source_data "${source_data}") string(REGEX REPLACE "#include \"${CLASS_LOWER}_x86.h\"" "#include \"${CLASS_LOWER}_x86_avx512.h\"" source_data "${source_data}") file(WRITE ${DST} "${source_data}") ================================================ FILE: cmake/ncnn_generate_avx_source.cmake ================================================ # must define SRC DST CLASS file(READ ${SRC} source_data) # replace string(TOUPPER ${CLASS} CLASS_UPPER) string(TOLOWER ${CLASS} CLASS_LOWER) string(REGEX REPLACE "LAYER_${CLASS_UPPER}_X86_H" "LAYER_${CLASS_UPPER}_X86_AVX_H" source_data "${source_data}") string(REGEX REPLACE "${CLASS}_x86" "${CLASS}_x86_avx" source_data "${source_data}") string(REGEX REPLACE "#include \"${CLASS_LOWER}_x86.h\"" "#include \"${CLASS_LOWER}_x86_avx.h\"" source_data "${source_data}") file(WRITE ${DST} "${source_data}") ================================================ FILE: cmake/ncnn_generate_fma_source.cmake ================================================ # must define SRC DST CLASS file(READ ${SRC} source_data) # replace string(TOUPPER ${CLASS} CLASS_UPPER) string(TOLOWER ${CLASS} CLASS_LOWER) string(REGEX REPLACE "LAYER_${CLASS_UPPER}_X86_H" "LAYER_${CLASS_UPPER}_X86_FMA_H" source_data "${source_data}") string(REGEX REPLACE "${CLASS}_x86" "${CLASS}_x86_fma" source_data "${source_data}") string(REGEX REPLACE "#include \"${CLASS_LOWER}_x86.h\"" "#include \"${CLASS_LOWER}_x86_fma.h\"" source_data "${source_data}") file(WRITE ${DST} "${source_data}") ================================================ FILE: cmake/ncnn_generate_lasx_source.cmake ================================================ # must define SRC DST CLASS file(READ ${SRC} source_data) # replace string(TOUPPER ${CLASS} CLASS_UPPER) string(TOLOWER ${CLASS} CLASS_LOWER) string(REGEX REPLACE "LAYER_${CLASS_UPPER}_LOONGARCH_H" "LAYER_${CLASS_UPPER}_LOONGARCH_LASX_H" source_data "${source_data}") string(REGEX REPLACE "${CLASS}_loongarch" "${CLASS}_loongarch_lasx" source_data "${source_data}") string(REGEX REPLACE "#include \"${CLASS_LOWER}_loongarch.h\"" "#include \"${CLASS_LOWER}_loongarch_lasx.h\"" source_data "${source_data}") file(WRITE ${DST} "${source_data}") ================================================ FILE: cmake/ncnn_generate_lsx_source.cmake ================================================ # must define SRC DST CLASS file(READ ${SRC} source_data) # replace string(TOUPPER ${CLASS} CLASS_UPPER) string(TOLOWER ${CLASS} CLASS_LOWER) string(REGEX REPLACE "LAYER_${CLASS_UPPER}_LOONGARCH_H" "LAYER_${CLASS_UPPER}_LOONGARCH_LSX_H" source_data "${source_data}") string(REGEX REPLACE "${CLASS}_loongarch" "${CLASS}_loongarch_lsx" source_data "${source_data}") string(REGEX REPLACE "#include \"${CLASS_LOWER}_loongarch.h\"" "#include \"${CLASS_LOWER}_loongarch_lsx.h\"" source_data "${source_data}") file(WRITE ${DST} "${source_data}") ================================================ FILE: cmake/ncnn_generate_msa_source.cmake ================================================ # must define SRC DST CLASS file(READ ${SRC} source_data) # replace string(TOUPPER ${CLASS} CLASS_UPPER) string(TOLOWER ${CLASS} CLASS_LOWER) string(REGEX REPLACE "LAYER_${CLASS_UPPER}_MIPS_H" "LAYER_${CLASS_UPPER}_MIPS_MSA_H" source_data "${source_data}") string(REGEX REPLACE "${CLASS}_mips" "${CLASS}_mips_msa" source_data "${source_data}") string(REGEX REPLACE "#include \"${CLASS_LOWER}_mips.h\"" "#include \"${CLASS_LOWER}_mips_msa.h\"" source_data "${source_data}") file(WRITE ${DST} "${source_data}") ================================================ FILE: cmake/ncnn_generate_param_header.cmake ================================================ # must define PARAM_HEADER PARAM_SRC PARAM_SRC_NAME_WE file(READ ${PARAM_SRC} param_data) # remove whitespace string(REGEX REPLACE "\n +" "\n" param_data ${param_data}) # replace more spaces to one space string(REGEX REPLACE "[ \t]+" " " param_data "${param_data}") # remove empty line string(REGEX REPLACE "\n[\n]+" "\n" param_data "${param_data}") # text to hex file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/param/${PARAM_SRC_NAME_WE}.text2hex.txt "${param_data}") file(READ ${CMAKE_CURRENT_BINARY_DIR}/param/${PARAM_SRC_NAME_WE}.text2hex.txt param_data_hex HEX) string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," param_data_hex ${param_data_hex}) string(FIND "${param_data_hex}" "," tail_comma REVERSE) string(SUBSTRING "${param_data_hex}" 0 ${tail_comma} param_data_hex) # generate model param header file file(WRITE ${PARAM_HEADER} "static const char ${PARAM_SRC_NAME_WE}_param_data[] = {${param_data_hex},0x00};\n") ================================================ FILE: cmake/ncnn_generate_rvv_source.cmake ================================================ # must define SRC DST CLASS file(READ ${SRC} source_data) # replace string(TOUPPER ${CLASS} CLASS_UPPER) string(TOLOWER ${CLASS} CLASS_LOWER) string(REGEX REPLACE "LAYER_${CLASS_UPPER}_RISCV_H" "LAYER_${CLASS_UPPER}_RISCV_RVV_H" source_data "${source_data}") string(REGEX REPLACE "${CLASS}_riscv" "${CLASS}_riscv_rvv" source_data "${source_data}") string(REGEX REPLACE "#include \"${CLASS_LOWER}_riscv.h\"" "#include \"${CLASS_LOWER}_riscv_rvv.h\"" source_data "${source_data}") file(WRITE ${DST} "${source_data}") ================================================ FILE: cmake/ncnn_generate_shader_comp_header.cmake ================================================ # must define SHADER_COMP_HEADER SHADER_SRC file(READ ${SHADER_SRC} comp_data) # skip leading comment string(FIND "${comp_data}" "#version" version_start) if(NOT ${version_start} EQUAL -1) string(SUBSTRING "${comp_data}" ${version_start} -1 comp_data) endif() # remove whitespace string(REGEX REPLACE "\n +" "\n" comp_data "${comp_data}") # remove comments string(REGEX REPLACE "//[^\n]*" "" comp_data "${comp_data}") # replace more spaces to one space string(REGEX REPLACE "[ \t]+" " " comp_data "${comp_data}") # remove empty line string(REGEX REPLACE "\n[\n]+" "\n" comp_data "${comp_data}") get_filename_component(SHADER_SRC_NAME_WE ${SHADER_SRC} NAME_WE) # text to hex file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/layer/vulkan/shader/${SHADER_SRC_NAME_WE}.text2hex.txt "${comp_data}") file(READ ${CMAKE_CURRENT_BINARY_DIR}/layer/vulkan/shader/${SHADER_SRC_NAME_WE}.text2hex.txt comp_data_hex HEX) string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," comp_data_hex ${comp_data_hex}) string(FIND "${comp_data_hex}" "," tail_comma REVERSE) string(SUBSTRING "${comp_data_hex}" 0 ${tail_comma} comp_data_hex) file(WRITE ${SHADER_COMP_HEADER} "static const char ${SHADER_SRC_NAME_WE}_comp_data[] = {${comp_data_hex}};\n") ================================================ FILE: cmake/ncnn_generate_xtheadvector_source.cmake ================================================ # must define SRC DST CLASS file(READ ${SRC} source_data) # replace string(TOUPPER ${CLASS} CLASS_UPPER) string(TOLOWER ${CLASS} CLASS_LOWER) string(REGEX REPLACE "LAYER_${CLASS_UPPER}_RISCV_H" "LAYER_${CLASS_UPPER}_RISCV_XTHEADVECTOR_H" source_data "${source_data}") string(REGEX REPLACE "${CLASS}_riscv" "${CLASS}_riscv_xtheadvector" source_data "${source_data}") string(REGEX REPLACE "#include \"${CLASS_LOWER}_riscv.h\"" "#include \"${CLASS_LOWER}_riscv_xtheadvector.h\"" source_data "${source_data}") file(WRITE ${DST} "${source_data}") ================================================ FILE: cmake/run_test.cmake ================================================ execute_process(COMMAND $ENV{TESTS_EXECUTABLE_LOADER} $ENV{TESTS_EXECUTABLE_LOADER_ARGUMENTS} ${TEST_EXECUTABLE} $ENV{TESTS_ARGUMENTS} RESULT_VARIABLE result) if(NOT "${result}" STREQUAL "0") message(FATAL_ERROR "Test failed with return value '${result}'") endif() ================================================ FILE: codeformat.sh ================================================ #!/usr/bin/env bash # we run clang-format and astyle twice to get stable format output format_code() { find src/ tools/ tests/ examples/ benchmark/ python/ -type f -name '*.c' -o -name '*.cpp' -o -name '*.cc' -o -name '*.h' | grep -v python/pybind11 | grep -v stb_image | grep -v ruapu | xargs -i clang-format -i {} astyle -n -r "benchmark/*.h,*.cpp,*.cc" "tests/*.h,*.cpp,*.cc" "tools/*.h,*.cpp,*.cc" "examples/*.h,*.cpp,*.cc" astyle -n -r "src/*.h,*.cpp,*.cc" --exclude=src/stb_image.h --exclude=src/stb_image_write.h --exclude=src/ruapu.h astyle -n -r "python/*.h,*.cpp,*.cc" --exclude=python/pybind11 } format_code || { echo 'Formatting failed' ; exit 1; } #first time execute format_code || { echo 'Formatting failed' ; exit 1; } #second time execute ================================================ FILE: docs/Home.md ================================================ ### input data and extract output ```cpp #include #include #include "net.h" int main() { cv::Mat img = cv::imread("image.ppm", CV_LOAD_IMAGE_GRAYSCALE); int w = img.cols; int h = img.rows; // subtract 128, norm to -1 ~ 1 ncnn::Mat in = ncnn::Mat::from_pixels_resize(img.data, ncnn::Mat::PIXEL_GRAY, w, h, 60, 60); float mean[1] = { 128.f }; float norm[1] = { 1/128.f }; in.substract_mean_normalize(mean, norm); ncnn::Net net; net.load_param("model.param"); net.load_model("model.bin"); ncnn::Extractor ex = net.create_extractor(); ex.input("data", in); ncnn::Mat feat; ex.extract("output", feat); return 0; } ``` ### print Mat content ```cpp void pretty_print(const ncnn::Mat& m) { for (int q=0; q normed_feats(m.c); for (int i=0; i(y); uchar* sp = normed_feats[i].ptr(y); for (int x=0; x /sys/class/power_supply/battery/charging_enabled current(μA) = adb shell cat /sys/class/power_supply/battery/current_now (multiply -1 for 810 chip) voltage(μV) = adb shell cat /sys/class/power_supply/battery/voltage_now power consumption(mW) = current / 1000 * voltage / 1000 / 1000 performance per watt(1000fps/W) = fps / power consumption * 1000 *** The binary size after debug stripping ![](https://github.com/nihui/ncnn-assets/raw/master/20180413/1.jpg) ![](https://github.com/nihui/ncnn-assets/raw/master/20180413/2.jpg) *** squeezenet ![](https://github.com/nihui/ncnn-assets/raw/master/20180413/3.jpg) ![](https://github.com/nihui/ncnn-assets/raw/master/20180413/4.jpg) ![](https://github.com/nihui/ncnn-assets/raw/master/20180413/5.jpg) ![](https://github.com/nihui/ncnn-assets/raw/master/20180413/6.jpg) ![](https://github.com/nihui/ncnn-assets/raw/master/20180413/7.jpg) ![](https://github.com/nihui/ncnn-assets/raw/master/20180413/8.jpg) *** mobilenet ![](https://github.com/nihui/ncnn-assets/raw/master/20180413/9.jpg) ![](https://github.com/nihui/ncnn-assets/raw/master/20180413/10.jpg) ![](https://github.com/nihui/ncnn-assets/raw/master/20180413/11.jpg) ![](https://github.com/nihui/ncnn-assets/raw/master/20180413/12.jpg) ![](https://github.com/nihui/ncnn-assets/raw/master/20180413/13.jpg) ![](https://github.com/nihui/ncnn-assets/raw/master/20180413/14.jpg) *** vgg16 ![](https://github.com/nihui/ncnn-assets/raw/master/20180413/15.jpg) ![](https://github.com/nihui/ncnn-assets/raw/master/20180413/16.jpg) ![](https://github.com/nihui/ncnn-assets/raw/master/20180413/17.jpg) ![](https://github.com/nihui/ncnn-assets/raw/master/20180413/18.jpg) ![](https://github.com/nihui/ncnn-assets/raw/master/20180413/19.jpg) ![](https://github.com/nihui/ncnn-assets/raw/master/20180413/20.jpg) ================================================ FILE: docs/benchmark/vulkan-conformance-test.md ================================================ |device|gpu|api version|driver version|squeezenet|mobilenetssd|yolov3| |---|---|---|---|---|---|---| |intel-i7-7700|Intel(R) HD Graphics 630 (Kaby Lake GT2)|1.1.90|18.3.4|y|y|y| |GTX-1060|GeForce GTX 1060 3GB|1.1.95|418.172.0|y|y|y| |AMD-Radeon R9 M290X|AMD RADV PITCAIRN (LLVM 7.0.1)|1.1.70|18.3.4|y|y|y| |iphone-5s|Apple A7 GPU|1.0.82|0.2.1825|y|y|y| |huawei-nexus6p|Adreno (TM) 430|1.0.49|35.601.2388|y|y|y |vivo-y1731ca|Adreno (TM) 505|1.0.61|37.845.1429|y|n|n| |vivo-y85a|Adreno (TM) 506|1.0.61|2.944.3349|y|n|n| |vivo-x9s|Adreno (TM) 510|1.0.61|42.917.1172|y|y|y| |meizu-15|Adreno (TM) 512|1.0.38|29.189.223|n|n|n| |chuizi-jianguo-pro2|Adreno (TM) 512|1.0.38|21.219.2615|n|n|n| |xiaomi-note3|Adreno (TM) 512|1.0.38|39.369.2305|n|n|n| |oppo-r11|Adreno (TM) 512|1.0.38|42.977.756|n|n|n| |xiaomi-6x|Adreno (TM) 512|1.0.61|14.322.3739|y|y|y| |oppo-r11s+|Adreno (TM) 512|1.0.61|35.1004.3936|y|y|y| |vivo-x20a|Adreno (TM) 512|1.0.61|43.10.3141|y|y|y| |vivo-v1816a|Adreno (TM) 512|1.0.61|43.10.3141|y|y|y| |vivo-z1|Adreno (TM) 512|1.0.61|43.10.3141|y|y|y| |xiaomi-redmi-note5|Adreno (TM) 512|1.0.61|63.219.2354|y|y|y| |google-pixel|Adreno (TM) 530|1.1.87|512.354.0|y|y|y| |nubia-z17|Adreno (TM) 540|1.0.38|1.28.32|n|n|n| |samsung-galaxys8+|Adreno (TM) 540|1.0.61|29.896.3583|y|y|y| |oneplus-5t|Adreno (TM) 540|1.0.61|18.1023.2233|y|y|y| |google-pixel2|Adreno (TM) 540|1.1.66|512.313.0|y|y|y| |essential-ph-1|Adreno (TM) 540|1.1.66|512.319.0|y|y|y| |vivo-x23|Adreno (TM) 615|1.0.66|33.870.3328|y|y|y| |vivo-v1813ba|Adreno (TM) 615|1.0.66|33.870.3328|y|y|y| |xiaomi-8se|Adreno (TM) 616|1.0.66|30.913.18|y|y|y| |vivo-nex-a|Adreno (TM) 616|1.0.66|33.870.3328|y|y|y| |xiaomi-mix2s|Adreno (TM) 630|1.0.61|4.91.2976|y|y|y| |heisha-SKR-A0|Adreno (TM) 630|1.0.61|36.173.3586|y|y|y| |heisha-SKR-A0|Adreno (TM) 630|1.0.66|47.448.1532|y|y|y| |oneplus-6|Adreno (TM) 630|1.1.66|512.324.0|y|y|y| |vivo-iQOO|Adreno (TM) 640|1.1.87|512.361.0|y|y|y| |meitu-m8s|Mali-T880|1.0.14|500.910.1017|n|n|n| |huawei-p10|Mali-G71|1.0.53|151.949.2145|n|n|n| |huawei-mate9|Mali-G71|1.0.53|151.949.2145|n|n|n| |oppo-a73|Mali-G71|1.0.47|575.795.1934|n|n|n| |vivo-y97|Mali-G72|1.0.58|240.537.3580|n|n|n| |huawei-mate10|Mali-G72|1.0.66|14.0.0|y|y|y| |huawei-v10|Mali-G72|1.0.66|14.0.0|y|y|y| |huawei-vce-al00|Mali-G72|1.0.66|14.0.0|y|y|y| |huawei-mate20|Mali-G76|1.0.66|14.0.0|y|y|y| |huawei-pct-al10|Mali-G76|1.0.66|14.0.0|y|y|y| ================================================ FILE: docs/developer-guide/aarch64-mix-assembly-and-intrinsic.md ================================================ ```c // v寄存器全部使用 %.4s // 128-bit vreg matches %.4s // a += b * c float32x4_t _a = vld1q_f32(a); float32x4_t _b = vld1q_f32(b); float32x4_t _c = vld1q_f32(c); asm volatile( "fmla %0.4s, %2.4s, %3.4s" : "=w"(_a) // %0 : "0"(_a), "w"(_b), // %2 "w"(_c) // %3 : ); ``` ```c // v寄存器使用低64位 %.2s // low 64-bit vreg matches %.2s // a += b * c float32x2_t _a = vld1_f32(a); float32x2_t _b = vld1_f32(b); float32x2_t _c = vld1_f32(c); asm volatile( "fmla %0.2s, %2.2s, %3.2s" : "=w"(_a) // %0 : "0"(_a), "w"(_b), // %2 "w"(_c) // %3 : ); ``` ```c // v寄存器单路使用 %.s[0] %.s[1] %.s[2] %.s[3] // 32-bit register matches %.s[0] // a += b * c[0] // a += b * c[1] // a += b * c[2] // a += b * c[3] float32x4_t _a = vld1_f32(a); float32x4_t _b = vld1_f32(b); float32x4_t _c = vld1_f32(c); asm volatile( "fmla %0.4s, %2.4s, %3.s[0]" "fmla %0.4s, %2.4s, %3.s[1]" "fmla %0.4s, %2.4s, %3.s[2]" "fmla %0.4s, %2.4s, %3.s[3]" : "=w"(_a) // %0 : "0"(_a), "w"(_b), // %2 "w"(_c) // %3 : ); ``` qwq ================================================ FILE: docs/developer-guide/add-custom-layer.zh.md ================================================ # NCNN增加自定义层 ## 举例 这里举个例子添加自定义层次 如Relu6,即 std::min(6.f, std::max(0.f, val)) ``` Input input 0 1 input Convolution conv2d 1 1 input conv2d 0=32 1=1 2=1 3=1 4=0 5=0 6=768 Relu6 relu6 1 1 conv2d relu6 Pooling maxpool 1 1 relu6 maxpool 0=0 1=3 2=2 3=-233 4=0 ``` ## 定义源码h文件:src/layer/relu6.h ```CPP #ifndef LAYER_RELU6_H #define LAYER_RELU6_H #include "layer.h" namespace ncnn { class Relu6 : public Layer { public: Relu6(); virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; }; } // namespace ncnn #endif // LAYER_RELU6_H ``` ## 定义源码CPP文件:src/layer/relu6.cpp ```CPP #include "relu6.h" #include namespace ncnn { Relu6::Relu6() { one_blob_only = true; support_inplace = true; } int Relu6::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; int h = bottom_top_blob.h; int channels = bottom_top_blob.c; int size = w * h; #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q < channels; q++) { float* ptr = bottom_top_blob.channel(q); for (int i=0; i weights(0); int ret = test_layer("Relu6", pd, weights, a); if (ret != 0) { fprintf(stderr, "test_relu6 failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c); } return ret; } static int test_relu6_0() { return 0 || test_relu6(RandomMat(5, 7, 24)) || test_relu6(RandomMat(7, 9, 12)) || test_relu6(RandomMat(3, 5, 13)); } static int test_relu6_1() { return 0 || test_relu6(RandomMat(15, 24)) || test_relu6(RandomMat(17, 12)) || test_relu6(RandomMat(19, 15)); } static int test_relu6_2() { return 0 || test_relu6(RandomMat(128)) || test_relu6(RandomMat(124)) || test_relu6(RandomMat(127)); } int main() { SRAND(7767517); return 0 || test_relu6_0() || test_relu6_1() || test_relu6_2(); } ``` ## 修改tests/CMakeLists.txt 注册Relu6测试用例 ```CPP ncnn_add_layer_test(LSTM) ncnn_add_layer_test(Yolov3DetectionOutput) ncnn_add_layer_test(Relu6) ``` ## 编译 ``` 按原NCNN步骤编译 ``` ## 单元测试 ``` ./test_relu6 ``` ================================================ FILE: docs/developer-guide/arm-a53-a55-dual-issue.md ================================================ ## natural assembly * no register dependency, no penalty ``` ld1 {v0.4s}, [r0], #16 fmla v10.4s, v16.4s, v24.s[0] fmla v11.4s, v16.4s, v24.s[1] fmla v12.4s, v16.4s, v24.s[2] fmla v13.4s, v16.4s, v24.s[3] ``` ## A53 * 128bit vector load cannot be dual issued with fmla, wait 2 cycles * 64bit vector load cannot be dual issued with fmla, wait 1 cycle * 64bit integer load can be dual issued with fmla, no penalty * pointer update can be dual issued with fmla, no penalty * 64bit vector load and 64bit vector insert can be dual issued, no penalty * any vector load cannot be issued on the 4th cycle of each fmla (enters the accumulator pipeline) ### practical guide * use 64bit vector load only * issue vector load every three fmla * 1 cycle to load 64bit, dual issue with the previous interleaved 64bit insert * load the remaining 64bit into integer register, dual issue with fmla * update pointer, dual issue with fmla * insert 64bit into vector from integer register, dual issue with the next interleaved 64bit load * add nop every three fmla if no load, seems to be faster ``` ldr d0, [r0] // 1 cycle, v0 first 64bit fmla ldr x23, [r0, #8] // 0 cycle, v0 second 64bit to temp register fmla add r0, r0, #16 // 0 cycle, update pointer fmla ldr d1, [r0] // 1 cycle, v1 first 64bit ins v0.d[1], x23 // 0 cycle, v0 second 64bit complete fmla ldr x23, [r0, #8] // 0 cycle, v1 second 64bit to temp register fmla add r0, r0, #16 // 0 cycle, update pointer fmla ins v1.d[1], x23 // 1 cycle, v1 second 64bit complete nop fmla fmla fmla nop nop fmla fmla fmla ``` ## A55 * Limited by the number of neon register read and write ports, most neon instructions cannot be dual-issued. * neon instructions have different latencies * 128bit vector load cannot be issued with fmla, WAR wait 2 cycles * 64bit integer load can be dual issued with fmla, no penalty * pointer update can be dual issued with fmla, no penalty * 64bit vector insert can be dual issued with fmla, no penalty ### practical guide * A55 supports 128bit load and 256bit write in one clock. Support dual emission of two 64bit vector loads or single emission of 128bit vector load * `ldr`, dual issue with fmla * load the remaining 64bit into integer register, dual issue with fmla * update pointer, dual issue with fmla * insert 64bit into vector from integer register, dual issue with fmla * interleaved load loose register dependency * nop trick is not needed * Loop unrolling fma reduces pipeline bubbles * Some data type conversion neon instructions can be dual issued, such as `fsvts` ``` ldr d0, [r0] // 0 cycle, v0 first 64bit fmla ldr x23, [r0, #8] // 0 cycle, v0 second 64bit to temp register fmla add r0, r0, #16 // 0 cycle, update pointer fmla ldr d1, [r0] // 0 cycle, v1 first 64bit fmla ins v0.d[1], x23 // 0 cycle, v0 second 64bit complete fmla ldr x23, [r0, #8] // 0 cycle, v1 second 64bit to temp register fmla add r0, r0, #16 // 0 cycle, update pointer fmla ins v1.d[1], x23 // 0 cycle, v1 second 64bit complete fmla ``` ================================================ FILE: docs/developer-guide/armv7-mix-assembly-and-intrinsic.md ================================================ ```c // d寄存器全部使用 %P // d reg matches %P // a += b * c float32x2_t _a = vld1_f32(a); float32x2_t _b = vld1_f32(b); float32x2_t _c = vld1_f32(c); asm volatile( "vmla.f32 %P0, %P2, %P3" : "=w"(_a) // %0 : "0"(_a), "w"(_b), // %2 "w"(_c) // %3 : ); ``` ```c // q寄存器全部使用 %q // q reg matches %q // a += b * c float32x4_t _a = vld1q_f32(a); float32x4_t _b = vld1q_f32(b); float32x4_t _c = vld1q_f32(c); asm volatile( "vmla.f32 %q0, %q2, %q3" : "=w"(_a) // %0 : "0"(_a), "w"(_b), // %2 "w"(_c) // %3 : ); ``` ```c // d寄存器单路使用 %P[0] %P[1] // 32bit d reg matches %P[0] // a += b * c[0] // a += b * c[1] float32x2_t _a = vld1_f32(a); float32x2_t _b = vld1_f32(b); float32x2_t _c = vld1_f32(c); asm volatile( "vmla.f32 %P0, %P2, %P3[0]" "vmla.f32 %P0, %P2, %P3[1]" : "=w"(_a) // %0 : "0"(_a), "w"(_b), // %2 "w"(_c) // %3 : ); ``` ```c // q寄存器单路使用 %e[0] %e[1] %f[0] %f[1] // 32-bit q reg matches %e[0] // a += b * c[0] // a += b * c[1] // a += b * c[2] // a += b * c[3] float32x4_t _a = vld1q_f32(a); float32x4_t _b = vld1q_f32(b); float32x4_t _c = vld1q_f32(c); asm volatile( "vmla.f32 %q0, %q2, %e3[0]" "vmla.f32 %q0, %q2, %e3[1]" "vmla.f32 %q0, %q2, %f3[0]" "vmla.f32 %q0, %q2, %f3[1]" : "=w"(_a) // %0 : "0"(_a), "w"(_b), // %2 "w"(_c) // %3 : ); ``` ```c // q寄存器拆分d寄存器使用 %e %f // use %e %f to split q reg into two d regs // a += b * c[0]c[1] // a += b * c[2]c[3] float32x2_t _a = vldq_f32(a); float32x2_t _b = vldq_f32(b); float32x4_t _c = vld1q_f32(c); asm volatile( "vmla.f32 %P0, %P2, %e3" "vmla.f32 %P0, %P2, %f3" : "=w"(_a) // %0 : "0"(_a), "w"(_b), // %2 "w"(_c) // %3 : ); ``` ```c // d寄存器声明绑定 // specify concrete d reg which want to save // vmla.f32 d0, d2, d4 register float32x2_t _a asm("d0") = vld1_f32(a); register float32x2_t _b asm("d2") = vld1_f32(b); register float32x2_t _c asm("d4") = vld1_f32(c); asm volatile( "vmla.f32 %P0, %P2, %P3" : "=w"(_a) // %0 : "0"(_a), "w"(_b), // %2 "w"(_c) // %3 : ); ``` ```c // q寄存器声明绑定 // bind q reg with data // vmla.f32 q0, q1, q2 register float32x4_t _a asm("q0") = vld1q_f32(a); register float32x4_t _b asm("q1") = vld1q_f32(b); register float32x4_t _c asm("q2") = vld1q_f32(c); asm volatile( "vmla.f32 %q0, %q2, %q3" : "=w"(_a) // %0 : "0"(_a), "w"(_b), // %2 "w"(_c) // %3 : ); ``` 如果不是因为编译器的bug,寄存器绑定是用不着的,然而。。。 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=41538 qwq ================================================ FILE: docs/developer-guide/binaryop-broadcasting.md ================================================ ### broadcasting rule ncnn BinaryOp accepts blobs with different shape C = BinaryOp(A, B) shape notation convention is [w], [w,h], [w,h,c], [w,h,d,c] * binaryop with scalar and scalar-like |A|B|C| |---|---|---| |[2]|scalar / [1]|[2]| |[2,3]|scalar / [1] / [1,1]|[2,3]| |[2,3,4]|scalar / [1] / [1,1] / [1,1,1]|[2,3,4]| |[2,3,4,5]|scalar / [1] / [1,1] / [1,1,1] / [1,1,1,1]|[2,3,4,5]| * no broadcast |A|B|C| |---|---|---| |[2]|[2]|[2]| |[2,3]|[2,3]|[2,3]| |[2,3,4]|[2,3,4]|[2,3,4]| |[2,3,4,5]|[2,3,4,5]|[2,3,4,5]| * explicit broadcast B |A|B|C| |---|---|---| |[2,3]|[1,3]|[2,3]| |[2,3]|[2,1]|[2,3]| |[2,3,4]|[1,3,4]|[2,3,4]| |[2,3,4]|[2,1,4]|[2,3,4]| |[2,3,4]|[2,3,1]|[2,3,4]| |[2,3,4]|[1,1,4]|[2,3,4]| |[2,3,4]|[1,3,1]|[2,3,4]| |[2,3,4]|[2,1,1]|[2,3,4]| |[2,3,4,5]|[1,3,4,5]|[2,3,4,5]| |[2,3,4,5]|[2,1,4,5]|[2,3,4,5]| |[2,3,4,5]|[2,3,1,5]|[2,3,4,5]| |[2,3,4,5]|[2,3,4,1]|[2,3,4,5]| |[2,3,4,5]|[1,1,4,5]|[2,3,4,5]| |[2,3,4,5]|[1,3,1,5]|[2,3,4,5]| |[2,3,4,5]|[1,3,4,1]|[2,3,4,5]| |[2,3,4,5]|[2,1,1,5]|[2,3,4,5]| |[2,3,4,5]|[2,1,4,1]|[2,3,4,5]| |[2,3,4,5]|[2,3,1,1]|[2,3,4,5]| |[2,3,4,5]|[1,1,1,5]|[2,3,4,5]| |[2,3,4,5]|[1,1,4,1]|[2,3,4,5]| |[2,3,4,5]|[1,3,1,1]|[2,3,4,5]| |[2,3,4,5]|[2,1,1,1]|[2,3,4,5]| * implicit broadcast B for inner axis It broadcasts in the opposite direction of the numpy's implicit broadcasting behavior. pnnx will insert reshape operator at the appropriate position to convert it to explicit broadcast automatically. |A|B|C| |---|---|---| |[2,3]|[3]|[2,3]| |[2,3,4]|[4]|[2,3,4]| |[2,3,4]|[3,4]|[2,3,4]| |[2,3,4,5]|[5]|[2,3,4,5]| |[2,3,4,5]|[4,5]|[2,3,4,5]| |[2,3,4,5]|[3,4,5]|[2,3,4,5]| * implicit broadcast B with 1 dimension rank for outer axis This exists only for compatibility. When the size is the same, eg. [2,2] and [2], broadcast B for inner axis will be prioritized. |A|B|C| |---|---|---| |[2,3]|[2]|[2,3]| |[2,3,4]|[2]|[2,3,4]| |[2,3,4,5]|[2]|[2,3,4,5]| ================================================ FILE: docs/developer-guide/build-ncnn-on-windows-xp.zh.md ================================================ # Build ncnn on Windows XP > **Contributors:** [@Sugar-Baby](https://github.com/Sugar-Baby) and [@AtomAlpaca](https://github.com/AtomAlpaca) ## 0. 环境准备 #### 0.1 虚拟机设置 我使用的是[我的MSDN](https://www.imsdn.cn/)提供的[Windows XP SP3 x64版本](https://www.imsdn.cn/operating-systems/windows-xp/)。虚拟机使用Oracle VM VirtualBox,内存4GB,存储空间64GB(C盘16GB,D盘48GB)。 **在虚拟机关机的情况下**,点击虚拟机管理器界面的"设置"-"网络"-"高级",将控制芯片改为PCnet-FAST III,混杂模式设置为拒绝,勾选接入网线,点击"OK"保存。重启虚拟机就可以连接上网络了。 点击虚拟机界面的"设备"-"安装增强功能...",在虚拟机中进入"我的电脑",刷新后出现"VirtualBox Guest Additions (D: )",右键选择"自动播放",完成安装后重启。 点击虚拟机界面的"设备"-"共享粘贴板",设置为"双向"。点击"设备"-"共享文件夹"-"共享文件夹..",点击右侧加号,在"共享文件夹路径"中选择"其他...",然后选择需要共享的主机文件夹。勾选"自动挂载"和"固定分配",点击"OK"保存。在虚拟机中进入"我的电脑",刷新后出现'VBoxSvr' 上的 <主机文件夹名称>,双击进入就可以双向传输文件了。 #### 0.2 开发环境配置 浏览器推荐[Mypal 68](https://www.mypal-browser.org/download.html),注意要选择32位版本。Windows XP自带ZIP文件解压。安装后就可以访问互联网了。 从Github下载[w64devkit](https://github.com/skeeto/w64devkit),选择x86版本。这里下载的是一个自解压的7z文件,在虚拟机中解压即可。 在"开始"-"控制面板"-"切换到经典视图"-"系统"-"高级"-"环境变量"-"系统变量"中,选择Path,点击"编辑",在字符串末尾加入一个分号(;),然后粘贴w64devkit下bin文件夹的目录。点击"确定"保存之后可以打开命令提示符输入例如c++的命令验证是否成功加入环境变量。 由于年代过于久远,Git的官方release已经没有兼容Windows XP的版本了。最后一个兼容的版本(1.9.5)可以在[这里](https://www.xiazaiba.com/html/29352.html)下载。 为了使用Git,需要安装[Win32 OpenSSL](https://slproweb.com/products/Win32OpenSSL.html)。选择Win32 OpenSSL Light版本。这个过程中会附带安装VC++ 2022运行时库。 如果因为协议、代理等问题不能在虚拟机中使用Git,也可以下载ZIP版本后在虚拟机中解压。 需要手动下载[CMake最后支持Windows XP的版本](https://github.com/Kitware/CMake/releases/download/v3.10.3/cmake-3.10.3-win32-x86.zip)。建议解压在C:\Program Files下,并且需要设置系统变量,到CMake目录下的bin文件夹。具体可以参考上面w64devkit的方法。 ## 1. 编译 ### 1.1 使用 MinGW-w64 运行 ```bash cd mkdir build cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/windows-xp-mingw.toolchain.cmake -DNCNN_VULKAN=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_AVX=OFF -DCMAKE_BUILD_TYPE=Release -G "MinGW Makefiles" .. make -j2 make install ``` 由于平台性能的限制,Vulkan SDK 最低要求 Windows 7 SP1,XP 无法安装官方驱动和工具链,因此需要关闭Vulkan选项。同时需要使用简化版 OpenCV 替代库NCNN_SIMPLEOCV。 ### 1.2 使用 Clang 需要先配置 MinGW-w64 环境,然后安装 Clang 6.0 或更高版本。 ```bash cd mkdir build cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/windows-xp-clang.toolchain.cmake -DNCNN_SIMPLEOCV=ON -DNCNN_SIMPLEOMP=ON -DNCNN_AVX=OFF -DCMAKE_BUILD_TYPE=Release -G "MinGW Makefiles" .. make -j2 make install ``` ### 1.3 使用 Visual Studio (MSVC) 需要安装支持 Windows XP 的 v141_xp 工具集: 1. 打开 Visual Studio 安装程序(工具 → 获取工具和功能) 2. 选择"使用 C++ 的桌面开发" 3. 在摘要部分选择"对 C++ 的 Windows XP 支持" 4. 点击修改 ```bash cd mkdir build cd build cmake -A WIN32 -G "Visual Studio 17 2022" -T v141_xp -DNCNN_SIMPLEOCV=ON -DNCNN_OPENMP=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_WITH_STATIC_CRT=ON -DCMAKE_TOOLCHAIN_FILE=../toolchains/windows-xp-msvc.toolchain.cmake .. cmake --build . --config Release -j 2 cmake --build . --config Release --target install ``` ## 2. 测试 ### 2.1 benchncnn 将benchmark目录下的所有文件复制到build/benchmark目录下。在命令提示符中cd到build/benchmark, 然后运行 ```bash benchncnn [测试的循环次数] [线程数] [节能模式] ``` 其中,节能模式取值为0时关闭,为1时打开。 ### 2.2 examples 从[这里](https://github.com/nihui/ncnn-assets/tree/master/models)可以下载到所有需要的param和bin文件。需要注意的是,ZF_faster_rcnn_final.bin开头的三个文件(.zip,.z01,.z02)最好先放在主机上解压出bin文件再传进虚拟机。 把这些文件放在build/examples目录下。 我写了一个bat脚本来批量测试这些模型: ```batch @echo off setlocal enabledelayedexpansion set EXAMPLES_DIR=\BUILD\EXAMPLES set IMAGE_PATH=\IMAGES\256-ncnn.png set LOG_FILE=test_results.log echo NCNN Examples Test Results > %LOG_FILE% echo ========================= >> %LOG_FILE% echo Test started: %date% %time% >> %LOG_FILE% echo. >> %LOG_FILE% for %%f in ("%EXAMPLES_DIR%\*.exe") do ( set EXE_NAME=%%~nf set EXE_PATH=%%f echo Testing: !EXE_NAME! >> %LOG_FILE% echo -------------------------------- >> %LOG_FILE% !EXE_PATH! "%IMAGE_PATH%" >> %LOG_FILE% 2>&1 if errorlevel 1 ( echo [ERROR] !EXE_NAME! failed to run. >> %LOG_FILE% ) else ( echo [SUCCESS] !EXE_NAME! completed. >> %LOG_FILE% ) echo. >> %LOG_FILE% ) echo Test finished: %date% %time% >> %LOG_FILE% echo Results saved to %LOG_FILE% endlocal ``` 把这个bat脚本放在build/examples目录下,替换掉所有的``,双击运行。通过生成的test_results.log即可查看所有模型的结果。 通过修改`set IMAGE_PATH=\IMAGES\256-ncnn.png`中的路径来更换需要测试的文件。 ================================================ FILE: docs/developer-guide/custom-allocator.md ================================================ Mat structure is now allocator-aware via an extra allocator parameter with default zero value. The good-old ncnn::fastMalloc()/ncnn::fastFree() will be used for a null allocator. You could pass a custom allocator to delegate all memory allocation and deallocation. ```cpp class Allocator { public: virtual void* fastMalloc(size_t size) = 0; virtual void fastFree(void* ptr) = 0; }; ``` ncnn has already implemented two simple pooled Allocator class, with mutex lock or without it. ```cpp ncnn::PoolAllocator locked_mempool; ncnn::UnlockedPoolAllocator unlocked_mempool; ``` the two allocator types in ncnn * blob allocator used to allocate memory for all named blobs, which you could retrieve by Extractor::extract() * workspace allocator used to allocate memory for internal temporary use in layer implementation, such as the temp blob after padding in convolution by default, all Extractor instance use the two allocator in the default option You can alter them by ncnn::set_default_option() or you can set them per Extractor by Extractor::set_blob_allocator()/Extractor::set_workspace_allocator() blob allocator is guaranteed to be called in-order in layer implementation during each Extractor lifecycle while workspace allocator may be called synchronously the practical usage * one network, one-by-one inference shared unlocked blob allocator for all Extractor shared locked workspace allocator for all Extractor * one network, concurrent inference shared unlocked blob allocator for all Extractor in each thread shared locked workspace allocator for all Extractor among all threads * concurrent multiple networks, one-by-one inference for each network shared unlocked blob allocator for all Extractor of each network shared locked workspace allocator for all Extractor among all networks (for saving memory) * concurrent multiple networks, concurrent inference for each network shared unlocked blob allocator for all Extractor of each network in each thread shared locked workspace allocator for all Extractor among all networks (for saving memory) ================================================ FILE: docs/developer-guide/element-packing.md ================================================ ### what is packing and why packing is the form of storing multiple short-sized values as one long-sized value. element packing is well mapped with the underlying simd register, which usually use one very wide register to store different types of values. |C|elemsize|elempack| |---|---|---| |double|8|1| |float|4|1| |int|4|1| |short|2|1| |signed char|1|1| |arm neon|elemsize|elempack| |---|---|---| |float64x2_t|16|2| |float32x4_t|16|4| |int32x4_t|16|4| |float16x4_t|8|4| |int8x8_t|8|8| Though the real count of values doubles when elempack is two, the wide-sized value is still treated as one value in the view of Mat structure. For example, we want to store 40 float values in Mat object, if elempack 1 is used, Mat width is then 40, while 10 if elempack 4 is used. |dims|w|h|c|cstep|elemsize|elempack| |---|---|---|---|---|---|---| |1|40|1|1|40|4|1| |1|10|1|1|10|16|4| ### packing style convention In practice, elempack 1, 4, 8 are the most common cases. It is possible to use any other packing style in theory. The following table show the packing axis used in ncnn for different dimension. |dims|packing axis|shape before packing|shape after packing| |---|---|---|---| |1|w|w|w/elempack| |2|h|w, h|w, h/elempack| |3|c|w, h, c|w, h, c/elempack| If the packing axis dim is not evenly divisible by elempack, zero padding may be used. ``` outw = (w + elempack - 1) / elempack; ``` The following snippet shows the memory layout after elempack=4 on 3-dim Mat ``` // w=2 h=3 c=4 elempack=1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 // w=2 h=3 c=1 elempack=4 (0,6,12,18) (1,7,13,19) (2,8,14,20) (3,9,15,21) (4,10,16,22) (5,11,17,23) ``` ### how to convert elempack There is a convenient wrapper function provided ``` // convert to elempack 4 if packing axis dim is evenly divisible by elempack // return the identity Mat otherwise ncnn::Mat a; ncnn::Mat a_packed; ncnn::convert_packing(a, a_packed, 4); if (a_packed.elempack == 4) { // check if packing is successful } // convert to packing 1, aka unpacking, shall be always successful ncnn::Mat b; ncnn::Mat b_unpacked; ncnn::convert_packing(b, b_unpacked, 1); ``` ### handle general interleaved data Here is an example of using convert packing to convert RGB interleaved data to planar **NOTE:** The following code is just presented to explain what packing is and the conversion process. Do not use it in production due to its poor performance. Do use ncnn::Mat::from_pixels() ```cpp // rgb_interleaved_u8 is RGB RGB RGB ... // rgb_interleaved_u8.w = w; // rgb_interleaved_u8.h = h; // rgb_interleaved_u8.c = 1; // rgb_interleaved_u8.elemsize = 3; // rgb_interleaved_u8.elempack = 3; ncnn::Mat rgb_interleaved_u8(w, h, 1, 3, 3); ncnn::Mat rgb_planar_u8; ncnn::convert_packing(rgb_interleaved_u8, rgb_planar_u8, 1); // rgb_planar_u8 is now RRR ... GGG ... BBB ... // rgb_planar_u8.w = w; // rgb_planar_u8.h = h; // rgb_planar_u8.c = 3; // rgb_planar_u8.elemsize = 1; // rgb_planar_u8.elempack = 1; ``` ================================================ FILE: docs/developer-guide/expression.md ================================================ ### expression expression is used in the reshape slice parameter to express the dynamic shape or subscript value based on the expression formula and input shape Compared with directly converting the expression calculation process into multiple operators, the motivation for using expression * No additional shape concat and other operators will be generated due to dynamic calculation, which greatly reduces the number of layers of the ncnn model and makes it easier to view the model structure and modify expression * Shape or subscript evaluations are usually single-digit operations, which are more suitable for direct completion on the CPU without layout conversion and kernel call overhead In the param file, `Reshape` layer can contain 6=expression The pnnx tool can automatically convert `pnnx.Expression` to the expr parameter of ncnn `Reshape` * Convert to 0w, 0h, 0d or 0c according to the input shape rank and `size(@0,1)` * Automatically remove the batch dimension according to the input batch index * Convert `pnnx.Expression` and `Tensor.reshape`/`Tensor.view` two operators are fused into ncnn `Reshape` * Automatically summarize the number of references, exclude duplicate references and sort the indexes of references * Convert the customary shape representation order, such as CHW to WHC Example pnnx.param where A and B are 3D tensors ``` pnnx.Expression expr 2 1 A B shape expr=[add(size(@1,0),2),mul(size(@0,1),2),-1] Tensor.reshape reshape 2 1 A shape out ``` pnnx.py ```python shape = [(B.size(0) + 2), (A.size(1) * 2), -1] out = A.reshape(*shape) ``` Converted to ncnn.param ``` Reshape reshape 2 1 A B out 6="-1,*(0h,2),+(1c,2)" ``` ### syntax Use infix expression, format is `op(arg0,arg1,...)`, multiple operations can be nested, multiple sizes are separated by commas, and numbers can be integers or decimals Among them, the commonly used `add` `sub` `mul` `div` `floor_div` are abbreviated as `+` `-` `*` `/` `//`, and other arithmetic operations use names, such as `sin` `ceil` `max`, etc. * `max(2,3)` * `floor(sin(3.14))` * `+(*(-2,1),10)` means (-2 * 1) + 10 * `1,2,+(3,2)` list can represent output shape with 3-rank The input shape can be referenced at runtime, format is `id(w|h|d|c)`, the maximum id is 9, which means that up to 10 inputs can be referenced Assuming that the Reshape layer has two input blobs, A and B, then * `0w,1h` means A.w, B.h * `*(+(0c,1c),2)` means (A.c + B.c) * 2 ### helper api ```cpp #include "expression.h" int count_expression_blobs(const std::string& expr); int eval_list_expression(const std::string& expr, const std::vector& blobs, std::vector& outlist); ``` * `count_expression_blobs` Pass expression to get the number of inputs it references, such as `0w,1h` returns 2 * `eval_list_expression` Evaluate the result list according to expression and input blob calculate. If the calculation result is a floating point number, it will be automatically truncated to an integer. ### supported operator |type|operators| |---|---| |float to int|`trunc` `ceil` `floor` `round`| |binary arithmetic|`+` `-` `*` `/` `//` `max` `min` `pow` `fmod` `remainder` `atan2` `logaddexp`| |unary arithmetic|`abs` `neg` `sign` `square` `sqrt` `rsqrt` `reciprocal` `exp` `log` `log10` `sin` `asin` `cos` `acos` `tan` `atan` `sinh` `asinh` `cosh` `acosh` `tanh` `atanh`| |integer bitwise|`and` `or` `xor` `lshift` `rshift`| ================================================ FILE: docs/developer-guide/glsl-extension.md ================================================ # ncnn GLSL extension ## rationale Different GPUs support different features, some support fp16 as buffer storage type, some support fp16 as operand variable, some old GPUs only support fp32 When the GPU supports the `VK_KHR_16bit_storage` extension, in order to minimize the memory bandwidth consumption of the GPU, we will give priority to using fp16 as the storage type. Otherwise, we use `packHalf2x16` and `unpackHalf2x16` in GLSL 4.2 to compress 2 fp32 to uint, reducing read and write bandwidth. Similarly, when the gpu supports the `VK_KHR_shader_float16_int8` extension, in order to speed up the calculation efficiency, we will give priority to using fp16 as the operation operand, which usually doubles the speed. Otherwise, we use fp32. To ensure the widest compatibility, the following code for declaring descriptor binding and loading data will be written ```c #if NCNN_fp16_storage // gpu supports 16bit storage layout (binding = 0) buffer blob { f16vec4 blob_data[]; }; #elif NCNN_fp16_packed // gpu supports GLSL 4.2 layout (binding = 0) buffer blob { uvec2 blob_data[]; }; #else // gpu only supports fp32 layout (binding = 0) buffer blob { vec4 blob_data[]; }; #endif void main() { const int i = int(gl_GlobalInvocationID.x); #if NCNN_fp16_storage && NCNN_fp16_arithmetic // gpu supports 16bit storage and shader float16 f16vec4 x = blob_data[i]; #elif NCNN_fp16_storage // gpu supports 16bit storage but no shader float16 vec4 x = vec4(blob_data[i]); #elif NCNN_fp16_packed && NCNN_fp16_arithmetic // gpu supports GLSL 4.2 and shader float16 f16vec4 x = f16vec4(unpackFloat2x16(blob_data[i].x), unpackFloat2x16(blob_data[i].y)); #elif NCNN_fp16_packed // gpu supports GLSL 4.2 vec4 x = vec4(unpackHalf2x16(blob_data[i].x), unpackHalf2x16(blob_data[i].y)); #else // gpu only supports fp32 vec4 x = blob_data[i]; #endif } ``` As you can see, just declaring the buffer type and reading a value consumes a lot of lines of code, which is a maintenance nightmare. Therefore, ncnn adds more flexible data types and auxiliary functions to reduce the size of the code and improve readability, and will automatically expand to the most efficient implementation according to the feature level supported by the GPU. The above code, by using the ncnn glsl extension, can be simplified to ```c layout (binding = 0) buffer blob { sfpvec4 blob_data[]; }; void main() { const int i = int(gl_GlobalInvocationID.x); afpvec4 x = buffer_ld4(blob_data, i); } ``` The ncnn glsl extension provides the necessary data types for storage, computation, shared memory, and load, store, conversion functions for buffers and images. We also provide some buffer and image copy functions to prevent loss of precision when using fp16 as the intermediate data type, and to avoid unnecessary `unpackHalf2x16` and `packHalf2x16` pair. # entrypoint for compiling GLSL The gpu.h header in the ncnn library exposes 3 APIs for compiling glsl code into spir-v binary, they support ncnn glsl extension, these 3 functions accept opt switch to control the expansion form of ncnn glsl extension. The first two accept raw glsl code strings, and the last one is used to create ncnn's built-in shader. ```cpp namespace ncnn { // online spirv compilation NCNN_EXPORT int compile_spirv_module(const char* comp_string, const Option& opt, std::vector& spirv); NCNN_EXPORT int compile_spirv_module(const char* comp_data, int comp_data_size, const Option& opt, std::vector& spirv); NCNN_EXPORT int compile_spirv_module(int shader_type_index, const Option& opt, std::vector& spirv); } // namespace ncnn ``` ## compile ncnn extended GLSL code directly You can write shader code with ncnn glsl extension, compiled to spir-v using ncnn functions. The compiled product is a standard-compliant spir-v binary, which can be directly used to create a pipeline object in the vulkan api ```cpp static const char my_glsl_data[] = R"( #version 450 layout (binding = 0) readonly buffer a_blob { sfpvec4 a_blob_data[]; }; layout (binding = 1) writeonly buffer b_blob { sfpvec4 b_blob_data[]; }; void main() { const int i = int(gl_GlobalInvocationID.x); afpvec4 v = buffer_ld4(a_blob_data, i); v = v + 123; buffer_st4(b_blob_data, i, v); } )"; Option opt; // you can control the extension behavior // even if the gpu supports 16bit storage opt.use_fp16_storage = false; std::vector spirv; ncnn::compile_spirv_module(my_glsl_data, sizeof(my_glsl_data) - 1, opt, spirv); // To create pipeline object later // ncnn::Pipeline pipeline(vkdev); // pipeline.set_local_size_xyz(64, 1, 1); // pipeline.create(spirv.data(), spirv.size() * 4, specializations); ``` ## ncnn built-in shader The shader index inside ncnn is exposed in the `layer_shader_type.h` header and can be used if needed ```cpp #include "layer_shader_type.h" int shader_type_index = LayerShaderType::convert_ycbcr; Option opt; std::vector spirv; int retc = compile_spirv_module(shader_type_index, opt, spirv); ``` # data types ## storage type declare buffer data layout in descriptor binding ```c layout (binding = 0) buffer top_blob { sfpvec4 top_blob_data[]; }; ``` |storage type|fp32|fp16p|fp16s|bf16p|bf16s| |---|---|---|---|---|---| |sfp|float|uint|float16_t|uint|bfloat16_t| |sfpvec2|vec2|uint|f16vec2|uint|bf16vec2| |sfpvec4|vec4|uvec2|f16vec4|uvec2|bf16vec4| ## arithmetic type declare local variable in glsl code ```c void main() { afpvec4 v = a * b; } ``` |arithmetic type|fp32|fp16a| |---|---|---| |afp|float|float16_t| |afpvec2|vec2|f16vec2| |afpvec4|vec4|f16vec4| ## local type declare variable in shared local memory ```c shared lfp tmp_a[8][4][2]; ``` |local type|fp32|fp16p / fp16s only|fp16s+fp16a|fp16s+fp16u|bf16p|bf16s| |---|---|---|---|---|---|---| |lfp|float|float|float|float16_t|float|bfloat16_t| |lfpvec4|vec4|uvec2|uint64_t|f16vec4|uvec2|bf16vec4| # buffer functions - load typed value from src[offset] ```c afp buffer_ld1(sfp src, int offset); afpvec2 buffer_ld2(sfpvec2 src, int offset); afpvec4 buffer_ld4(sfpvec4 src, int offset); ``` - store typed value to dst[offset] ```c void buffer_st1(sfp dst, int offset, afp v); void buffer_st2(sfpvec2 dst, int offset, afpvec2 v); void buffer_st4(sfpvec4 dst, int offset, afpvec4 v); ``` - copy typed value from src[src_offset] to dst[dst_offset] ```c void buffer_cp1(sfp dst, int dst_offset, sfp src, int src_offset); void buffer_cp2(sfpvec2 dst, int dst_offset, sfpvec2 src, int src_offset); void buffer_cp4(sfpvec4 dst, int dst_offset, sfpvec4 src, int src_offset); ``` - copy and pack value from src[src_offsets[0],src_offsets[1],...] to dst[dst_offset] ```c void buffer_cp1to4(sfpvec4 dst, int dst_offset, sfp src, ivec4 src_offsets); ``` - copy and unpack value from src[src_offset] to dst[dst_offsets[0],dst_offsets[1],...] ```c void buffer_cp4to1(sfp dst, ivec4 dst_offsets, sfpvec4 src, int src_offset); ``` # local data conversion functions - storage buffer to local memory ```c lfp buffer_sm1(sfp src, int offset); lfpvec4 buffer_sm4(sfpvec4 src, int offset); ``` - local memory to local variable ```c afp lfp2afp(lfp v); afpvec4 lfp2afpvec4(lfpvec4 v); ``` - local variable to local memory ```c lfp afp2lfp(afp v); lfpvec4 afp2lfpvec4(afpvec4 v); ``` Note: The common usage of local memory is to read from global memory first, store it in local memory, and then read local variables from local memory for subsequent use. Therefore, only storage type to local type and local type to arithmetic type conversion functions are provided here. # misc functions - prefer specialization constant over push constant ```c T psc(T x) ``` Declare the same variable in specialization constant AND push constant section, then `psc(x)` will become a compile-time constant when specialization constant given non-zero or be dynamic via push constant otherwise. This is often used for tensor shape specialization. We can usually resolve all shape information and make them be compile-time constants for more aggressive shader optimization. ```c layout (constant_id = 0) const int size = 0; layout (push_constant) uniform parameter { int size; } p; void main() { const int s = psc(size); } ``` # platform macros judge if the current platform is moltenvk, for enabling some platform-specific workaround ```c #if NCNN_moltenvk // enable workaround for moltenvk #endif ``` ncnn adds additional macro definitions in the new version, which may conflict or confuse the existing glsl code. In order to obtain cross-version compatibility of ncnn, you can switch between the old and new codes according to the `ncnn_glsl_version` macro version. ```c #if ncnn_glsl_version >= 1 // use device macros introduced since version 1 #endif ``` ncnn additionally defines most of the vulkan device-related features as macros, which we can use to distinguish different platforms, device extensions, features, and properties ### extension macros When the device supports an extension, `ncnn_` is defined as the extension version ```c void main() { #if ncnn_VK_KHR_16bit_storage // here is the code for any device that supports VK_KHR_16bit_storage #endif #if ncnn_VK_KHR_sampler_ycbcr_conversion >= 10 // here is the code for any device that supports VK_KHR_sampler_ycbcr_conversion and version >= 10 #endif } ``` ### device feature and property macros ncnn will query device features and properties and then define them as macros. The macro name is `ncnn_` or `ncnn_` The `GL_EXT_shader_explicit_arithmetic_types_int64` extension will be automatically enabled without explicit code indication when the device supports `shaderInt64` The `GL_EXT_shader_explicit_arithmetic_types_int16` extension will be automatically enabled without explicit code indication when the device supports `shaderInt16` ```c void main() { #if ncnn_robustBufferAccess // here is the code for any device that supports robustBufferAccess feature #endif #if ncnn_vendorID == 4318 // here is the vendor specific code, 4318 is nvidia graphics #endif #if ncnn_subgroupSize == 32 // here is the code path optimized for subgroup_size == 32 #endif // use macro definitions uint size; // dynamic value from some previous routines if (size < ncnn_subgroupSize) { #if ncnn_supportedOperations & 4 // subgroup support arithmetic #endif #if ncnn_subgroup_arithmetic // shorthand style for checking subgroup arithmetic :P #endif } } ``` ### validation layer macros ncnn will define some additional convenient macros when the vulkan validation layer enabled * `ncnn_enable_validation_layer` * `NCNN_LOGE` currently, you have to modify the `ENABLE_VALIDATION_LAYER` definition at the beginning of `src/gpu.cpp` to `1` to enable these macros. The `GL_EXT_debug_printf` extension will be enabled automatically without explicitly specifying it in your code. ```c void main() { int gx = int(gl_GlobalInvocationID.x); #if ncnn_enable_validation_layer NCNN_LOGE("gx = %d\n", gx); #endif } ``` At runtime, `NCNN_LOGE` will print out the value of `gx` ### option macros enable glsl extension only if user enable some options The `GL_EXT_shader_16bit_storage` extension will be automatically enabled without explicit code indication when the device supports 16-bit storage and the user turns on `opt.use_fp16_storage` or `opt.use_bf16_storage` The `GL_EXT_shader_explicit_arithmetic_types_float16` extension will be automatically enabled without explicit code indication when the device supports 16-bit arithmetic and the user turns on `opt.use_fp16_arithmetic` The `GL_EXT_shader_8bit_storage` extension will be automatically enabled without explicit code indication when the device supports 8-bit storage and the user turns on `opt.use_int8_storage` The `GL_EXT_shader_explicit_arithmetic_types_int8` extension will be automatically enabled without explicit code indication when the device supports 8-bit arithmetic and the user turns on `opt.use_int8_arithmetic` The `GL_EXT_bfloat16` extension will be automatically enabled without explicit code indication when the device supports bfloat16 storage and the user turns on `opt.use_bf16_storage` ```c void main() { #if NCNN_fp16_storage // the user enable fp16 storage option and the device has fp16 storage support #endif #if NCNN_fp16_arithmetic // the user enable fp16 arithmetic option and the device has fp16 arithmetic support #endif } ``` |macro|defined by option| |---|---| |NCNN_fp16_packed|opt.use_fp16_packed| |NCNN_fp16_storage|opt.use_fp16_storage| |NCNN_fp16_arithmetic|opt.use_fp16_arithmetic| |NCNN_int8_packed|opt.use_int8_packed| |NCNN_int8_storage|opt.use_int8_storage| |NCNN_int8_arithmetic|opt.use_int8_arithmetic| |NCNN_bf16_packed|opt.use_bf16_packed| |NCNN_bf16_storage|opt.use_bf16_storage| |NCNN_shader_local_memory|opt.use_shader_local_memory| ================================================ FILE: docs/developer-guide/glsl-extension.zh.md ================================================ # ncnn GLSL 扩展 ## 理由 不同的 GPU 支持不同的功能,有的支持 fp16 作为缓冲存储类型,有的支持 fp16 作为操作数变量,有的老 GPU 只支持 fp32。 当 GPU 支持 `VK_KHR_16bit_storage` 扩展时,为了尽量减少 GPU 的内存带宽消耗,我们会优先使用 fp16 作为存储类型。否则,我们使用 `packHalf2x16` 和 `unpackHalf2x16` 在 GLSL 4.2 中将 2 个 fp32 压缩为 uint,从而减少读写带宽。 同样,当 GPU 支持 `VK_KHR_shader_float16_int8` 扩展时,为了加快计算效率,我们会优先使用 fp16 作为运算操作数,这通常会使速度翻倍。否则,我们使用 fp32。 为了确保最广泛的兼容性,将编写以下用于声明描述符绑定和加载数据的代码 ```c #if NCNN_fp16_storage // GPU支持 16bit storage layout (binding = 0) buffer blob { f16vec4 blob_data[]; }; #elif NCNN_fp16_packed // GPU支持 GLSL 4.2 layout (binding = 0) buffer blob { uvec2 blob_data[]; }; #else // GPU仅支持 fp32 layout (binding = 0) buffer blob { vec4 blob_data[]; }; #endif void main() { const int i = int(gl_GlobalInvocationID.x); #if NCNN_fp16_storage && NCNN_fp16_arithmetic // GPU支持 16bit storage 和 shader float16 f16vec4 x = blob_data[i]; #elif NCNN_fp16_storage // GPU支持 16bit storage 但不包含 shader float16 vec4 x = vec4(blob_data[i]); #elif NCNN_fp16_packed && NCNN_fp16_arithmetic // GPU支持 GLSL 4.2 和 shader float16 f16vec4 x = f16vec4(unpackFloat2x16(blob_data[i].x), unpackFloat2x16(blob_data[i].y)); #elif NCNN_fp16_packed // GPU支持 GLSL 4.2 vec4 x = vec4(unpackHalf2x16(blob_data[i].x), unpackHalf2x16(blob_data[i].y)); #else // GPU仅支持 fp32 vec4 x = blob_data[i]; #endif } ``` 如您所见,仅声明缓冲区类型并读取值会消耗大量代码行,这是项目维护的噩梦。因此,ncnn 增加了更灵活的数据类型和辅助函数,以减小代码的大小并提高可读性,并且会根据 GPU 支持的功能级别自动扩展到最高效的实现。 上面的代码,通过使用 ncnn GLSL 扩展,可以简化为 ```c layout (binding = 0) buffer blob { sfpvec4 blob_data[]; }; void main() { const int i = int(gl_GlobalInvocationID.x); afpvec4 x = buffer_ld4(blob_data, i); } ``` ncnn GLSL 扩展为存储、计算、共享内存以及缓冲区和图像的加载、存储、转换函数提供了必要的数据类型。我们还提供了一些缓冲区和图像复制函数,以防止在使用 fp16 作为中间数据类型时丢失精度,并避免不必要的 `unpackHalf2x16` 和 `packHalf2x16` 配对。 # 编译GLSL的入口点 ncnn库中的 gpu.h 头文件公开了3个用于将 GLSL 代码编译为 Spir-V 二进制的API函数,它们支持 ncnn GLSL 扩展,这3个函数接受 opt switch 来控制 ncnn GLSL 扩展形式。前两个函数接受原始 GLSL 代码字符串作为参数,最后一个函数用于创建 ncnn 的已存在的内置着色器。 ```cpp namespace ncnn { // 在线 Spir-V 编译器 NCNN_EXPORT int compile_spirv_module(const char* comp_string, const Option& opt, std::vector& spirv); NCNN_EXPORT int compile_spirv_module(const char* comp_data, int comp_data_size, const Option& opt, std::vector& spirv); NCNN_EXPORT int compile_spirv_module(int shader_type_index, const Option& opt, std::vector& spirv); } // namespace ncnn ``` ## 直接编译ncnn扩展GLSL代码 您可以使用 ncnn GLSL 扩展编写着色器代码,使用 ncnn 函数编译为 Spir-V。编译后的产品是符合标准的 Spir-V 二进制文件,可以直接用于在 Vulkan API 中创建流水线对象 ```cpp static const char my_glsl_data[] = R"( #version 450 layout (binding = 0) readonly buffer a_blob { sfpvec4 a_blob_data[]; }; layout (binding = 1) writeonly buffer b_blob { sfpvec4 b_blob_data[]; }; void main() { const int i = int(gl_GlobalInvocationID.x); afpvec4 v = buffer_ld4(a_blob_data, i); v = v + 123; buffer_st4(b_blob_data, i, v); } )"; Option opt; // 您可以控制Vulkan扩展行为 // 当GPU支持16位存储的话 opt.use_fp16_storage = false; std::vector spirv; ncnn::compile_spirv_module(my_glsl_data, sizeof(my_glsl_data) - 1, opt, spirv); // 稍后再创建管道对象 // ncnn::Pipeline pipeline(vkdev); // pipeline.set_local_size_xyz(64, 1, 1); // pipeline.create(spirv.data(), spirv.size() * 4, specializations); ``` ## ncnn内置着色器 ncnn内部的着色器索引在标头中公开,如果需要可以使用 `layer_shader_type.h` ```cpp #include "layer_shader_type.h" int shader_type_index = LayerShaderType::convert_ycbcr; Option opt; std::vector spirv; int retc = compile_spirv_module(shader_type_index, opt, spirv); ``` # 数据类型 ## 存储类型(storage type) 在描述符绑定中声明缓冲区数据布局 ```c layout (binding = 0) buffer top_blob { sfpvec4 top_blob_data[]; }; ``` |存储类型|fp32|fp16p|fp16s|bf16p|bf16s| |---|---|---|---|---|---| |sfp|float|uint|float16_t|uint|bfloat16_t| |sfpvec2|vec2|uint|f16vec2|uint|bf16vec2| |sfpvec4|vec4|uvec2|f16vec4|uvec2|bf16vec4| ## 算术类型(arithmetic type) 在 GLSL 代码中声明局部变量 ```c void main() { afpvec4 v = a * b; } ``` |算术类型|fp32|fp16a| |---|---|---| |afp|float|float16_t| |afpvec2|vec2|f16vec2| |afpvec4|vec4|f16vec4| ## 本地类型(local type) 在共享本地内存中声明变量 ```c shared lfp tmp_a[8][4][2]; ``` |本地类型|fp32|fp16p / fp16s only|fp16s+fp16a|fp16s+fp16u|bf16p|bf16s| |---|---|---|---|---|---|---| |lfp|float|float|float|float16_t|float|bfloat16_t| |lfpvec4|vec4|uvec2|uint64_t|f16vec4|uvec2|bf16vec4| # 缓冲区函数(buffer functions) - 从 src[offset] 加载已经确定类型的值 ```c afp buffer_ld1(sfp src, int offset); afpvec2 buffer_ld2(sfpvec2 src, int offset); afpvec4 buffer_ld4(sfpvec4 src, int offset); ``` - 将已确定类型的值存储到 dst[偏移量] ```c void buffer_st1(sfp dst, int offset, afp v); void buffer_st2(sfpvec2 dst, int offset, afpvec2 v); void buffer_st4(sfpvec4 dst, int offset, afpvec4 v); ``` - 从已确定类型 src[src_offset] 的值拷贝到 dst[dst_offset] ```c void buffer_cp1(sfp dst, int dst_offset, sfp src, int src_offset); void buffer_cp2(sfpvec2 dst, int dst_offset, sfpvec2 src, int src_offset); void buffer_cp4(sfpvec4 dst, int dst_offset, sfpvec4 src, int src_offset); ``` - 从 src[src_offsets[0],src_offsets[1],...] 的值拷贝并打包到 dst[dst_offset] ```c void buffer_cp1to4(sfpvec4 dst, int dst_offset, sfp src, ivec4 src_offsets); ``` - 从 src[src_offset] 的值拷贝并解包到 dst[dst_offsets[0],dst_offsets[1],...] ```c void buffer_cp4to1(sfp dst, ivec4 dst_offsets, sfpvec4 src, int src_offset); ``` # 本地数据转换函数 - 存储缓冲区转换到本地内存 ```c lfp buffer_sm1(sfp src, int offset); lfpvec4 buffer_sm4(sfpvec4 src, int offset); ``` - 本地内存转换到局部变量 ```c afp lfp2afp(lfp v); afpvec4 lfp2afpvec4(lfpvec4 v); ``` - 局部变量转换到本地内存 ```c lfp afp2lfp(afp v); lfpvec4 afp2lfpvec4(afpvec4 v); ``` 注意:本地内存的常见用法是先从全局内存中读取,存储在本地内存中,然后再从本地内存中读取局部变量以供后续使用。因此,此处仅提供存储类型到本地类型和本地类型到算术类型的转换函数。 # 杂项函数 - 更推荐使用专业化常量(specialization constants),而不是推动常量(push constants) ```c T psc(T x) ``` 在 `专用常量` 和 `推送常量` 部分中声明相同的变量,然后在专用常量给定非零时 `psc(x)` 将成为编译时常量,否则将通过推送常量动态。这通常用于张量形状特化。我们通常可以解析所有形状信息,并使它们成为编译时常量,以实现让着色器得到更积极的优化。 ```c layout (constant_id = 0) const int size = 0; layout (push_constant) uniform parameter { int size; } p; void main() { const int s = psc(size); } ``` # 平台宏定义 判断当前平台是否为 moltenvk,以启用对于某些特定于平台的解决方法 ```c #if NCNN_moltenvk // 启用moltenvk的解决方法 #endif ``` ncnn 在新版本中添加了额外的宏定义,可能与现在的 glsl 代码冲突或引起混淆。为了实现 ncnn 的跨版本兼容性,可以根据 `ncnn_glsl_version` 宏的版本号在新旧代码之间进行切换 。 ```c #if ncnn_glsl_version >= 1 // 使用自版本 1 起引入的设备宏 #endif ``` ncnn 额外定义了大多数 vulcan 设备相关功能作为宏,我们可以用来区分不同的平台、设备扩展、功能和属性。 ### 扩展宏定义 当设备支持某个扩展时,`ncnn_` 被定义为扩展版本 ```c void main() { #if ncnn_VK_KHR_16bit_storage // 支持 VK_KHR_16bit_storage 设备的代码 #endif #if ncnn_VK_KHR_sampler_ycbcr_conversion >= 10 // 支持 VK_KHR_sampler_ycbcr_conversion 且版本 >=10 的代码 #endif } ``` ### 设备特性和属性宏 ncnn 会查询设备特性和属性,然后将它们定义为宏。 宏名称为 `ncnn_` 或 `ncnn_` 当设备支持 `shaderInt64` 时,`GL_EXT_shader_explicit_arithmetic_types_int64` 扩展会自动启用,无需显式代码指示。 当设备支持 `shaderInt16` 时,`GL_EXT_shader_explicit_arithmetic_types_int16` 扩展会自动启用,无需显式代码指示。 ```c void main() { #if ncnn_robustBufferAccess // 支持 robustBufferAccess 特性的设备代码 #endif #if ncnn_vendorID == 4318 // 供应商特定代码,4318 是 nvidia 显卡 #endif #if ncnn_subgroupSize == 32 // 为 subgroup_size == 32 优化的代码路径 #endif // 使用宏定义 uint size; // 来自先前例程的动态值 if (size < ncnn_subgroupSize) { #if ncnn_supportedOperations & 4 // subgroup 支持算术运算 #endif #if ncnn_subgroup_arithmetic // 检查 subgroup 算术运算的简写形式 #endif } } ``` ### 验证层宏定义 当启用 vulkan 验证层时,ncnn 会定义一些额外的便捷宏 * `ncnn_enable_validation_layer` * `NCNN_LOGE` 目前,你必须将 `src/gpu.cpp` 开头的 `ENABLE_VALIDATION_LAYER` 定义修改为 `1` 才能启用这些宏。 `GL_EXT_debug_printf` 扩展会自动启用,无需在代码中显式指定。 ```c void main() { int gx = int(gl_GlobalInvocationID.x); #if ncnn_enable_validation_layer NCNN_LOGE("gx = %d\n", gx); #endif } ``` 在运行时,`NCNN_LOGE` 将打印出 `gx` 的值 ### 选项宏 仅当用户启用某些选项时才启用 GLSL 扩展 `GL_EXT_shader_16bit_storage` 扩展会在设备支持 16 位存储且用户开启了 `opt.use_fp16_storage` 或 `opt.use_bf16_storage` 选项时,自动启用,无需显式代码指示。 `GL_EXT_shader_explicit_arithmetic_types_float16` 扩展会在设备支持 16 位算术运算且用户开启了 `opt.use_fp16_arithmetic` 选项时,自动启用,无需显式代码指示。 `GL_EXT_shader_8bit_storage` 扩展会在设备支持 8 位存储且用户开启了 `opt.use_int8_storage` 选项时,自动启用,无需显式代码指示。 `GL_EXT_shader_explicit_arithmetic_types_int8` 扩展会在设备支持 8 位算术运算且用户开启了 `opt.use_int8_arithmetic` 选项时,自动启用,无需显式代码指示。 `GL_EXT_bfloat16` 扩展会在设备支持 bfloat16 存储且用户开启了 `opt.use_bf16_storage` 选项时,自动启用,无需显式代码指示。 ```c void main() { #if NCNN_fp16_storage // 用户启用 fp16 存储选项,且设备支持 fp16 存储 #endif #if NCNN_fp16_arithmetic // 用户启用 fp16 算术选项,且设备支持 fp16 算术运算 #endif } ``` |宏定义|option中所定义的变量| |---|---| |NCNN_fp16_packed|opt.use_fp16_packed| |NCNN_fp16_storage|opt.use_fp16_storage| |NCNN_fp16_arithmetic|opt.use_fp16_arithmetic| |NCNN_int8_packed|opt.use_int8_packed| |NCNN_int8_storage|opt.use_int8_storage| |NCNN_int8_arithmetic|opt.use_int8_arithmetic| |NCNN_bf16_packed|opt.use_bf16_packed| |NCNN_bf16_storage|opt.use_bf16_storage| |NCNN_shader_local_memory|opt.use_shader_local_memory| ================================================ FILE: docs/developer-guide/how-to-be-a-contributor.zh.md ================================================ ### 如何提交代码 #### 一、fork 分支 在浏览器中打开 [ncnn](https://github.com/tencent/ncnn), `fork` 到自己的 repositories,例如 ``` https://github.com/user/ncnn ``` clone 项目到本地,添加官方 remote 并 fetch: ``` $ git clone https://github.com/user/ncnn && cd ncnn $ git remote add tencent https://github.com/tencent/ncnn $ git fetch tencent ``` 对于 `git clone` 下来的项目,它现在有两个 remote,分别是 origin 和 tencent: ``` $ git remote -v origin https://github.com/user/ncnn (fetch) origin https://github.com/user/ncnn (push) tencent https://github.com/Tencent/ncnn (fetch) tencent https://github.com/Tencent/ncnn (push) ``` origin 指向你 fork 的仓库地址;remote 即官方 repo。可以基于不同的 remote 创建和提交分支。 例如切换到官方 master 分支,并基于此创建自己的分支(命名尽量言简意赅。一个分支只做一件事,方便 review 和 revert) ``` $ git checkout tencent/master $ git checkout -b add-conv-int8 ``` 或创建分支时指定基于官方 master 分支: ``` $ git checkout -b fix-typo-in-document tencent/master ``` > `git fetch` 是从远程获取最新代码到本地。如果是第二次 pr ncnn,直接从 `git fetch tencent` 开始即可,不需要 `git remote add tencent`,也不需要修改 `github.com/user/ncnn`。 #### 二、代码习惯 为了增加沟通效率,reviewer 一般要求 contributor 遵从以下规则 * `if-else`和花括号`{`中间需要换行 * 不能随意增删空行 * tab 替换为 4 个空格 * 为了保证平台兼容性,目前不使用`c++11`,`src`目录下尽量避免使用`template` * 若是新增功能或平台,`test`目录需有对应测试用例 * 文档放到`doc`对应目录下,中文用`.zh.md`做后缀;英文直接用`.md`后缀 开发完成后提交到自己的 repository ``` $ git commit -a $ git push origin add-conv-int8 ``` 推荐使用 [`commitizen`](https://pypi.org/project/commitizen/) 或 [`gitlint`](https://jorisroovers.com/gitlint/) 等工具格式化 commit message,方便事后检索海量提交记录 #### 三、代码提交 浏览器中打开 [ncnn pulls](https://github.com/Tencent/ncnn/pulls) ,此时应有此分支 pr 提示,点击 `Compare & pull request` * 标题**必须**是英文。未完成的分支应以 `WIP:` 开头,例如 `WIP: add conv int8` * 正文宜包含以下内容,中英不限 * 内容概述和实现方式 * 功能或性能测试 * 测试结果 CI 已集成了自动格式化,restyled-io 会在 pr 的同时生成 `Restyled add conv int8`,需要 merge 自动 restyled 的分支,例如 ``` $ git fetch tencent $ git checkout add-conv-int8 $ git merge tencent/restyled/pull-2078 $ git push origin add-conv-int8 ``` 回到浏览器签署 CLA,所有 CI 测试通过后通知 reviewer merge 此分支。 #### 四、彩蛋 留下个人 qq 号会触发隐藏事件。 ================================================ FILE: docs/developer-guide/how-to-implement-custom-layer-step-by-step.md ================================================ # step1 create a new empty class ```cpp // mylayer.h #include "layer.h" using namespace ncnn; // a new layer type called MyLayer class MyLayer : public Layer { }; // mylayer.cpp #include "mylayer.h" DEFINE_LAYER_CREATOR(MyLayer) ``` # step2 declare layer parameters and weights ```cpp // mylayer.h #include "layer.h" using namespace ncnn; class MyLayer : public Layer { private: int channels;// new code float gamma;// new code Mat weight;// new code }; // mylayer.cpp #include "mylayer.h" DEFINE_LAYER_CREATOR(MyLayer) ``` # step3 implement load functions for parameters and weights ```cpp // mylayer.h #include "layer.h" using namespace ncnn; class MyLayer : public Layer { public: virtual int load_param(const ParamDict& pd);// new code virtual int load_model(const ModelBin& mb);// new code private: int channels; float eps; Mat gamma_data; }; // mylayer.cpp #include "mylayer.h" DEFINE_LAYER_CREATOR(MyLayer) // new routine for loading parameters int MyLayer::load_param(const ParamDict& pd) { // details about the relations with param file // https://github.com/Tencent/ncnn/wiki/param-and-model-file-structure // channels = pd.get(0, 0);// parse 0= entry, default value 0 eps = pd.get(1, 0.001f);// parse 1= entry, default value 0.001f return 0;// return zero if success } // new routine for loading weights int MyLayer::load_model(const ModelBin& mb) { // details about the relations with model file // https://github.com/Tencent/ncnn/wiki/param-and-model-file-structure // // read weights with length of channels * sizeof(float) // the second argument explains as follows // 0 judge the value type automatically, you may get float or float16 or uint8 etc // depends on the model storage and the supporting target hardware // 1 read float values anyway // 2 read float16 values anyway // 3 read uint8 values anyway gamma_data = mb.load(channels, 1); if (gamma_data.empty()) return -100;// return non-zero on error, -100 indicates out-of-memory return 0;// return zero if success } ``` # step4 determine forward behavior ```cpp // mylayer.h #include "layer.h" using namespace ncnn; class MyLayer : public Layer { public: MyLayer();// new code virtual int load_param(const ParamDict& pd); virtual int load_model(const ModelBin& mb); private: int channels; float eps; Mat gamma_data; }; // mylayer.cpp #include "mylayer.h" DEFINE_LAYER_CREATOR(MyLayer) // new routine for setting forward behavior MyLayer::MyLayer() { // one input and one output // typical one_blob_only type: Convolution, Pooling, ReLU, Softmax ... // typical non-one_blob_only type: Eltwise, Split, Concat, Slice ... one_blob_only = true; // do not change the blob size, modify data in-place // typical support_inplace type: ReLU, Sigmoid ... // typical non-support_inplace type: Convolution, Pooling ... support_inplace = true; } int MyLayer::load_param(const ParamDict& pd) { channels = pd.get(0, 0); eps = pd.get(1, 0.001f); // you could alter the behavior based on loaded parameter // if (eps == 0.001f) // { // one_blob_only = false; // support_inplace = false; // } return 0; } int MyLayer::load_model(const ModelBin& mb) { gamma_data = mb.load(channels, 1); if (gamma_data.empty()) return -100; // you could alter the behavior based on loaded weight // if (gamma_data[0] == 0.f) // { // one_blob_only = false; // support_inplace = false; // } return 0; } ``` # step5 choose proper interface based on forward behavior ```cpp // The base class Layer defines four interfaces for each forward behavior combination // 1 virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; // 2 virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; // 3 virtual int forward_inplace(std::vector& bottom_top_blobs, const Option& opt) const; // 4 virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; ``` **must** = layer must implement this function **optional** = layer may implement this function for optimal performance sometimes the graph inference path cannot call forward_inplace directly due to data sharing, in this situation the non-inplace forward routine will be used, which deep-copy the input blob and call inplace forward on it if the optional routine is not implemented. Thus, you could avoid this deep-copy by process input to output on-the-fly. |one_blob_only|support_inplace|1|2|3|4| |---|---|---|---|---|---| |false|false|must| | | | |false|true|optional| |must| | |true|false| |must| | | |true|true| |optional| |must| # step6 implement forward function ```cpp // mylayer.h #include "layer.h" using namespace ncnn; class MyLayer : public Layer { public: MyLayer(); virtual int load_param(const ParamDict& pd); virtual int load_model(const ModelBin& mb); virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;// new code, optional virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;// new code private: int channels; float eps; Mat gamma_data; }; // mylayer.cpp #include "mylayer.h" DEFINE_LAYER_CREATOR(MyLayer) MyLayer::MyLayer() { one_blob_only = true; support_inplace = true; } int MyLayer::load_param(const ParamDict& pd) { channels = pd.get(0, 0); eps = pd.get(1, 0.001f); return 0; } int MyLayer::load_model(const ModelBin& mb) { gamma_data = mb.load(channels, 1); if (gamma_data.empty()) return -100; return 0; } // optional new routine for layer forward function, non-inplace version int MyLayer::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { // check input dims, return non-zero on error if (bottom_blob.c != channels) return -1; // x = (x + eps) * gamma_per_channel int w = bottom_blob.w; int h = bottom_blob.h; size_t elemsize = bottom_blob.elemsize; int size = w * h; top_blob.create(w, h, channels, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100;// return non-zero on error, -100 indicates out-of-memory #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q使用SSE优化之前,确保理解代码指针位置和移动原理,原生代码已经完成测试,输出结果正确。 ### 1.SSE数据类型 ​ SSE数据类型形如: ```c++ __m //__m适用代表申请mm寄存器 // bit 代表数据类型的字节长度,在SSE中为128 或 64 // 默认type为单精度浮点(f32),其余为int 或double // 另外要注意所有SSE的类型除__m128和__m64外,随着版本更新有不同的类型,建议根据需要且确定硬件性能后选择合适的类型 // 举例如下: __m128 //4xf32 含有4个单精度浮点数;SSE __m64 //4xf32 含有2个单精度浮点数;SSE __m128i //8个int类型(8x16) ;SSE3 __m128d //2个double类型(2x64) ``` ### 2.SSE内联函数结构 ​ SSE内联函数在线查询:[Intel® Intrinsics Guide](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE,SSE2,SSE3,SSSE3,SSE4_1,SSE4_2) 在此 单个指令的结构如下: - Synopsis :摘要。描述指令的接口定义,需要引入的头文件,对应的指令,CPU必须支持的标志; - Description:描述该指令的行为; - Operation:逻辑层面描述指令行为; - Performance:在不同架构中所需要的延迟和执行所需要的时钟周期数(CPI)。 ​ 值得指出的的是此处默认使用小端存储,即左边为高位,右边为低位。 ​ 相似的内联函数有很多,在使用时候一定要注意Operation中的逻辑满足您的要求。 ​ 另外,在ncnn中,ncnn已经将部分SSE内联函数以NCNN内联的方式封装。在为NCNN添加SSE优化的算法的过程中,请首先考虑搜索“NCNNINLINE”宏封装的SSE函数。 ## 四:样例 ### 1.一个简单的样例:4x4矩阵乘法 ​ 矩阵乘法方面,已经有很多出色的成果。值得一读的比如[how-to-optimize-gemm](https://github.com/flame/how-to-optimize-gemm),及 [以Arm Intrinsic优化矩阵乘法](https://github.com/tpoisonooo/how-to-optimize-gemm)。我建议感兴趣同学参考和学习这两份项目,来探究如何从0到1优化一份算法; ​ 矩阵乘法原理很简单: ​ 假设有A,B两个矩阵,如下: $$ A_{[4][4]} = \begin{bmatrix} a_0 & a_4 & a_8 & a_{12} \\ a_1 & a_5 & a_9 & a_{13} \\ a_2 & a_6 & a_{10} & a_{14} \\ a_3 & a_7 & a_{11} & a_{15} \end{bmatrix} ~~ B_{[4][4]} = \begin{bmatrix} b_0 & b_4 & b_8 & b_{12} \\ b_1 & b_5 & b_9 & b_{13} \\ b_2 & b_6 & b_{10} & b_{14} \\ b_3 & b_7 & b_{11} & b_{15} \end{bmatrix} ~~ C_{[4][4]} = \begin{bmatrix} c_0 & c_4 & c_8 & c_{12} \\ c_1 & c_5 & c_9 & c_{13} \\ c_2 & c_6 & c_{10} & c_{14} \\ c_3 & c_7 & c_{11} & c_{15} \end{bmatrix} $$ ​ 对于C 矩阵的第一列,我们有: $$ c_0 = a_0b_0 + a_4b_1 + a_8b_2 + a_{12}b_3 \\ c_1 = a_1b_0 + a_5b_1 + a_9b_2 + a_{13}b_3 \\ c_2 = a_2b_0 + a_6b_1 + a_{10}b_2 + a_{14}b_3 \\ c_3 = a_3b_0 + a_7b_1 + a_{11}b_2 + a_{15}b_3 $$ #### 1.编写测试代码和基准测量程序 ​ 在该样例中,测试代码很容易编写出来,我们只需要初始化4x4的二维数组,并返回指针即可。此时,可以不考虑泛用性,初始化为固定值即可。 ```c // <代码片段> ... float A[16] = {0.0f}; // 此处已经将输入和输出的矩阵默认展开成im2col 后的单行(inch = 1) 宽度为h*w = 16的矩阵 float B[16] = {0.0f}; float C[16] = {0.0f}; matrix_init_rand(A, 4, 4); // 随机初始化A数组 matrix_init_rand(B, 4, 4); // 随机初始化B数组 ``` ​ 编写验证正确性的测试代码。 ```c // <代码片段> ... float T[16] = {...}; // Target即为预测的C的结果数组,可用numpy或者纸笔计算 ... float error = 0.0001; bool CheckAuc(T, C, error); // 注意:float在计算机中不能完全表示,只能使用绝对误差的判别方法。gtest等测试框架的EXCEPT宏无法处理1.234e5这样结构的float数的对比。 ``` ​ 同样,编写计算耗时的基准测量代码,此处使用1000次操作所占的平均时间来作为基准。 ```c // <代码片段> ... const int loop = 1000; clock_gettime_(CLOCK_REALTIME, &time_start); for(init i = 0; i < loop; i++) { matirx_mult_native(C, A, B); } clock_gettime_(CLOCK_REALTIME, &time_end); clocks_c = (time_end.tv_sec - time_start.tv_sec) * 1000000 + (time_end.tv_sec - time_start.tv_sec) /1000; ``` #### 2.编写原生代码 ​ 编写原生代码,使得正确性测试能够通过。 ```c // <代码片段> static void matirx_mult_native(float *C, float *A, float *B) { for(int i_idx = 0; i_idx < 4; i_idx++) { for(int j_idx = 0; j_idx < 4; j_idx++) { for(int k_idx = 0; k_idx < 4; k_idx++) { C[4*j_idx + i_idx] += A[4*k_idx + i_idx] * B[4*j_idx + k_idx]; } } } } ``` #### 3.优化原生代码 ​ 注意到上述代码中,先取c0 - c3 的计算作为样例考虑: $$ c_0 = a_0b_0 + a_4b_1 + a_8b_2 + a_{12}b_3 \\ c_1 = a_1b_0 + a_5b_1 + a_9b_2 + a_{13}b_3 \\ c_2 = a_2b_0 + a_6b_1 + a_{10}b_2 + a_{14}b_3 \\ c_3 = a_3b_0 + a_7b_1 + a_{11}b_2 + a_{15}b_3 $$ ##### 1.装载寄存器 - 考虑竖排a0-a1-a2-a3 为4个f32 数据,又因为SSE可以申请mm寄存器,单次保存128bit,那么不妨把a0-a4保存在寄存器中, - 对于b0-b3 则是,单次读取一个值,能够重复用4次,不妨考虑b0 重复4次,排满单个128bit的mm寄存器; - 同理把c0-c3也放入寄存器,从列方向上考虑,取名为_c0 ```c++ _m128 _a0 = _mm_load_ps(a_ptr); //a0 -a1 -a2 -a3 _m128 _a1 = _mm_load_ps(a_ptr + 4); //a4 -a5 -a6 -a7 _m128 _a2 = _mm_load_ps(a_ptr + 8); //a8 -a9 -a10-a11 _m128 _a3 = _mm_load_ps(a_ptr + 12); //a12-a13-a14-a15 _m128 _b0 = _mm_load_ps1(b_ptr); // b0 - b0 - b0 - b0 _m128 _b1 = _mm_load_ps1(b_ptr + 4); // b1 - b1 - b1 - b1 _m128 _b2 = _mm_load_ps1(b_ptr + 8); // b2 - b2 - b2 - b2 _m128 _b3 = _mm_load_ps1(b_ptr + 12); // b3 - b3 - b3 - b3 ``` ##### 2.编写第一列的计算结果 ​ 对于_a0 -\_a3 数据与\_b0 数据相乘 ,有: ```c++ // 保存结果新建一个_c0 作为临时变量 _m128 _c0 = _mm_set_ps1(0.0f); _c0 = _mm_mul_ps(_a0, _b0); _c0 = _mm_add_ps(_mm_mul_ps(_a1, _b1),_c0); _c0 = _mm_add_ps(_mm_mul_ps(_a2, _b2),_c0); _c0 = _mm_add_ps(_mm_mul_ps(_a3, _b3),_c0); // 把 _sum0存会以c指针开头的内存中,完美! _mm_store_ps(c_ptr, _c0); ``` ##### 3.将单列输出扩展到所有列: ​ 我们针对剩下的c中的c1 列也做相同的操作: 对于C1 列 有: $$ c_4 = a_0b_4 + a_4b_5 + a_8b_6 + a_{12}b_7 \\ c_5 = a_1b_4 + a_5b_5 + a_9b_6 + a_{13}b_7 \\ c_6 = a_2b_4 + a_6b_5 + a_{10}b_6 + a_{14}b_7 \\ c_7 = a_3b_4 + a_7b_5 + a_{11}b_6 + a_{15}b_7 $$ ```c++ // a 系列不变 b系列指针+1 _m128 _b4 = _mm_load_ps1(b_ptr + 1); // b4 - b4 - b4 - b4 _m128 _b5 = _mm_load_ps1(b_ptr + 4 + 1); // b5 - b5 - b5 - b5 _m128 _b6 = _mm_load_ps1(b_ptr + 8 + 1); // b6 - b6 - b6 - b6 _m128 _b7 = _mm_load_ps1(b_ptr + 12+ 1); // b7 - b7 - b7 - b7 // 保存结果新建一个_c0 作为临时变量 _m128 _c1 = _mm_set_ps1(0.0f); _c1 = _mm_mul_ps(_a0, _b4); _c1 = _mm_add_ps(_mm_mul_ps(_a1, _b5),_c1); _c1 = _mm_add_ps(_mm_mul_ps(_a2, _b6),_c1); _c1 = _mm_add_ps(_mm_mul_ps(_a3, _b7),_c1); // 把 _sum0存会以c指针开头的内存中,完美! _mm_store_ps(c_ptr, _c1); ``` ​ 此时我们发现,对于C1列的操作与C0列及其相似,只不过是b_ptr的指针发生移动,不妨将其放到同一个循环中,有: ```C++ // a 系列不变 _m128 _a0 = _mm_load_ps(a_ptr); //a0 -a1 -a2 -a3 _m128 _a1 = _mm_load_ps(a_ptr + 4); //a4 -a5 -a6 -a7 _m128 _a2 = _mm_load_ps(a_ptr + 8); //a8 -a9 -a10-a11 _m128 _a3 = _mm_load_ps(a_ptr + 12); //a12-a13-a14-a15 for(int i = 0; i < 4; i++) { _m128 _b0 = _mm_load_ps1(b_ptr); // b0 - b0 - b0 - b0 _m128 _b1 = _mm_load_ps1(b_ptr + 4); // b1 - b1 - b1 - b1 _m128 _b2 = _mm_load_ps1(b_ptr + 8); // b2 - b2 - b2 - b2 _m128 _b3 = _mm_load_ps1(b_ptr + 12); // b3 - b3 - b3 - b3 _m128 _ci = _mm_set_ps1(0.0f); _ci = _mm_mul_ps(_a0, _b0); _ci = _mm_add_ps(_mm_mul_ps(_a1, _b1),_ci); _ci = _mm_add_ps(_mm_mul_ps(_a2, _b2),_ci); _ci = _mm_add_ps(_mm_mul_ps(_a3, _b3),_ci); _mm_store_ps(c_ptr, _ci); b_ptr += 1; // 移动b_ptr c_ptr += 4; // 移动保存内存的c_ptr } ``` ### 2.NCNN中以SSE优化算子的注意事项 #### 1.线程与openmp ​ 以上计算Benchmark 和 SSE优化的方法大多集中在单个核心中,但是在实际使用ncnn中,ncnn使用Option opt 中提供的num_threads 给openmp赋值,以实现多线程并行化,同时运行在多个核心上。 ```c++ #pragma omp parallel for num_threads(opt.num_threads) ``` ​ 在优化成SSE代码的初期,可以考虑锁定为单线程,或者直接不用考虑线程的影响,仅对单核以SSE优化,保证单核的结果正确后,再加上opt的多线程进行结果测试。 #### 2.展开循环 ​ 在实际ncnn实现的原生代码的算法中,循环是非常常见的。针对以SSE优化这类循环,遵循非常简单的原则:循环中,迭代器等于零时刻,整个输出的结果也是正确的。 ​ 那么,在我们使用SSE优化过程中,不妨以迭代器等于零的时刻,函数计算结果作为此时目标结果。在此基础上再利用SSE优化代码。与目标结果核对正确以后,再进一步去考虑迭代器等于1的情况(重复这个过程直到迭代器达到最大值)。在迭代器的每个元素下,SSE优化出的代码都与结果相等,那么我们可以说,该次优化是正确性,且完全覆盖了需执行代码。(一般来说不用考虑到最大值,根据数学归纳法,n有效,n+1有效,那么n的序列都是有效的) ## 五:总结 ​ 本文描述SSE的使用及以4x4矩阵乘法的样例来优化SSE代码。 ​ 值得注意的是,SSE只是128bit数据宽度的指令集,但是也可以用来模拟256bit 和 512bit数据宽度,来实现以pack4拼接成pack8,甚至pack16的做法,只不过在输出结果管理上更加繁琐而已。感兴趣的同学可以尝试一下。 ## 六:引用 1. [SSE指令扩展快查](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE,SSE2,SSE3,SSSE3,SSE4_1,SSE4_2); 2. 浮点性能基准计算-[浮点峰值那些事儿](https://zhuanlan.zhihu.com/p/28226956) 3. 硬件性能基准测试计算样例:[M1芯片搞数据科学好使吗?5种基准测试给你答案](https://mp.weixin.qq.com/s/2N5cl_Z1MRF8dfbRo-sb4A) 4. 讨论矩阵乘法如何优化的系列论文:[how-to-optimized-gemm](https://github.com/flame/how-to-optimize-gemm/wiki) 5. 讨论以Arm Intrinsic 优化gemm的系列文章:[OpenBLAS gemm从零入门](https://zhuanlan.zhihu.com/p/65436463) ================================================ FILE: docs/developer-guide/kvcache.md ================================================ # high-performance transformer inference with mha kv cache in ncnn This document details the implementation and usage of the key-value (kv) cache for the `MultiHeadAttention` and `SDPA` layer in ncnn. This feature significantly accelerates autoregressive inference for Transformer-based models, such as large language models and other encoder-decoder architectures. ## 1. what is kv cache? ### the challenge of autoregressive inference Transformer models generate output token by token in a process called autoregressive decoding. In each step, the model takes the previously generated tokens as input to predict the next one. A core component of this is the self-attention mechanism, which computes query (q), key (k), and value (v) matrices based on the sequence generated so far. Without optimization, the model must recompute the k and v matrices for all preceding tokens at every single step. For a sequence of length `N`, the computational cost for the self-attention mechanism is roughly proportional to `N^2`. As the sequence grows, this becomes a significant performance bottleneck. ### the solution: kv cache **kv cache** is an optimization technique that stores the key and value tensors from previous decoding steps. When generating a new token, we only need to compute the k and v for the *current* token and append them to the cached values. The model then uses the full set of cached k and v tensors for the attention calculation. ### key benefits - **dramatic speed-up:** It reduces the computational complexity of the self-attention mechanism from O(N^2) per step to approximately O(N). This drastically cuts down inference latency, especially for long sequences. - **reduced computation:** It eliminates redundant calculations, saving significant computational resources and energy. - **enables real-time applications:** The performance gain makes it feasible to deploy large Transformer models for interactive and real-time tasks. ## 2. ncnn kv cache implementation ncnn introduces kv cache support directly into its `MultiHeadAttention` and `SDPA` layer. The implementation is designed to be efficient and flexible, handling both the dynamic cache of self-attention and the static k/v of cross-attention found in encoder-decoder architectures. ### self-attention vs. cross-attention cache logic The caching strategy is fundamentally different for self-attention and cross-attention layers within a decoder. #### self-attention (dynamic cache) - **purpose:** Allows the decoder to attend to previously generated tokens in its own sequence (e.g., the text being generated). - **cache Logic:** The cache is **dynamic** and grows with each generated token. In step `t`, the k and v for token `t` are computed and appended to the cache from step `t-1`. - **ncnn implementation:** The `MultiHeadAttention` and `SDPA` layers for self-attention are modified to accept two additional inputs (`cache_k_in`, `cache_v_in`) and produce two corresponding outputs (`cache_k_out`, `cache_v_out`). The `7=1` parameter enables this dynamic caching behavior inside the layer. #### cross-attention (static k/v) - **purpose:** Allows the decoder to attend to the output of the encoder (e.g., attending to audio features in speech recognition, or an input sentence in translation). - **cache Logic:** The k and v matrices are derived from the encoder's output, which is computed only **once** per input sequence. Therefore, the k and v for cross-attention are **static** and do not change during the decoding process. They are "cached" in the sense that they are pre-computed and reused in every decoding step. - **ncnn implementation:** The `MultiHeadAttention` and `SDPA` layers for cross-attention are also configured with `7=1` and cache I/O blobs. However, the implementation correctly identifies cross-attention (where the query blob is different from the key/value blobs) and reuses the `cache_k_in` and `cache_v_in` directly, without performing concatenation. This allows the static encoder k/v to be passed efficiently through the network. ## 3. ncnn kv cache memory layout The memory layout of the kv cache is a critical design choice for performance. ncnn uses different layouts for `MultiHeadAttention` and `SDPA` to optimize for their respective calculation patterns. ### `MultiHeadAttention` cache layout (Transposed) The `MultiHeadAttention` layer uses a **transposed layout** for its cache blobs. The primary reason for this is to **ensure that data for each attention head is contiguous in memory, which significantly boosts gemm performance.** * **input blobs (q, k, v):** These typically have a shape where height represents the sequence length. * `ncnn::Mat` dimensions: `(w = embed_dim, h = seq_len)` * **cache blobs (`k_cache`, `v_cache`):** These are stored in a **transposed** format. * `ncnn::Mat` dimensions: `(w = seq_len, h = embed_dim)` **the rationale:** 1. **slicing by Head:** During the attention calculation, the code slices the `k_cache` and `v_cache` matrices along their height to isolate the data for each head (e.g., using `row_range(head_index * embed_dim_per_head, embed_dim_per_head)`). 2. **memory contiguity:** Because `ncnn::Mat` uses a row-major memory layout, this slicing operation on the transposed cache blob results in a sub-matrix where all the data for a single head is perfectly contiguous. 3. **gemm efficiency:** Subsequent matrix multiplication operations (`q * k^T` and `Attention * v`) can then operate on these contiguous memory blocks. This maximizes CPU cache locality and the effectiveness of simd instructions, leading to a substantial increase in computational speed. If a non-transposed layout were used, the data for each head would be strided in memory, causing frequent cache misses and dramatically slowing down the performance-critical gemm calculations. Therefore, this transposed layout is a deliberate and crucial optimization for computation. ### `SDPA` cache layout (Standard) The `SDPA` layer uses the **standard ncnn Mat layout**, where the sequence length is represented by the height. * **input blobs (q, k, v):** `(w = embed_dim, h = seq_len, c = num_heads)` * **cache blobs (`k_cache`, `v_cache`):** `(w = embed_dim, h = seq_len, c = num_heads)` **the rationale:** The `SDPA` layer's internal implementation directly concatenates the cache blobs (`past_k`, `past_v`) with the current ones (`cur_k`, `cur_v`) along the height dimension (`seq_len`). This simpler approach avoids the need for a transposed layout while still being highly efficient, as the concatenation logic is handled inside the optimized C++ implementation. ## 4. converting models to support kv cache To enable kv cache, you must modify the model's `.param` file to add the necessary cache inputs and outputs to all `MultiHeadAttention` and `SDPA` layers in the decoder. ### step 1: export a sequence-length-1 model First, export your model from its original framework (e.g., PyTorch) using a sequence length of 1 for the decoder. This creates a graph optimized for single-token generation, which is the core of the autoregressive decoding loop. ### step 2: modify the .ncnn.param file After exporting, a script is needed to edit the generated `.ncnn.param` file to make it cache-aware. #### A. Adding kv cache to All MultiHeadAttention and SDPA Layers You must add cache inputs/outputs to **every** `MultiHeadAttention` / `SDPA` layer in the decoder. - **change `input_count` and `output_count`:** Increase both by 2. - **add blob names:** Append new, unique blob names for `cache_k_in`, `cache_v_in`, `cache_k_out`, and `cache_v_out`. - **enable cache behavior:** Add the parameter `7=1`. Here is a robust Python function that automates this process: ```python def add_kv_cache_to_ncnn_param(filename): """ Modifies an ncnn.param file to add a kv cache mechanism to all MultiHeadAttention and SDPA layers and overwrites the original file. This handles both self-attention and cross-attention layers. """ import os if not os.path.exists(filename): print(f"Error: The file '{filename}' was not found.") return with open(filename, 'r', encoding='utf-8') as f: lines = f.readlines() header_line_index = 1 # line 2, after magic number header_parts = lines[header_line_index].strip().split() original_layer_count = int(header_parts[0]) original_blob_count = int(header_parts[1]) attention_indices = [i for i, line in enumerate(lines) if line.strip().startswith("MultiHeadAttention") or line.strip().startswith("SDPA")] attention_count = len(attention_indices) if attention_count == 0: print("No 'MultiHeadAttention' or 'SDPA' layers found. The file will not be modified.") return # --- modify MultiHeadAttention and SDPA layers --- for i, line_index in enumerate(attention_indices): parts = lines[line_index].strip().split() layer_type, layer_name, input_count_str, output_count_str = parts[:4] input_count, output_count = int(input_count_str), int(output_count_str) blob_and_params = parts[4:] inputs = blob_and_params[:input_count] outputs = blob_and_params[input_count : input_count + output_count] params = blob_and_params[input_count + output_count:] # add cache I/O blobs and enable cache parameter inputs.extend([f"cache_k_in_{i}", f"cache_v_in_{i}"]) outputs.extend([f"cache_k_out_{i}", f"cache_v_out_{i}"]) params.append("7=1") new_line_parts = [ f"{layer_type:<24}", f"{layer_name:<24}", str(input_count + 2), str(output_count + 2), *inputs, *outputs, *params ] lines[line_index] = " ".join(new_line_parts) + "\n" # --- add a single input layer to provide all cache blobs --- new_layer_count = original_layer_count + 1 # each mha needs 2 new *input* blobs and produces 2 new *output* blobs. # the total number of unique blobs increases by 4 for each mha. new_blob_count = original_blob_count + (attention_count * 4) lines[header_line_index] = f"{new_layer_count} {new_blob_count}\n" # find where to insert the new input layer (after existing ones) insert_pos = header_line_index + 1 while insert_pos < len(lines) and lines[insert_pos].strip().startswith("Input"): insert_pos += 1 cache_blob_names = [name for i in range(attention_count) for name in (f"cache_k_in_{i}", f"cache_v_in_{i}")] input_layer_line = ( f"{'Input':<24} {'kv_cache_in':<24} 0 {len(cache_blob_names)} " f"{' '.join(cache_blob_names)}\n" ) lines.insert(insert_pos, input_layer_line) with open(filename, 'w', encoding='utf-8') as f: f.writelines(lines) print(f"Successfully added kv cache to {attention_count} MultiHeadAttention / SDPA layers.") # usage: # add_kv_cache_to_ncnn_param("your_model_decoder.ncnn.param") ``` #### B. Supporting Dynamic Sequence Length in Gemm Feed-forward networks (`Gemm` layers) that process the output of attention blocks must support dynamic sequence lengths, as the cache grows. To achieve this, change the parameter `7=1` (constant input shape) to `7=0` (dynamic input shape) for the relevant `Gemm` layers. ```python def update_gemm_params(param_file_path): """ Finds all 'Gemm' layers and changes parameter '7=1' to '7=0' to support dynamic input shapes. """ import re with open(param_file_path, 'r') as f: lines = f.readlines() new_lines = [] for line in lines: if line.strip().startswith('Gemm'): line = re.sub(r'(\b7=)1\b', r'\g<1>0', line) new_lines.append(line) with open(param_file_path, 'w') as f: f.writelines(new_lines) print(f"Updated Gemm layers in '{param_file_path}' to support dynamic inputs.") # usage: # update_gemm_params("your_model_decoder.ncnn.param") ``` ## 5. implementing kv cache inference logic Your C++ inference code must manage the cache blobs across decoding steps. ### step 1: identify cache blob indices After loading the network, identify the input and output blob indices for the cache. You can iterate through the mha layers and find the blobs you named in the conversion script. ```cpp #include "net.h" #include #include struct kvcache_info { std::vector input_indices; std::vector output_indices; }; void find_mha_kvcache_blobs(const ncnn::Net& net, kvcache_info& info) { for (const ncnn::Layer* layer : net.layers()) { // cache-enabled mha layer has 3 outputs (out, cache_k_out, cache_v_out) instead of 1 if ((layer->typeindex == ncnn::LayerType::MultiHeadAttention || layer->typeindex == ncnn::LayerType::SDPA) && layer->tops.size() == 3) { // the script adds cache_k and cache_v as the last two inputs/outputs int input_count = layer->bottoms.size(); int output_count = layer->tops.size(); info.input_indices.push_back(layer->bottoms[input_count - 2]); // cache_k_in info.input_indices.push_back(layer->bottoms[input_count - 1]); // cache_v_in info.output_indices.push_back(layer->tops[output_count - 2]); // cache_k_out, i.e., tops[1] info.output_indices.push_back(layer->tops[output_count - 1]); // cache_v_out, i.e., tops[2] } } } ``` ### step 2: prefill and decode loop The inference process is split into two phases: "prefill" for the initial prompt and "decode" for subsequent single-token generation. - **prefill (`run_decoder_pre`):** - input: The entire initial sequence of token IDs - the kv cache is empty - run the decoder once - extract the output logits for the *last* token to predict the next token - extract the `out_cache_k` and `out_cache_v` blobs from all mha layers and store them - **decode (`run_decoder_step`):** - input: The single, most recently generated token ID - the kv cache blobs from the previous step are fed as input - run the decoder - extract the output logits to predict the next token - extract and store the updated kv cache blobs for the next step Here is a conceptual C++ implementation: ```cpp // assume 'decoder_net' is loaded and 'kvcache_info' is populated. // --- prefill step (processes a sequence of tokens) --- void run_decoder_pre(const std::vector& tokens, const ncnn::Mat& encoder_states, std::vector& out_kv_cache) { ncnn::Extractor ex = decoder_net.create_extractor(); ncnn::Mat input_embeds = prepare_input_embeds(tokens); // your embedding logic ex.input("in0", input_embeds); // use your input blob name ex.input("encoder_out", encoder_states); // use your encoder output blob name out_kv_cache.resize(kvcache_info.output_indices.size()); for (size_t i = 0; i < kvcache_info.output_indices.size(); i++) { ex.extract(kvcache_info.output_indices[i], out_kv_cache[i]); } ncnn::Mat all_logits; ex.extract("out0", all_logits); // Use your output blob name // ... process logits for the last token ... } // --- decode step (processes a single token) --- void run_decoder_step(int token, const ncnn::Mat& encoder_states, const std::vector& kv_cache, std::vector& out_kv_cache) { ncnn::Extractor ex = decoder_net.create_extractor(); ncnn::Mat input_embeds = prepare_input_embeds({token}); ex.input("in0", input_embeds); ex.input("encoder_out", encoder_states); // feed the existing cache for (size_t i = 0; i < kvcache_info.input_indices.size(); i++) { ex.input(kvcache_info.input_indices[i], kv_cache[i]); } // extract the updated cache out_kv_cache.resize(kvcache_info.output_indices.size()); for (size_t i = 0; i < kvcache_info.output_indices.size(); i++) { ex.extract(kvcache_info.output_indices[i], out_kv_cache[i]); } ncnn::Mat logits; ex.extract("out0", logits); // ... process logits to get the next token ... } // --- main inference loop --- void generate_sequence() { std::vector initial_tokens = { /* SOT and prompt tokens */ }; ncnn::Mat encoder_states = run_encoder(); // compute encoder output once // 1. prefill stage std::vector kv_cache; run_decoder_pre(initial_tokens, encoder_states, kv_cache); int next_token = get_next_token_from_prefill_logits(); // 2. autoregressive decoding loop while (next_token != EOT_TOKEN && sequence_length < MAX_LENGTH) { std::vector next_kv_cache; run_decoder_step(next_token, encoder_states, kv_cache, next_kv_cache); kv_cache = next_kv_cache; // update cache for the next iteration next_token = get_next_token_from_step_logits(); // append next_token to your generated sequence } } ``` This structured approach allows ncnn to perform highly efficient Transformer inference, correctly handling both dynamic self-attention and static cross-attention caches with an optimized memory layout. ================================================ FILE: docs/developer-guide/layer-feat-mask.md ================================================ # layer feature mask Each ncnn layer allows a special parameter pair `31=X` to control specific bahavior. X is an unsigned integer with each bit contributing a feature mask. We usually use it to configuring fine-graded behaviors for certain layers to maintain accuracy, reduce memory usage or optimize performance. |bit|value|mask|rationale| |---|---|---|---| |1<<0|1|no fp16 arithmetic|precision concern| |1<<1|2|no fp16 storage|precision concern| |1<<2|4|no bf16 storage|precision concern| |1<<3|8|no int8|debug dynamic quantized model| |1<<4|16|no vulkan|reduce overhead for cpu op - gpu split - cpu op| |1<<5|32|no sgemm|reduce some memory| |1<<6|64|no winograd|reduce some memory| |1<<7|128|no threading|force single thread| These bits can be OR-combined into one value to control multiple behaviors simultaneously. For example, `31=17` means disabling both vulkan and fp16 arithmetic. ## disable fp16 for certain layer to fix overflow ```ruby 7767517 3 3 Input input 0 1 input0 0=22 1=22 2=32 Convolution conv0 1 1 input0 conv0 0=32 1=1 6=1024 9=1 Convolution conv1 1 1 conv0 conv1 0=128 1=3 6=36864 9=1 ``` Typically, we use fp16 computation to improve inference speed. However, since the weight value of `conv1` is very large, fp16 accumulation may cause numerical overflow, so fp16 needs to be disabled individually for `conv1`, while other layers continue to use fp16 mode Add `31=3` to disable fp16 storage and arithmetic. ```ruby 7767517 3 3 Input input 0 1 input0 0=22 1=22 2=32 Convolution conv0 1 1 input0 conv0 0=32 1=1 6=1024 9=1 Convolution conv1 1 1 conv0 conv1 0=128 1=3 6=36864 9=1 31=3 ``` ## disable vulkan for certain layer to improve performance ```ruby 7767517 5 5 Input input 0 1 input0 0=22 1=22 2=32 Convolution conv0 1 1 input0 conv0 0=32 1=1 6=1024 9=1 SomeCPULayer c0 1 1 conv0 c0 0=32 ReLU relu0 1 1 c0 relu0 SomeCPULayer c1 1 1 relu0 c1 0=32 ``` Between the CPU layers, there is a simple calculation layer that supports vulkan. We can set `31=16` to force it to run on CPU. This can avoid the overhead of data upload, download and storage layout conversion between CPU and GPU. After all, CPU is fast enough for simple operations. ```ruby 7767517 5 5 Input input 0 1 input0 0=22 1=22 2=32 Convolution conv0 1 1 input0 conv0 0=32 1=1 6=1024 9=1 SomeCPULayer c0 1 1 conv0 c0 0=32 ReLU relu0 1 1 c0 relu0 31=16 SomeCPULayer c1 1 1 relu0 c1 0=32 ``` ## disable winograd for certain layer to reduce memory usage ```ruby 7767517 3 3 Input input 0 1 input0 0=22 1=22 2=32 Convolution conv0 1 1 input0 conv0 0=32 1=1 6=1024 9=1 Convolution conv1 1 1 conv0 conv1 0=128 1=3 6=36864 9=1 ``` The winograd technology uses more memory for the purpose of improving convolution performance, but this is not always true. In some memory-constrained situations, or memory IO bottlenecks, we can disable the use of winograd on some layers in exchange for a smaller memory footprint. Add `31=64` to Convolution layer, which forces it to use implcit-gemm or tiled im2col-gemm implementation, reducing memory usage and sometimes improving vulkan performance. ```ruby 7767517 3 3 Input input 0 1 input0 0=22 1=22 2=32 Convolution conv0 1 1 input0 conv0 0=32 1=1 6=1024 9=1 Convolution conv1 1 1 conv0 conv1 0=128 1=3 6=36864 9=1 31=64 ``` ## disable threading for certain layer to improve performance ```ruby 7767517 4 4 Input input 0 1 input0 0=22 1=22 2=3 Convolution conv0 1 1 input0 conv0 0=16 1=3 6=432 HardSigmoid hs 1 1 conv0 hs0 Convolution conv1 1 1 hs0 conv1 0=16 1=3 6=2304 ``` The overhead of multi-thread dispatch and merging is too large for small tensors. Add `31=128` to HardSigmoid layer, which forces it to execute in a single thread, reducing power consumption and improving performance. ```ruby 7767517 4 4 Input input 0 1 input0 0=22 1=22 2=3 Convolution conv0 1 1 input0 conv0 0=16 1=3 6=432 HardSigmoid hs 1 1 conv0 hs0 31=128 Convolution conv1 1 1 hs0 conv1 0=16 1=3 6=2304 ``` ================================================ FILE: docs/developer-guide/layer-support-behavior.md ================================================ # Understanding `support_XYZ` Properties in ncnn's `Layer` Class This document is for developers implementing new layers in `ncnn`. It explains the `support_XYZ` boolean properties in the `ncnn::Layer` base class. Correctly setting these properties declares the capabilities of your layer to the `ncnn` inference engine. This allows the engine to apply specific optimizations, such as enabling SIMD, half-precision floating-point computation, or Vulkan GPU acceleration, to achieve optimal performance and memory efficiency. ## When to Set `support` Properties A layer can set its `support` properties in two ways: 1. **Statically in the constructor**: If the layer's capabilities are fixed, the simplest way is to set them in its constructor. 2. **Dynamically in `create_pipeline`**: If the layer's capabilities depend on parameters loaded from `load_param` or `load_model` (e.g., the data type of weights), you can set these properties dynamically within the `create_pipeline` method. --- ## Property Details Here is a detailed breakdown of each `support` property and what it means for your layer's implementation. ### `one_blob_only` * **Purpose**: Declares that the layer accepts only one input `blob` and produces only one output `blob`. * **Requirements if `true`**: You must implement the single-input, single-output version of the `forward` method: ```cpp virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; ``` * **Behavior**: When `true`, `ncnn` calls this overload. If `false` (default), the `std::vector` version of `forward` is called. ### `support_inplace` * **Purpose**: Declares that the layer supports in-place computation, meaning the input and output can share the same memory. This significantly reduces memory overhead. * **Requirements if `true`**: You must implement the `forward_inplace` method. Depending on whether `one_blob_only` is also enabled, implement the corresponding version: ```cpp // If one_blob_only is true virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; // If one_blob_only is false virtual int forward_inplace(std::vector& bottom_top_blobs, const Option& opt) const; ``` ### `support_vulkan` * **Purpose**: Declares that the layer has a Vulkan implementation for GPU-accelerated inference. * **Requirements if `true`**: * Implement `forward` / `forward_inplace` methods that accept `VkMat` for input and output. * Implement `upload_model` to transfer weight data to the GPU. * Implement `create_pipeline` and `destroy_pipeline` to manage Vulkan `Pipeline` objects and other GPU resources. ### `support_packing` (for CPU) * **Purpose**: Declares that the layer's **CPU implementation** can handle `Mat` data with a "packing" memory layout (i.e., `elempack > 1`). This is crucial for SIMD optimizations (e.g., processing 4 or 8 floats at once with NEON or AVX). * **Behavior if `true`**: * When the input `Mat` channel count is a multiple of the SIMD width, the `ncnn` engine ensures that the input `Mat` passed to `forward` / `forward_inplace` is packed (e.g., `elempack=4` or `elempack=8`). * Your implementation must correctly handle `Mat` data where `cstep` and `elempack` are not their default values. * **Behavior if `false`**: * The `ncnn` engine guarantees that the input `Mat` passed to your layer will always have `elempack=1`. The engine will automatically insert conversions if the preceding layer produced a packed output. * **Output**: Regardless of the property's value, your layer can output a `Mat` with any `elempack`. However, it is highly recommended to output a `Mat` with an adaptive `elempack` to avoid unnecessary conversions in subsequent layers. ### `support_any_packing` (for CPU) * **Purpose**: An extension of `support_packing`. It declares that the layer's **CPU implementation** is flexible enough to handle a `Mat` with **any** `elempack` value (`1`, `4`, `8`, etc.). * **Behavior if `true`**: * The `ncnn` engine can pass an input `Mat` with any packing layout to your `forward` method, without forcing a conversion to the hardware's "optimal" `elempack`. For example, on an AVX512 system where `elempack=16` is optimal, your layer can still accept `elempack=1`, `4`, or `8`. * This gives the engine more flexibility to avoid unnecessary packing/unpacking conversions between layers. * **Behavior if `false`**: If `false` (but `support_packing` is `true`), the engine will try to provide an input `Mat` with an optimal `elempack` for the target architecture. * **Output**: This property does not enforce any constraint on the output `Mat`, which can have any `elempack`. ### `support_vulkan_packing` (for Vulkan) * **Purpose**: This is the Vulkan equivalent of `support_packing`. It declares that the layer's **Vulkan implementation** can handle `VkMat` with `elempack=4`. * **Behavior if `true`**: When the input `VkMat` has a channel count that is a multiple of 4, the `ncnn` engine will provide a packed `VkMat` (with `elempack=4`) to your Vulkan `forward` methods. * **Behavior if `false`**: The engine will ensure the input `VkMat` has `elempack=1`. * **Note**: `support_packing` and `support_vulkan_packing` are independent. A layer can support packing on CPU but not on Vulkan, or vice-versa. ### `support_vulkan_any_packing` (for Vulkan) * **Purpose**: An extension of `support_vulkan_packing`. It declares that the layer's **Vulkan implementation** can handle a `VkMat` with **any** supported `elempack` value (e.g., `1`, `4`). * **Behavior if `true`**: * The `ncnn` engine can pass an input `VkMat` with any supported packing layout to your Vulkan `forward` method. This allows the engine to avoid unnecessary repacking operations on the GPU. * This is particularly useful for optimizing shader dispatch and memory access patterns. * **Behavior if `false`**: If `false` (but `support_vulkan_packing` is `true`), the engine will try to provide a `VkMat` with `elempack=4` if the channel count is a multiple of 4. * **Note**: This property is independent of its CPU counterpart, `support_any_packing`. ### `support_bf16_storage` * **Purpose**: Declares that the layer can process `bfloat16` data. * **Behavior if `true`**: * The `forward` method may receive an input `Mat` of type `bfloat16` (`elembits() == 16`) or `fp32`. * Inside your `forward` implementation, you must check `opt.use_bf16_storage` and `bottom_blob.elembits()` to determine whether to use a `bfloat16`-optimized code path. * **Behavior if `false`**: The `ncnn` engine ensures your layer will **not** receive a `bfloat16` `Mat`. * **Output**: Your layer can output either a `bfloat16` or `fp32` `Mat`. When `opt.use_bf16_storage` is active, outputting `bfloat16` is recommended to maintain precision and performance across the network. ### `support_fp16_storage` * **Purpose**: Declares that the layer can process `float16` data for half-precision inference. * **Behavior if `true`**: * Similar to `support_bf16_storage`, the `forward` method may receive an `fp16` or `fp32` `Mat`. * Your implementation should check `opt.use_fp16_storage` and `bottom_blob.elembits()` to select the correct code path. * **Behavior if `false`**: The `ncnn` engine ensures your layer will **not** receive an `fp16` `Mat`. * **Output**: Your layer can output either a `fp16` or `fp32` `Mat`. When `opt.use_fp16_storage` is active, outputting an `fp16` `Mat` is recommended. ### `support_int8_storage` * **Purpose**: Declares that the layer supports `int8` quantized inference. * **Behavior if `true`**: * When `opt.use_int8_inference` is `true`, the `forward` method may receive an `int8` or `fp32` `Mat`. * **Important**: If the input is `fp32`, your `forward` implementation is responsible for dynamically quantizing it to `int8` before performing computations. * **Behavior if `false`**: The `ncnn` engine ensures your layer will **not** receive an `int8` `Mat`. * **Output**: The output can be `int8` or `fp32`, depending on your layer's design. --- ## Practical Implementation and Priorities ### Handling Multiple Precision Types A layer can set `support_fp16_storage` and `support_bf16_storage` to `true` simultaneously. The `ncnn` engine prioritizes these formats based on the `Option` flags. As seen in the `convert_layout` function in `src/net.cpp`, if `opt.use_bf16_storage` is true, the engine will prefer converting inputs to `bfloat16`. Otherwise, it falls back to `fp16` if `opt.use_fp16_storage` is true. The chosen `elempack` also depends on the precision. For instance, with SIMD, the priority might be: * FP16: `elempack=8` (if supported), then `elempack=4`, then `1`. * BF16: `elempack=4`, then `1`. Your `forward` implementation should reflect this by checking `elembits()` and `elempack` to dispatch to the correct kernel. ### Code Example: `Clip_arm` The `Clip_arm` layer provides a great example of these concepts in practice. 1. **Declaring Support in the Constructor**: It declares support for packing and, conditionally, for fp16 and bf16 storage. ```cpp // From: src/layer/arm/clip_arm.cpp Clip_arm::Clip_arm() { #if __ARM_NEON support_packing = true; #if NCNN_ARM82 support_fp16_storage = cpu_support_arm_asimdhp(); #endif #endif // __ARM_NEON #if NCNN_BF16 support_bf16_storage = true; #endif } ``` 2. **Dispatching in `forward_inplace`**: The `forward_inplace` method acts as a dispatcher. It first checks the element size (`elembits`) and the corresponding `opt` flag to decide whether to call a specialized low-precision implementation (`fp16s` or `bf16s`). If neither is applicable, it defaults to the standard `fp32` implementation. ```cpp // From: src/layer/arm/clip_arm.cpp int Clip_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { int elembits = bottom_top_blob.elembits(); #if NCNN_ARM82 if (support_fp16_storage && opt.use_fp16_storage && elembits == 16) return forward_inplace_fp16s(bottom_top_blob, opt); #endif #if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_inplace_bf16s(bottom_top_blob, opt); #endif // Default fp32 implementation follows... int w = bottom_top_blob.w; // ... } ``` ### An Incremental Development Workflow Adopting a gradual approach can simplify the development of a new layer: 1. **Implement the Core Algorithm**: Start with all `support_XYZ` properties set to `false`. Focus on getting the mathematical logic correct using standard `fp32` data and `elempack=1`. 2. **Add Packing Support**: Once the core logic is validated, set `support_packing = true`. Modify your code to handle `elempack > 1` and implement SIMD optimizations (e.g., using NEON intrinsics). 3. **Add Low-Precision Support**: Next, add support for `fp16`, `bf16`, or `int8`. Set the corresponding `support_*_storage` flags to `true` and add branches in your `forward` method to handle these data types based on the `opt` flags. 4. **Add Vulkan Support**: Finally, if GPU acceleration is desired, set `support_vulkan = true` and implement the Vulkan-specific methods. This incremental process allows you to tackle one challenge at a time, making it easier to develop a highly optimized and feature-rich layer. ================================================ FILE: docs/developer-guide/low-level-operation-api.md ================================================ # implement elementwise addition with/without broadcast using BinaryOp operation * input must be fp32 storage without packing * output is expected to be fp32 storage without packing ```cpp void binary_add(const ncnn::Mat& a, const ncnn::Mat& b, ncnn::Mat& c) { ncnn::Option opt; opt.num_threads = 2; opt.use_fp16_storage = false; opt.use_packing_layout = false; ncnn::Layer* op = ncnn::create_layer("BinaryOp"); // set param ncnn::ParamDict pd; pd.set(0, 0);// op_type op->load_param(pd); op->create_pipeline(opt); // forward std::vector bottoms(2); bottoms[0] = a; bottoms[1] = b; std::vector tops(1); op->forward(bottoms, tops, opt); c = tops[0]; op->destroy_pipeline(opt); delete op; } ``` # implement 3x3 box blur on three channel image using ConvolutionDepthWise operation * input must be fp32 storage without packing * output is expected to be fp32 storage without packing ```cpp void convolution_3x3_boxblur_RGB(const ncnn::Mat& rgb, ncnn::Mat& out) { ncnn::Option opt; opt.num_threads = 2; opt.use_fp16_storage = false; opt.use_packing_layout = false; ncnn::Layer* op = ncnn::create_layer("ConvolutionDepthWise"); // set param ncnn::ParamDict pd; pd.set(0, 3);// num_output pd.set(1, 3);// kernel_w pd.set(5, 0);// bias_term pd.set(6, 3*3*3);// weight_data_size pd.set(7, 3);// group op->load_param(pd); // set weights ncnn::Mat weights[1]; weights[0].create(3*3*3);// weight_data for (int i=0; i<3*3*3; i++) { weights[0][i] = 1.f / 9; } op->load_model(ncnn::ModelBinFromMatArray(weights)); op->create_pipeline(opt); // forward op->forward(rgb, out, opt); op->destroy_pipeline(opt); delete op; } ``` # transpose Mat, chw to cwh * input must be fp32 storage with/without packing * output is expected to be fp32 storage packed ```cpp void transpose(const ncnn::Mat& in, ncnn::Mat& out) { ncnn::Option opt; opt.num_threads = 2; opt.use_fp16_storage = false; opt.use_packing_layout = true; ncnn::Layer* op = ncnn::create_layer("Permute"); // set param ncnn::ParamDict pd; pd.set(0, 1);// order_type op->load_param(pd); op->create_pipeline(opt); ncnn::Mat in_packed = in; { // resolve dst_elempack int dims = in.dims; int elemcount = 0; if (dims == 1) elemcount = in.elempack * in.w; if (dims == 2) elemcount = in.elempack * in.h; if (dims == 3) elemcount = in.elempack * in.c; int dst_elempack = 1; if (op->support_packing) { if (elemcount % 8 == 0 && (ncnn::cpu_support_x86_avx2() || ncnn::cpu_support_x86_avx())) dst_elempack = 8; else if (elemcount % 4 == 0) dst_elempack = 4; } if (in.elempack != dst_elempack) { convert_packing(in, in_packed, dst_elempack, opt); } } // forward op->forward(in_packed, out, opt); op->destroy_pipeline(opt); delete op; } ``` # apply instance normalization // x = (x - mean) / sqrt(var) * input can be fp32/fp16 storage with/without packing * output is expected to be fp16 storage packed when supported, or fp32 storage packed otherwise ```cpp void normalize(const ncnn::Mat& in, ncnn::Mat& out) { ncnn::Option opt; opt.num_threads = 2; opt.use_fp16_storage = true; opt.use_packing_layout = true; ncnn::Layer* op = ncnn::create_layer("InstanceNorm"); // set param ncnn::ParamDict pd; pd.set(0, in.c);// channels pd.set(1, 0.f);// eps op->load_param(pd); // set weights ncnn::Mat weights[2]; weights[0].create(in.c);// gamma_data weights[1].create(in.c);// beta_data weights[0].fill(1.f); weights[1].fill(0.f); op->load_model(ncnn::ModelBinFromMatArray(weights)); op->create_pipeline(opt); ncnn::Mat in_fp16 = in; if (in.elembits() == 32 && op->support_fp16_storage) { cast_float32_to_float16(in, in_fp16, opt); } if (in.elembits() == 16 && !op->support_fp16_storage) { cast_float16_to_float32(in, in_fp16, opt); } ncnn::Mat in_fp16_packed = in_fp16; { // resolve dst_elempack int dims = in_fp16.dims; int elemcount = 0; if (dims == 1) elemcount = in_fp16.elempack * in_fp16.w; if (dims == 2) elemcount = in_fp16.elempack * in_fp16.h; if (dims == 3) elemcount = in_fp16.elempack * in_fp16.c; int dst_elempack = 1; if (op->support_packing) { if (elemcount % 8 == 0 && (ncnn::cpu_support_x86_avx2() || ncnn::cpu_support_x86_avx())) dst_elempack = 8; else if (elemcount % 4 == 0) dst_elempack = 4; } if (in_fp16.elempack != dst_elempack) { convert_packing(in_fp16, in_fp16_packed, dst_elempack, opt); } } // forward op->forward(in_fp16_packed, out, opt); op->destroy_pipeline(opt); delete op; } ``` # cpu -> gpu -> forward -> gpu -> cpu ```cpp ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(); ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator(); ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator(); ncnn::VkWeightAllocator* weight_vkallocator = new ncnn::VkWeightAllocator(vkdev); ncnn::VkWeightStagingAllocator* weight_staging_vkallocator = new ncnn::VkWeightStagingAllocator(vkdev); // create layer ncnn::Layer* convolution = ncnn::create_layer("Convolution"); convolution->vkdev = vkdev; // set option ncnn::Option opt; opt.num_threads = 4; opt.use_vulkan_compute = true; opt.blob_vkallocator = blob_vkallocator; opt.workspace_vkallocator = blob_vkallocator; opt.staging_vkallocator = staging_vkallocator; // load param { ncnn::ParamDict pd; pd.set(0, outch); pd.set(1, ksize); pd.set(6, outch*inch*ksize*ksize); pd.use_vulkan_compute = 1; convolution->load_param(pd); } // load model { ncnn::Mat weights[2]; weights[0] = random_mat(outch*inch*ksize*ksize); weights[1] = random_mat(outch); ncnn::ModelBinFromMatArray mb(weights); convolution->load_model(mb); } // create pipeline convolution->create_pipeline(opt); // upload model { ncnn::VkTransfer cmd(vkdev); ncnn::Option opt_upload = opt; opt_upload.blob_vkallocator = weight_vkallocator; opt_upload.workspace_vkallocator = weight_vkallocator; opt_upload.staging_vkallocator = weight_staging_vkallocator; convolution->upload_model(cmd, opt_upload); cmd.submit_and_wait(); } ncnn::Mat bottom = random_mat(w, h, inch); ncnn::Mat top; // forward { ncnn::VkCompute cmd(vkdev); ncnn::VkMat bottom_gpu; cmd.record_upload(bottom, bottom_gpu, opt); ncnn::VkMat top_gpu; convolution->forward(bottom_gpu, top_gpu, cmd, opt); cmd.record_download(top_gpu, top, opt); cmd.submit_and_wait(); } convolution->destroy_pipeline(opt); delete convolution; vkdev->reclaim_blob_allocator(blob_vkallocator); vkdev->reclaim_staging_allocator(staging_vkallocator); weight_vkallocator->clear(); weight_staging_vkallocator->clear(); delete weight_vkallocator; delete weight_staging_vkallocator; ``` ================================================ FILE: docs/developer-guide/ncnn-tips-and-tricks.zh.md ================================================ ### blob内存是隐含共享的 ncnn的blob最初直接使用opencv的cv::Mat,后发现blob最多只支持三维,因此实现了类似的Mat Mat的data每个通道内存16字节对齐,并且有原子的引用计数,a=b不复制数据,超级快 Mat支持直接引用外部的内存块,不复制数据,加快模型加载和输入输出 举个例子:split layer 将一个blob复制成n个,ncnn中实现为单纯的增加引用计数,没有任何数据复制 ### 只运算一部分并保留中间结果 ncnn的net在解决分支依赖时是自上而下深度优先的,因此当网络有多个分支时,运算只会在需要结果的那个分支中进行,节约时间 当多个分支有重合部分时,运算其中一个分支后会自动保留其余分支所需的中间结果,隐含共享,以便运算其余分支时利用 举个例子:某网络结构为 A -> B -> C1 + C2,向ncnn索要C1结果时,运算过程是 A -> B -> C1,同时B结果引用计数加1自动保留,后面还需要C2结果时,只运算C2就足够了 ### 开启轻模式省内存 每个layer都会产生blob,除了最后的结果和多分支中间结果,大部分blob都不值得保留,开启轻模式可以在运算后自动回收,省下内存 举个例子:某网络结构为 A -> B -> C,在轻模式下,向ncnn索要C结果时,A结果会在运算B时自动回收,而B结果会在运算C时自动回收,最后只保留C结果,后面再需要C结果会直接获得,满足绝大部分深度网络的使用方式 ### 网络和运算是分开的 ncnn的net是网络模型,实际使用的是extractor,也就是同个net可以有很多个运算实例,而且运算实例互不影响,中间结果保留在extractor内部,在多线程使用时共用网络的结构和参数数据,初始化网络模型和参数只需要一遍 举个例子:全局静态的net实例,初始化一次后,就能不停地生成extractor使用 ### openmp虽快但未必合适 ncnn中几乎所有运算都能用上openmp多线程加速,而且性能很赞 不过系统有时候会突然慢一下,比如手机太热自动降频,界面操作等等,ncnn耗时也会偶尔抖动变长,在计算耗时稳定性比较重要的时候建议关闭openmp,或者设置下extractor线程数 举个例子:手机自拍时,用ncnn进行人脸实时定位,如果耗时突然涨一下就会感觉到掉帧,而稳定的帧率体验更好 ### NCNN_STDIO/NCNN_STRING禁用模型文件 ncnn支持加载自有的模型文件和模型内存,NCNN_STDIO控制是否需要支持加载模型文件,设成0能禁用这部分代码,从而减小库的体积,NCNN_STRING设成0能清除大部分可见的字符串和解析过程 模型内存加载时的参数数据是直接引用的,速度更快,通常在手机上使用这种方式 ### 削减 ncnn 内置的层实现 cmake的时候,加参数 -DWITH_LAYER_xxx=OFF 就可以完全不编译对应的内置层,这样可以进一步减小库的体积 ### 关于 ARM big.LITTLE 调度 调用set_cpu_powersave可以把ncnn运算线程控制在特定的cpu核心上,大核心速度快耗电多,小核心速度慢点但省电,大小一起用手机热得快 ================================================ FILE: docs/developer-guide/new-model-load-api.md ================================================ ## current model load api ### Cons #### long and awful code #### two functions #### deal float32 float16 quantized-u8 #### deal alignment size ```cpp #if NCNN_STDIO int Convolution::load_model(FILE* binfp) { int nread; union { struct { unsigned char f0; unsigned char f1; unsigned char f2; unsigned char f3; }; unsigned int tag; } flag_struct; nread = fread(&flag_struct, sizeof(flag_struct), 1, binfp); if (nread != 1) { fprintf(stderr, "Convolution read flag_struct failed %d\n", nread); return -1; } unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3; weight_data.create(weight_data_size); if (weight_data.empty()) return -100; if (flag_struct.tag == 0x01306B47) { // half-precision weight data int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned short), 4); std::vector float16_weights; float16_weights.resize(align_weight_data_size); nread = fread(float16_weights.data(), align_weight_data_size, 1, binfp); if (nread != 1) { fprintf(stderr, "Convolution read float16_weights failed %d\n", nread); return -1; } weight_data = Mat::from_float16(float16_weights.data(), weight_data_size); if (weight_data.empty()) return -100; } else if (flag != 0) { // quantized weight data float quantization_value[256]; nread = fread(quantization_value, 256 * sizeof(float), 1, binfp); if (nread != 1) { fprintf(stderr, "Convolution read quantization_value failed %d\n", nread); return -1; } int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned char), 4); std::vector index_array; index_array.resize(align_weight_data_size); nread = fread(index_array.data(), align_weight_data_size, 1, binfp); if (nread != 1) { fprintf(stderr, "Convolution read index_array failed %d\n", nread); return -1; } float* weight_data_ptr = weight_data; for (int i = 0; i < weight_data_size; i++) { weight_data_ptr[i] = quantization_value[ index_array[i] ]; } } else if (flag_struct.f0 == 0) { // raw weight data nread = fread(weight_data, weight_data_size * sizeof(float), 1, binfp); if (nread != 1) { fprintf(stderr, "Convolution read weight_data failed %d\n", nread); return -1; } } if (bias_term) { bias_data.create(num_output); if (bias_data.empty()) return -100; nread = fread(bias_data, num_output * sizeof(float), 1, binfp); if (nread != 1) { fprintf(stderr, "Convolution read bias_data failed %d\n", nread); return -1; } } return 0; } #endif // NCNN_STDIO int Convolution::load_model(const unsigned char*& mem) { union { struct { unsigned char f0; unsigned char f1; unsigned char f2; unsigned char f3; }; unsigned int tag; } flag_struct; memcpy(&flag_struct, mem, sizeof(flag_struct)); mem += sizeof(flag_struct); unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3; if (flag_struct.tag == 0x01306B47) { // half-precision weight data weight_data = Mat::from_float16((unsigned short*)mem, weight_data_size); mem += alignSize(weight_data_size * sizeof(unsigned short), 4); if (weight_data.empty()) return -100; } else if (flag != 0) { // quantized weight data const float* quantization_value = (const float*)mem; mem += 256 * sizeof(float); const unsigned char* index_array = (const unsigned char*)mem; mem += alignSize(weight_data_size * sizeof(unsigned char), 4); weight_data.create(weight_data_size); if (weight_data.empty()) return -100; float* weight_data_ptr = weight_data; for (int i = 0; i < weight_data_size; i++) { weight_data_ptr[i] = quantization_value[ index_array[i] ]; } } else if (flag_struct.f0 == 0) { // raw weight data weight_data = Mat(weight_data_size, (float*)mem); mem += weight_data_size * sizeof(float); } if (bias_term) { bias_data = Mat(num_output, (float*)mem); mem += num_output * sizeof(float); } return 0; } ``` ## new model load api proposed ### Pros #### clean and simple api #### element type detection ```cpp int Convolution::load_model(const ModelBin& mb) { // auto detect element type weight_data = mb.load(weight_data_size, 0); if (weight_data.empty()) return -100; if (bias_term) { // certain type specified bias_data = mb.load(num_output, 1); if (bias_data.empty()) return -100; } return 0; } ``` ================================================ FILE: docs/developer-guide/new-param-load-api.md ================================================ ## current param load api ### Cons #### long and awful code #### three functions #### not extensible #### no default value #### no variable length array ``` MyLayer mylayer 1 1 in out 100 1.250000 ``` ``` binary 100 binary 1.250000 ``` ```cpp #if NCNN_STDIO #if NCNN_STRING int MyLayer::load_param(FILE* paramfp) { int nscan = fscanf(paramfp, "%d %f", &a, &b); if (nscan != 2) { fprintf(stderr, "MyLayer load_param failed %d\n", nscan); return -1; } return 0; } #endif // NCNN_STRING int MyLayer::load_param_bin(FILE* paramfp) { fread(&a, sizeof(int), 1, paramfp); fread(&b, sizeof(float), 1, paramfp); return 0; } #endif // NCNN_STDIO int MyLayer::load_param(const unsigned char*& mem) { a = *(int*)(mem); mem += 4; b = *(float*)(mem); mem += 4; return 0; } ``` ## new param load api proposed ### Pros #### clean and simple api #### default value #### extensible #### variable length array ``` 7767517 MyLayer mylayer 1 1 in out 0=100 1=1.250000 -23303=5,0.1,0.2,0.4,0.8,1.0 ``` ``` binary 0xDD857600(magic) binary 0 binary 100 binary 1 binary 1.250000 binary -23303 binary 5 binary 0.1 binary 0.2 binary 0.4 binary 0.8 binary 1.0 binary -233(EOP) ``` ```cpp int MyLayer::load_param(const ParamDict& pd) { // pd.get( param id (seq), default value ); a = pd.get(0, 100); b = pd.get(1, 1.25f); // get default value for c if not specified in param file c = pd.get(2, 0.001); // get array d = pd.get(3, Mat(len, array)); return 0; } ``` ================================================ FILE: docs/developer-guide/operation-param-weight-table.md ================================================ |operation|param id|param phase|default value|weight order| |:---:|:---:|:---:|:---:|:---:| |AbsVal||| |ArgMax|0|out_max_val|0| ||1|topk|1| |BatchNorm|0|channels|0|slope mean variance bias| ||1|eps|0.f| |Bias|0|bias_data_size|0| |BinaryOp|0|op_type|0| ||1|with_scalar|0| ||2|b|0.f| |BNLL||| |Cast|0|type_from|0| ||1|type_to|0| |Clip|0|min|-FLT_MAX| ||1|max|FLT_MAX| |Concat|0|axis|0| |Convolution|0|num_output|0|weight bias| ||1|kernel_w|0| ||2|dilation_w|1| ||3|stride_w|1| ||4|pad_left|0| ||5|bias_term|0| ||6|weight_data_size|0| ||8|int8_scale_term|0| ||9|activation_type|0| ||10|activation_params|[ ]| ||11|kernel_h|kernel_w| ||12|dilation_h|dilation_w| ||13|stride_h|stride_w| ||15|pad_right|pad_left| ||14|pad_top|pad_left| ||16|pad_bottom|pad_top| ||17|impl_type|0| ||18|pad_value|0.f| |ConvolutionDepthWise|0|num_output|0|weight bias| ||1|kernel_w|0| ||2|dilation_w|1| ||3|stride_w|1| ||4|pad_left|0| ||5|bias_term|0| ||6|weight_data_size|0| ||7|group|1| ||8|int8_scale_term|0| ||9|activation_type|0| ||10|activation_params|[ ]| ||11|kernel_h|kernel_w| ||12|dilation_h|dilation_w| ||13|stride_h|stride_w| ||15|pad_right|pad_left| ||14|pad_top|pad_left| ||16|pad_bottom|pad_top| ||18|pad_value|0.f| |Crop|0|woffset|0| ||1|hoffset|0| ||2|coffset|0| ||3|outw|0| ||4|outh|0| ||5|outc|0| ||6|woffset2|0| ||7|hoffset2|0| ||8|coffset2|0| ||9|starts|[ ]| ||10|ends|[ ]| ||11|axes|[ ]| |Deconvolution|0|num_output|0|weight bias| ||1|kernel_w|0| ||2|dilation_w|1| ||3|stride_w|1| ||4|pad_left|0| ||5|bias_term|0| ||6|weight_data_size|0| ||9|activation_type|0| ||10|activation_params|[ ]| ||11|kernel_h|kernel_w| ||12|dilation_h|dilation_w| ||13|stride_h|stride_w| ||15|pad_right|pad_left| ||14|pad_top|pad_left| ||16|pad_bottom|pad_top| ||18|output_pad_right|0| ||19|output_pad_bottom|output_pad_right| ||20|output_w|0| ||21|output_h|output_w| |DeconvolutionDepthWise|0|num_output|0|weight bias| ||1|kernel_w|0| ||2|dilation_w|1| ||3|stride_w|1| ||4|pad_left|0| ||5|bias_term|0| ||6|weight_data_size|0| ||7|group|1| ||9|activation_type|0| ||10|activation_params|[ ]| ||11|kernel_h|kernel_w| ||12|dilation_h|dilation_w| ||13|stride_h|stride_w| ||15|pad_right|pad_left| ||14|pad_top|pad_left| ||16|pad_bottom|pad_top| ||18|output_pad_right|0| ||19|output_pad_bottom|output_pad_right| ||20|output_w|0| ||21|output_h|output_w| |Dequantize|0|scale|1.f|bias| ||1|bias_term|0| ||2|bias_data_size|0| |DetectionOutput|0|num_class|0| ||1|nms_threshold|0.05f| ||2|nms_top_k|300| ||3|keep_top_k|100| ||4|confidence_threshold|0.5f| ||5|variances[0]|0.1f| ||6|variances[1]|0.1f| ||7|variances[2]|0.2f| ||8|variances[3]|0.2f| |Dropout|0|scale|1.f| |Eltwise|0|op_type|0| ||1|coeffs|[ ]| |ELU|0|alpha|0.1f| |Embed|0|num_output|0|weight bias| ||1|input_dim|0| ||2|bias_term|0| ||3|weight_data_size|0| |Exp|0|base|-1.f| ||1|scale|1.f| ||2|shift|0.f| |ExpandDims|0|expand_w|0| ||1|expand_h|0| ||2|expand_c|0| ||3|axes|[ ]| |Flatten||| |HardSigmoid|0|alpha|0.2f|| ||1|beta|0.5f| |HardSwish|0|alpha|0.2f|| ||1|beta|0.5f| |InnerProduct|0|num_output|0|weight bias| ||1|bias_term|0| ||2|weight_data_size|0| ||8|int8_scale_term|0| ||9|activation_type|0| ||10|activation_params|[ ]| |Input|0|w|0| ||1|h|0| ||2|c|0| |InstanceNorm|0|channels|0|gamma bias| ||1|eps|0.001f| |Interp|0|resize_type|0| ||1|height_scale|1.f| ||2|width_scale|1.f| ||3|output_height|0| ||4|output_width|0| |Log|0|base|-1.f| ||1|scale|1.f| ||2|shift|0.f| |LRN|0|region_type|0| ||1|local_size|5| ||2|alpha|1.f| ||3|beta|0.75f| ||4|bias|1.f| |LSTM|0|num_output|0| ||1|weight_data_size|1| ||2|direction|0| |MemoryData|0|w|0| ||1|h|0| ||2|c|0| |Mish||| |MVN|0|normalize_variance|0| ||1|across_channels|0| ||2|eps|0.0001f| |Noop||| |Normalize|0|across_spatial|0|scale| ||4|across_channel|0| ||1|channel_shared|0| ||2|eps|0.0001f| ||9|eps_mode|0| ||3|scale_data_size|0| |Packing|0|out_packing|1| ||1|use_padding|0| ||2|cast_type_from|0| ||3|cast_type_to|0| ||4|storage_type_from|0| ||5|storage_type_to|0| |Padding|0|top|0|per_channel_pad_data| ||1|bottom|0| ||2|left|0| ||3|right|0| ||4|type|0| ||5|value|0.f| ||6|per_channel_pad_data_size|0| ||7|front|0| ||8|behind|0| |Permute|0|order_type|0| |PixelShuffle|0|upscale_factor|1| |Pooling|0|pooling_type(0: max 1: avg)|0| ||1|kernel_w|0| ||11|kernel_h|kernel_w| ||2|stride_w|1| ||12|stride_h|stride_w| ||3|pad_left|0| ||14|pad_right|pad_left| ||13|pad_top|pad_left| ||15|pad_bottom|pad_top| ||4|global_pooling|0| ||5|pad_mode|0| |Power|0|power|1.f| ||1|scale|1.f| ||2|shift|0.f| |PReLU|0|num_slope|0|slope| |PriorBox|0|min_sizes|[ ]| ||1|max_sizes|[ ]| ||2|aspect_ratios|[ ]| ||3|varainces[0]|0.f| ||4|varainces[1]|0.f| ||5|varainces[2]|0.f| ||6|varainces[3]|0.f| ||7|flip|1| ||8|clip|0| ||9|image_width|0| ||10|image_height|0| ||11|step_width|-233.f| ||12|step_height|-233.f| ||13|offset|0.f| ||14|step_mmdetection|0| ||15|center_mmdetection|0| |Proposal|0|feat_stride|16| ||1|base_size|16| ||2|pre_nms_topN|6000| ||3|after_nms_topN|300| ||4|num_thresh|0.7f| ||5|min_size|16| |PSROIPooling|0|pooled_width|7| ||1|pooled_height|7| ||2|spatial_scale|0.0625f| ||3|output_dim|0| |Quantize|0|scale|1.f| |Reduction|0|operation|0| ||1|dim|0| ||2|coeff|1.f| ||3|axes|[ ]| ||4|keepdims|0| |ReLU|0|slope|0.f| |Reorg|0|stride|0| |Requantize|0|scale_in|1.f|bias| ||1|scale_out|1.f| ||2|bias_term|0| ||3|bias_data_size|0| ||4|fusion_relu|0| |Reshape|0|w|-233| ||1|h|-233| ||2|c|-233| ||3|permute|0| |ROIAlign|0|pooled_width|0| ||1|pooled_height|0| ||2|spatial_scale|1.f| ||3|sampling_ratio|0| ||4|aligned|0| ||5|version|0| |ROIPooling|0|pooled_width|0| ||1|pooled_height|0| ||2|spatial_scale|1.f| |Scale|0|scale_data_size|0|scale bias| ||1|bias_term|0| |SELU|0|alpha|1.67326324f|| ||1|lambda|1.050700987f| |ShuffleChannel|0|group|1| |Sigmoid||| |Slice|0|slices|[ ]| ||1|axis|0| |Softmax|0|axis|0| |Split||| |SPP|0|pooling_type|0| ||1|pyramid_height|1| |Squeeze|0|squeeze_w|0| ||1|squeeze_h|0| ||2|squeeze_c|0| ||3|axes|[ ]| |StatisticsPooling|0|include_stddev|0| |Swish||| |TanH||| |Threshold|0|threshold|0.f| |Tile|0|dim|0| ||1|tiles|1| |UnaryOp|0|op_type|0| |YoloDetectionOutput|0|num_class|20| ||1|num_box|5| ||2|confidence_threshold|0.01f| ||3|num_threshold|0.45f| ||4|biases|[]| |Yolov3DetectionOutput|0|num_class|20| ||1|num_box|5| ||2|confidence_threshold|0.01f| ||3|num_threshold|0.45f| ||4|biases|[]| ||5|mask|[]| ||6|anchors_scale|[]| |RNN|0|num_output|0| ||1|weight_data_size|0| ||2|direction|0| |MultiHeadAttention|0|embed_dim|0| ||1|num_head|1| ||2|weight_data_size|0| ================================================ FILE: docs/developer-guide/operators.md ================================================ * [AbsVal](#absval) * [ArgMax](#argmax) * [BatchNorm](#batchnorm) * [Bias](#bias) * [BinaryOp](#binaryop) * [BNLL](#bnll) * [Cast](#cast) * [CELU](#celu) * [Clip](#clip) * [Concat](#concat) * [Convolution](#convolution) * [Convolution1D](#convolution1d) * [Convolution3D](#convolution3d) * [ConvolutionDepthWise](#convolutiondepthwise) * [ConvolutionDepthWise1D](#convolutiondepthwise1d) * [ConvolutionDepthWise3D](#convolutiondepthwise3d) * [CopyTo](#copyto) * [Crop](#crop) * [CumulativeSum](#cumulativesum) * [Deconvolution](#deconvolution) * [Deconvolution1D](#deconvolution1d) * [Deconvolution3D](#deconvolution3d) * [DeconvolutionDepthWise](#deconvolutiondepthwise) * [DeconvolutionDepthWise1D](#deconvolutiondepthwise1d) * [DeconvolutionDepthWise3D](#deconvolutiondepthwise3d) * [DeformableConv2D](#deformableconv2d) * [Dequantize](#dequantize) * [Diag](#diag) * [Dropout](#dropout) * [Eltwise](#eltwise) * [ELU](#elu) * [Embed](#embed) * [Exp](#exp) * [ExpandDims](#expanddims) * [Flatten](#flatten) * [Flip](#flip) * [Fold](#fold) * [GELU](#gelu) * [GLU](#glu) * [Gemm](#gemm) * [GridSample](#gridsample) * [GroupNorm](#groupnorm) * [GRU](#gru) * [HardSigmoid](#hardsigmoid) * [HardSwish](#hardswish) * [InnerProduct](#innerproduct) * [Input](#input) * [InstanceNorm](#instancenorm) * [Interp](#interp) * [InverseSpectrogram](#inversespectrogram) * [LayerNorm](#layernorm) * [Log](#log) * [LRN](#lrn) * [LSTM](#lstm) * [MemoryData](#memorydata) * [Mish](#mish) * [MultiHeadAttention](#multiheadattention) * [MVN](#mvn) * [Noop](#noop) * [Normalize](#normalize) * [Packing](#packing) * [Padding](#padding) * [Permute](#permute) * [PixelShuffle](#pixelshuffle) * [Pooling](#pooling) * [Pooling1D](#pooling1d) * [Pooling3D](#pooling3d) * [Power](#power) * [PReLU](#prelu) * [Quantize](#quantize) * [Reduction](#reduction) * [ReLU](#relu) * [Reorg](#reorg) * [Requantize](#requantize) * [Reshape](#reshape) * [RMSNorm](#rmsnorm) * [RNN](#rnn) * [RotaryEmbed](#rotaryembed) * [Scale](#scale) * [SDPA](#sdpa) * [SELU](#selu) * [Shrink](#shrink) * [ShuffleChannel](#shufflechannel) * [Sigmoid](#sigmoid) * [Slice](#slice) * [Softmax](#softmax) * [Softplus](#softplus) * [Spectrogram](#spectrogram) * [Split](#split) * [Squeeze](#squeeze) * [Swish](#swish) * [TanH](#tanh) * [Threshold](#threshold) * [Tile](#tile) * [UnaryOp](#unaryop) * [Unfold](#unfold) # AbsVal ``` y = abs(x) ``` * one_blob_only * support_inplace # ArgMax ``` y = argmax(x, out_max_val, topk) ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | out_max_val | int | 0 | | | 1 | topk | int | 1 | | # BatchNorm ``` y = (x - mean) / sqrt(var + eps) * slope + bias ``` * one_blob_only * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | channels | int | 0 | | | 1 | eps | float | 0.f | | | weight | type | shape | | ------------- | ----- | --------------------- | | slope_data | float | [channels] | | mean_data | float | [channels] | | var_data | float | [channels] | | bias_data | float | [channels] | # Bias ``` y = x + bias ``` * one_blob_only * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | bias_data_size| int | 0 | | | weight | type | shape | | ------------- | ----- | --------------------- | | bias_data | float | [channels] | # BinaryOp This operation is used for binary computation, and the calculation rule depends on the [broadcasting rule](https://github.com/Tencent/ncnn/wiki/binaryop-broadcasting). ``` C = binaryop(A, B) ``` if with_scalar = 1: - one_blob_only - support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | op_type | int | 0 | Operation type as follows | | 1 | with_scalar | int | 0 | with_scalar=0 B is a matrix, with_scalar=1 B is a scalar | | 2 | b | float | 0.f | When B is a scalar, B = b | Operation type: - 0 = ADD - 1 = SUB - 2 = MUL - 3 = DIV - 4 = MAX - 5 = MIN - 6 = POW - 7 = RSUB - 8 = RDIV - 9 = RPOW - 10 = ATAN2 - 11 = RATAN2 # BNLL ``` y = log(1 + e^(-x)) , x > 0 y = log(1 + e^x), x < 0 ``` * one_blob_only * support_inplace # Cast ``` y = cast(x) ``` * one_blob_only * support_packing | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | type_from | int | 0 | | | 1 | type_to | int | 0 | | Element type: - 0 = auto - 1 = float32 - 2 = float16 - 3 = int8 - 4 = bfloat16 # CELU ``` if x < 0 y = (exp(x / alpha) - 1.f) * alpha else y = x ``` * one_blob_only * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | alpha | float | 1.f | | # Clip ``` y = clamp(x, min, max) ``` * one_blob_only * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | min | float | -FLT_MAX | | | 1 | max | float | FLT_MAX | | # Concat ``` y = concat(x0, x1, x2, ...) by axis ``` | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | axis | int | 0 | | # Convolution ``` x2 = pad(x, pads, pad_value) x3 = conv(x2, weight, kernel, stride, dilation) + bias y = activation(x3, act_type, act_params) ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | num_output | int | 0 | | | 1 | kernel_w | int | 0 | | | 2 | dilation_w | int | 1 | | | 3 | stride_w | int | 1 | | | 4 | pad_left | int | 0 | | | 5 | bias_term | int | 0 | | | 6 | weight_data_size| int | 0 | | | 8 | int8_scale_term| int | 0 | | | 9 | activation_type| int | 0 | | | 10 | activation_params| array | [ ] | | | 11 | kernel_h | int | kernel_w | | | 12 | dilation_h | int | dilation_w | | | 13 | stride_h | int | stride_w | | | 14 | pad_top | int | pad_left | | | 15 | pad_right | int | pad_left | | | 16 | pad_bottom | int | pad_top | | | 18 | pad_value | float | 0.f | | | 19 | dynamic_weight| int | 0 | | | weight | type | shape | | ------------- | ----- | --------------------- | | weight_data | float/fp16/int8 | [kernel_w, kernel_h, num_input, num_output] | | bias_data | float | [num_output] | | weight_data_int8_scales| float | [num_output] | | bottom_blob_int8_scales| float | [1] | | top_blob_int8_scales| float | [1] | # Convolution1D ``` x2 = pad(x, pads, pad_value) x3 = conv1d(x2, weight, kernel, stride, dilation) + bias y = activation(x3, act_type, act_params) ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | num_output | int | 0 | | | 1 | kernel_w | int | 0 | | | 2 | dilation_w | int | 1 | | | 3 | stride_w | int | 1 | | | 4 | pad_left | int | 0 | | | 5 | bias_term | int | 0 | | | 6 | weight_data_size| int | 0 | | | 9 | activation_type| int | 0 | | | 10 | activation_params| array | [ ] | | | 15 | pad_right | int | pad_left | | | 18 | pad_value | float | 0.f | | | 19 | dynamic_weight| int | 0 | | | weight | type | shape | | ------------- | ----- | --------------------- | | weight_data | float/fp16/int8 | [kernel_w, num_input, num_output] | | bias_data | float | [num_output] | # Convolution3D ``` x2 = pad(x, pads, pad_value) x3 = conv3d(x2, weight, kernel, stride, dilation) + bias y = activation(x3, act_type, act_params) ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | num_output | int | 0 | | | 1 | kernel_w | int | 0 | | | 2 | dilation_w | int | 1 | | | 3 | stride_w | int | 1 | | | 4 | pad_left | int | 0 | | | 5 | bias_term | int | 0 | | | 6 | weight_data_size| int | 0 | | | 9 | activation_type| int | 0 | | | 10 | activation_params| array | [ ] | | | 11 | kernel_h | int | kernel_w | | | 12 | dilation_h | int | dilation_w | | | 13 | stride_h | int | stride_w | | | 14 | pad_top | int | pad_left | | | 15 | pad_right | int | pad_left | | | 16 | pad_bottom | int | pad_top | | | 17 | pad_behind | int | pad_front | | | 18 | pad_value | float | 0.f | | | 21 | kernel_d | int | kernel_w | | | 22 | dilation_d | int | dilation_w | | | 23 | stride_d | int | stride_w | | | 24 | pad_front | int | pad_left | | | weight | type | shape | | ------------- | ----- | --------------------- | | weight_data | float/fp16/int8 | [kernel_w, kernel_h, kernel_d, num_input, num_output] | | bias_data | float | [num_output] | # ConvolutionDepthWise ``` x2 = pad(x, pads, pad_value) x3 = conv(x2, weight, kernel, stride, dilation, group) + bias y = activation(x3, act_type, act_params) ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | num_output | int | 0 | | | 1 | kernel_w | int | 0 | | | 2 | dilation_w | int | 1 | | | 3 | stride_w | int | 1 | | | 4 | pad_left | int | 0 | | | 5 | bias_term | int | 0 | | | 6 | weight_data_size| int | 0 | | | 7 | group | int | 1 | | | 8 | int8_scale_term| int | 0 | | | 9 | activation_type| int | 0 | | | 10 | activation_params| array | [ ] | | | 11 | kernel_h | int | kernel_w | | | 12 | dilation_h | int | dilation_w | | | 13 | stride_h | int | stride_w | | | 14 | pad_top | int | pad_left | | | 15 | pad_right | int | pad_left | | | 16 | pad_bottom | int | pad_top | | | 18 | pad_value | float | 0.f | | | 19 | dynamic_weight| int | 0 | | | weight | type | shape | | ------------- | ----- | --------------------- | | weight_data | float/fp16/int8 | [kernel_w, kernel_h, num_input / group, num_output / group, group] | | bias_data | float | [num_output] | | weight_data_int8_scales| float | [group] | | bottom_blob_int8_scales| float | [1] | | top_blob_int8_scales| float | [1] | # ConvolutionDepthWise1D ``` x2 = pad(x, pads, pad_value) x3 = conv1d(x2, weight, kernel, stride, dilation, group) + bias y = activation(x3, act_type, act_params) ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | num_output | int | 0 | | | 1 | kernel_w | int | 0 | | | 2 | dilation_w | int | 1 | | | 3 | stride_w | int | 1 | | | 4 | pad_left | int | 0 | | | 5 | bias_term | int | 0 | | | 6 | weight_data_size| int | 0 | | | 7 | group | int | 1 | | | 9 | activation_type| int | 0 | | | 10 | activation_params| array | [ ] | | | 15 | pad_right | int | pad_left | | | 18 | pad_value | float | 0.f | | | 19 | dynamic_weight| int | 0 | | | weight | type | shape | | ------------- | ----- | --------------------- | | weight_data | float/fp16/int8 | [kernel_w, num_input / group, num_output / group, group] | | bias_data | float | [num_output] | # ConvolutionDepthWise3D ``` x2 = pad(x, pads, pad_value) x3 = conv3d(x2, weight, kernel, stride, dilation, group) + bias y = activation(x3, act_type, act_params) ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | num_output | int | 0 | | | 1 | kernel_w | int | 0 | | | 2 | dilation_w | int | 1 | | | 3 | stride_w | int | 1 | | | 4 | pad_left | int | 0 | | | 5 | bias_term | int | 0 | | | 6 | weight_data_size| int | 0 | | | 7 | group | int | 1 | | | 9 | activation_type| int | 0 | | | 10 | activation_params| array | [ ] | | | 11 | kernel_h | int | kernel_w | | | 12 | dilation_h | int | dilation_w | | | 13 | stride_h | int | stride_w | | | 14 | pad_top | int | pad_left | | | 15 | pad_right | int | pad_left | | | 16 | pad_bottom | int | pad_top | | | 17 | pad_behind | int | pad_front | | | 18 | pad_value | float | 0.f | | | 21 | kernel_d | int | kernel_w | | | 22 | dilation_d | int | dilation_w | | | 23 | stride_d | int | stride_w | | | 24 | pad_front | int | pad_left | | | weight | type | shape | | ------------- | ----- | --------------------- | | weight_data | float/fp16/int8 | [kernel_w, kernel_h, kernel_d, num_input / group, num_output / group, group] | | bias_data | float | [num_output] | # CopyTo ``` self[offset] = src ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | woffset | int | 0 | | | 1 | hoffset | int | 0 | | | 13 | doffset | int | 0 | | | 2 | coffset | int | 0 | | | 9 | starts | array | [ ] | | | 11 | axes | array | [ ] | | # Crop ``` y = crop(x) ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | woffset | int | 0 | | | 1 | hoffset | int | 0 | | | 13 | doffset | int | 0 | | | 2 | coffset | int | 0 | | | 3 | outw | int | 0 | | | 4 | outh | int | 0 | | | 14 | outd | int | 0 | | | 5 | outc | int | 0 | | | 6 | woffset2 | int | 0 | | | 7 | hoffset2 | int | 0 | | | 15 | doffset2 | int | 0 | | | 8 | coffset2 | int | 0 | | | 9 | starts | array | [ ] | | | 10 | ends | array | [ ] | | | 11 | axes | array | [ ] | | | 19 | starts_expr | str | "" | | | 20 | ends_expr | str | "" | | | 21 | axes_expr | str | "" | | # CumulativeSum If axis < 0, we use axis = x.dims + axis It implements https://pytorch.org/docs/stable/generated/torch.cumsum.html * one_blob_only * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | axis | int | 0 | | # Deconvolution ``` x2 = deconv(x, weight, kernel, stride, dilation) + bias x3 = depad(x2, pads, pad_value) y = activation(x3, act_type, act_params) ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | num_output | int | 0 | | | 1 | kernel_w | int | 0 | | | 2 | dilation_w | int | 1 | | | 3 | stride_w | int | 1 | | | 4 | pad_left | int | 0 | | | 5 | bias_term | int | 0 | | | 6 | weight_data_size| int | 0 | | | 9 | activation_type| int | 0 | | | 10 | activation_params| array | [ ] | | | 11 | kernel_h | int | kernel_w | | | 12 | dilation_h | int | dilation_w | | | 13 | stride_h | int | stride_w | | | 14 | pad_top | int | pad_left | | | 15 | pad_right | int | pad_left | | | 16 | pad_bottom | int | pad_top | | | 18 | output_pad_right| int | 0 | | | 19 | output_pad_bottom| int | output_pad_right | | | 20 | output_w | int | 0 | | | 21 | output_h | int | output_w | | | 28 | dynamic_weight| int | 0 | | | weight | type | shape | | ------------- | ----- | --------------------- | | weight_data | float/fp16 | [kernel_w, kernel_h, num_input, num_output] | | bias_data | float | [num_output] | # Deconvolution1D ``` x2 = deconv1d(x, weight, kernel, stride, dilation) + bias x3 = depad(x2, pads, pad_value) y = activation(x3, act_type, act_params) ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | num_output | int | 0 | | | 1 | kernel_w | int | 0 | | | 2 | dilation_w | int | 1 | | | 3 | stride_w | int | 1 | | | 4 | pad_left | int | 0 | | | 5 | bias_term | int | 0 | | | 6 | weight_data_size| int | 0 | | | 9 | activation_type| int | 0 | | | 10 | activation_params| array | [ ] | | | 15 | pad_right | int | pad_left | | | 18 | output_pad_right| int | 0 | | | 20 | output_w | int | 0 | | | 28 | dynamic_weight| int | 0 | | | weight | type | shape | | ------------- | ----- | --------------------- | | weight_data | float/fp16 | [kernel_w, num_input, num_output] | | bias_data | float | [num_output] | # Deconvolution3D ``` x2 = deconv3d(x, weight, kernel, stride, dilation) + bias x3 = depad(x2, pads, pad_value) y = activation(x3, act_type, act_params) ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | num_output | int | 0 | | | 1 | kernel_w | int | 0 | | | 2 | dilation_w | int | 1 | | | 3 | stride_w | int | 1 | | | 4 | pad_left | int | 0 | | | 5 | bias_term | int | 0 | | | 6 | weight_data_size| int | 0 | | | 9 | activation_type| int | 0 | | | 10 | activation_params| array | [ ] | | | 11 | kernel_h | int | kernel_w | | | 12 | dilation_h | int | dilation_w | | | 13 | stride_h | int | stride_w | | | 14 | pad_top | int | pad_left | | | 15 | pad_right | int | pad_left | | | 16 | pad_bottom | int | pad_top | | | 17 | pad_behind | int | pad_front | | | 18 | output_pad_right| int | 0 | | | 19 | output_pad_bottom| int | output_pad_right | | | 20 | output_pad_behind| int | output_pad_right | | | 21 | kernel_d | int | kernel_w | | | 22 | dilation_d | int | dilation_w | | | 23 | stride_d | int | stride_w | | | 24 | pad_front | int | pad_left | | | 25 | output_w | int | 0 | | | 26 | output_h | int | output_w | | | 27 | output_d | int | output_w | | | weight | type | shape | | ------------- | ----- | --------------------- | | weight_data | float/fp16 | [kernel_w, kernel_h, kernel_d, num_input, num_output] | | bias_data | float | [num_output] | # DeconvolutionDepthWise ``` x2 = deconv(x, weight, kernel, stride, dilation, group) + bias x3 = depad(x2, pads, pad_value) y = activation(x3, act_type, act_params) ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | num_output | int | 0 | | | 1 | kernel_w | int | 0 | | | 2 | dilation_w | int | 1 | | | 3 | stride_w | int | 1 | | | 4 | pad_left | int | 0 | | | 5 | bias_term | int | 0 | | | 6 | weight_data_size| int | 0 | | | 7 | group | int | 1 | | | 9 | activation_type| int | 0 | | | 10 | activation_params| array | [ ] | | | 11 | kernel_h | int | kernel_w | | | 12 | dilation_h | int | dilation_w | | | 13 | stride_h | int | stride_w | | | 14 | pad_top | int | pad_left | | | 15 | pad_right | int | pad_left | | | 16 | pad_bottom | int | pad_top | | | 18 | output_pad_right| int | 0 | | | 19 | output_pad_bottom| int | output_pad_right | | | 20 | output_w | int | 0 | | | 21 | output_h | int | output_w | | | 28 | dynamic_weight| int | 0 | | | weight | type | shape | | ------------- | ----- | --------------------- | | weight_data | float/fp16 | [kernel_w, kernel_h, num_input / group, num_output / group, group] | | bias_data | float | [num_output] | # DeconvolutionDepthWise1D ``` x2 = deconv1d(x, weight, kernel, stride, dilation, group) + bias x3 = depad(x2, pads, pad_value) y = activation(x3, act_type, act_params) ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | num_output | int | 0 | | | 1 | kernel_w | int | 0 | | | 2 | dilation_w | int | 1 | | | 3 | stride_w | int | 1 | | | 4 | pad_left | int | 0 | | | 5 | bias_term | int | 0 | | | 6 | weight_data_size| int | 0 | | | 7 | group | int | 1 | | | 9 | activation_type| int | 0 | | | 10 | activation_params| array | [ ] | | | 15 | pad_right | int | pad_left | | | 18 | output_pad_right| int | 0 | | | 20 | output_w | int | 0 | | | 28 | dynamic_weight| int | 0 | | | weight | type | shape | | ------------- | ----- | --------------------- | | weight_data | float/fp16 | [kernel_w, num_input / group, num_output / group, group] | | bias_data | float | [num_output] | # DeconvolutionDepthWise3D ``` x2 = deconv3d(x, weight, kernel, stride, dilation, group) + bias x3 = depad(x2, pads, pad_value) y = activation(x3, act_type, act_params) ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | num_output | int | 0 | | | 1 | kernel_w | int | 0 | | | 2 | dilation_w | int | 1 | | | 3 | stride_w | int | 1 | | | 4 | pad_left | int | 0 | | | 5 | bias_term | int | 0 | | | 6 | weight_data_size| int | 0 | | | 7 | group | int | 1 | | | 9 | activation_type| int | 0 | | | 10 | activation_params| array | [ ] | | | 11 | kernel_h | int | kernel_w | | | 12 | dilation_h | int | dilation_w | | | 13 | stride_h | int | stride_w | | | 14 | pad_top | int | pad_left | | | 15 | pad_right | int | pad_left | | | 16 | pad_bottom | int | pad_top | | | 17 | pad_behind | int | pad_front | | | 18 | output_pad_right| int | 0 | | | 19 | output_pad_bottom| int | output_pad_right | | | 20 | output_pad_behind| int | output_pad_right | | | 21 | kernel_d | int | kernel_w | | | 22 | dilation_d | int | dilation_w | | | 23 | stride_d | int | stride_w | | | 24 | pad_front | int | pad_left | | | 25 | output_w | int | 0 | | | 26 | output_h | int | output_w | | | 27 | output_d | int | output_w | | | weight | type | shape | | ------------- | ----- | --------------------- | | weight_data | float/fp16 | [kernel_w, kernel_h, kernel_d, num_input / group, num_output / group, group] | | bias_data | float | [num_output] | # DeformableConv2D ``` x2 = deformableconv2d(x, offset, mask, weight, kernel, stride, dilation) + bias y = activation(x2, act_type, act_params) ``` | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | num_output | int | 0 | | | 1 | kernel_w | int | 0 | | | 2 | dilation_w | int | 1 | | | 3 | stride_w | int | 1 | | | 4 | pad_left | int | 0 | | | 5 | bias_term | int | 0 | | | 6 | weight_data_size| int | 0 | | | 9 | activation_type| int | 0 | | | 10 | activation_params| array | [ ] | | | 11 | kernel_h | int | kernel_w | | | 12 | dilation_h | int | dilation_w | | | 13 | stride_h | int | stride_w | | | 14 | pad_top | int | pad_left | | | 15 | pad_right | int | pad_left | | | 16 | pad_bottom | int | pad_top | | | weight | type | shape | | ------------- | ----- | --------------------- | | weight_data | float/fp16/int8 | [kernel_w, kernel_h, num_input, num_output] | | bias_data | float | [num_output] | # Dequantize ``` y = x * scale + bias ``` * one_blob_only * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | scale_data_size| int | 1 | | | 1 | bias_data_size| int | 0 | | | weight | type | shape | | ------------- | ----- | --------------------- | | scale_data | float | [scale_data_size] | | bias_data | float | [bias_data_size] | # Diag ``` y = diag(x, diagonal) ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | diagonal | int | 0 | | # Dropout ``` y = x * scale ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | scale | float | 1.f | | # Eltwise ``` y = elementwise_op(x0, x1, ...) ``` | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | op_type | int | 0 | | | 1 | coeffs | array | [ ] | | Operation type: - 0 = PROD - 1 = SUM - 2 = MAX # ELU ``` if x < 0 y = (exp(x) - 1) * alpha else y = x ``` * one_blob_only * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | alpha | float | 0.1f | | # Embed ``` y = embedding(x) ``` | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | num_output | int | 0 | | | 1 | input_dim | int | 0 | | | 2 | bias_term | int | 0 | | | 3 | weight_data_size | int | 0 | | | 18 | int8_scale_term| int | 0 | | | weight | type | shape | | ------------- | ----- | --------------------- | | weight_data | float | [weight_data_size] | | bias_term | float | [num_output] | | weight_data_int8_scales| float | [1] | # Exp ``` if base == -1 y = exp(shift + x * scale) else y = pow(base, (shift + x * scale)) ``` * one_blob_only * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | base | float | -1.f | | | 1 | scale | float | 1.f | | | 2 | shift | float | 0.f | | # ExpandDims * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 3 | axes | array | [ ] | | # Flatten Reshape blob to 1 dimension * one_blob_only # Flip * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | axes | array | [ ] | | # Fold ``` y = fold(x) ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | num_output | int | 0 | | | 1 | kernel_w | int | 0 | | | 2 | dilation_w | int | 1 | | | 3 | stride_w | int | 1 | | | 4 | pad_left | int | 0 | | | 11 | kernel_h | int | kernel_w | | | 12 | dilation_h | int | dilation_w | | | 13 | stride_h | int | stride_w | | | 14 | pad_top | int | pad_left | | | 15 | pad_right | int | pad_left | | | 16 | pad_bottom | int | pad_top | | | 20 | output_w | int | 0 | | | 21 | output_h | int | output_w | | # GELU ``` if fast_gelu == 1 y = 0.5 * x * (1 + tanh(0.79788452 * (x + 0.044715 * x * x * x))); else y = 0.5 * x * erfc(-0.70710678 * x) ``` * one_blob_only * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | fast_gelu | int | 0 | use approximation | # GLU If axis < 0, we use axis = x.dims + axis GLU(a,b)=a⊗σ(b) where a is the first half of the input matrix and b is the second half. axis specifies the dimension to split the input * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | axis | int | 0 | | # Gemm ``` a = transA ? transpose(x0) : x0 b = transb ? transpose(x1) : x1 c = x2 y = (gemm(a, b) + c * beta) * alpha ``` | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | alpha | float | 1.f | | | 1 | beta | float | 1.f | | | 2 | transA | int | 0 | | | 3 | transb | int | 0 | | | 4 | constantA | int | 0 | | | 5 | constantB | int | 0 | | | 6 | constantC | int | 0 | | | 7 | constantM | int | 0 | | | 8 | constantN | int | 0 | | | 9 | constantK | int | 0 | | | 10 | constant_broadcast_type_C | int | 0 | | | 11 | output_N1M | int | 0 | | | 12 | output_elempack | int | 0 | | | 13 | output_elemtype | int | 0 | | | 14 | output_transpose | int| 0 | | | 18 | int8_scale_term | int | 0 | | | 20 | constant_TILE_M | int | 0 | | | 21 | constant_TILE_N | int | 0 | | | 22 | constant_TILE_K | int | 0 | | | weight | type | shape | | ------------- | ----- | --------------------- | | A_data | float/fp16/int8 | [M, K] or [K, M] | | B_data | float/fp16/int8 | [N, K] or [K, N] | | C_data | float | [1], [M] or [N] or [1, M] or [N,1] or [N, M] | | A_data_int8_scales| float | [M] | | B_data_int8_scales| float | [1] | # GridSample ``` Given an input and a flow-field grid, computes the output using input values and pixel locations from grid. For each output location output[:, h2, w2], the size-2 vector grid[h2, w2, 2] specifies input pixel[:, h1, w1] locations x and y, which are used to interpolate the output value output[:, h2, w2] This function is often used in conjunction with affine_grid() to build Spatial Transformer Networks . ``` | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | sample_type | int | 1 | | | 1 | padding_mode | int | 1 | | | 2 | align_corner | int | 0 | | | 3 | permute_fusion| int | 0 | fuse with permute | Sample type: - 1 = Nearest - 2 = Bilinear - 3 = Bicubic Padding mode: - 1 = zeros - 2 = border - 3 = reflection # GroupNorm ``` split x along channel axis into group x0, x1 ... l2 normalize for each group x0, x1 ... y = x * gamma + beta ``` * one_blob_only * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | group | int | 1 | | | 1 | channels | int | 0 | | | 2 | eps | float | 0.001f | x = x / sqrt(var + eps) | | 3 | affine | int | 1 | | | weight | type | shape | | ------------- | ----- | --------------------- | | gamma_data | float | [channels] | | beta_data | float | [channels] | # GRU Apply a single-layer GRU to a feature sequence of `T` timesteps. The input blob shape is `[w=input_size, h=T]` and the output blob shape is `[w=num_output, h=T]`. ``` y = gru(x) y0, hidden y1 = gru(x0, hidden x1) ``` * one_blob_only if bidirectional | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | num_output | int | 0 | hidden size of output | | 1 | weight_data_size| int | 0 | total size of weight matrix | | 2 | direction | int | 0 | 0=forward, 1=reverse, 2=bidirectional | | weight | type | shape | | ------------- | ----- | --------------------- | | weight_xc_data| float/fp16/int8 | [input_size, num_output * 3, num_directions] | | bias_c_data | float/fp16/int8 | [num_output, 4, num_directions] | | weight_hc_data| float/fp16/int8 | [num_output, num_output * 3, num_directions] | Direction flag: - 0 = forward only - 1 = reverse only - 2 = bidirectional # HardSigmoid ``` y = clamp(x * alpha + beta, 0, 1) ``` * one_blob_only * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | alpha | float | 0.2f | | | 1 | beta | float | 0.5f | | # HardSwish ``` y = x * clamp(x * alpha + beta, 0, 1) ``` * one_blob_only * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | alpha | float | 0.2f | | | 1 | beta | float | 0.5f | | # InnerProduct ``` x2 = innerproduct(x, weight) + bias y = activation(x2, act_type, act_params) ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | num_output | int | 0 | | | 1 | bias_term | int | 0 | | | 2 | weight_data_size| int | 0 | | | 8 | int8_scale_term| int | 0 | | | 9 | activation_type| int | 0 | | | 10 | activation_params| array | [ ] | | | weight | type | shape | | ------------- | ----- | --------------------- | | weight_data | float/fp16/int8 | [num_input, num_output] | | bias_data | float | [num_output] | | weight_data_int8_scales| float | [num_output] | | bottom_blob_int8_scales| float | [1] | # Input ``` y = input ``` * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | w | int | 0 | | | 1 | h | int | 0 | | | 11 | d | int | 0 | | | 2 | c | int | 0 | | # InstanceNorm ``` split x along channel axis into instance x0, x1 ... l2 normalize for each channel instance x0, x1 ... y = x * gamma + beta ``` * one_blob_only * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | channels | int | 0 | | | 1 | eps | float | 0.001f | x = x / sqrt(var + eps) | | 2 | affine | int | 1 | | | weight | type | shape | | ------------- | ----- | --------------------- | | gamma_data | float | [channels] | | beta_data | float | [channels] | # Interp ``` if dynamic_target_size == 0 y = resize(x) by fixed size or scale else y = resize(x0, size(x1)) ``` * one_blob_only if dynamic_target_size == 0 | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | resize_type | int | 0 | | | 1 | height_scale | float | 1.f | | | 2 | width_scale | float | 1.f | | | 3 | output_height | int | 0 | | | 4 | output_width | int | 0 | | | 5 | dynamic_target_size| int | 0 | | | 6 | align_corner | int | 0 | | | 9 | size_expr | str | "" | | Resize type: - 1 = Nearest - 2 = Bilinear - 3 = Bicubic # InverseSpectrogram ``` x1 = x as complex x1 = x1 * sqrt(norm) if normalized y = istft(x1) y1 = unpad(y) if center if returns == 0 return y1 as complex if returns == 1 return y1 real if returns == 2 return y1 imag ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | n_fft | int | 0 | | | 1 | returns | int | 1 | | | 2 | hoplen | int | n_fft / 4 | | | 3 | winlen | int | n_fft | | | 4 | window_type | int | 0 | 0=ones 1=hann 2=hamming | | 5 | center | int | 1 | | | 7 | normalized | int | 0 | 0=no 1=n_fft 2=window-l2-energy | # LayerNorm ``` split x along outmost axis into part x0, x1 ... l2 normalize for each part x0, x1 ... y = x * gamma + beta by elementwise ``` * one_blob_only * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | affine_size | int | 0 | | | 1 | eps | float | 0.001f | x = x / sqrt(var + eps) | | 2 | affine | int | 1 | | | weight | type | shape | | ------------- | ----- | --------------------- | | gamma_data | float | [affine_size] | | beta_data | float | [affine_size] | # Log ``` if base == -1 y = log(shift + x * scale) else y = log(shift + x * scale) / log(base) ``` * one_blob_only * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | base | float | -1.f | | | 1 | scale | float | 1.f | | | 2 | shift | float | 0.f | | # LRN ``` if region_type == ACROSS_CHANNELS square_sum = sum of channel window of local_size if region_type == WITHIN_CHANNEL square_sum = sum of spatial window of local_size y = x * pow(bias + alpha * square_sum / (local_size * local_size), -beta) ``` * one_blob_only * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | region_type | int | 0 | | | 1 | local_size | int | 5 | | | 2 | alpha | float | 1.f | | | 3 | beta | float | 0.75f | | | 4 | bias | float | 1.f | | Region type: - 0 = ACROSS_CHANNELS - 1 = WITHIN_CHANNEL # LSTM Apply a single-layer LSTM to a feature sequence of `T` timesteps. The input blob shape is `[w=input_size, h=T]` and the output blob shape is `[w=num_output, h=T]`. ``` y = lstm(x) y0, hidden y1, cell y2 = lstm(x0, hidden x1, cell x2) ``` * one_blob_only if bidirectional | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | num_output | int | 0 | output size of output | | 1 | weight_data_size| int | 0 | total size of IFOG weight matrix | | 2 | direction | int | 0 | 0=forward, 1=reverse, 2=bidirectional | | 3 | hidden_size | int | num_output| hidden size | | weight | type | shape | | ------------- | ----- | --------------------- | | weight_xc_data| float/fp16/int8 | [input_size, hidden_size * 4, num_directions] | | bias_c_data | float/fp16/int8 | [hidden_size, 4, num_directions] | | weight_hc_data| float/fp16/int8 | [num_output, hidden_size * 4, num_directions] | | weight_hr_data| float/fp16/int8 | [hidden_size, num_output, num_directions] | Direction flag: - 0 = forward only - 1 = reverse only - 2 = bidirectional # MemoryData ``` y = data ``` | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | w | int | 0 | | | 1 | h | int | 0 | | | 11 | d | int | 0 | | | 2 | c | int | 0 | | | 21 | load_type | int | 1 | 1=fp32 | | weight | type | shape | | ------------- | ----- | --------------------- | | data | float | [w, h, d, c] | # Mish ``` y = x * tanh(log(exp(x) + 1)) ``` * one_blob_only * support_inplace # MultiHeadAttention ``` q_affine = affine(q) / (embed_dim / num_head) k_affine = affine(k) or reuse kv_cache part v_affine = affine(v) or reuse kv_cache part split q k v into num_head part q0, k0, v0, q1, k1, v1 ... for each num_head part qk = q * k qk = qk + attn_mask if attn_mask exists softmax(qk) qkv = qk * v merge qkv to out y = affine(out) ``` | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | embed_dim | int | 0 | | | 1 | num_heads | int | 1 | | | 2 | weight_data_size| int | 0 | qdim = weight_data_size / embed_dim | | 3 | kdim | int | embed_dim | | | 4 | vdim | int | embed_dim | | | 5 | attn_mask | int | 0 | | | 6 | scale | float | 1.f / sqrt(embed_dim / num_heads) | | | 7 | kv_cache | int | 0 | | | 18 | int8_scale_term | int | 0 | | | weight | type | shape | | ------------- | ----- | --------------------- | | q_weight_data | float/fp16/int8 | [embed_dim * qdim] | | q_bias_data | float | [embed_dim] | | k_weight_data | float/fp16/int8 | [embed_dim * kdim] | | k_bias_data | float | [embed_dim] | | v_weight_data | float/fp16/int8 | [embed_dim * vdim] | | v_bias_data | float | [embed_dim] | | out_weight_data| float/fp16/int8 | [qdim * embed_dim] | | out_bias_data | float | [qdim] | | q_weight_data_int8_scales| float | [embed_dim] | | k_weight_data_int8_scales| float | [embed_dim] | | v_weight_data_int8_scales| float | [embed_dim] | | out_weight_data_int8_scales| float | [1] | # MVN ``` if normalize_variance == 1 && across_channels == 1 y = (x - mean) / (sqrt(var) + eps) of whole blob if normalize_variance == 1 && across_channels == 0 y = (x - mean) / (sqrt(var) + eps) of each channel if normalize_variance == 0 && across_channels == 1 y = x - mean of whole blob if normalize_variance == 0 && across_channels == 0 y = x - mean of each channel ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | normalize_variance| int | 0 | | | 1 | across_channels| int | 0 | | | 2 | eps | float | 0.0001f | x = x / (sqrt(var) + eps) | # Noop ``` y = x ``` # Normalize ``` if across_spatial == 1 && across_channel == 1 x2 = normalize(x) of whole blob if across_spatial == 1 && across_channel == 0 x2 = normalize(x) of each channel if across_spatial == 0 && across_channel == 1 x2 = normalize(x) of each position y = x2 * scale ``` * one_blob_only * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | across_spatial| int | 0 | | | 1 | channel_shared| int | 0 | | | 2 | eps | float | 0.0001f | see eps mode | | 3 | scale_data_size| int | 0 | | | 4 | across_channel| int | 0 | | | 9 | eps_mode | int | 0 | | | weight | type | shape | | ------------- | ----- | --------------------- | | scale_data | float | [scale_data_size] | Eps Mode: - 0 = caffe/mxnet x = x / sqrt(var + eps) - 1 = pytorch x = x / max(sqrt(var), eps) - 2 = tensorflow x = x / sqrt(max(var, eps)) # Packing ``` y = wrap_packing(x) ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | out_elempack | int | 1 | | | 1 | use_padding | int | 0 | | | 2 | cast_type_from| int | 0 | | | 3 | cast_type_to | int | 0 | | | 4 | storage_type_from| int | 0 | | | 5 | storage_type_to| int | 0 | | # Padding ``` y = pad(x, pads) ``` | param id | name | type | default | description | | --------- | ------------- | ---- | --------- | ----------------- | | 0 | top | int | 0 | | | 1 | bottom | int | 0 | | | 2 | left | int | 0 | | | 3 | right | int | 0 | | | 4 | type | int | 0 | | | 5 | value | float | 0 | | | 6 | per_channel_pad_data_size| int | 0 | | | 7 | front | int | stride_w | | | 8 | behind | int | pad_left | | | weight | type | shape | | ------------- | ----- | --------------------- | | per_channel_pad_data| float | [per_channel_pad_data_size] | Padding type: - 0 = CONSTANT - 1 = REPLICATE - 2 = REFLECT # Permute ``` y = reorder(x) ``` | param id | name | type | default | description | | --------- | ------------- | ---- | --------- | ----------------- | | 0 | order_type | int | 0 | | Order Type: - 0 = WH WHC WHDC - 1 = HW HWC HWDC - 2 = WCH WDHC - 3 = CWH DWHC - 4 = HCW HDWC - 5 = CHW DHWC - 6 = WHCD - 7 = HWCD - 8 = WCHD - 9 = CWHD - 10 = HCWD - 11 = CHWD - 12 = WDCH - 13 = DWCH - 14 = WCDH - 15 = CWDH - 16 = DCWH - 17 = CDWH - 18 = HDCW - 19 = DHCW - 20 = HCDW - 21 = CHDW - 22 = DCHW - 23 = CDHW # PixelShuffle ``` if mode == 0 y = depth_to_space(x) where x channel order is sw-sh-outc if mode == 1 y = depth_to_space(x) where x channel order is outc-sw-sh ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ---- | --------- | ----------------- | | 0 | upscale_factor| int | 1 | | | 1 | mode | int | 0 | | # Pooling ``` x2 = pad(x, pads) x3 = pooling(x2, kernel, stride) ``` | param id | name | type | default | description | | --------- | --------------| ---- | --------- | ----------------- | | 0 | pooling_type | int | 0 | | | 1 | kernel_w | int | 0 | | | 2 | stride_w | int | 1 | | | 3 | pad_left | int | 0 | | | 4 | global_pooling| int | 0 | | | 5 | pad_mode | int | 0 | | | 6 | avgpool_count_include_pad| int | 0 | | | 7 | adaptive_pooling| int | 0 | | | 8 | out_w | int | 0 | | | 11 | kernel_h | int | kernel_w | | | 12 | stride_h | int | stride_w | | | 13 | pad_top | int | pad_left | | | 14 | pad_right | int | pad_left | | | 15 | pad_bottom | int | pad_top | | | 18 | out_h | int | out_w | | Pooling type: - 0 = MAX - 1 = AVG Pad mode: - 0 = full padding - 1 = valid padding - 2 = tensorflow padding=SAME or onnx padding=SAME_UPPER - 3 = onnx padding=SAME_LOWER # Pooling1D ``` x2 = pad(x, pads) x3 = pooling1d(x2, kernel, stride) ``` | param id | name | type | default | description | | --------- | --------------| ---- | --------- | ----------------- | | 0 | pooling_type | int | 0 | | | 1 | kernel_w | int | 0 | | | 2 | stride_w | int | 1 | | | 3 | pad_left | int | 0 | | | 4 | global_pooling| int | 0 | | | 5 | pad_mode | int | 0 | | | 6 | avgpool_count_include_pad| int | 0 | | | 7 | adaptive_pooling| int | 0 | | | 8 | out_w | int | 0 | | | 14 | pad_right | int | pad_left | | Pooling type: - 0 = MAX - 1 = AVG Pad mode: - 0 = full padding - 1 = valid padding - 2 = tensorflow padding=SAME or onnx padding=SAME_UPPER - 3 = onnx padding=SAME_LOWER # Pooling3D ``` x2 = pad(x, pads) x3 = pooling3d(x2, kernel, stride) ``` | param id | name | type | default | description | | --------- | --------------| ---- | --------- | ----------------- | | 0 | pooling_type | int | 0 | | | 1 | kernel_w | int | 0 | | | 2 | stride_w | int | 1 | | | 3 | pad_left | int | 0 | | | 4 | global_pooling| int | 0 | | | 5 | pad_mode | int | 0 | | | 6 | avgpool_count_include_pad| int | 0 | | | 7 | adaptive_pooling| int | 0 | | | 8 | out_w | int | 0 | | | 11 | kernel_h | int | kernel_w | | | 12 | stride_h | int | stride_w | | | 13 | pad_top | int | pad_left | | | 14 | pad_right | int | pad_left | | | 15 | pad_bottom | int | pad_top | | | 16 | pad_behind | int | pad_front | | | 18 | out_h | int | out_w | | | 21 | kernel_d | int | kernel_w | | | 22 | stride_d | int | stride_w | | | 23 | pad_front | int | pad_left | | | 28 | out_d | int | out_w | | Pooling type: - 0 = MAX - 1 = AVG Pad mode: - 0 = full padding - 1 = valid padding - 2 = tensorflow padding=SAME or onnx padding=SAME_UPPER - 3 = onnx padding=SAME_LOWER # Power ``` y = pow((shift + x * scale), power) ``` * one_blob_only * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | power | float | 1.f | | | 1 | scale | float | 1.f | | | 2 | shift | float | 0.f | | # PReLU ``` if x < 0 y = x * slope else y = x ``` * one_blob_only * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | num_slope | int | 0 | | | weight | type | shape | | ------------- | ----- | --------------------- | | slope_data | float | [num_slope] | # Quantize ``` y = float2int8(x * scale) ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | scale_data_size| int | 1 | | | weight | type | shape | | ------------- | ----- | --------------------- | | scale_data | float | [scale_data_size] | # Reduction ``` y = reduce_op(x * coeff) ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | operation | int | 0 | | | 1 | reduce_all | int | 1 | | | 2 | coeff | float | 1.f | | | 3 | axes | array | [ ] | | | 4 | keepdims | int | 0 | | | 5 | fixbug0 | int | 0 | hack for bug fix, should be 1 | Operation type: - 0 = SUM - 1 = ASUM - 2 = SUMSQ - 3 = MEAN - 4 = MAX - 5 = MIN - 6 = PROD - 7 = L1 - 8 = L2 - 9 = LogSum - 10 = LogSumExp # ReLU ``` if x < 0 y = x * slope else y = x ``` * one_blob_only * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | slope | float | 0.f | | # Reorg ``` if mode == 0 y = space_to_depth(x) where x channel order is sw-sh-outc if mode == 1 y = space_to_depth(x) where x channel order is outc-sw-sh ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ---- | --------- | ----------------- | | 0 | stride | int | 1 | | | 1 | mode | int | 0 | | # Requantize ``` x2 = x * scale_in + bias x3 = activation(x2) y = float2int8(x3 * scale_out) ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | scale_in_data_size| int | 1 | | | 1 | scale_out_data_size| int | 1 | | | 2 | bias_data_size| int | 0 | | | 3 | activation_type| int | 0 | | | 4 | activation_params| int | [ ] | | | weight | type | shape | | ------------- | ----- | --------------------- | | scale_in_data | float | [scale_in_data_size] | | scale_out_data| float | [scale_out_data_size] | | bias_data | float | [bias_data_size] | # Reshape ``` y = reshape(x) ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | w | int | -233 | | | 1 | h | int | -233 | | | 11 | d | int | -233 | | | 2 | c | int | -233 | | | 6 | shape_expr | str | "" | | Reshape flag: - 0 = copy from bottom - -1 = remaining - -233 = drop this dim(default) # RMSNorm ``` split x along outmost axis into part x0, x1 ... root mean square normalize for each part x0, x1 ... y = x * gamma by elementwise ``` * one_blob_only * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | affine_size | int | 0 | | | 1 | eps | float | 0.001f | x = x / sqrt(var + eps) | | 2 | affine | int | 1 | | | weight | type | shape | | ------------- | ----- | --------------------- | | gamma_data | float | [affine_size] | # RNN Apply a single-layer RNN to a feature sequence of `T` timesteps. The input blob shape is `[w=input_size, h=T]` and the output blob shape is `[w=num_output, h=T]`. ``` y = rnn(x) y0, hidden y1 = rnn(x0, hidden x1) ``` * one_blob_only if bidirectional | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | num_output | int | 0 | hidden size of output | | 1 | weight_data_size| int | 0 | total size of weight matrix | | 2 | direction | int | 0 | 0=forward, 1=reverse, 2=bidirectional | | weight | type | shape | | ------------- | ----- | --------------------- | | weight_xc_data| float/fp16/int8 | [input_size, num_output, num_directions] | | bias_c_data | float/fp16/int8 | [num_output, 1, num_directions] | | weight_hc_data| float/fp16/int8 | [num_output, num_output, num_directions] | Direction flag: - 0 = forward only - 1 = reverse only - 2 = bidirectional # RotaryEmbed Apply rotary positional embeddings with cos and sin cache ``` y1 = x1 * cos - x2 * sin y2 = x1 * sin + x2 * cos ``` | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | interleaved | int | 0 | | # Scale ``` if scale_data_size == -233 y = x0 * x1 else y = x * scale + bias ``` * one_blob_only if scale_data_size != -233 * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | scale_data_size| int | 0 | | | 1 | bias_term | int | 0 | | | weight | type | shape | | ------------- | ----- | --------------------- | | scale_data | float | [scale_data_size] | | bias_data | float | [scale_data_size] | # SDPA ``` scaled dot product attention for each num_head part qk = q * k qk = qk + attn_mask if attn_mask exists softmax(qk) qkv = qk * v ``` | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 5 | attn_mask | int | 0 | | | 6 | scale | float | 0.f | auto = 1.f / sqrt(embed_dim) | | 7 | kv_cache | int | 0 | | | 18 | int8_scale_term | int | 0 | | # SELU ``` if x < 0 y = (exp(x) - 1.f) * alpha * lambda else y = x * lambda ``` * one_blob_only * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | alpha | float | 1.67326324f| | | 1 | lambda | float | 1.050700987f| | # Shrink ``` if x < -lambd y = x + bias if x > lambd y = x - bias else y = x ``` * one_blob_only * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | bias | float | 0.0f | | | 1 | lambd | float | 0.5f | | # ShuffleChannel ``` if reverse == 0 y = shufflechannel(x) by group if reverse == 1 y = shufflechannel(x) by channel / group ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ---- | --------- | ----------------- | | 0 | group | int | 1 | | | 1 | reverse | int | 0 | | # Sigmoid ``` y = 1 / (1 + exp(-x)) ``` * one_blob_only * support_inplace # Slice ``` split x along axis into slices, each part slice size is based on slices array ``` | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | slices | array | [ ] | | | 1 | axis | int | 0 | | | 2 | indices | array | [ ] | | # Softmax ``` softmax(x, axis) ``` * one_blob_only * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | axis | int | 0 | | | 1 | fixbug0 | int | 0 | hack for bug fix, should be 1 | # Softplus ``` y = log(exp(x) + 1) ``` * one_blob_only * support_inplace # Spectrogram ``` x1 = pad(x) if center y = stft(x1) y = y / sqrt(norm) if normalized if power == 0 return y as real if power == 1 return magnitude if power == 2 return square of magnitude ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | n_fft | int | 0 | | | 1 | power | int | 0 | | | 2 | hoplen | int | n_fft / 4 | | | 3 | winlen | int | n_fft | | | 4 | window_type | int | 0 | 0=ones 1=hann 2=hamming | | 5 | center | int | 1 | | | 6 | pad_type | int | 2 | 0=CONSTANT 1=REPLICATE 2=REFLECT | | 7 | normalized | int | 0 | 0=no 1=n_fft 2=window-l2-energy | | 8 | onesided | int | 1 | | # Split ``` y0, y1 ... = x ``` # Squeeze * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | squeeze_w | int | 0 | | | 1 | squeeze_h | int | 0 | | | 11 | squeeze_d | int | 0 | | | 2 | squeeze_c | int | 0 | | | 3 | axes | array | [ ] | | # Swish ``` y = x / (1 + exp(-x)) ``` * one_blob_only * support_inplace # TanH ``` y = tanh(x) ``` * one_blob_only * support_inplace # Threshold ``` if x > threshold y = 1 else y = 0 ``` * one_blob_only * support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | threshold | float | 0.f | | # Tile ``` y = repeat tiles along axis for x ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | axis | int | 0 | | | 1 | tiles | int | 1 | | | 2 | repeats | array | [ ] | | # UnaryOp ``` y = unaryop(x) ``` - one_blob_only - support_inplace | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | op_type | int | 0 | Operation type as follows | Operation type: - 0 = ABS - 1 = NEG - 2 = FLOOR - 3 = CEIL - 4 = SQUARE - 5 = SQRT - 6 = RSQ - 7 = EXP - 8 = LOG - 9 = SIN - 10 = COS - 11 = TAN - 12 = ASIN - 13 = ACOS - 14 = ATAN - 15 = RECIPROCAL - 16 = TANH - 17 = LOG10 - 18 = ROUND - 19 = TRUNC # Unfold ``` y = unfold(x) ``` * one_blob_only | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | | 0 | num_output | int | 0 | | | 1 | kernel_w | int | 0 | | | 2 | dilation_w | int | 1 | | | 3 | stride_w | int | 1 | | | 4 | pad_left | int | 0 | | | 11 | kernel_h | int | kernel_w | | | 12 | dilation_h | int | dilation_w | | | 13 | stride_h | int | stride_w | | | 14 | pad_top | int | pad_left | | | 15 | pad_right | int | pad_left | | | 16 | pad_bottom | int | pad_top | | ================================================ FILE: docs/developer-guide/param-and-model-file-structure.md ================================================ ## net.param ### example ``` 7767517 3 3 Input input 0 1 data 0=4 1=4 2=1 InnerProduct ip 1 1 data fc 0=10 1=1 2=80 Softmax softmax 1 1 fc prob 0=0 ``` ### overview ``` [magic] ``` * magic number : 7767517 ``` [layer count] [blob count] ``` * layer count : count of the layer line follows, should be exactly the count of all layer names * blob count : count of all blobs, usually greater than or equals to the layer count ### layer line ``` [layer type] [layer name] [input count] [output count] [input blobs] [output blobs] [layer specific params] ``` * layer type : type name, such as Convolution Softmax etc * layer name : name of this layer, must be unique among all layer names * input count : count of the blobs this layer needs as input * output count : count of the blobs this layer produces as output * input blobs : name list of all the input blob names, separated by space, must be unique among input blob names of all layers * output blobs : name list of all the output blob names, separated by space, must be unique among output blob names of all layers * layer specific params : key=value pair list, separated by space ### layer param ``` 0=1 1=2.5 -23303=2,2.0,3.0 ``` key index should be unique in each layer line, pair can be omitted if the default value used the meaning of existing param key index can be looked up at [operation-param-weight-table](operation-param-weight-table) * integer or float key : index 0 ~ 19 * integer value : int * float value : float * integer array or float array key : -23300 minus index 0 ~ 19 * integer array value : [array size],int,int,...,int * float array value : [array size],float,float,...,float In modern ncnn param file * array could be represented as `3=2.0,3.0` that is much more human friendly * string typed value: `4=hello` and the string is no longer than 255 ## net.bin ``` +---------+---------+---------+---------+---------+---------+ | weight1 | weight2 | weight3 | weight4 | ....... | weightN | +---------+---------+---------+---------+---------+---------+ ^ ^ ^ ^ 0x0 0x80 0x140 0x1C0 ``` the model binary is the concatenation of all weight data, each weight buffer is aligned by 32bit ### weight buffer ``` [flag] (optional) [raw data] [padding] (optional) ``` * flag : unsigned int, little-endian, indicating the weight storage type, 0 => float32, 0x01306B47 => float16, otherwise => quantized int8, may be omitted if the layer implementation forced the storage type explicitly * raw data : raw weight data, little-endian, float32 data or float16 data or quantized table and indexes depending on the storage type flag * padding : padding space for 32bit alignment, may be omitted if already aligned ================================================ FILE: docs/developer-guide/preload-practice.zh.md ================================================ ## 只是实践经验,没有理论,不一定正确 ``` prfm pldl1keep, [x0, #256] ``` * 放在 ld1 [x0] 前面 0~8 条指令 * #256 表示把 x0+256 的内容放进 L1 cache * ldp 也适用 * (经验)不写 offset 不如写个 #128 * (经验)pldl1strm 似乎没啥意思,也没 pldl1keep 快 * (经验)x0 ~ x0+256 的内容也会进来 * (经验)load 128bit 用 #128,256bit或更多用 #256 * (经验)避免 pld a,pld b,load a,load b 顺序,可能相互干扰 * (经验)提前太多会失效 * (经验)适合连续读 ``` prfm pldl2strm, [x0, #256] ``` * 放在 ld1 [x0] 前面 N 条指令,N 尽量大些 * #256 表示把 x0+256 的内容放进 L2 cache * ldp 也适用 * (经验)不写 offset 不如写个 #128 * (经验)pldl2strm 效果稍好于 pldl2keep * (经验)x0 ~ x0+256 的内容也会进来 * (经验)load 128bit 用 #128,256bit 用 #256 * (经验)读很多数据,用不同 offset 连续两次 pldl2strm * (经验)后面不要对同位置再 pldl1keep,会变慢 * (经验)适合提前准备要跳到很远的地方读,比如换 channel ================================================ FILE: docs/developer-guide/tensorflow-op-combination.md ================================================ ## batchnorm ``` Input A 0 1 A 0 0 0 MemoryData sub/y 0 1 sub/y 16 0 0 BinaryOp sub 2 1 A sub/y sub 1 MemoryData div/y 0 1 div/y 16 0 0 BinaryOp div 2 1 sub div/y div 3 MemoryData mul/y 0 1 mul/y 16 0 0 BinaryOp mul 2 1 div mul/y mul 2 MemoryData BiasAdd/bias 0 1 BiasAdd/bias 16 0 0 BinaryOp BiasAdd 2 1 mul BiasAdd/bias BiasAdd 0 ``` ## convolution ``` Input A 0 1 A 0 0 0 Convolution Conv2D 1 1 A Conv2D 10 3 1 1 0 0 270 MemoryData biases/read 0 1 biases/read 10 0 0 BinaryOp BiasAdd 2 1 Conv2D biases/read BiasAdd 0 ``` ## innerproduct ``` Input A 0 1 A 0 0 0 MemoryData biases/read 0 1 biases/read 10 0 0 InnerProduct MatMul 1 1 A MatMul 10 0 2560 BinaryOp conv6 2 1 MatMul biases/read conv6 0 ``` ## leakyrelu ``` Input A 0 1 A 0 0 0 Split splitncnn_0 1 2 A A_splitncnn_0 A_splitncnn_1 MemoryData mul_1/x 0 1 mul_1/x 0 0 0 BinaryOp mul_1 2 1 mul_1/x A_splitncnn_1 mul_1 2 BinaryOp leaky 2 1 mul_1 A_splitncnn_0 leaky 4 ``` ## prelu ``` Input A 0 1 A 0 0 0 Split splitncnn_0 1 2 A A_splitncnn_0 A_splitncnn_1 MemoryData prelu/alpha 0 1 prelu/alpha 10 0 0 ReLU prelu/Relu 1 1 A_splitncnn_1 prelu/Relu 0.000000 UnaryOp prelu/Neg 1 1 A_splitncnn_0 prelu/Neg 1 ReLU prelu/Relu_1 1 1 prelu/Neg prelu/Relu_1 0.000000 UnaryOp prelu/Neg_1 1 1 prelu/Relu_1 prelu/Neg_1 1 BinaryOp prelu/Mul 2 1 prelu/alpha prelu/Neg_1 prelu/Mul 2 BinaryOp prelu/add 2 1 prelu/Relu prelu/Mul prelu/add 0 ``` ## softmax ``` Input A 0 1 A 0 0 0 Split splitncnn_4 1 2 A A_splitncnn_0 A_splitncnn_1 Reduction Max 1 1 A_splitncnn_1 Max 4 -2 1.000000 BinaryOp sub 2 1 A_splitncnn_0 Max sub 1 UnaryOp Exp 1 1 sub Exp 7 Split splitncnn_5 1 2 Exp Exp_splitncnn_0 Exp_splitncnn_1 Reduction Sum 1 1 Exp_splitncnn_1 Sum 0 -2 1.000000 BinaryOp prob 2 1 Exp_splitncnn_0 Sum prob 3 ``` ================================================ FILE: docs/developer-guide/vulkan-driver-loader.md ================================================ # ncnn vulkan driver loader ncnn turns on the ```NCNN_SIMPLEVK``` cmake option by default, when ```NCNN_VULKAN``` is enabled simplevk is ncnn's built-in vulkan loader. It provides vulkan function declarations and function entries that meet ncnn's needs. It allows the use and compilation of vulkan-related codes without relying on vulkan-sdk. It can dynamically load the vulkan runtime library at runtime or directly load the graphics card driver. vulkan driver. When distributing ncnn applications, it is not required that the target system has a vulkan driver. Usually you don't need to care about how simplevk loads the vulkan driver, because ncnn will automatically load and initialize when using vulkan related functions. It is sufficient to set the `Option` switch before loading the model. Typical code ```cpp ncnn::Net net; net.opt.use_vulkan_compute = true; net.load_param("model.param"); net.load_param("model.bin"); ``` Using the in-house vulkan loader instead of the standard libvulkan has the following benefits - Can compile ncnn vulkan code without installing vulkan-sdk - Can deploy and distribute applications without libvulkan linkage - Can load external vulkan driver instead of system driver - Can directly load android hal module - Can directly load graphics card driver files via NCNN_VULKAN_DRIVER env - Able to actively search for graphics card driver files in the system and load them - Can compile android libraries supporting vulkan under the platform of android-api<24 ## Create and manage gpu context ```cpp int create_gpu_instance(const char* driver_path = 0); void destroy_gpu_instance(); VkInstance get_gpu_instance(); ``` ## Loading order ``` If driver_path == 0 1a from env ```VK_ICD_FILENAMES``` 1b from env ```NCNN_VULKAN_DRIVER``` If driver_path != 0 1 from specified driver_path 2 from vulkan-1.dll / libvulkan.so / libvulkan.dylib in system 3 search driver by name nvoglv64.dll / amdvlk64.dll / libGLX_nvidia.so.0 .... and load it ``` ## Load from system vulkan library or graphics driver This is the default behavior and it should work on most systems sample usage ```cpp int ret = create_gpu_instance(); ``` Load from system-installed libvulkan #### Windows vulkan-1.dll #### Linux Android libvulkan.so #### macOS iOS and other APPLE platforms libvulkan.dylib If static moltenvk driver linked, should always succeed If failed, it will try to find graphics driver object and load it #### Windows for 64bit applications. search in ```%SystemRoot%\System32\DriverStore\FileRepository``` - nvoglv64.dll - amdvlk64.dll - igvk64.dll - qcvkarm64xum.dll for 32bit applications. search in ```%SystemRoot%\System32\DriverStore\FileRepository``` - nvoglv32.dll - amdvlk32.dll - igvk32.dll #### Linux `dlopen()` search for - libGLX_nvidia.so.0 - libvulkan_radeon.so - libvulkan_intel.so - libMaliVulkan.so.1 - libVK_IMG.so #### Android for 64bit applications - /vendor/lib64/hw/vulkan.adreno.so - /vendor/lib64/egl/libGLES_mali.so for 32bit applications - /vendor/lib/hw/vulkan.adreno.so - /vendor/lib/egl/libGLES_mali.so #### macOS iOS and other APPLE platforms `dlopen()` search for - libMoltenVK.dylib - libvulkan_kosmickrisp.dylib ## Load from driver_path for advanced developer sample usage ```cpp int ret = create_gpu_instance("libvulkan.so"); int ret = create_gpu_instance("/usr/lib64/libvulkan_radeon.so"); int ret = create_gpu_instance("/vendor/lib64/hw/vulkan.adreno.so"); int ret = create_gpu_instance("/data/local/tmp/vulkan.ad07XX.so"); ``` ## Load from env VK_ICD_FILENAMES for debug purpose sample usage ```sh export VK_ICD_FILENAMES=./vk_swiftshader_icd.json export VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/lvp_icd.x86_64.json export VK_ICD_FILENAMES=/etc/vulkan/icd.d/nvidia_icd.json ``` ## Load from env NCNN_VULKAN_DRIVER for debug purpose sample usage ```sh export NCNN_VULKAN_DRIVER=/data/local/tmp/vulkan.ad07XX.so ``` ================================================ FILE: docs/faq.en.md ================================================ # How to join the technical Community Groups with QQ ? - Open QQ -> click the group chat search-> search group number 637093648, enter the answer to the question: conv conv conv conv conv → join the group chat → ready to accept the Turing test(a joke) - Open QQ -> search Pocky group: 677104663 (lots experts), the answer to the question # How to watch the author's on live in Bilibili? - nihui:[水竹院落](https://live.bilibili.com/1264617) # Compilation - ## How to download the full source code? git clone --recursive https://github.com/Tencent/ncnn/ or download [ncnn-xxxxx-full-source.zip](https://github.com/Tencent/ncnn/releases) - ## How to cross-compile?How to set the cmake toolchain? See https://github.com/Tencent/ncnn/wiki/how-to-build - ## The submodules were not downloaded! Please update submodules with "git submodule update --init" and try again As above, download the full source code. Or follow the prompts to execute: git submodule update --init - ## Could NOT find Protobuf (missing: Protobuf_INCLUDE_DIR) sudo apt-get install libprotobuf-dev protobuf-compiler - ## Could NOT find CUDA (missing: CUDA_TOOLKIT_ROOT_DIR CUDA_INCLUDE_DIRS CUDA_CUDART_LIBRARY) https://github.com/Tencent/ncnn/issues/1873 - ## Could not find a package configuration file provided by "OpenCV" with any of the following names: OpenCVConfig.cmake opencv-config.cmake sudo apt-get install libopencv-dev or customized compile and install ,with set(OpenCV_DIR {the dir OpenCVConfig.cmake exist}) - ## Could not find a package configuration file provided by "ncnn" with any of the following names: ncnnConfig.cmake ncnn-config.cmake set(ncnn_DIR { the dir ncnnConfig.cmake exist}) - ## xxx.lib not found(be specified by system/compiler) undefined reference to __kmpc_for_static_init_4 __kmpc_for_static_fini __kmpc_fork_call ... Need to link openmp undefined reference to glslang::InitializeProcess() glslang::TShader::TShader(EShLanguage) ... need glslang.lib glslang-default-resource-limits.lib undefined reference to AAssetManager_fromJava AAssetManager_open AAsset_seek ... Add android to find_library and target_like_libraries find_package(ncnn) - ## undefined reference to typeinfo for ncnn::Layer opencv rtti -> opencv-mobile - ## undefined reference to __cpu_model upgrade compiler / libgcc_s libgcc - ## unrecognized command line option "-mavx2" upgrade gcc - ## Why is the compiled ncnn-android library so large? See https://github.com/Tencent/ncnn/wiki/build-for-android.zh and see How to trim smaller ncnn - ## ncnnoptimize and custom layer ncnnoptimize first before adding a custom layer to avoid ncnnoptimize not being able to handle custom layer saves. - ## rtti/exceptions Conflict The reason for the conflict is that the libraries used in the project are configured differently, so analyze whether you need to turn them on or off according to your actual situation. ncnn is ON by default, add the following two parameters when recompiling ncnn. - ON: -DNCNN_DISABLE_RTTI=OFF -DNCNN_DISABLE_EXCEPTION=OFF - OFF: -DNCNN_DISABLE_RTTI=ON -DNCNN_DISABLE_EXCEPTION=ON - ## error: undefined symbol: ncnn::Extractor::extract(char const*, ncnn::Mat&) Possible scenarios. - Try upgrading the NDK version of Android Studio # How do I add the ncnn library to my project and how does the cmake method work? Compile ncnn,and make install. linux/windows should set/export ncnn_DIR points to the directory containing ncnnConfig.cmake under the install directory - ## android - ## ios - ## linux - ## windows - ## macos - ## arm linux # Convert model issues - ## caffe `./caffe2ncnn caffe.prototxt caffe.caffemodel ncnn.param ncnn.bin` - ## mxnet ` ./mxnet2ncnn mxnet-symbol.json mxnet.params ncnn.param ncnn.bin` - ## darknet [https://github.com/xiangweizeng/darknet2ncnn](https://github.com/xiangweizeng/darknet2ncnn) - ## pytorch - onnx [use ncnn with pytorch or onnx](https://github.com/Tencent/ncnn/wiki/use-ncnn-with-pytorch-or-onnx) - ## tensorflow 1.x/2.x - keras [https://github.com/MarsTechHAN/keras2ncnn](https://github.com/MarsTechHAN/keras2ncnn) **[@MarsTechHAN](https://github.com/MarsTechHAN)** - ## tensorflow 2.x - mlir [Converting tensorflow2 models to ncnn via MLIR](https://zhuanlan.zhihu.com/p/152535430) **@[nihui](https://www.zhihu.com/people/nihui-2)** - ## netron [https://github.com/lutzroeder/netron](https://github.com/lutzroeder/netron) - ## How to generate a model with fixed shape? Input 0=w 1=h 2=c - ## why gpu can speedup - ## How to convert ncnnoptimize to fp16 model `ncnnoptimize model.param model.bin yolov5s-opt.param yolov5s-opt.bin 65536` - ## How to use ncnnoptimize checking the FLOPS / memory usage of your model - ## How to modify the model to support dynamics shape? Interp Reshape - ## How to convert a model into code embedded in a program? use ncnn2mem - ## How to encrypt the model? See https://zhuanlan.zhihu.com/p/268327784 - ## The ncnn model transferred under Linux, Windows/MacOS/Android/... Can I use it directly? Yes, for all platforms - ## How to remove post-processing and export onnx? Ref: Referring to an article by UP , step 3 is to remove the post-processing and then export the onnx, where removing the post-processing can be the result of removing the subsequent steps when testing within the project. - ## pytorch layers can't export to onnx? Mode 1: ONNX_ATEN_FALLBACK Fully customizable op, first change to one that can export (e.g. concat slice), go to ncnn and then modify param Way 2. You can try this with PNNX, see the following article for a general description: 1. [Windows/Linux/macOS steps for compiling PNNX](https://zhuanlan.zhihu.com/p/431833958) 2. [Learn in 5 minutes! Converting TorchScript models to ncnn models with PNNX](https://zhuanlan.zhihu.com/p/427512763) # Using - ## vkEnumeratePhysicalDevices failed -3 - ## vkCreateInstance failed -9 Please upgrade your GPU driver if you meet this crash or error. Here are the download sites for some brands of GPU drivers. We have provided some driver download pages here. [Intel](https://downloadcenter.intel.com/product/80939/Graphics-Drivers), [AMD](https://www.amd.com/en/support), [Nvidia](https://) www.nvidia.com/Download/index.aspx) - ## ModuleNotFoundError: No module named 'ncnn.ncnn' python setup.py develop - ## fopen nanodet-m.param failed path should be working dir File not found or not readable. Make sure that XYZ.param/XYZ.bin is accessible. - ## find_blob_index_by_name data / output / ... failed layer name vs blob name param.bin use xxx.id.h enum - ## parse magic failed - ## param is too old, please regenerate The model maybe has problems Your model file is being the old format converted by an old caffe2ncnn tool. Checkout the latest ncnn code, build it and regenerate param and model binary files, and that should work. Make sure that your param file starts with the magic number 7767517. you may find more info on use-ncnn-with-alexnet When adding the softmax layer yourself, you need to add 1=1 - ## set_vulkan_compute failed, network use_vulkan_compute disabled Set net.opt.use_vulkan_compute = true before load_param / load_model; - ## How to execute multiple blob inputs, multiple blob outputs? Multiple execute `ex.input()` and `ex.extract()` like following ``` ex.input("data1", in_1); ex.input("data2", in_2); ex.extract("output1", out_1); ex.extract("output2", out_2); ``` - ## Multiple executions of Extractor extract double the calculation? No - ## How to see the elapsed time for every layer? cmake -DNCNN_BENCHMARK=ON .. - ## How to convert a cv::Mat CV_8UC3 BGR image from_pixels to_pixels - ## How to convert float data to ncnn::Mat First of all, you need to manage the memory you request yourself, at this point ncnn::Mat will not automatically free up the float data you pass over to it ``` c++ std::vector testData(60, 1.0); // use std::vector to manage memory requests and releases yourself ncnn::Mat in1 = ncnn::Mat(60, (void*)testData.data()).reshape(4, 5, 3); // just pass the pointer to the float data as a void*, and even specify the dimension (up says it's best to use reshape to solve the channel gap) float* a = new float[60]; // New a piece of memory yourself, you need to release it later ncnn::Mat in2 = ncnn::Mat(60, (void*)a).reshape(4, 5, 3).clone(); // use the same method as above, clone() to transfer data owner ``` ================================================ FILE: docs/faq.md ================================================ # 如何加入技术交流QQ群? - 打开QQ→点击群聊搜索→搜索群号637093648→输入问题答案:卷卷卷卷卷→进入群聊→准备接受图灵测试(bushi) - 前往QQ搜索Pocky群:677104663(超多大佬),问题答案:multi level intermediate representation # 如何看作者b站直播? - nihui的bilibili直播间:[水竹院落](https://live.bilibili.com/1264617) # 编译 - ## 怎样下载完整源码? git clone --recursive https://github.com/Tencent/ncnn/ 或者 下载 [ncnn-xxxxx-full-source.zip](https://github.com/Tencent/ncnn/releases) - ## 怎么交叉编译?cmake 工具链怎么设置啊? 参见 https://github.com/Tencent/ncnn/wiki/how-to-build - ## The submodules were not downloaded! Please update submodules with "git submodule update --init" and try again 如上,下载完整源码。或者按提示执行: git submodule update --init - ## Could NOT find Protobuf (missing: Protobuf_INCLUDE_DIR) sudo apt-get install libprotobuf-dev protobuf-compiler - ## Could NOT find CUDA (missing: CUDA_TOOLKIT_ROOT_DIR CUDA_INCLUDE_DIRS CUDA_CUDART_LIBRARY) https://github.com/Tencent/ncnn/issues/1873 - ## Could not find a package configuration file provided by "OpenCV" with any of the following names: OpenCVConfig.cmake opencv-config.cmake sudo apt-get install libopencv-dev 或者自行编译安装,set(OpenCV_DIR {OpenCVConfig.cmake所在目录}) - ## Could not find a package configuration file provided by "ncnn" with any of the following names: ncnnConfig.cmake ncnn-config.cmake set(ncnn_DIR {ncnnConfig.cmake所在目录}) - ## 找不到库(需要根据系统/编译器指定) undefined reference to __kmpc_for_static_init_4 __kmpc_for_static_fini __kmpc_fork_call ... 需要链接openmp库 undefined reference to glslang::InitializeProcess() glslang::TShader::TShader(EShLanguage) ... 需要 glslang.lib glslang-default-resource-limits.lib undefined reference to AAssetManager_fromJava AAssetManager_open AAsset_seek ... find_library和target_like_libraries中增加 android find_package(ncnn) - ## undefined reference to typeinfo for ncnn::Layer opencv rtti -> opencv-mobile - ## undefined reference to __cpu_model 升级编译器 / libgcc_s libgcc - ## unrecognized command line option "-mavx2" 升级 gcc - ## 为啥自己编译的ncnn android库特别大? https://github.com/Tencent/ncnn/wiki/build-for-android.zh 以及见 如何裁剪更小的 ncnn 库 - ## ncnnoptimize和自定义层 先ncnnoptimize再增加自定义层,避免ncnnoptimize不能处理自定义层保存。 - ## rtti/exceptions冲突 产生原因是项目工程中使用的库配置不一样导致冲突,根据自己的实际情况分析是需要开启还是关闭。ncnn默认是ON,在重新编译ncnn时增加以下2个参数即可: - 开启:-DNCNN_DISABLE_RTTI=OFF -DNCNN_DISABLE_EXCEPTION=OFF - 关闭:-DNCNN_DISABLE_RTTI=ON -DNCNN_DISABLE_EXCEPTION=ON - ## error: undefined symbol: ncnn::Extractor::extract(char const*, ncnn::Mat&) 可能的情况: - 尝试升级 Android Studio 的 NDK 版本 - ## CMake 3.14.0 or higher is required. You are running version 2.8.12.2 ```shell wget https://github.com/Kitware/CMake/releases/download/v3.18.2/cmake-3.18.2-Linux-x86_64.tar.gz tar zxvf cmake-3.18.2-Linux-x86_64.tar.gz mv cmake-3.18.2-Linux-x86_64 /opt/cmake-3.18.2 ln -sf /opt/cmake-3.18.2/bin/* /usr/bin/ ``` # 怎样添加ncnn库到项目中?cmake方式怎么用? 编译ncnn,make install。linux/windows set/export ncnn_DIR 指向 install目录下包含ncnnConfig.cmake 的目录 - ## android - ## ios - ## linux - ## windows - ## macos - ## arm linux # 转模型问题 - ## caffe `./caffe2ncnn caffe.prototxt caffe.caffemodel ncnn.param ncnn.bin` - ## mxnet ` ./mxnet2ncnn mxnet-symbol.json mxnet.params ncnn.param ncnn.bin` - ## darknet [https://github.com/xiangweizeng/darknet2ncnn](https://github.com/xiangweizeng/darknet2ncnn) - ## pytorch - onnx [use ncnn with pytorch or onnx](https://github.com/Tencent/ncnn/wiki/use-ncnn-with-pytorch-or-onnx) - ## tensorflow 1.x/2.x - keras [https://github.com/MarsTechHAN/keras2ncnn](https://github.com/MarsTechHAN/keras2ncnn) **[@MarsTechHAN](https://github.com/MarsTechHAN)** - ## tensorflow 2.x - mlir [通过MLIR将tensorflow2模型转换到ncnn](https://zhuanlan.zhihu.com/p/152535430) **@[nihui](https://www.zhihu.com/people/nihui-2)** - ## netron [https://github.com/lutzroeder/netron](https://github.com/lutzroeder/netron) - ## 怎么生成有固定 shape 信息的模型? Input 0=w 1=h 2=c - ## why gpu能更快 - ## ncnnoptimize 怎么转成 fp16 模型 `ncnnoptimize model.param model.bin yolov5s-opt.param yolov5s-opt.bin 65536` - ## ncnnoptimize 怎样查看模型的 FLOPS / 内存占用情况 - ## 怎么修改模型支持动态 shape? Interp Reshape - ## 如何将模型转换为代码内嵌到程序里? ncnn2mem - ## 如何加密模型? https://zhuanlan.zhihu.com/p/268327784 - ## Linux下转的ncnn模型,Windows/MacOS/Android/.. 也能直接用吗? Yes,全平台通用 - ## 如何去掉后处理,再导出 onnx? 检测: 参考up的一篇文章,步骤三就是去掉后处理,再导出onnx,其中去掉后处理可以是项目内测试时去掉后续步骤的结果。 - ## pytorch 有的层导不出 onnx 怎么办? 方式一: ONNX_ATEN_FALLBACK 完全自定义的op,先改成能导出的(如 concat slice),转到 ncnn 后再修改 param 方式二: 可以使用PNNX来试试,参考以下文章大概说明: 1. [Windows/Linux/macOS 编译 PNNX 步骤](https://zhuanlan.zhihu.com/p/431833958) 2. [5分钟学会!用 PNNX 转换 TorchScript 模型到 ncnn 模型](https://zhuanlan.zhihu.com/p/427512763) # 使用 - ## vkEnumeratePhysicalDevices failed -3 - ## vkCreateInstance failed -9 出现此类问题请先更新GPU驱动。Please upgrade your GPU driver if you encounter this crash or error. 这里提供了一些品牌的GPU驱动下载网址.We have provided some drivers' download pages here. [Intel](https://downloadcenter.intel.com/product/80939/Graphics-Drivers),[AMD](https://www.amd.com/en/support),[Nvidia](https://www.nvidia.com/Download/index.aspx) - ## docker 环境里面 nvidia-smi 能看到显卡也能跑 cuda 却不能跑 vulkan 因为这个docker环境的nvidia驱动没有安装opengl/vulkan支持 首先运行 nvidia-smi 查看当前驱动版本 ``` NVIDIA-SMI 535.161.07 Driver Version: 535.161.07 CUDA Version: 12.2 ``` 然后去下载对应版本的NVIDIA驱动,安装用户态驱动文件,跳过内核部分 ``` wget https://us.download.nvidia.com/tesla/535.161.07/NVIDIA-Linux-x86_64-535.161.07.run chmod +x NVIDIA-Linux-x86_64-535.161.07.run ./NVIDIA-Linux-x86_64-535.161.07.run --silent --no-kernel-module ``` 安装时会报一些文件权限错误,不用管,安装完成后 vulkan 支持就可用了。最后安装 vulkaninfo 查看gpu信息 ``` dnf install vulkan-tools vulkaninfo ``` - ## ModuleNotFoundError: No module named 'ncnn.ncnn' python setup.py develop - ## fopen nanodet-m.param failed 文件路径 working dir File not found or not readable. Make sure that XYZ.param/XYZ.bin is accessible. - ## find_blob_index_by_name data / output / ... failed layer name vs blob name param.bin 应该用 xxx.id.h 的枚举 - ## parse magic failed - ## param is too old, please regenerate 模型本身有问题 Your model file is being the old format converted by an old caffe2ncnn tool. Checkout the latest ncnn code, build it and regenerate param and model binary files, and that should work. Make sure that your param file starts with the magic number 7767517. you may find more info on use-ncnn-with-alexnet When adding the softmax layer yourself, you need to add 1=1 - ## set_vulkan_compute failed, network use_vulkan_compute disabled 你应该在 load_param / load_model 之前设置 net.opt.use_vulkan_compute = true; - ## 多个blob输入,多个blob输出,怎么做? 多次执行`ex.input()` 和 `ex.extract()` ``` ex.input("data1", in_1); ex.input("data2", in_2); ex.extract("output1", out_1); ex.extract("output2", out_2); ``` - ## Extractor extract 多次会重复计算吗? 不会 - ## 如何看每一层的耗时? cmake -DNCNN_BENCHMARK=ON .. - ## 如何转换 cv::Mat CV_8UC3 BGR 图片 from_pixels to_pixels - ## 如何转换 float 数据为 ncnn::Mat 首先,自己申请的内存需要自己管理,此时ncnn::Mat不会自动给你释放你传过来的float数据 ``` c++ std::vector testData(60, 1.0); // 利用std::vector自己管理内存的申请和释放 ncnn::Mat in1 = ncnn::Mat(60, (void*)testData.data()).reshape(4, 5, 3); // 把float数据的指针转成void*传过去即可,甚至还可以指定维度(up说最好使用reshape用来解决channel gap) float* a = new float[60]; // 自己new一块内存,后续需要自己释放 ncnn::Mat in2 = ncnn::Mat(60, (void*)a).reshape(4, 5, 3).clone(); // 使用方法和上面相同,clone() to transfer data owner ``` - ## 如何初始化 ncnn::Mat 为全 0 `mat.fill(0.f);` - ## 如何查看/获取版本号 cmake时会打印 c_api.h ncnn_version() 自己拼 1.0+yyyymmdd - ## 如何转换 yuv 数据 yuv420sp2rgb yuv420sp2rgb_nv12 **[@metarutaiga](https://github.com/metarutaiga/xxYUV)** - ## 如何 resize crop rotate 图片 [efficient roi resize rotate](https://github.com/Tencent/ncnn/wiki/efficient-roi-resize-rotate) - ## 如何人脸5点对齐 get_affine_transform warpaffine_bilinear_c3 ```c // 计算变换矩阵 并且求逆变换 int type = 0; // 0->区域外填充为v[0],v[1],v[2], -233->区域外不处理 unsigned int v = 0; float tm[6]; float tm_inv[6]; // 人脸区域在原图上的坐标和宽高 float src_x = target->det.rect.x / target->det.w * pIveImageU8C3->u32Width; float src_y = target->det.rect.y / target->det.h * pIveImageU8C3->u32Height; float src_w = target->det.rect.w / target->det.w * pIveImageU8C3->u32Width; float src_h = target->det.rect.h / target->det.h * pIveImageU8C3->u32Height; float point_src[10] = { src_x + src_w * target->attr.land[0][0], src_x + src_w * target->attr.land[0][1], src_x + src_w * target->attr.land[1][0], src_x + src_w * target->attr.land[1][1], src_x + src_w * target->attr.land[2][0], src_x + src_w * target->attr.land[2][1], src_x + src_w * target->attr.land[3][0], src_x + src_w * target->attr.land[3][1], src_x + src_w * target->attr.land[4][0], src_x + src_w * target->attr.land[4][1], }; float point_dst[10] = { // +8 是因为我们处理112*112的图 30.2946f + 8.0f, 51.6963f, 65.5318f + 8.0f, 51.5014f, 48.0252f + 8.0f, 71.7366f, 33.5493f + 8.0f, 92.3655f, 62.7299f + 8.0f, 92.2041f, }; // 第一种方式:先计算变换在求逆 AffineTrans::get_affine_transform(point_src, point_dst, 5, tm); AffineTrans::invert_affine_transform(tm, tm_inv); // 第二种方式:直接拿到求逆的结果 // AffineTrans::get_affine_transform(point_dst, point_src, 5, tm_inv); // rgb 分离的,所以要单独处理 for(int c = 0; c < 3; c++) { unsigned char* pSrc = malloc(xxx); unsigned char* pDst = malloc(xxx); ncnn::warpaffine_bilinear_c1(pSrc, SrcWidth, SrcHeight, SrcStride[c], pDst, DstWidth, DstHeight, DstStride[c], tm_inv, type, v); } // rgb packed则可以一次处理 ncnn::warpaffine_bilinear_c3(pSrc, SrcWidth, SrcHeight, SrcStride, pDst, DstWidth, DstHeight, DstStride, tm_inv, type, v); ``` - ## 如何获得中间层的blob输出 ncnn::Mat output; ex.extract("your_blob_name", output); - ## 为什么我使用GPU,但是GPU占用为0 windows 10 任务管理器 - 性能选项卡 - GPU - 选择其中一个视图左上角的下拉箭头切换到 Compute_0 / Compute_1 / Cuda 你还可以安装软件:GPU-Z - ## layer XYZ not exists or registered Your network contains some operations that are not implemented in ncnn. You may implement them as custom layer followed in how-to-implement-custom-layer-step-by-step. Or you could simply register them as no-op if you are sure those operations make no sense. ``` class Noop : public ncnn::Layer {}; DEFINE_LAYER_CREATOR(Noop) net.register_custom_layer("LinearRegressionOutput", Noop_layer_creator); net.register_custom_layer("MAERegressionOutput", Noop_layer_creator); ``` - ## network graph not ready You shall call Net::load_param() first, then Net::load_model(). This error may also happens when Net::load_param() failed, but not properly handled. For more information about the ncnn model load api, see ncnn-load-model - ## memory not 32-bit aligned at XYZ The pointer passed to Net::load_param() or Net::load_model() is not 32bit aligned. In practice, the head pointer of std::vector is not guaranteed to be 32bit aligned. you can store your binary buffer in ncnn::Mat structure, its internal memory is aligned. - ## crash on android with '__kmp_abort_process' This usually happens if you bundle multiple shared library with openmp linked It is actually an issue of the android ndk https://github.com/android/ndk/issues/1028 On old android ndk, modify the link flags as -Wl,-Bstatic -lomp -Wl,-Bdynamic For recent ndk >= 21 -fstatic-openmp - ## dlopen failed: library "libomp.so" not found Newer android ndk defaults to dynamic openmp runtime modify the link flags as -fstatic-openmp -fopenmp - ## crash when freeing a ncnn dynamic library(.dll/.so) built with openMP for optimal performance, the openmp threadpool spin waits for about a second prior to shutting down in case more work becomes available. If you unload a dynamic library that's in the process of spin-waiting, it will crash in the manner you see (most of the time). Just set OMP_WAIT_POLICY=passive in your environment, before calling loadlibrary. or Just wait a few seconds before calling freelibrary. You can also use the following method to set environment variables in your code: for msvc++: SetEnvironmentVariable(_T("OMP_WAIT_POLICY"), _T("passive")); for g++: setenv("OMP_WAIT_POLICY", "passive", 1) reference: https://stackoverflow.com/questions/34439956/vc-crash-when-freeing-a-dll-built-with-openmp # 跑出来的结果对不上 [ncnn-produce-wrong-result](https://github.com/Tencent/ncnn/wiki/FAQ-ncnn-produce-wrong-result) - ## 如何打印 ncnn::Mat 的值? ```C++ void pretty_print(const ncnn::Mat& m) { for (int q=0; q // Don't forget this void pretty_print(const ncnn::Mat& m) { for (int q=0; q normed_feats(m.c); for (int i=0; i(y); uchar* sp = normed_feats[i].ptr(y); for (int x=0; x注意这里的flag指的是fp32和fp16,其中0指的是fp32,1指的是fp16 使用方式二: - ./ncnnoptimize ncnn.param ncnn.bin new.param new.bin flag cutstartname cutendname
cutstartname:模型截取的起点
cutendname:模型截取的终点 - ## 如何使用量化工具? [Post Training Quantization Tools](https://github.com/Tencent/ncnn/tree/master/tools/quantize) - ## 如何设置线程数? opt.num_threads - ## 如何降低CPU占用率? net.opt.openmp_blocktime = 0; OMP_WAIT_POLICY=passive - ## 如何 batch inference? ``` int max_batch_size = vkdev->info.compute_queue_count; ncnn::Mat inputs[1000]; ncnn::Mat outputs[1000]; #pragma omp parallel for num_threads(max_batch_size) for (int i=0; i<1000; i++) { ncnn::Extractor ex = net1.create_extractor(); ex.input("data", inputs[i]); ex.extract("prob", outputs[i]); } ``` - ## partial graph inference 先 extract 分类,判断后,再 extract bbox - ## 如何启用 bf16s 加速? ``` net.opt.use_packing_layout = true; net.opt.use_bf16_storage = true; ``` [用bf16加速ncnn](https://zhuanlan.zhihu.com/p/112564372) **@[nihui](https://www.zhihu.com/people/nihui-2)** A53 - ## 如何裁剪更小的 ncnn 库? [build-minimal-library](https://github.com/Tencent/ncnn/wiki/build-minimal-library) - ## net.opt sgemm winograd fp16_storage 各是有什么作用? 对内存消耗的影响 - ## 如何解决显卡进入节能模式造成的一系列问题? nVidia显卡(Intel和AMD估计也有)会在它认为的所谓空闲模式下,自动进入 `节能模式`,显存和核心频率就都会降低。 简单来说就是如果你的计算任务是 `非连续的`,那么可能会让耗时看起来非常 `不均匀`,当期间有运算空闲间隔发生,显卡进入节能模式,则会在下一次冷启动时发生计算耗时远超正常耗时几倍的情况,如下日志所示: ```cpp //开始播放 Total: 162ms, Diff: 0ms, GLTex2Mat: 7ms, calc: 152ms, Mat2GLTex: 3ms Total: 43ms, Diff: 0ms, GLTex2Mat: 3ms, calc: 35ms, Mat2GLTex: 2ms Total: 45ms, Diff: 0ms, GLTex2Mat: 3ms, calc: 37ms, Mat2GLTex: 3ms Total: 40ms, Diff: 0ms, GLTex2Mat: 3ms, calc: 32ms, Mat2GLTex: 4ms //暂停3秒 //继续播放 Total: 190ms, Diff: 0ms, GLTex2Mat: 9ms, calc: 177ms, Mat2GLTex: 3ms Total: 134ms, Diff: 0ms, GLTex2Mat: 5ms, calc: 110ms, Mat2GLTex: 18ms Total: 40ms, Diff: 0ms, GLTex2Mat: 3ms, calc: 34ms, Mat2GLTex: 2ms Total: 42ms, Diff: 0ms, GLTex2Mat: 3ms, calc: 36ms, Mat2GLTex: 2ms Total: 47ms, Diff: 0ms, GLTex2Mat: 5ms, calc: 38ms, Mat2GLTex: 3ms ... ``` 在对时间不敏感的项目上,这个问题没什么大不了的,完全可以忽略,但是有些业务场景上必须精准推估下一帧及其未来几帧的从上传、计算到渲染的耗时情况,则这种现象将会给开发者打开些许困扰。 ### 3种解决方法 * 联系显卡厂商,让其更新驱动将你的应用加入到免节能模式的白名单。 * 优点:你什么都不用改。缺点:沟通困难,很可能显卡厂商根本不理你。 * [显卡控制面板] - [管理3D设置] - [电源管理模式],改成:[最高性能优先]。 * 优点:不用改代码。缺点:如果是部署端是小白用户,需要编写手册手把手教他。 * 可以空闲(暂停)时定期灌一些心跳计算包的任务进去(放1x1小图)让GPU维持在高性能状态。 * 优点:需要改代码。缺点:不低碳不环保。 # 白嫖项目 - ## nanodet # 其他 - ## up主用的什么系统/编辑器/开发环境? | 软件类型 | 软件名称 | | ------------| ----------- | | 系统 | Fedora | | 桌面环境 | KDE | | 编辑器 | Kate | | 画草图 | kolourpaint | | 画函数图像 | kmplot | | bilibili直播 | OBS | ================================================ FILE: docs/how-to-build/build-mlir2ncnn.md ================================================ # mlir2ncnn ## Compile **Clone LLVM** ```bash https://github.com/llvm/llvm-project.git git checkout -b mlir ``` Current working commit id is 74e6030bcbcc8e628f9a99a424342a0c656456f9: ```bash $ git log commit 74e6030bcbcc8e628f9a99a424342a0c656456f9 (HEAD -> main, origin/main, origin/HEAD) Author: Craig Topper Date: Thu Mar 4 22:30:38 2021 -0800 [TargetLowering] Use HandleSDNodes to prevent nodes from being deleted by recursive calls in getNegatedExpression. ``` It is determined by query lastest git commit date of `tools/mlir` directory. **Compile mlir** ```bash cd llvm-project mkdir build cd build cmake -G Ninja -DCMAKE_INSTALL_PREFIX=install -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DLLVM_ENABLE_PROJECTS="mlir" -DLLVM_TARGETS_TO_BUILD="" -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF ../llvm/ ninja -j8 ninja install ``` **Compile mlir2ncnn** ```bash cd tools/mlir mkdir build cd build cmake .. -D LLVM_DIR= make ``` ## Usage **Export `.mlir`** See https://zhuanlan.zhihu.com/p/152535430 **Usage mlir2ncnn** ```bash ./mlir2ncnn pix2pix.mlir pix2pix.param pix2pix.bin ``` ================================================ FILE: docs/how-to-build/how-to-build.md ================================================ ### Git clone ncnn repo with submodule ``` git clone https://github.com/Tencent/ncnn.git cd ncnn git submodule update --init ``` - [Git clone ncnn repo with submodule](#git-clone-ncnn-repo-with-submodule) - [Build for Linux](#build-for-linux) - [Nvidia Jetson](#nvidia-jetson) - [Raspberry Pi](#raspberry-pi) - [POWER](#power) - [Intel oneAPI](#intel-oneapi) - [Cross compile: Riscv-gnu-toolchain](#cross-compile-riscv-gnu-toolchain) - [Verification](#verification) - [Build for Windows x64 using Visual Studio Community 2017](#build-for-windows-x64-using-visual-studio-community-2017) - [Build for Windows x64 using MinGW-w64](#build-for-windows-x64-using-mingw-w64) - [Build for Windows XP (x86)](#build-for-windows-xp-x86) - [Using MinGW-w64](#using-mingw-w64) - [Using Clang](#using-clang) - [Using Visual Studio (MSVC)](#using-visual-studio-msvc) - [Build for macOS](#build-for-macos) - [Build for ARM Cortex-A family with cross-compiling](#build-for-arm-cortex-a-family-with-cross-compiling) - [Build for Hisilicon platform with cross-compiling](#build-for-hisilicon-platform-with-cross-compiling) - [Build for AnyCloud platform with cross-compiling](#build-for-AnyCloud-platform-with-cross-compiling) - [Build for Android](#build-for-android) - [Build for iOS on macOS with xcode](#build-for-ios-on-macos-with-xcode) - [Build for WebAssembly](#build-for-webassembly) - [Build for AllWinner D1](#build-for-allwinner-d1) - [Build for Loongson 2K1000](#build-for-loongson-2k1000) - [Build for Termux on Android](#build-for-termux-on-android) - [Build for QNX](#build-for-qnx) - [Build for Nintendo 3DS Homebrew Launcher](#build-for-nintendo-3ds-homebrew-launcher) - [Build for HarmonyOS with cross-compiling](#build-for-harmonyos-with-cross-compiling) - [Build for ESP32 with cross-compiling](#build-for-esp32-with-cross-compiling) *** ### Build for Linux Install required build dependencies: * git * g++ * cmake * protocol buffer (protobuf) headers files and protobuf compiler * (optional) LLVM OpenMP header files # If building with Clang, and multithreaded CPU inference is desired * (optional) opencv # For building examples Generally if you have Intel, AMD or Nvidia GPU from last 10 years, Vulkan can be easily used. On some systems there are no Vulkan drivers easily available at the moment (October 2020), so you might need to disable use of Vulkan on them. This applies to Raspberry Pi 3 (but there is experimental open source Vulkan driver in the works, which is not ready yet). Nvidia Tegra series devices (like Nvidia Jetson) should support Vulkan. Ensure you have most recent software installed for best experience. On Debian, Ubuntu, or Raspberry Pi OS, you can install all required dependencies using: ```shell sudo apt install build-essential git cmake libprotobuf-dev protobuf-compiler libomp-dev libopencv-dev ``` On Redhat or Centos, you can install all required dependencies using: ```shell sudo yum install build-essential git cmake libprotobuf-dev protobuf-compiler libopencv-dev ``` To use Vulkan after building ncnn later, you will also need to have Vulkan driver for your GPU. For AMD and Intel GPUs these can be found in Mesa graphics driver, which usually is installed by default on all distros (i.e. `sudo apt install mesa-vulkan-drivers` on Debian/Ubuntu). For Nvidia GPUs the proprietary Nvidia driver must be downloaded and installed (some distros will allow easier installation in some way). After installing Vulkan driver, confirm Vulkan libraries and driver are working, by using `vulkaninfo` or `vulkaninfo | grep deviceType`, it should list GPU device type. If there are more than one GPU installed (including the case of integrated GPU and discrete GPU, commonly found in laptops), you might need to note the order of devices to use later on. #### Nvidia Jetson The Vulkan driver is a default component of the Linux For Tegra BSP release, check [the device list](https://developer.nvidia.com/embedded/vulkan). ```shell cd ncnn mkdir -p build cd build cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../toolchains/jetson.toolchain.cmake -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=ON .. make -j$(nproc) ``` #### Raspberry Pi Vulkan drivers do exists, but are not mature. You are free to experiment at your own discretion, and report results and performance. ```shell cd ncnn mkdir -p build cd build cmake -DCMAKE_BUILD_TYPE=Release -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=ON .. make -j$(nproc) ``` You can add `-GNinja` to `cmake` above to use Ninja build system (invoke build using `ninja` or `cmake --build .`). For Raspberry Pi 3 on 32bit OS, add `-DCMAKE_TOOLCHAIN_FILE=../toolchains/pi3.toolchain.cmake` to cmake. You can also consider disabling Vulkan support as the Vulkan drivers for Raspberry Pi are still not mature, but it doesn't hurt to build the support in, but not use it. #### POWER For POWER9 with Clang: ```shell cd ncnn mkdir -p build cd build cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../toolchains/power9le-linux-gnu-vsx.clang.toolchain.cmake -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=ON .. make -j$(nproc) ``` To use GCC instead, use the `power9le-linux-gnu-vsx.toolchain.cmake` toolchain file instead. Note that according to benchmarks, Clang appears to produce noticeably faster CPU inference than GCC for POWER9 targets. For fastest inference, use Clang 18 or higher; earlier versions of Clang may have impaired inference speed due to [Bug 49864](https://github.com/llvm/llvm-project/issues/49864) and [Bug 64664](https://github.com/llvm/llvm-project/issues/64664). For POWER8 instead of POWER9, use the `power8le-linux-gnu-vsx.clang.toolchain.cmake` or `power8le-linux-gnu-vsx.toolchain.cmake` toolchain file instead. POWER8 will be slower than POWER9. Note that the POWER toolchain files only support little-endian mode. #### Intel oneAPI Besides the prerequests in this section, Intel oneAPI BaseKit and HPCKit should be installed. They are available from https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html and https://www.intel.com/content/www/us/en/developer/tools/oneapi/hpc-toolkit.html freely. Intel oneAPI offers two kinds of compilers, the classic `icc/icpc` and the LLVM based `icx/icpx`. To build with these compilers, add `CC=icc CXX=icpc` or `CC=icx CXX=icpx` before the `cmake` command. When compiling with `icc/icpc`, cmake will warn that `xop`, `avx512`, and `bf16` extensions are not supported by the compiler, while `icx/icpx` works well. Both of these compilers have been tested and passed the ncnn benchmark successfully. The results have been included in ncnn benchmark readme. Generally, `icx/icpx` are likely to show better performance than `icc/icpc` and the quantized models can benefit from the extensions `icx/icpx` supports. #### Cross compile: Riscv-gnu-toolchain Before compiling the whole project, toolchain must be installed. [Reference: Riscv-gnu-toolchain build guide](https://github.com/riscv-collab/riscv-gnu-toolchain/blob/master/README.md) ```shell # configure with vector extension. ./configure --prefix=/opt/riscv --enable-multilib --with-arch=rv64gcv # configure without vector extension. ./configure --prefix=/opt/riscv --enable-multilib --with-arch=rv64gc # it takes quite a long time:( sudo make linux ``` Now you can build the project: ```shell mkdir build-riscv cd build-riscv cmake -DDCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.toolchain.cmake -DNCNN_BUILD_EXAMPLES=ON .. make -j$(nproc) # or `make -j2` if your cpu isn't powerful enough. ``` #### Verification Verify build by running some examples: ```shell cd ../examples ../build/examples/squeezenet ../images/256-ncnn.png [0 AMD RADV FIJI (LLVM 10.0.1)] queueC=1[4] queueG=0[1] queueT=0[1] [0 AMD RADV FIJI (LLVM 10.0.1)] bugsbn1=0 buglbia=0 bugcopc=0 bugihfa=0 [0 AMD RADV FIJI (LLVM 10.0.1)] fp16p=1 fp16s=1 fp16a=0 int8s=1 int8a=1 532 = 0.163452 920 = 0.093140 716 = 0.061584 ``` You can also run benchmarks (the 4th argument is a GPU device index to use, refer to `vulkaninfo`, if you have more than one GPU): ```shell cd ../benchmark ../build/benchmark/benchncnn 10 $(nproc) 0 0 [0 AMD RADV FIJI (LLVM 10.0.1)] queueC=1[4] queueG=0[1] queueT=0[1] [0 AMD RADV FIJI (LLVM 10.0.1)] bugsbn1=0 buglbia=0 bugcopc=0 bugihfa=0 [0 AMD RADV FIJI (LLVM 10.0.1)] fp16p=1 fp16s=1 fp16a=0 int8s=1 int8a=1 num_threads = 4 powersave = 0 gpu_device = 0 cooling_down = 1 squeezenet min = 4.68 max = 4.99 avg = 4.85 squeezenet_int8 min = 38.52 max = 66.90 avg = 48.52 ... ``` To run benchmarks on a CPU, set the 5th argument to `-1`. *** ### Build for Windows x64 using Visual Studio Community 2017 Download and Install Visual Studio Community 2017 from https://visualstudio.microsoft.com/vs/community/ Start the command prompt: `Start → Programs → Visual Studio 2017 → Visual Studio Tools → x64 Native Tools Command Prompt for VS 2017` > You can also search `x64 Native Tools Command Prompt for VS 2017` directly. Download protobuf-3.11.2 from https://github.com/google/protobuf/archive/v3.11.2.zip Build protobuf library: ```shell cd mkdir protobuf_build cd protobuf_build cmake -A x64 -DCMAKE_INSTALL_PREFIX=%cd%/install -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake cmake --build . --config Release -j 2 cmake --build . --config Release --target install ``` Build ncnn library (replace `` with a proper path): ```shell cd mkdir -p protobuf_build cd protobuf_build cmake -A x64 -DCMAKE_INSTALL_PREFIX=%cd%/install -Dprotobuf_DIR=/protobuf_build/install/cmake -DNCNN_VULKAN=ON .. cmake --build . --config Release -j 2 cmake --build . --config Release --target install ``` Note: To speed up compilation process on multi core machines, configuring `cmake` to use `jom` or `ninja` using `-G` flag is recommended. Note: For protobuf >=22.0 (Take v25.3 for example): Build zlib: ```shell git clone -b -v1.3.1 https://github.com/madler/zlib.git cd zlib mkdir build cd build cmake -A x64 -DCMAKE_INSTALL_PREFIX=%cd%/install .. cmake --build . --config Release -j 2 cmake --build . --config Release --target install ``` Build protobuf library (replace `` with a proper path): ```shell git clone -b v25.3 https://github.com/protocolbuffers/protobuf.git cd protobuf git submodule update --init --recursive mkdir protobuf_build cd protobuf_build cmake -A x64 -DCMAKE_INSTALL_PREFIX=%cd%/install -DCMAKE_CXX_STANDARD=14 -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF -DZLIB_INCLUDE_DIR=\build\install\include -DZLIB_LIBRARY=\build\install\lib\zlib.lib -DABSL_PROPAGATE_CXX_STD=ON ../cmake cmake --build . --config Release -j 2 cmake --build . --config Release --target install ``` Build ncnn library (replace `` and `` with a proper path): ```shell cd mkdir -p build cd build cmake -A x64 -DCMAKE_INSTALL_PREFIX=%cd%/install -DCMAKE_PREFIX_PATH=/protobuf_build\install\cmake -DZLIB_INCLUDE_DIR=\build\install\include -DZLIB_LIBRARY=\build\install\lib\zlib.lib -Dabsl_DIR=/protobuf_build\install\lib\cmake\absl -Dutf8_range_DIR=/protobuf_build\install\lib\cmake\utf8_range -DNCNN_VULKAN=ON .. cmake --build . --config Release -j 2 cmake --build . --config Release --target install ``` *** ### Build for Windows x64 using MinGW-w64 Download MinGW-w64 toolchain from [winlibs](https://winlibs.com/) or [w64devkit](https://github.com/skeeto/w64devkit), add `bin` folder to environment variables. Build ncnn library: ```shell cd mkdir build cd build cmake -DNCNN_VULKAN=ON -G "MinGW Makefiles" .. cmake --build . --config Release -j 4 cmake --build . --config Release --target install ``` *** ### Build for Windows XP (x86) > **Note:** Windows XP support is provided through collaborative contributions from [@Sugar-Baby](https://github.com/Sugar-Baby) and [@AtomAlpaca](https://github.com/AtomAlpaca). #### Using MinGW-w64 Download mingw toolchain targeting 32 bit from [sourceforge](https://jaist.dl.sourceforge.net/project/mingw-w64/Toolchains%20targetting%20Win32/Personal%20Builds/mingw-builds/8.1.0/threads-posix/dwarf/i686-8.1.0-release-posix-dwarf-rt_v6-rev0.7z), extract and add environment variable named `MINGW32_ROOT_PATH` valued by ``, and add `/bin` to `PATH`. ```shell mkdir build cd build cmake -DCMAKE_TOOLCHAIN_FILE="../toolchains/windows-xp-mingw.toolchain.cmake" -DNCNN_WINXP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_AVX=OFF .. -G "MinGW Makefiles" cmake --build . --config Release -j 4 cmake --build . --config Release --target install ``` #### Using Clang Clang requires libraries from mingw. Configure mingw toolchain targeting 32-bit as described in the [MinGW-w64 section](#using-mingw-w64). Install Clang 6.0 or later. ```shell mkdir build cd build cmake -DCMAKE_TOOLCHAIN_FILE="../toolchains/windows-xp-clang.toolchain.cmake" -DNCNN_WINXP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_AVX=OFF .. -G "MinGW Makefiles" cmake --build . --config Release -j 4 cmake --build . --config Release --target install ``` #### Using Visual Studio (MSVC) Install v141_xp toolset for Windows XP: 1. Bring up the Visual Studio installer (Tools → Get Tools and Features) 2. Select Desktop development with C++ 3. Select Windows XP support for C++ from the Summary section 4. Click Modify ```shell mkdir build cd build cmake -A WIN32 -G "Visual Studio 17 2022" -T v141_xp -DNCNN_WINXP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_OPENMP=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_WITH_STATIC_CRT=ON -DCMAKE_TOOLCHAIN_FILE="../toolchains/windows-xp-msvc.toolchain.cmake" .. cmake --build . --config Release -j 4 cmake --build . --config Release --target install ``` **Note:** The MSVC toolchain uses the `v141_xp` platform toolset for Windows XP compatibility. Vulkan is disabled for XP compatibility, and advanced CPU features (AVX, AVX2, AVX512) are disabled to ensure compatibility with older processors. *** ### Build for macOS We've published ncnn to [brew](https://formulae.brew.sh/formula/ncnn#default) now, you can just use following method to install ncnn if you have the Xcode Command Line Tools installed. ```shell brew update brew install ncnn ``` Or if you want to compile and build ncnn locally, first install Xcode or Xcode Command Line Tools according to your needs. Then install `protobuf` and `libomp` via homebrew ```shell brew install protobuf libomp ``` Download and install Vulkan SDK from ```shell wget https://sdk.lunarg.com/sdk/download/1.3.280.1/mac/vulkansdk-macos-1.3.280.1.dmg -O vulkansdk-macos-1.3.280.1.dmg hdiutil attach vulkansdk-macos-1.3.280.1.dmg sudo /Volumes/vulkansdk-macos-1.3.280.1/InstallVulkan.app/Contents/MacOS/InstallVulkan --root `pwd`/vulkansdk-macos-1.3.280.1 --accept-licenses --default-answer --confirm-command install hdiutil detach /Volumes/vulkansdk-macos-1.3.280.1 # setup env export VULKAN_SDK=`pwd`/vulkansdk-macos-1.3.280.1/macOS ``` ```shell cd mkdir -p build cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DPLATFORM=MAC -DARCHS="x86_64;arm64" \ -DVulkan_LIBRARY=`pwd`/../vulkansdk-macos-1.3.280.1/macOS/lib/libMoltenVK.dylib \ -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=ON .. cmake --build . -j 4 cmake --build . --target install ``` *Note: If you encounter `libomp` related errors during installation, you can also check our GitHub Actions at [here](https://github.com/Tencent/ncnn/blob/d91cccf/.github/workflows/macos-x64-gpu.yml#L50-L68) to install and use `openmp`.* *** ### Build for ARM Cortex-A family with cross-compiling Download ARM toolchain from https://developer.arm.com/open-source/gnu-toolchain/gnu-a/downloads ```shell export PATH=":${PATH}" ``` Alternatively install a cross-compiler provided by the distribution (i.e. on Debian / Ubuntu, you can do `sudo apt install g++-arm-linux-gnueabi g++-arm-linux-gnueabihf g++-aarch64-linux-gnu`). Depending on your needs build one or more of the below targets. AArch32 target with soft float (arm-linux-gnueabi) ```shell cd mkdir -p build-arm-linux-gnueabi cd build-arm-linux-gnueabi cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake .. make -j$(nproc) make install ``` AArch32 target with hard float (arm-linux-gnueabihf) ```shell cd mkdir -p build-arm-linux-gnueabihf cd build-arm-linux-gnueabihf cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf.toolchain.cmake .. make -j$(nproc) make install ``` AArch64 GNU/Linux target (aarch64-linux-gnu) ```shell cd mkdir -p build-aarch64-linux-gnu cd build-aarch64-linux-gnu cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake .. make -j$(nproc) make install ``` *** ### Build for Hisilicon platform with cross-compiling Download and install Hisilicon SDK. The toolchain should be in `/opt/hisi-linux/x86-arm` new version of Hisilicon toolchain should be in `/opt/linux/x86-arm/` ```shell cd mkdir -p build cd build # Choose one cmake toolchain file depends on your target platform cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/hisiv300.toolchain.cmake .. cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/hisiv500.toolchain.cmake .. cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/himix100.toolchain.cmake .. cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/himix200.toolchain.cmake .. cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/himix210.toolchain.cmake .. make -j$(nproc) make install ``` *** ### Build for AnyCloud platform with cross-compiling Download and install AnyCloud SDK. And load env to set toolchain can access in shell ```shell cd mkdir -p build cd build # Choose one cmake toolchain file depends on your target platform cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/anykav500.toolchain.cmake .. make -j$(nproc) make install ``` *** ### Build for Android You can use the pre-build ncnn-android-lib.zip from https://github.com/Tencent/ncnn/releases Download Android NDK from http://developer.android.com/ndk/downloads/index.html and install it, for example: ```shell unzip android-ndk-r21d-linux-x86_64.zip export ANDROID_NDK= ``` (optional) remove the hardcoded debug flag in Android NDK [android-ndk issue](https://github.com/android-ndk/ndk/issues/243) ``` # open $ANDROID_NDK/build/cmake/android.toolchain.cmake for ndk < r23 # or $ANDROID_NDK/build/cmake/android-legacy.toolchain.cmake for ndk >= r23 # delete "-g" line list(APPEND ANDROID_COMPILER_FLAGS -g -DANDROID ``` Build armv7 library ```shell cd mkdir -p build-android-armv7 cd build-android-armv7 cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" \ -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON \ -DANDROID_PLATFORM=android-14 -DNCNN_VULKAN=ON .. # If you use cmake >= 3.21 and ndk-r23 # you need to add -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False option for working optimization flags make -j$(nproc) make install ``` Pick `build-android-armv7/install` folder for further JNI usage. Build aarch64 library: ```shell cd mkdir -p build-android-aarch64 cd build-android-aarch64 cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake"\ -DANDROID_ABI="arm64-v8a" \ -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON .. # If you use cmake >= 3.21 and ndk-r23 # you need to add -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False option for working optimization flags make -j$(nproc) make install ``` Pick `build-android-aarch64/install` folder for further JNI usage. *** ### Build for iOS on macOS with xcode You can use the pre-build ncnn.framework glslang.framework and openmp.framework from https://github.com/Tencent/ncnn/releases Install xcode You can replace ```-DENABLE_BITCODE=0``` to ```-DENABLE_BITCODE=1``` in the following cmake arguments if you want to build bitcode enabled libraries. Download and install openmp for multithreading inference feature on iPhoneOS ```shell wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz tar -xf openmp-11.0.0.src.tar.xz cd openmp-11.0.0.src # apply some compilation fix sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S mkdir -p build-ios cd build-ios cmake -DCMAKE_TOOLCHAIN_FILE=/toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install \ -DPLATFORM=OS64 -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DARCHS="arm64;arm64e" \ -DPERL_EXECUTABLE=/usr/local/bin/perl \ -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. cmake --build . -j 4 cmake --build . --target install # copy openmp library and header files to xcode toolchain sysroot # is usually /Applications/Xcode.app or /Applications/Xcode-beta.app depends on your Xcode version sudo cp install/include/* /Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/include sudo cp install/lib/libomp.a /Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib ``` Download and install openmp for multithreading inference feature on iPhoneSimulator ```shell wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz tar -xf openmp-11.0.0.src.tar.xz cd openmp-11.0.0.src # apply some compilation fix sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S mkdir -p build-ios-sim cd build-ios-sim cmake -DCMAKE_TOOLCHAIN_FILE=/toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install \ -DPLATFORM=SIMULATORARM64 -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DARCHS="x86_64;arm64" \ -DPERL_EXECUTABLE=/usr/local/bin/perl \ -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. cmake --build . -j 4 cmake --build . --target install # copy openmp library and header files to xcode toolchain sysroot # is usually /Applications/Xcode.app or /Applications/Xcode-beta.app depends on your Xcode version sudo cp install/include/* /Contents/Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/include sudo cp install/lib/libomp.a /Contents/Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib ``` Package openmp framework: ```shell cd mkdir -p openmp.framework/Versions/A/Headers mkdir -p openmp.framework/Versions/A/Resources ln -s A openmp.framework/Versions/Current ln -s Versions/Current/Headers openmp.framework/Headers ln -s Versions/Current/Resources openmp.framework/Resources ln -s Versions/Current/openmp openmp.framework/openmp lipo -create build-ios/install/lib/libomp.a build-ios-sim/install/lib/libomp.a -o openmp.framework/Versions/A/openmp cp -r build-ios/install/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/11.0/g' /Info.plist > openmp.framework/Versions/A/Resources/Info.plist ``` Download and install Vulkan SDK from https://vulkan.lunarg.com/sdk/home ```shell wget https://sdk.lunarg.com/sdk/download/1.2.189.0/mac/vulkansdk-macos-1.2.189.0.dmg?Human=true -O vulkansdk-macos-1.2.189.0.dmg hdiutil attach vulkansdk-macos-1.2.189.0.dmg sudo /Volumes/vulkansdk-macos-1.2.189.0/InstallVulkan.app/Contents/MacOS/InstallVulkan --root `pwd`/vulkansdk-macos-1.2.189.0 --accept-licenses --default-answer --confirm-command install hdiutil detach /Volumes/vulkansdk-macos-1.2.189.0 # setup env export VULKAN_SDK=`pwd`/vulkansdk-macos-1.2.189.0/macOS ``` Build library for iPhoneOS: ```shell cd git submodule update --init mkdir -p build-ios cd build-ios cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DPLATFORM=OS64 -DARCHS="arm64;arm64e" \ -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 \ -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib/libomp.a" \ -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. cmake --build . -j 4 cmake --build . --target install ``` Build library for iPhoneSimulator: ```shell cd mkdir -p build-ios-sim cd build-ios-sim cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DPLATFORM=SIMULATORARM64 -DARCHS="x86_64;arm64" \ -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 \ -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib/libomp.a" \ -DNCNN_BUILD_BENCHMARK=OFF .. cmake --build . -j 4 cmake --build . --target install ``` Package glslang framework for iPhoneOS: ```shell cd mkdir -p glslang.framework/Versions/A/Headers mkdir -p glslang.framework/Versions/A/Resources ln -s A glslang.framework/Versions/Current ln -s Versions/Current/Headers glslang.framework/Headers ln -s Versions/Current/Resources glslang.framework/Resources ln -s Versions/Current/glslang glslang.framework/glslang libtool -static build-ios/install/lib/libglslang.a build-ios/install/lib/libSPIRV.a -o build-ios/install/lib/libglslang_combined.a lipo -create build-ios/install/lib/libglslang_combined.a -o glslang.framework/Versions/A/glslang cp -r build/install/include/glslang glslang.framework/Versions/A/Headers/ sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist ``` Package ncnn framework for iPhoneOS: ```shell cd mkdir -p ncnn.framework/Versions/A/Headers mkdir -p ncnn.framework/Versions/A/Resources ln -s A ncnn.framework/Versions/Current ln -s Versions/Current/Headers ncnn.framework/Headers ln -s Versions/Current/Resources ncnn.framework/Resources ln -s Versions/Current/ncnn ncnn.framework/ncnn lipo -create build-ios/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn cp -r build-ios/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist ``` Pick `ncnn.framework` `glslang.framework` and `openmp.framework` folder for app development. *** ### Build for WebAssembly Install Emscripten ```shell git clone https://github.com/emscripten-core/emsdk.git cd emsdk ./emsdk install 3.1.28 ./emsdk activate 3.1.28 source emsdk_env.sh ``` Build without any extension for general compatibility: ```shell mkdir -p build cd build cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake \ -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. cmake --build . -j 4 cmake --build . --target install ``` Build with WASM SIMD extension: ```shell mkdir -p build-simd cd build-simd cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake \ -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. cmake --build . -j 4 cmake --build . --target install ``` Build with WASM Thread extension: ```shell mkdir -p build-threads cd build-threads cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake \ -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. cmake --build . -j 4 cmake --build . --target install ``` Build with WASM SIMD and Thread extension: ```shell mkdir -p build-simd-threads cd build-simd-threads cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake \ -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. cmake --build . -j 4 cmake --build . --target install ``` Pick `build-XYZ/install` folder for further usage. *** ### Build for AllWinner D1 Download c906 toolchain package from https://www.xrvm.cn/community/download?id=4453617141140230144 ```shell tar -xf Xuantie-900-gcc-linux-6.6.0-glibc-x86_64-V3.1.0-20250522.tar.gz export RISCV_ROOT_PATH=/home/nihui/osd/Xuantie-900-gcc-linux-6.6.0-glibc-x86_64-V3.1.0 ``` Build ncnn with riscv-v vector and simpleocv enabled: ```shell mkdir -p build-c906 cd build-c906 cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/c906-v310.toolchain.cmake \ -DCMAKE_BUILD_TYPE=release -DNCNN_OPENMP=OFF -DNCNN_THREADS=OFF -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=OFF -DNCNN_XTHEADVECTOR=ON -DNCNN_ZFH=ON -DNCNN_ZVFH=OFF \ -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON .. cmake --build . -j 4 cmake --build . --target install ``` Pick `build-c906/install` folder for further usage. You can upload binary inside `build-c906/examples` folder and run on D1 board for testing. *** ### Build for Loongson 2K1000 For gcc version < 8.5, you need to fix msa.h header for workaround msa fmadd/fmsub/maddv/msubv bug. Open ```/usr/lib/gcc/mips64el-linux-gnuabi64/8/include/msa.h```, find ```__msa_fmadd``` and ```__msa_fmsub``` and apply changes as the following ```c // #define __msa_fmadd_w __builtin_msa_fmadd_w // #define __msa_fmadd_d __builtin_msa_fmadd_d // #define __msa_fmsub_w __builtin_msa_fmsub_w // #define __msa_fmsub_d __builtin_msa_fmsub_d #define __msa_fmadd_w(a, b, c) __builtin_msa_fmadd_w(c, b, a) #define __msa_fmadd_d(a, b, c) __builtin_msa_fmadd_d(c, b, a) #define __msa_fmsub_w(a, b, c) __builtin_msa_fmsub_w(c, b, a) #define __msa_fmsub_d(a, b, c) __builtin_msa_fmsub_d(c, b, a) ``` find ```__msa_maddv``` and ```__msa_msubv``` and apply changes as the following ```c // #define __msa_maddv_b __builtin_msa_maddv_b // #define __msa_maddv_h __builtin_msa_maddv_h // #define __msa_maddv_w __builtin_msa_maddv_w // #define __msa_maddv_d __builtin_msa_maddv_d // #define __msa_msubv_b __builtin_msa_msubv_b // #define __msa_msubv_h __builtin_msa_msubv_h // #define __msa_msubv_w __builtin_msa_msubv_w // #define __msa_msubv_d __builtin_msa_msubv_d #define __msa_maddv_b(a, b, c) __builtin_msa_maddv_b(c, b, a) #define __msa_maddv_h(a, b, c) __builtin_msa_maddv_h(c, b, a) #define __msa_maddv_w(a, b, c) __builtin_msa_maddv_w(c, b, a) #define __msa_maddv_d(a, b, c) __builtin_msa_maddv_d(c, b, a) #define __msa_msubv_b(a, b, c) __builtin_msa_msubv_b(c, b, a) #define __msa_msubv_h(a, b, c) __builtin_msa_msubv_h(c, b, a) #define __msa_msubv_w(a, b, c) __builtin_msa_msubv_w(c, b, a) #define __msa_msubv_d(a, b, c) __builtin_msa_msubv_d(c, b, a) ``` Build ncnn with mips msa and simpleocv enabled: ```shell mkdir -p build cd build cmake -DNCNN_DISABLE_RTTI=ON -DNCNN_DISABLE_EXCEPTION=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_MSA=ON -DNCNN_MMI=ON -DNCNN_SIMPLEOCV=ON .. cmake --build . -j 2 cmake --build . --target install ``` Pick `build/install` folder for further usage. You can run binary inside `build/examples` folder for testing. *** ### Build for Termux on Android Install app Termux on your phone,and install Ubuntu in Termux. If you want use ssh, just install openssh in Termux ```shell pkg install proot-distro proot-distro install ubuntu ``` or you can see what system can be installed using `proot-distro list` while you install ubuntu successfully, using `proot-distro login ubuntu` to login Ubuntu. Then make ncnn,no need to install any other dependencies. ```shell git clone https://github.com/Tencent/ncnn.git cd ncnn git submodule update --init mkdir -p build cd build cmake -DCMAKE_BUILD_TYPE=Release -DNCNN_BUILD_EXAMPLES=ON -DNCNN_PLATFORM_API=OFF -DNCNN_SIMPLEOCV=ON .. make -j$(nproc) ``` Then you can run a test > on my Pixel 3 XL using Qualcomm 845,cant load `256-ncnn.png` ```shell cd ../examples ../build/examples/squeezenet ../images/128-ncnn.png ``` ### Build for QNX Request license and download SDP from QNX Software Center: https://www.qnx.com/products/everywhere/ . Setup QNX environment by invoking SDP's bundled script: on Windows, open cmd and run ```batch call C:\Users\zz\qnx800\qnxsdp-env.bat ``` on Linux, use /bin/bash and run ```shell source /home/zz/qnx800/qnxsdp-env.sh ``` If it gives error `cannot find ld` on Linux, solve it by creaing link file: ```shell cd ${QNX_HOST}/usr/bin/ ln -s aarch64-unknown-nto-qnx7.1.0-ld ld ``` Build ncnn with cmake in same shell: ```shell git clone https://github.com/Tencent/ncnn.git cd ncnn git submodule update --init mkdir -p build-qnx cd build-qnx cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-qnx.toolchain.cmake .. make -j$(nproc) make install ``` Pick `build-qnx/install` folder for further usage. ### Build for Nintendo 3DS Homebrew Launcher Install DevkitPRO toolchains - If you are working on windows, download DevkitPro installer from [DevkitPro](https://devkitpro.org/wiki/Getting_Started). - If you are using Ubuntu, the official guidelines from DevkitPro might not work for you. Try using the lines below to install ```shell sudo apt-get update sudo apt-get upgrade wget https://apt.devkitpro.org/install-devkitpro-pacman chmod +x ./install-devkitpro-pacman sudo ./install-devkitpro-pacman ``` ```shell export DEVKITPRO=/opt/devkitpro export DEVKITARM=/opt/devkitpro/devkitARM export DEVKITPPC=/opt/devkitpro/devkitPPC export export PATH=$/opt/devkitpro/tools/bin:$PATH source ~/.profile ``` ```shell sudo dkp-pacman -Sy sudo dkp-pacman -Syu sudo dkp-pacman -S 3ds-dev ``` Copy the toolchain files from [3DS-cmake](https://github.com/Xtansia/3ds-cmake)(DevitARM3DS.cmake and the cmake folder) to NCNN's toolchains folder. ``` ├── toolchains │   ├── cmake │   │   ├── bin2s_header.h.in │   │   ├── FindCITRO3D.cmake │   │   ├── FindCTRULIB.cmake │   │   ├── FindFreetype.cmake │   │   ├── FindJPEG.cmake │   │   ├── FindPNG.cmake │   │   ├── FindSF2D.cmake │   │   ├── FindSFIL.cmake │   │   ├── FindSFTD.cmake │   │   ├── FindZLIB.cmake │   │   ├── LibFindMacros.cmake │   │   ├── Tools3DS.cmake │   │   ├── ToolsGBA.cmake │   │   └── try_add_imported_target.cmake │   ├── DevkitArm3DS.cmake ... ``` Build with: ```shell cd ncnn mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/DevkitArm3DS.cmake .. -DNCNN_SIMPLEOCV=ON -DNCNN_OPENMP=OFF -DNCNN_VFPV4=OFF .. make -j4 make install ``` Modify the Makefile in Homebrew example to link and use NCNN in your 3DS Homebrew app. *** ### Build for HarmonyOS with cross-compiling Download and install HarmonyOS SDK. The sdk installation directory is `/opt/ohos-sdk/linux` ```shell cd mkdir -p build cd build export HM_SDK=/opt/ohos-sdk/linux # Choose HarmonyOS sdk cmake toolchain file. # If you want to enable vulkan, set -DNCNN_VULKAN=ON # The HarmonyOS sdk does not support openmp, use ncnn simpleomp instead. # Cross-compiling with CMake must use the one provided by the HarmonyOS SDK; otherwise, it won't recognize parameters like OHOS_PLATFORM, leading to compilation errors. ${HM_SDK}/native/build-tools/cmake/bin/cmake -DOHOS_STL=c++_static -DOHOS_ARCH=arm64-v8a -DOHOS_PLATFORM=OHOS -DCMAKE_TOOLCHAIN_FILE=${HM_SDK}/native/build/cmake/ohos.toolchain.cmake -DNCNN_VULKAN=ON -DNCNN_SIMPLEOMP=ON .. make -j$(nproc) make install ``` *** ### Build for ESP32 with cross-compiling Download esp-idf sdk ```shell git clone https://github.com/espressif/esp-idf cd esp-idf git submodule update --init --recursive ``` Install esp-idf sdk and configure the environment ```shell ./install.sh source export.sh ``` And for Windows, you should use: ```bash install.bat # or `install.ps1` export.bat ``` Note: python>=3.8, cmake>=3.24.0 Build ncnn library: ```shell mkdir build-esp32 cd build-esp32 cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/esp32.toolchain.cmake -DCMAKE_BUILD_TYPE=Release .. make -j 4 make install ``` Note: Make sure to compile in esp-idf environment. The compiled ncnn library and headers can be put to the esp32 project to test. ================================================ FILE: docs/how-to-use-and-FAQ/FAQ-ncnn-produce-wrong-result.md ================================================ ### caffemodel should be row-major `caffe2ncnn` tool assumes the caffemodel is row-major (produced by c++ caffe train command). The kernel 3x3 weights should be stored as ``` a b c d e f g h i ``` However, matlab caffe produced col-major caffemodel. You have to transpose all the kernel weights by yourself or re-training using c++ caffe train command. Besides, you may interest in https://github.com/conanhujinming/matcaffe2caffe ### check input is RGB or BGR If your caffemodel is trained using c++ caffe and opencv, then the input image should be BGR order. If your model is trained using matlab caffe or pytorch or mxnet or tensorflow, the input image would probably be RGB order. The channel order can be changed on-the-fly through proper pixel type enum ``` // construct RGB blob from rgb image ncnn::Mat in_rgb = ncnn::Mat::from_pixels(rgb_data, ncnn::Mat::PIXEL_RGB, w, h); // construct BGR blob from bgr image ncnn::Mat in_bgr = ncnn::Mat::from_pixels(bgr_data, ncnn::Mat::PIXEL_BGR, w, h); // construct BGR blob from rgb image ncnn::Mat in_bgr = ncnn::Mat::from_pixels(rgb_data, ncnn::Mat::PIXEL_RGB2BGR, w, h); // construct RGB blob from bgr image ncnn::Mat in_rgb = ncnn::Mat::from_pixels(bgr_data, ncnn::Mat::PIXEL_BGR2RGB, w, h); ``` ### image decoding JPEG(`.jpg`,`.jpeg`) is loss compression, people may get different pixel value for same image on same position. `.bmp` images are recommended instead. ### interpolation / resizing There are several image resizing methods, which may generate different result for same input image. Even we specify same interpolation method, different frameworks/libraries and their various versions may also introduce difference. A good practice is feed same size image as the input layer expected, e.g. read a 224x244 bmp image when input layer need 224x224 size. ### Mat::from_pixels/from_pixels_resize assume that the pixel data is continuous You shall pass continuous pixel buffer to from_pixels family. If your image is an opencv submat from an image roi, call clone() to get a continuous one. ``` cv::Mat image;// the image cv::Rect facerect;// the face rectangle cv::Mat faceimage = image(facerect).clone();// get a continuous sub image ncnn::Mat in = ncnn::Mat::from_pixels(faceimage.data, ncnn::Mat::PIXEL_BGR, faceimage.cols, faceimage.rows); ``` ### pre process Apply pre process according to your training configuration Different model has different pre process config, you may find the following transform config in Data layer section ``` transform_param { mean_value: 103.94 mean_value: 116.78 mean_value: 123.68 scale: 0.017 } ``` Then the corresponding code for ncnn pre process is ```cpp const float mean_vals[3] = { 103.94f, 116.78f, 123.68f }; const float norm_vals[3] = { 0.017f, 0.017f, 0.017f }; in.substract_mean_normalize(mean_vals, norm_vals); ``` Mean file is not supported currently So you have to pre process the input data by yourself (use opencv or something) ``` transform_param { mean_file: "imagenet_mean.binaryproto" } ``` For pytorch or mxnet-gluon ```python transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), ``` Then the corresponding code for ncnn pre process is ```cpp // R' = (R / 255 - 0.485) / 0.229 = (R - 0.485 * 255) / 0.229 / 255 // G' = (G / 255 - 0.456) / 0.224 = (G - 0.456 * 255) / 0.224 / 255 // B' = (B / 255 - 0.406) / 0.225 = (B - 0.406 * 255) / 0.225 / 255 const float mean_vals[3] = {0.485f*255.f, 0.456f*255.f, 0.406f*255.f}; const float norm_vals[3] = {1/0.229f/255.f, 1/0.224f/255.f, 1/0.225f/255.f}; in.substract_mean_normalize(mean_vals, norm_vals); ``` ### use the desired blob The blob names for input and extract are differ among models. For example, squeezenet v1.1 use "data" as input blob and "prob" as output blob while mobilenet-ssd use "data" as input blob and "detection_out" as output blob. Some models may need multiple input or produce multiple output. ```cpp ncnn::Extractor ex = net.create_extractor(); ex.input("data", in);// change "data" to yours ex.input("mask", mask);// change "mask" to yours ex.extract("output1", out1);// change "output1" to yours ex.extract("output2", out2);// change "output2" to yours ``` ### blob may have channel gap Each channel pointer is aligned by 128bit in ncnn Mat structure. blob may have gaps between channels if (width x height) can not divided exactly by 4 Prefer using ncnn::Mat::from_pixels or ncnn::Mat::from_pixels_resize for constructing input blob from image data If you do need a continuous blob buffer, reshape the output. ```cpp // out is the output blob extracted ncnn::Mat flattened_out = out.reshape(out.w * out.h * out.c); // plain array, C-H-W const float* outptr = flattened_out; ``` ### create new Extractor for each image The `ncnn::Extractor` object is stateful, if you reuse for different input, you will always get exact the same result cached inside. Always create new Extractor to process images in loop unless you do know how the stateful Extractor works. ```cpp for (int i=0; i is not guaranteed to be 32bit aligned. you can store your binary buffer in ncnn::Mat structure, its internal memory is aligned. ### undefined reference to '__kmpc_XYZ_XYZ' use clang for building android shared library comment the following line in your Application.mk ``` NDK_TOOLCHAIN_VERSION := 4.9 ``` ### crash on android with '__kmp_abort_process' This usually happens if you bundle multiple shared library with openmp linked It is actually an issue of the android ndk https://github.com/android/ndk/issues/1028 On old android ndk, modify the link flags as ``` -Wl,-Bstatic -lomp -Wl,-Bdynamic ``` For recent ndk >= 21 ``` -fstatic-openmp ``` ### dlopen failed: library "libomp.so" not found Newer android ndk defaults to dynamic openmp runtime modify the link flags as ``` -fstatic-openmp -fopenmp ``` ### crash when freeing a ncnn dynamic library(*.dll/*.so) built with openMP for optimal performance, the openmp threadpool spin waits for about a second prior to shutting down in case more work becomes available. If you unload a dynamic library that's in the process of spin-waiting, it will crash in the manner you see (most of the time). Just set OMP_WAIT_POLICY=passive in your environment, before calling loadlibrary. or Just wait a few seconds before calling freelibrary. You can also use the following method to set environment variables in your code: for msvc++: ``` SetEnvironmentVariable(_T("OMP_WAIT_POLICY"), _T("passive")); ``` for g++: ``` setenv("OMP_WAIT_POLICY", "passive", 1) ``` reference: https://stackoverflow.com/questions/34439956/vc-crash-when-freeing-a-dll-built-with-openmp ================================================ FILE: docs/how-to-use-and-FAQ/FAQ-ncnn-vulkan.md ================================================ ### how to enable ncnn vulkan capability follow [the build and install instruction](https://github.com/Tencent/ncnn/blob/master/docs/how-to-build/how-to-build.md) make sure you have installed vulkan sdk from [lunarg vulkan sdk website](https://vulkan.lunarg.com/sdk/home) Usually, you can enable the vulkan compute inference feature by adding only one line of code to your application. ```cpp // enable vulkan compute feature before loading ncnn::Net net; net.opt.use_vulkan_compute = 1; ``` ### does my graphics device support vulkan Some platforms have been tested and known working. In theory, if your platform support vulkan api, either 1.0 or 1.1, it shall work. * Y = known work * ? = shall work, not confirmed * / = not applied | |windows|linux|android|mac|ios| |---|---|---|---|---|---| |intel|Y|Y|?|?|/| |amd|Y|Y|/|?|/| |nvidia|Y|Y|?|/|/| |qcom|/|/|Y|/|/| |apple|/|/|/|Y|Y| |arm|/|?|Y|/|/| You can search [the vulkan database](https://vulkan.gpuinfo.org) to see if your device supports vulkan. Some old buggy drivers may produce wrong result, that are blacklisted in ncnn and treated as non-vulkan capable device. You could check if your device and driver have this issue with [my conformance test here](vulkan-conformance-test). Most of these systems are android with version lower than 8.1. ### why using vulkan over cuda/opencl/metal In the beginning, I had no GPGPU programming experience, and I had to learn one. vulkan is considered more portable and well supported by vendors and the cross-platform low-overhead graphics api. As a contrast, cuda is only available on nvidia device, metal is only available on macos and ios, while loading opencl library is banned in android 7.0+ and does not work on ios. ### I got errors like "vkCreateComputePipelines failed -1000012000" or random stalls or crashes Upgrade your vulkan driver. [intel https://downloadcenter.intel.com/product/80939/Graphics-Drivers](https://downloadcenter.intel.com/product/80939/Graphics-Drivers) [amd https://www.amd.com/en/support](https://www.amd.com/en/support) [nvidia https://www.nvidia.com/Download/index.aspx](https://www.nvidia.com/Download/index.aspx) ### how to use ncnn vulkan on android minimum android ndk version: android-ndk-r18b minimum sdk platform api version: android-24 link your jni project with libvulkan.so [The squeezencnn example](https://github.com/Tencent/ncnn/tree/master/examples/squeezencnn) have equipped gpu inference, you could take it as reference. ### how to use ncnn vulkan on ios setup vulkan sdk (https://vulkan.lunarg.com/sdk/home#mac) metal only works on real device with arm64 cpu (iPhone 5s and later) link your project with MoltenVK framework and Metal ### what about the layers without vulkan support These layers have vulkan support currently AbsVal, BatchNorm, BinaryOp, Cast, Clip, Concat, Convolution, ConvolutionDepthWise, Crop, Deconvolution, DeconvolutionDepthWise, Dropout, Eltwise, Flatten, HardSigmoid, InnerProduct, Interp, LRN, Packing, Padding, Permute, Pooling(pad SAME not supported), PReLU, PriorBox, ReLU, Reorg, Reshape, Scale, ShuffleChannel, Sigmoid, Softmax, TanH, UnaryOp For these layers without vulkan support, ncnn inference engine will automatically fallback to cpu path. Thus, it is usually not a serious issue if your network only has some special head layers like SSD or YOLO. All examples in ncnn are known working properly with vulkan enabled. ### my model runs slower on gpu than cpu The current vulkan inference implementation is far from the preferred state. Many handful optimization techniques are planned, such as winograd convolution, operator fusion, fp16 storage and arithmetic etc. It is common that your model runs slower on gpu than cpu on arm devices like mobile phones, since we have quite good arm optimization in ncnn ;) ### vulkan device not found / extra high cpu utility while vulkan is enabled on nvidia gpu There are several reasons could lead to this outcome. First please check your driver status with `nvidia-smi`. If you have correctly installed your driver, you should see something like this: ```bash $ nvidia-smi Sat Mar 06 19:53:16 2021 +-----------------------------------------------------------------------------+ | NVIDIA-SMI 451.48 Driver Version: 451.48 CUDA Version: 11.0 | |-------------------------------+----------------------+----------------------+ | GPU Name TCC/WDDM | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | |===============================+======================+======================| | 0 GeForce GTX 1060 WDDM | 00000000:02:00.0 Off | N/A | | N/A 31C P8 5W / N/A | 90MiB / 6144MiB | 0% Default | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=============================================================================| | No running processes found | +-----------------------------------------------------------------------------+ ``` If `nvidia-smi` crashes or cannot be found, please reinstall your graphics driver. If ncnn *is* utilizing the Tesla GPU, you can see your program in the `Processes` block at the bottom. In that case, it's likely some operators are not yet supported in Vulkan, and have fallbacked to the CPU, thus leading to a low utilization of the GPU. If you *couldn't* find your process running, plase check the active driver model, which can be found to the right of your device name. For Geforce and Titan GPUs, the default driver model is WDDM (Windows Desktop Driver Model), which supports both rendering graphics as well as computing. But for Tesla GPUs, without configuration, the driver model is defualted to TCC ([Tesla Computing Cluster](https://docs.nvidia.com/gameworks/content/developertools/desktop/tesla_compute_cluster.htm)). NVIDIA's TCC driver does not support Vulkan, so you need to use the following command to set the driver model back to WDDM, to use Vulkan: ```bash $ nvidia-smi -g 0 -dm 0 ``` The number following `-g` is the GPU ID (which can be found to the left of your device name in `nvidia-smi` output); and `-dm` stands for driver model, 0 refers to WDDM and 1 means TCC. ================================================ FILE: docs/how-to-use-and-FAQ/build-minimal-library.md ================================================ For some reason, if you're not happy with the binary size of the ncnn library, then here is the cheatsheet that helps you to build a minimal ncnn :P ### disable c++ rtti and exceptions ``` cmake -DNCNN_DISABLE_RTTI=ON -DNCNN_DISABLE_EXCEPTION=ON .. ``` * Cannot use RTTI and Exceptions when ncnn functions are called. ### disable vulkan support ``` cmake -DNCNN_VULKAN=OFF .. ``` * Cannot use GPU acceleration. ### disable NCNN_STDIO ``` cmake -DNCNN_STDIO=OFF .. ``` * Cannot load model from files, but can load model from memory or by Android Assets. Read more [here](https://github.com/Tencent/ncnn/blob/master/docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.md#load-model). ### disable NCNN_STRING ``` cmake -DNCNN_STRING=OFF .. ``` * Cannot load human-readable param files with visible strings, but can load binary param.bin files. Read more [here](https://github.com/Tencent/ncnn/blob/master/docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.md#strip-visible-string) * Cannot identify blobs by string name when calling `Extractor::input / extract`, but can identify them by enum value in `id.h`. Read more [here](https://github.com/Tencent/ncnn/blob/master/docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.md#input-and-output). ### disable NCNN_BF16 ``` cmake -DNCNN_BF16=OFF .. ``` * Cannot use bf16 storage type in inference. ### disable NCNN_INT8 ``` cmake -DNCNN_INT8=OFF .. ``` * Cannot use quantized int8 inference. ### drop pixel drawing functions ``` cmake -DNCNN_PIXEL_DRAWING=OFF .. ``` * Cannot use functions doing drawing basic shape and text like `ncnn::draw_rectangle_xx / ncnn::draw_circle_xx / ncnn::draw_text_xx`, but functions like `Mat::from_pixels / from_pixels_resize` are still available. ### drop pixel rotate and affine functions ``` cmake -DNCNN_PIXEL_ROTATE=OFF -DNCNN_PIXEL_AFFINE=OFF .. ``` * Cannot use functions doing rotatation and affine transformation like `ncnn::kanna_rotate_xx / ncnn::warpaffine_bilinear_xx`, but functions like `Mat::from_pixels / from_pixels_resize` are still available. ### drop pixel functions ``` cmake -DNCNN_PIXEL=OFF .. ``` * Cannot use functions transferring from image to pixels like `Mat::from_pixels / from_pixels_resize / to_pixels / to_pixels_resize`, and need create a Mat and fill in data by hand. ### disable openmp ``` cmake -DNCNN_OPENMP=OFF .. ``` * Cannot use openmp multi-threading acceleration. If you want to run a model in single thread on your target machine, it is recommended to close the option. ### disable avx2 and arm82 optimized kernel ``` cmake -DNCNN_AVX2=OFF -DNCNN_ARM82=OFF .. ``` * Do not compile optimized kernels using avx2 / arm82 instruction set extensions. If your target machine does not support some of them, it is recommended to close the related options. ### disable runtime cpu instruction dispatch ``` cmake -DNCNN_RUNTIME_CPU=OFF .. ``` * Cannot check supported cpu instruction set extensions and use related optimized kernels in runtime. * If you know which instruction set extensions are supported on your target machine like avx2 / arm82, you can open related options like `-DNCNN_AVX2=ON / -DNCNN_ARM82=ON` by hand and then sse2 / arm8 version kernels will not be compiled. ### drop layers not used ``` cmake -DWITH_LAYER_absval=OFF -DWITH_LAYER_bnll=OFF .. ``` * If your model does not include some layers, taking absval / bnll as a example above, you can drop them. * Some key or dependency layers should not be dropped, like convolution / innerproduct, their dependency like padding / flatten, and activation like relu / clip. ### disable c++ stl ``` cmake -DNCNN_SIMPLESTL=ON .. ``` * STL provided by compiler is no longer depended on, and use `simplestl` provided by ncnn as a replacement. Users also can only use `simplestl` when ncnn functions are called. * Usually with compiler parameters `-nodefaultlibs -fno-builtin -nostdinc++ -lc` * Need cmake parameters `cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_STL=system` to avoid STL conflict when compiling to Android. ### drop optimized kernel not used * Modify the source code under `ncnn/src/layer/arm/` to delete unnecessary optimized kernels or replace them with empty functions. * You can also drop layers and related optimized kernels by `-DWITH_LAYER_absval=OFF` as mentioned above. ### drop operators from BinaryOp UnaryOp * Modify `ncnn/src/layer/binaryop.cpp unaryop.cpp` and `ncnn/src/layer/arm/binaryop.cpp unaryop_arm.cpp` by hand to delete unnecessary operators. ================================================ FILE: docs/how-to-use-and-FAQ/efficient-roi-resize-rotate.md ================================================ ### image roi crop + convert to ncnn::Mat ``` +--------------+ | y | /-------/ | x +-------+ | +-------+| | | roih |im_h => | roih | +-roiw--+ | +-roiw--+/ | | +-----im_w-----+ ``` ```cpp ncnn::Mat in = ncnn::Mat::from_pixels_roi(im.data, ncnn::Mat::PIXEL_RGB, im_w, im_h, x, y, roiw, roih); ``` For Android Application, it is : ```cpp ncnn::Mat in = ncnn::Mat::from_android_bitmap_roi(env, image, ncnn::Mat::PIXEL_RGBA2RGB, x, y, roiw, roih); ``` ### image roi crop + resize + convert to ncnn::Mat ``` +--------------+ | y | /----/ | x +-------+ | +----+| | | roih |im_h => | target_h | +-roiw--+ | | || | | +----+/ +-----im_w-----+ target_w ``` ```cpp ncnn::Mat in = ncnn::Mat::from_pixels_roi_resize(im.data, ncnn::Mat::PIXEL_RGB, im_w, im_h, x, y, roiw, roih, target_w, target_h); ``` For Android Application, it is : ```cpp ncnn::Mat in = ncnn::Mat::from_android_bitmap_roi_resize(env, image, ncnn::Mat::PIXEL_RGBA2RGB, x, y, roiw, roih, target_w, target_h); ``` ### ncnn::Mat export image + offset paste ``` +--------------+ /-------/ | y | +-------+| | x +-------+ | | h| => | | h |im_h +---w---+/ | +---w---+ | | | +-----im_w-----+ ``` ```cpp const unsigned char* data = im.data + (y * im_w + x) * 3; out.to_pixels(data, ncnn::Mat::PIXEL_RGB, im_w * 3); ``` ### ncnn::Mat export image + resize + roi paste ``` +--------------+ /----/ | y | +----+| | x +-------+ | | h| => | | roih|im_h | || | +-roiw--+ | +-w--+/ | | +-----im_w-----+ ``` ```cpp const unsigned char* data = im.data + (y * im_w + x) * 3; out.to_pixels_resize(data, ncnn::Mat::PIXEL_RGB, roiw, roih, im_w * 3); ``` ### image roi crop + resize ``` +--------------+ | y | | x +-------+ | +----+ | | roih|im_h => | target_h | +-roiw--+ | | | | | +----+ +-----im_w-----+ target_w ``` ```cpp const unsigned char* data = im.data + (y * im_w + x) * 3; ncnn::resize_bilinear_c3(data, roiw, roih, im_w * 3, outdata, target_w, target_h, target_w * 3); ``` ### image resize + offset paste ``` +--------------+ | y | +----+ | x +-------+ | | h => | | roih |im_h | | | +-roiw--+ | +-w--+ | | +-----im_w-----+ ``` ```cpp unsigned char* outdata = im.data + (y * im_w + x) * 3; ncnn::resize_bilinear_c3(data, w, h, w * 3, outdata, roiw, roih, im_w * 3); ``` ### image roi crop + resize + roi paste ``` +--------------+ +-----------------+ | y | | roiy | | x +-------+ | |roix----------+ | | | h |im_h => | | target_h|outim_h | +---w---+ | | | | | | | | +-target_w-+ | +-----im_w-----+ +-----outim_w-----+ ``` ```cpp const unsigned char* data = im.data + (y * im_w + x) * 3; unsigned char* outdata = outim.data + (roiy * outim_w + roix) * 3; ncnn::resize_bilinear_c3(data, w, h, im_w * 3, outdata, target_w, target_h, outim_w * 3); ``` ### image roi crop + rotate ``` +--------------+ | y | | x +-------+ | +---+ | | < < h |im_h => | ^ |w | +---w---+ | | ^ | | | +---+ +-----im_w-----+ h ``` ```cpp const unsigned char* data = im.data + (y * im_w + x) * 3; ncnn::kanna_rotate_c3(data, w, h, im_w * 3, outdata, h, w, h * 3, 6); ``` ### image rotate + offset paste ``` +--------------+ | y | +---+ | x +-------+ | | ^ |h => | | < < w |im_h | ^ | | +---h---+ | +---+ | | w +-----im_w-----+ ``` ```cpp unsigned char* outdata = im.data + (y * im_w + x) * 3; ncnn::kanna_rotate_c3(data, w, h, w * 3, outdata, h, w, im_w * 3, 7); ``` ### image roi crop + rotate + roi paste ``` +--------------+ +-----------------+ | y | | roiy | | x +-------+ | | roix +---+ | | | < < h |im_h => | | ^ w |outim_h | +---w---+ | | | ^ | | | | | +-h-+ | +-----im_w-----+ +-----outim_w-----+ ``` ```cpp const unsigned char* data = im.data + (y * im_w + x) * 3; unsigned char* outdata = outim.data + (roiy * outim_w + roix) * 3; ncnn::kanna_rotate_c3(data, w, h, im_w * 3, outdata, h, w, outim_w * 3, 6); ``` ================================================ FILE: docs/how-to-use-and-FAQ/ncnn-load-model.md ================================================ ### the comprehensive model loading api table |load from|alexnet.param|alexnet.param.bin|alexnet.bin| |---|---|---|---| |file path|load_param(const char*)|load_param_bin(const char*)|load_model(const char*)| |file path
(wchar_t for windows)|load_param(const wchar_t*)|load_param_bin(const wchar_t*)|load_model(const wchar_t*)| |file descriptor|load_param(FILE*)|load_param_bin(FILE*)|load_model(FILE*)| |file memory|load_param_mem(const char*)|load_param(const unsigned char*)|load_model(const unsigned char*)| |android asset|load_param(AAsset*)|load_param_bin(AAsset*)|load_model(AAsset*)| |android asset path|load_param(AAssetManager*, const char*)|load_param_bin(AAssetManager*, const char*)|load_model(AAssetManager*, const char*)| |custom IO reader|load_param(const DataReader&)|load_param_bin(const DataReader&)|load_model(const DataReader&)| ### points to note 1. Either of the following combination shall be enough for loading model * alexnet.param + alexnet.bin * alexnet.param.bin + alexnet.bin 2. Never modify Net opt member after loading 3. Most loading functions return 0 if success, except loading alexnet.param.bin and alexnet.bin from file memory, which returns the bytes consumed after loading * size_t Net::load_param(const unsigned char*) * size_t Net::load_model(const unsigned char*) 4. It is recommended to load model from Android asset directly to avoid copying them to sdcard on Android platform 5. The custom IO reader interface can be used to implement on-the-fly model decryption and loading ================================================ FILE: docs/how-to-use-and-FAQ/openmp-best-practice.md ================================================ ncnn openmp best practice ### CPU loadaverage is too high with ncnn. When inference the neural network with ncnn, the cpu occupancy is very high even all CPU cores occupancy close to 100%. If there are other threads or processes that require more cpu resources, the running speed of the program will drop severely. ### The root cause of high CPU usage 1. ncnn uses openmp API to speed up the inference compute. the thread count equals to the cpu core count. If the computing work need to run frequently, it must consume many cpu resources. 2. There is a thread pool managed by openmp, the pool size is equal to the cpu core size. (the max vulue is 15 if there are much more cpu cores?) Openmp need to sync the thread when acquiring and returning threads to the pool. In order to improve efficiency, almost all omp implementations use spinlock synchronization (except for simpleomp). The default spin time of the spinlock is 200ms. So after a thread is scheduled, the thread need to busy-wait up to 200ms. ### Why the CPU usage is still high even using vulkan GPU acceleration. 1. Openmp is also used when loading the param bin file, and this part runs on cpu. 2. The fp32 to fp16 conversion before and after the GPU memory upload is executed on the cpu, and this part of the logic also uses openmp. ### Solution ``` 1. Bind to the specific cpu core. ``` If you use a device with large and small core CPUs, it is recommended to bind large or small cores through ncnn::set_cpu_powersave(int). Note that Windows does not support binding cores. By the way, it's possible to have multiple threadpool using openmp. A new threadpool will be created for a new thread scope. Suppose your platform is 2 big cores + 4 little cores, and you want to execute model A on 2 big cores and model B on 4 little cores concurrently. create two threads via std::thread or pthread ``` void thread_1() { ncnn::set_cpu_powersave(2); // bind to big cores netA.opt.num_threads = 2; } void thread_2() { ncnn::set_cpu_powersave(1); // bind to little cores netB.opt.num_threads = 4; } ``` ``` 2. Use fewer threads. ``` Set the number of threads to half of the cpu cores count or less through ncnn::set_omp_num_threads(int) or change net.opt.num_threads field. If you are coding with clang libomp, it's recommended that the number of threads does not exceed 8. If you use other omp libraries, it is recommended that the number of threads does not exceed 4. ``` 3. Reduce openmp spinlock blocktime. ``` You can modify openmp blocktime by call ncnn::set_kmp_blocktime(int) method or modify net.opt.openmp_blocktime field. This argument is the spin time set by the ncnn API, and the default is 20ms.You can set a smaller value according to the situation, or directly change it to 0. Limitations: At present, only the libomp library of clang is implemented. Neither vcomp nor libgomp have corresponding interfaces. If it is not compiled with clang, this value is still 200ms by default. If you use vcomp or libgomp, you can use the environment variable OMP_WAIT_POLICY=PASSIVE to disable spin time. If you use simpleomp, It's no need to set this parameter. ``` 4. Limit the number of threads available in the openmp thread pool. ``` Even if the number of openmp threads is reduced, the CPU occupancy rate may still be high. This is more common on servers with particularly many CPU cores. This is because the waiting threads in the thread pool use a spinlock to busy-wait, which can be reducedby limiting the number of threads available in the thread pool. Generally, you can set the OMP_THREAD_LIMIT environment variable. simpleomp currently does not support this feature so it's no need to be set. Note that this environment variable is only valid if it is set before the program starts. ``` 5. Disable openmp completely ``` If there is only one cpu core, or use the vulkan gpu acceleration, it is recommended to disable openmp, just specify -DNCNN_OPENMP=OFF when compiling with cmake. ================================================ FILE: docs/how-to-use-and-FAQ/openmp-best-practice.zh.md ================================================ ncnn openmp 最佳实践 ### ncnn占用过多cpu资源 使用ncnn推理运算,cpu占用非常高甚至所有核心占用都接近100%。 如果还有其它线程或进程需要较多的cpu资源,运行速度下降严重。 ### cpu占用高的根本原因 1. ncnn使用openmp API控制多线程加速推理计算。默认情况下,线程数等于cpu内核数。如果推理需要高频率运行,必然占用大部分 cpu资源。 2. openmp内部维护一个线程池,线程池最大可用线程数等于cpu内核数。(核心过多时最大限制是15?)获取和归还线程时需要同步。 为了提高效率,几乎所有omp实现都使用了自旋锁同步(simpleomp除外)。自旋锁默认的spin time是200ms。因此一个线程被调度后, 需要忙等待最多200ms。 ### 为什么使用vulkan加速后cpu占用依然很高。 1. 加载参数文件时也使用了openmp,这部分是在cpu上运行的。 2. 显存上传前和下载后的 fp32 fp16转换是在cpu上执行的,这部分逻辑也使用了openmp。 ### 解决方法 ``` 1. 绑核 ``` 如果使用有大小核cpu的设备,建议通过ncnn::set_cpu_powersave(int)绑定大核或小核,注意windows系统不支持绑核。顺便说一下,ncnn支持不同的模型运行在不同的核心。假设硬件平台有2个大核,4个小核,你想把netA运行在大核,netB运行在小核。 可以通过std::thread or pthread创建两个线程,运行如下代码: ``` void thread_1() { ncnn::set_cpu_powersave(2); // bind to big cores netA.opt.num_threads = 2; } void thread_2() { ncnn::set_cpu_powersave(1); // bind to little cores netB.opt.num_threads = 4; } ``` ``` 2. 使用更少的线程数。 ``` 通过ncnn::set_omp_num_threads(int)或者net.opt.num_threads字段设置线程数为cpu内核数的一半或更小。如果使用clang的libomp, 建议线程数不超过8,如果使用其它omp库,建议线程数不超过4。 ``` 3. 减小openmp blocktime。 ``` 可以修改ncnn::set_kmp_blocktime(int)或者修改net.opt.openmp_blocktime,这个参数是ncnn API设置的spin time,默认是20ms。 可以根据情况设置更小的值,或者直接改为0。 局限:目前只有clang的libomp库有实现,vcomp和libgomp都没有相应接口,如果不是使用clang编译的,这个值默认还是200ms。 如果使用vcomp或libgomp, 可以使用环境变量OMP_WAIT_POLICY=PASSIVE禁用spin time,如果使用simpleomp,不需要设置这个参数。 ``` 4. 限制openmp线程池可用线程数量。 ``` 即使减小了openmp线程数量,cpu占用率仍然可能会很高。这在cpu核心特别多的服务器上比较常见。这是因为线程池中的等待线程使用 自旋锁忙等待,可以通过限制线程池可用线程数量减轻这种影响。 一般可以通过设置OMP_THREAD_LIMIT环境变量。simpleomp目前不支持这一特性,不需要设置。注意这个环境变量仅在程序启动前设置才有效。 ``` 5. 完全禁用openmp ``` 如果只有一个cpu核心,或者使用vulkan加速,建议关闭openmp, cmake编译时指定-DNCNN_OPENMP=OFF即可。 ================================================ FILE: docs/how-to-use-and-FAQ/quantized-int8-inference.md ================================================ # Post Training Quantization Tools To support int8 model deployment on mobile devices,we provide the universal post training quantization tools which can convert the float32 model to int8 model. ## User Guide Example with mobilenet, just need three steps. ### 1. Optimize model NOTE: **If your model is converted via pnnx, skip this step.** ```shell ./ncnnoptimize mobilenet.param mobilenet.bin mobilenet-opt.param mobilenet-opt.bin 0 ``` ### 2. Create the calibration table file #### 2.1 From image We suggest that using the verification dataset for calibration, which is more than 5000 images. Some imagenet sample images here https://github.com/nihui/imagenet-sample-images ```shell find images/ -type f > imagelist.txt ./ncnn2table mobilenet-opt.param mobilenet-opt.bin imagelist.txt mobilenet.table mean=[104,117,123] norm=[0.017,0.017,0.017] shape=[224,224,3] pixel=BGR thread=8 method=kl ``` * mean and norm are the values you passed to ```Mat::substract_mean_normalize()``` * shape is the blob shape of your model, [w,h] or [w,h,c] > * if w and h both are given, image will be resized to exactly size. * if w and h both are zero or negative, image will not be resized. * if only h is zero or negative, image's width will scaled resize to w, keeping aspect ratio. * if only w is zero or negative, image's height will scaled resize to h * pixel is the pixel format of your model, image pixels will be converted to this type before ```Extractor::input()``` * thread is the CPU thread count that could be used for parallel inference * method is the post training quantization algorithm, kl and aciq are currently supported If your model has multiple input nodes, you can use multiple list files and other parameters ```shell ./ncnn2table mobilenet-opt.param mobilenet-opt.bin imagelist-bgr.txt,imagelist-depth.txt mobilenet.table mean=[104,117,123],[128] norm=[0.017,0.017,0.017],[0.0078125] shape=[224,224,3],[224,224,1] pixel=BGR,GRAY thread=8 method=kl ``` #### 2.2 From npy We suggest that using the validation(development) set for calibration. Use the same preprocessing as the training set to get the input vectors, in the case of batchsize=1, store each input vector as an npy file, n inputs correspond to n npy files, the actual stored vectors to remove the batch dimension. test net, shape is in NCHW format, but there's no `N`. ```txt in0, shape=[512] in1, shape=[2, 1, 64] in2, shape=[2, 1, 64] ``` filelist_in0.txt ```txt 0_in0.npy 1_in0.npy 2_in0.npy ... ``` filelist_in1.txt ```txt 0_in1.npy 1_in1.npy 2_in1.npy ... ``` filelist_in2.txt ```txt 0_in2.npy 1_in2.npy 2_in2.npy ... ``` ```shell ./ncnn2table test.param test.bin filelist_in0.txt,filelist_in1.txt,filelist_in2.txt test.table shape=[512],[64,1,2],[64,1,2] thread=8 method=kl type=1 ``` **Here shape is WHC, because the order of the arguments to `ncnn::Mat`.** ### 3. Quantize model ```shell ./ncnn2int8 mobilenet-opt.param mobilenet-opt.bin mobilenet-int8.param mobilenet-int8.bin mobilenet.table ``` If you don’t need static quantization, ncnn supports RNN/LSTM/GRU dynamic quantization. In this case, you can omit the table file. ```shell ./ncnn2int8 rnn-model.param rnn-model.bin rnn-model-int8.param rnn-model-int8.bin ``` ## use ncnn int8 inference the ncnn library would use int8 inference automatically, nothing changed in your code ```cpp ncnn::Net mobilenet; mobilenet.load_param("mobilenet-int8.param"); mobilenet.load_model("mobilenet-int8.bin"); ``` ## mixed precision inference Before quantize your model, comment the layer weight scale line in table file, then the layer will do the float32 inference ``` conv1_param_0 156.639840536 ``` ``` #conv1_param_0 156.639840536 ``` ================================================ FILE: docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.md ================================================ We use alexnet as an example ### prepare caffe prototxt and model These files will usually generated when trained with caffe ``` train.prototxt deploy.prototxt snapshot_10000.caffemodel ``` deploy.prototxt and caffemodel file are enough for TEST phase alexnet deploy.prototxt can be downloaded here https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet alexnet caffemodel can be downloaded here http://dl.caffe.berkeleyvision.org/bvlc_alexnet.caffemodel ### convert to ncnn model Convert old caffe prototxt and caffemodel to new ones using tools in caffe because the ncnn convert tool needs the new format ``` upgrade_net_proto_text [old prototxt] [new prototxt] upgrade_net_proto_binary [old caffemodel] [new caffemodel] ``` Use Input layer as input, set N dim as 1 since only one image can be processed each time ``` layer { name: "data" type: "Input" top: "data" input_param { shape: { dim: 1 dim: 3 dim: 227 dim: 227 } } } ``` Use caffe2ncnn tool to convert caffe model to ncnn model ``` caffe2ncnn deploy.prototxt bvlc_alexnet.caffemodel alexnet.param alexnet.bin ``` ### strip visible string It is already enough for deploying with param and bin file only, but there are visible strings in param file, it may not be suitable to distribute plain neural network information in your APP. You can use ncnn2mem tool to convert plain model file to binary representation. It will generate alexnet.param.bin and two static array code files. ``` ncnn2mem alexnet.param alexnet.bin alexnet.id.h alexnet.mem.h ``` ### load model Load param and bin file, the easy way ```cpp ncnn::Net net; net.load_param("alexnet.param"); net.load_model("alexnet.bin"); ``` Load binary param.bin and bin file, no visible strings included, suitable for bundled as APP resource ```cpp ncnn::Net net; net.load_param_bin("alexnet.param.bin"); net.load_model("alexnet.bin"); ``` Load network and model from external memory, no visible strings included, no external resource files bundled, the whole model is hardcoded in your program You may use this way to load from android asset resource ```cpp #include "alexnet.mem.h" ncnn::Net net; net.load_param(alexnet_param_bin); net.load_model(alexnet_bin); ``` You can choose either way to load model. Loading from external memory is zero-copy, which means you must keep your memory buffer during processing ### unload model ```cpp net.clear(); ``` ### input and output ncnn Mat is the data structure for input and output data Input image should be converted to Mat, and subtracted mean values and normalized when needed ```cpp #include "mat.h" unsigned char* rgbdata;// data pointer to RGB image pixels int w;// image width int h;// image height ncnn::Mat in = ncnn::Mat::from_pixels(rgbdata, ncnn::Mat::PIXEL_RGB, w, h); const float mean_vals[3] = {104.f, 117.f, 123.f}; in.substract_mean_normalize(mean_vals, 0); ``` Execute the network inference and retrieve the result ```cpp #include "net.h" ncnn::Mat in;// input blob as above ncnn::Mat out; ncnn::Extractor ex = net.create_extractor(); ex.input("data", in); ex.extract("prob", out); ``` If you load model with binary param.bin file, you should use the enum value in alexnet.id.h file instead of the blob name ```cpp #include "net.h" #include "alexnet.id.h" ncnn::Mat in;// input blob as above ncnn::Mat out; ncnn::Extractor ex = net.create_extractor(); ex.input(alexnet_param_id::BLOB_data, in); ex.extract(alexnet_param_id::BLOB_prob, out); ``` Read the data in the output Mat. Iterate data to get all classification scores. ```cpp ncnn::Mat out_flatterned = out.reshape(out.w * out.h * out.c); std::vector scores; scores.resize(out_flatterned.w); for (int j=0; j $ cat alexnet.param.bin alexnet.bin > alexnet-all.bin ```cpp #include "net.h" FILE* fp = fopen("alexnet-all.bin", "rb"); net.load_param_bin(fp); net.load_model(fp); fclose(fp); ``` ================================================ FILE: docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.zh.md ================================================ 首先,非常感谢大家对 ncnn 组件的关注 为了方便大家使用 ncnn 组件,up主特意写了这篇使用指北,以烂大街的 alexnet 作为例子 ### 准备caffe网络和模型 caffe 的网络和模型通常是搞深度学习的研究者训练出来的,一般来说训练完会有 ``` train.prototxt deploy.prototxt snapshot_10000.caffemodel ``` 部署的时候只需要 TEST 过程,所以有 deploy.prototxt 和 caffemodel 就足够了 alexnet 的 deploy.prototxt 可以在这里下载 https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet alexnet 的 caffemodel 可以在这里下载 http://dl.caffe.berkeleyvision.org/bvlc_alexnet.caffemodel ### 转换ncnn网络和模型 caffe 自带了工具可以把老版本的 caffe 网络和模型转换为新版(ncnn的工具只认识新版 ``` upgrade_net_proto_text [老prototxt] [新prototxt] upgrade_net_proto_binary [老caffemodel] [新caffemodel] ``` 输入层改用 Input,因为每次只需要做一个图片,所以第一个 dim 设为 1 ``` layer { name: "data" type: "Input" top: "data" input_param { shape: { dim: 1 dim: 3 dim: 227 dim: 227 } } } ``` 使用 caffe2ncnn 工具转换为 ncnn 的网络描述和模型 ``` caffe2ncnn deploy.prototxt bvlc_alexnet.caffemodel alexnet.param alexnet.bin ``` ### 去除可见字符串 有 param 和 bin 文件其实已经可以用了,但是 param 描述文件是明文的,如果放在 APP 分发出去容易被窥探到网络结构(说得好像不明文就看不到一样 使用 ncnn2mem 工具转换为二进制描述文件和内存模型,生成 alexnet.param.bin 和两个静态数组的代码文件 ``` ncnn2mem alexnet.param alexnet.bin alexnet.id.h alexnet.mem.h ``` ### 加载模型 直接加载 param 和 bin,适合快速验证效果使用 ```cpp ncnn::Net net; net.load_param("alexnet.param"); net.load_model("alexnet.bin"); ``` 加载二进制的 param.bin 和 bin,没有可见字符串,适合 APP 分发模型资源 ```cpp ncnn::Net net; net.load_param_bin("alexnet.param.bin"); net.load_model("alexnet.bin"); ``` 从内存引用加载网络和模型,没有可见字符串,模型数据全在代码里头,没有任何外部文件 另外,android apk 打包的资源文件读出来也是内存块 ```cpp #include "alexnet.mem.h" ncnn::Net net; net.load_param(alexnet_param_bin); net.load_model(alexnet_bin); ``` 以上三种都可以加载模型,其中内存引用方式加载是 zero-copy 的,所以使用 net 模型的来源内存块必须存在 ### 卸载模型 ```cpp net.clear(); ``` ### 输入和输出 ncnn 用自己的数据结构 Mat 来存放输入和输出数据 输入图像的数据要转换为 Mat,依需要减去均值和乘系数 ```cpp #include "mat.h" unsigned char* rgbdata;// data pointer to RGB image pixels int w;// image width int h;// image height ncnn::Mat in = ncnn::Mat::from_pixels(rgbdata, ncnn::Mat::PIXEL_RGB, w, h); const float mean_vals[3] = {104.f, 117.f, 123.f}; in.substract_mean_normalize(mean_vals, 0); ``` 执行前向网络,获得计算结果 ```cpp #include "net.h" ncnn::Mat in;// input blob as above ncnn::Mat out; ncnn::Extractor ex = net.create_extractor(); ex.input("data", in); ex.extract("prob", out); ``` 如果是二进制的 param.bin 方式,没有可见字符串,利用 alexnet.id.h 的枚举来代替 blob 的名字 ```cpp #include "net.h" #include "alexnet.id.h" ncnn::Mat in;// input blob as above ncnn::Mat out; ncnn::Extractor ex = net.create_extractor(); ex.input(alexnet_param_id::BLOB_data, in); ex.extract(alexnet_param_id::BLOB_prob, out); ``` 获取 Mat 中的输出数据,Mat 内部的数据通常是三维的,c / h / w,遍历所有获得全部分类的分数 ```cpp ncnn::Mat out_flatterned = out.reshape(out.w * out.h * out.c); std::vector scores; scores.resize(out_flatterned.w); for (int j=0; j $ cat alexnet.param.bin alexnet.bin > alexnet-all.bin ```cpp #include "net.h" FILE* fp = fopen("alexnet-all.bin", "rb"); net.load_param_bin(fp); net.load_model(fp); fclose(fp); ``` ================================================ FILE: docs/how-to-use-and-FAQ/use-ncnn-with-opencv.md ================================================ ### opencv to ncnn * cv::Mat CV_8UC3 -> ncnn::Mat 3 channel + swap RGB/BGR ```cpp // cv::Mat a(h, w, CV_8UC3); ncnn::Mat in = ncnn::Mat::from_pixels(a.data, ncnn::Mat::PIXEL_BGR2RGB, a.cols, a.rows); ``` * cv::Mat CV_8UC3 -> ncnn::Mat 3 channel + keep RGB/BGR order ```cpp // cv::Mat a(h, w, CV_8UC3); ncnn::Mat in = ncnn::Mat::from_pixels(a.data, ncnn::Mat::PIXEL_RGB, a.cols, a.rows); ``` * cv::Mat CV_8UC3 -> ncnn::Mat 1 channel + do RGB2GRAY/BGR2GRAY ```cpp // cv::Mat rgb(h, w, CV_8UC3); ncnn::Mat inrgb = ncnn::Mat::from_pixels(rgb.data, ncnn::Mat::PIXEL_RGB2GRAY, rgb.cols, rgb.rows); // cv::Mat bgr(h, w, CV_8UC3); ncnn::Mat inbgr = ncnn::Mat::from_pixels(bgr.data, ncnn::Mat::PIXEL_BGR2GRAY, bgr.cols, bgr.rows); ``` * cv::Mat CV_8UC1 -> ncnn::Mat 1 channel ```cpp // cv::Mat a(h, w, CV_8UC1); ncnn::Mat in = ncnn::Mat::from_pixels(a.data, ncnn::Mat::PIXEL_GRAY, a.cols, a.rows); ``` * cv::Mat CV_32FC1 -> ncnn::Mat 1 channel * **You could construct ncnn::Mat and fill data into it directly to avoid data copy** ```cpp // cv::Mat a(h, w, CV_32FC1); ncnn::Mat in(a.cols, a.rows, 1, (void*)a.data); in = in.clone(); ``` * cv::Mat CV_32FC3 -> ncnn::Mat 3 channel * **You could construct ncnn::Mat and fill data into it directly to avoid data copy** ```cpp // cv::Mat a(h, w, CV_32FC3); ncnn::Mat in_pack3(a.cols, a.rows, 1, (void*)a.data, (size_t)4u * 3, 3); ncnn::Mat in; ncnn::convert_packing(in_pack3, in, 1); ``` * std::vector < cv::Mat > + CV_32FC1 -> ncnn::Mat multiple channels * **You could construct ncnn::Mat and fill data into it directly to avoid data copy** ```cpp // std::vector a(channels, cv::Mat(h, w, CV_32FC1)); int channels = a.size(); ncnn::Mat in(a[0].cols, a[0].rows, channels); for (int p=0; p cv::Mat CV_8UC3 + swap RGB/BGR * **You may need to call in.substract_mean_normalize() first to scale values from 0..1 to 0..255** ```cpp // ncnn::Mat in(w, h, 3); cv::Mat a(in.h, in.w, CV_8UC3); in.to_pixels(a.data, ncnn::Mat::PIXEL_BGR2RGB); ``` * ncnn::Mat 3 channel -> cv::Mat CV_8UC3 + keep RGB/BGR order * **You may need to call in.substract_mean_normalize() first to scale values from 0..1 to 0..255** ```cpp // ncnn::Mat in(w, h, 3); cv::Mat a(in.h, in.w, CV_8UC3); in.to_pixels(a.data, ncnn::Mat::PIXEL_RGB); ``` * ncnn::Mat 1 channel -> cv::Mat CV_8UC1 * **You may need to call in.substract_mean_normalize() first to scale values from 0..1 to 0..255** ```cpp // ncnn::Mat in(w, h, 1); cv::Mat a(in.h, in.w, CV_8UC1); in.to_pixels(a.data, ncnn::Mat::PIXEL_GRAY); ``` * ncnn::Mat 1 channel -> cv::Mat CV_32FC1 * **You could consume or manipulate ncnn::Mat data directly to avoid data copy** ```cpp // ncnn::Mat in; cv::Mat a(in.h, in.w, CV_32FC1); memcpy((uchar*)a.data, in.data, in.w * in.h * sizeof(float)); ``` * ncnn::Mat 3 channel -> cv::Mat CV_32FC3 * **You could consume or manipulate ncnn::Mat data directly to avoid data copy** ```cpp // ncnn::Mat in(w, h, 3); ncnn::Mat in_pack3; ncnn::convert_packing(in, in_pack3, 3); cv::Mat a(in.h, in.w, CV_32FC3); memcpy((uchar*)a.data, in_pack3.data, in.w * in.h * 3 * sizeof(float)); ``` * ncnn::Mat multiple channels -> std::vector < cv::Mat > + CV_32FC1 * **You could consume or manipulate ncnn::Mat data directly to avoid data copy** ```cpp // ncnn::Mat in(w, h, channels); std::vector a(in.c); for (int p=0; p/lib/cmake/ncnn" CACHE PATH "Directory that contains ncnnConfig.cmake") find_package(ncnn REQUIRED) target_link_libraries(my_target ncnn) ``` After this, both the header file search path ("including directories") and library paths are configured automatically, including vulkan related dependencies. Note: you have to change `` to your machine's directory, it is the directory that contains `ncnnConfig.cmake`. For the prebuilt ncnn release packages, ncnnConfig is located in: - for `ncnn-YYYYMMDD-windows-vs2019`, it is `lib/cmake/ncnn` - for `ncnn-YYYYMMDD-android-vulkan`, it is `${ANDROID_ABI}/lib/cmake/ncnn` (`${ANDROID_ABI}` is defined in NDK's cmake toolchain file) - other prebuilt release packages are with similar condition **manually specify** You may also manually specify ncnn library path and including directory. Note that if you use ncnn with vulkan, it is also required to specify vulkan related dependencies. For example, on Visual Studio debug mode with vulkan required, the lib paths are: ``` E:\github\ncnn\build\vs2019-x64\install\lib\ncnnd.lib E:\github\ncnn\build\vs2019-x64\install\lib\glslangd.lib ``` And for its release mode, lib paths are: ``` E:\github\ncnn\build\vs2019-x64\install\lib\ncnn.lib E:\github\ncnn\build\vs2019-x64\install\lib\glslang.lib ``` ================================================ FILE: docs/how-to-use-and-FAQ/use-ncnn-with-pytorch-or-onnx.md ================================================ # A Guide to Converting pytorch / onnx Models to ncnn This guide is designed to help pytorch and onnx users use the new-generation model conversion tool, **pnnx**, to efficiently and reliably convert models to the ncnn format for high-performance inference on the edge. This document is written and revised based on the **official pnnx documentation**. * pnnx project: https://github.com/pnnx/pnnx * ncnn project: https://github.com/Tencent/ncnn * supported pytorch operators: https://github.com/Tencent/ncnn/tree/master/tools/pnnx#supported-pytorch-operator-status * supported onnx operators: https://github.com/Tencent/ncnn/tree/master/tools/pnnx#supported-onnx-operator-status --- ## Why is pnnx Highly Recommended? Regardless of which framework you come from, pnnx offers significant advantages over traditional tools (like `onnx2ncnn`): * **Forget the Hassles of onnx**: The traditional `pytorch -> onnx -> ncnn` pipeline often fails due to onnx operator compatibility issues and dynamic shape problems. pnnx can convert directly from pytorch, completely bypassing the unstable intermediate step of onnx. * **Core Framework Support**: pnnx focuses on supporting **pytorch** and **onnx**, providing you with a unified and consistent conversion experience. * **More Stable and Powerful**: pnnx can handle a wider range of modern operators and complex model architectures, generating cleaner and more accurate ncnn graphs. * **Active and Continuous Development**: pnnx is under active development, constantly adding support for the latest operators and features from both source frameworks and the ncnn engine. * **Richer Graph Information**: pnnx preserves the original model's structural information during the conversion process, which is highly beneficial for model analysis and subsequent optimization. --- ## Workflow 1: Guide for pytorch Users (Recommended) For pytorch users, converting directly from a pytorch model is the most stable and efficient path. ### Method A: Direct Conversion in Python with `pnnx.export` (Most Recommended) This is the simplest and most recommended workflow, allowing you to complete the model conversion with a single command without leaving your Python environment. #### 1. Install pnnx First, install the pnnx Python package. This command installs both the `pnnx` Python library and the `pnnx` command-line tool. ```bash pip3 install pnnx ``` #### 2. Call `pnnx.export` in Your Python Script Calling the `pnnx.export` function will generate both a TorchScript (`.pt`) file and the `.param` and `.bin` files required by ncnn. **Complete Code Example:** ```python import torch import torch.nn as nn import pnnx # 1. Define or load your pytorch model class MyModel(nn.Module): def __init__(self): super(MyModel, self).__init__() self.conv1 = nn.Conv2d(3, 16, 3, 1, 1) self.relu = nn.ReLU() self.fc = nn.Linear(16 * 224 * 224, 10) def forward(self, x): x = self.conv1(x) x = self.relu(x) x = x.view(x.size(0), -1) x = self.fc(x) return x # 2. Instantiate the model and set it to evaluation mode model = MyModel() model.eval() # 3. Create a dummy input tensor with the correct input shape input_tensor = torch.rand(1, 3, 224, 224) # 4. Call pnnx.export to export the model pnnx.export(model, "my_model.pt", (input_tensor,)) print("Conversion complete!") print("Please check for the generated my_model.pt, my_model.ncnn.param, and my_model.ncnn.bin files.") ``` ### Method B: Using the Command-Line Tool (Alternative) #### 1. Get the pnnx Command-Line Tool If you have already run `pip install pnnx`, the `pnnx` command is available, and you can proceed to the next step. For non-Python environments or users who prefer a standalone executable, you can manually download the latest binary from the [pnnx Releases page](https://github.com/pnnx/pnnx/releases). #### 2. Export to TorchScript (Skip if you already have a .pt file) ```python import torch # ... (model definition from above) model = MyModel() model.eval() input_tensor = torch.rand(1, 3, 224, 224) traced_script_module = torch.jit.trace(model, input_tensor) traced_script_module.save("my_model.pt") ``` #### 3. Run the pnnx Command for Conversion Run the following command in your terminal. ```bash # Syntax: pnnx pnnx my_model.pt ``` --- ## Workflow 2: Guide for onnx Users For users who already have an `.onnx` file, please use pnnx for conversion. ### 1. Get the pnnx Command-Line Tool * **Method 1 (Recommended):** If you have Python in your environment, install it directly via pip. ```bash pip3 install pnnx ``` The `pnnx` command will be automatically added to your system's path. * **Method 2 (Alternative):** For non-Python environments or to use a standalone program, you can download the latest executable from the [pnnx Releases page](https://github.com/pnnx/pnnx/releases). ### 2. Run the Command-Line Conversion Open a terminal, navigate to the directory containing your model file, and run the following command. **Basic Command Example:** ```bash # Syntax: pnnx pnnx my_model.onnx ``` After the command executes successfully, you will get the `my_model.ncnn.param` and `my_model.ncnn.bin` files, which can be directly loaded and used in your ncnn project. ================================================ FILE: docs/how-to-use-and-FAQ/use-ncnnoptimize-to-optimize-model.md ================================================ the typical usage ``` ncnnoptimize mobilenet.param mobilenet.bin mobilenet-opt.param mobilenet-opt.bin 65536 ``` operator fusion * batchnorm - scale * convolution - batchnorm * convolutiondepthwise - batchnorm * deconvolution - batchnorm * deconvolutiondepthwise - batchnorm * innerproduct - batchnorm * convolution - relu * convolutiondepthwise - relu * deconvolution - relu * deconvolutiondepthwise - relu * innerproduct - relu eliminate noop operator * innerproduct - dropout * flatten after global pooling prefer better operator * replace convolution with innerproduct after global pooling ================================================ FILE: docs/how-to-use-and-FAQ/vulkan-notes.md ================================================ ## supported platform * Y = known work * ? = shall work, not confirmed * / = not applied | |windows|linux|android|mac|ios| |---|---|---|---|---|---| |intel|Y|Y|Y|Y|/| |amd|Y|Y|/|Y|/| |nvidia|Y|Y|?|/|/| |qcom|/|/|Y|/|/| |apple|/|/|/|Y|Y| |arm|/|Y|Y|/|/| ## enable vulkan compute support ``` $ cmake -DNCNN_VULKAN=ON .. ``` ## enable vulkan compute inference ```cpp ncnn::Net net; net.opt.use_vulkan_compute = 1; ``` ## proper allocator usage ```cpp ncnn::VkAllocator* blob_vkallocator = vkdev.acquire_blob_allocator(); ncnn::VkAllocator* staging_vkallocator = vkdev.acquire_blob_allocator(); net.opt.blob_vkallocator = blob_vkallocator; net.opt.workspace_vkallocator = blob_vkallocator; net.opt.staging_vkallocator = staging_vkallocator; // .... // after inference vkdev.reclaim_blob_allocator(blob_vkallocator); vkdev.reclaim_staging_allocator(staging_vkallocator); ``` ## select gpu device ```cpp // get gpu count int gpu_count = ncnn::get_gpu_count(); // set specified vulkan device before loading param and model net.set_vulkan_device(0); // use device-0 net.set_vulkan_device(1); // use device-1 // or set opt.vulkan_device_index field before loading param and model net.opt.vulkan_device_index = 0; // use device-0 net.opt.vulkan_device_index = 1; // use device-1 ``` ## zero-copy on unified memory device ```cpp ncnn::VkMat blob_gpu; ncnn::Mat mapped = blob_gpu.mapped(); // use mapped.data directly ``` ## hybrid cpu/gpu inference ```cpp ncnn::Net net_cpu; ncnn::Net net_gpu; net_cpu.opt.use_vulkan_compute = false; net_gpu.opt.use_vulkan_compute = true; net_cpu.load_param(); net_cpu.load_model(); net_gpu.load_param(); net_gpu.load_model(); ncnn::Extractor ex_cpu = net_cpu.create_extractor(); ncnn::Extractor ex_gpu = net_gpu.create_extractor(); #pragma omp parallel sections { #pragma omp section { ex_cpu.input(); ex_cpu.extract(); } #pragma omp section { ex_gpu.input(); ex_gpu.extract(); } } ``` ## zero-copy gpu inference chaining ```cpp ncnn::Extractor ex1 = net1.create_extractor(); ncnn::Extractor ex2 = net2.create_extractor(); ncnn::VkCompute cmd(&vkdev); ncnn::VkMat conv1; ncnn::VkMat conv2; ncnn::VkMat conv3; ex1.input("conv1", conv1); ex1.extract("conv2", conv2, cmd); ex2.input("conv2", conv2); ex2.extract("conv3", conv3, cmd); cmd.submit_and_wait(); ``` ## batch inference ```cpp int max_batch_size = vkdev->info.compute_queue_count(); ncnn::Mat inputs[1000]; ncnn::Mat outputs[1000]; #pragma omp parallel for num_threads(max_batch_size) for (int i=0; i<1000; i++) { ncnn::Extractor ex = net1.create_extractor(); ex.input("data", inputs[i]); ex.extract("prob", outputs[i]); } ``` ## control storage and arithmetic precision disable all lower-precision optimizations, get full fp32 precision ```cpp ncnn::Net net; net.opt.use_fp16_packed = false; net.opt.use_fp16_storage = false; net.opt.use_fp16_arithmetic = false; net.opt.use_int8_storage = false; net.opt.use_int8_arithmetic = false; ``` ## debugging tips ```cpp #define ENABLE_VALIDATION_LAYER 1 // modify to 1 in gpu.cpp ``` ## add vulkan compute support to layer 1. add vulkan shader in src/layer/shader/ 2. upload model weight data in Layer::upload_model() 3. setup pipeline in Layer::create_pipeline() 4. destroy pipeline in Layer::destroy_pipeline() 5. record command in Layer::forward() ## add optimized shader path 1. add vulkan shader in src/layer/shader/ named XXX_abc.comp 2. create pipeline with "XXX_abc" 3. record command using XXX_abc pipeline ## low-level op api 1. create layer 2. load param and load model 3. upload model 4. create pipeline 5. new command 6. record 7. submit and wait ================================================ FILE: examples/CMakeLists.txt ================================================ macro(ncnn_add_example name) add_executable(${name} ${name}.cpp) if(OpenCV_FOUND) target_include_directories(${name} PRIVATE ${OpenCV_INCLUDE_DIRS}) target_link_libraries(${name} PRIVATE ncnn ${OpenCV_LIBS}) elseif(NCNN_SIMPLEOCV) target_compile_definitions(${name} PUBLIC USE_NCNN_SIMPLEOCV) target_link_libraries(${name} PRIVATE ncnn) endif() # add test to a virtual project group set_property(TARGET ${name} PROPERTY FOLDER "examples") endmacro() if(NCNN_PIXEL) if(NOT NCNN_SIMPLEOCV) find_package(OpenCV QUIET COMPONENTS opencv_world) # for opencv 2.4 on ubuntu 16.04, there is no opencv_world but OpenCV_FOUND will be TRUE if("${OpenCV_LIBS}" STREQUAL "") set(OpenCV_FOUND FALSE) endif() if(NOT OpenCV_FOUND) find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs videoio) endif() if(NOT OpenCV_FOUND) find_package(OpenCV QUIET COMPONENTS core highgui imgproc) endif() endif() if(OpenCV_FOUND OR NCNN_SIMPLEOCV) if(OpenCV_FOUND) message(STATUS "OpenCV library: ${OpenCV_INSTALL_PATH}") message(STATUS " version: ${OpenCV_VERSION}") message(STATUS " libraries: ${OpenCV_LIBS}") message(STATUS " include path: ${OpenCV_INCLUDE_DIRS}") if(${OpenCV_VERSION_MAJOR} GREATER 3) set(CMAKE_CXX_STANDARD 11) endif() endif() include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src) include_directories(${CMAKE_CURRENT_BINARY_DIR}/../src) ncnn_add_example(arcface) ncnn_add_example(squeezenet) ncnn_add_example(squeezenet_c_api) ncnn_add_example(fasterrcnn) ncnn_add_example(rfcn) ncnn_add_example(yolov2) ncnn_add_example(yolov3) ncnn_add_example(yolov5) ncnn_add_example(yolov5_pnnx) ncnn_add_example(yolov7_pnnx) ncnn_add_example(yolov7) ncnn_add_example(yolov8) ncnn_add_example(yolov8_seg) ncnn_add_example(yolov8_pose) ncnn_add_example(yolov8_cls) ncnn_add_example(yolox) ncnn_add_example(yolo11) ncnn_add_example(yolo11_seg) ncnn_add_example(yolo11_pose) ncnn_add_example(yolo11_cls) ncnn_add_example(yoloworld) ncnn_add_example(mobilenetv2ssdlite) ncnn_add_example(mobilenetssd) ncnn_add_example(squeezenetssd) ncnn_add_example(shufflenetv2) ncnn_add_example(peleenetssd_seg) ncnn_add_example(simplepose) ncnn_add_example(retinaface) ncnn_add_example(yolact) ncnn_add_example(nanodet) ncnn_add_example(nanodetplus_pnnx) ncnn_add_example(scrfd) ncnn_add_example(scrfd_crowdhuman) ncnn_add_example(piper) ncnn_add_example(whisper) if(OpenCV_FOUND) ncnn_add_example(yolov4) ncnn_add_example(yolov8_obb) ncnn_add_example(yolo11_obb) ncnn_add_example(rvm) ncnn_add_example(p2pnet) ncnn_add_example(ppocrv5) endif() else() message(WARNING "OpenCV not found and NCNN_SIMPLEOCV disabled, examples won't be built") endif() else() message(WARNING "NCNN_PIXEL not enabled, examples won't be built") endif() ================================================ FILE: examples/arcface.cpp ================================================ // Copyright 2025 heabeounMKTO // SPDX-License-Identifier: BSD-3-Clause /* ncnn example using yolo-face and arcface to extract embeddings from a face * * * the arcface model is converted from * https://github.com/onnx/models/tree/main/validated/vision/body_analysis/arcface * 1. first simplify the arcface.onnx using onnxsim * 2. then convert it using ncnn's onnx exporter onnx2ncnn * using pnnx to convert would cause -nan output! * * the yolov8-face model is converted from * https://github.com/derronqi/yolov8-face * * * you can find the models preconverted at * https://drive.google.com/drive/folders/1P0RDzj9V7FHEL8w_-yqls5RHeVpO-2PS?usp=sharing * * */ #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include #include #include "layer.h" #include "net.h" #include "mat.h" #ifndef ARCFACE_EXAMPLE_YOLO_INFER_SIZE #define ARCFACE_EXAMPLE_YOLO_INFER_SIZE 320 #endif struct Bbox { float x1, y1, x2, y2, confidence; int label; Bbox() : x1(0.0f), y1(0.0f), x2(0.0f), y2(0.0f), confidence(0.0f), label(0) { } Bbox(float x1, float y1, float x2, float y2, float confidence, int label = 0, std::string label_name = "") : x1(x1), y1(y1), x2(x2), y2(y2), confidence(confidence), label(label) { } Bbox apply_image_scale(const cv::Mat& original_image, const float scale_factor, const int pad_w, const int pad_h) { int img_w = original_image.cols; int img_h = original_image.rows; x1 = (x1 - pad_w) / scale_factor; y1 = (y1 - pad_h) / scale_factor; x2 = (x2 - pad_w) / scale_factor; y2 = (y2 - pad_h) / scale_factor; // clamp x1 = std::max(0.0f, std::min(x1, (float)img_w)); y1 = std::max(0.0f, std::min(y1, (float)img_h)); x2 = std::max(0.0f, std::min(x2, (float)img_w)); y2 = std::max(0.0f, std::min(y2, (float)img_h)); return Bbox(x1, y1, x2, y2, confidence, label); } std::string get_label_name(const std::vector& classes) { return classes[this->label]; } /// what more do you need to know vro float area() const { float width = x2 - x1; float height = y2 - y1; return width * height; } cv::Mat crop_bbox(const cv::Mat& originalImage) const { // Calculate width and height int bbox_width = static_cast(x2 - x1); int bbox_height = static_cast(y2 - y1); // Ensure valid dimensions if (bbox_width <= 0 || bbox_height <= 0) { fprintf(stderr, "Invalid bounding box dimensions\n"); return cv::Mat(); } // Ensure coordinates are within image bounds int x1_int = static_cast(x1); int y1_int = static_cast(y1); int x2_int = static_cast(x2); int y2_int = static_cast(y2); // Clamp to image bounds x1_int = std::max(0, x1_int); y1_int = std::max(0, y1_int); x2_int = std::min(originalImage.cols, x2_int); y2_int = std::min(originalImage.rows, y2_int); // Create ROI and return cropped image cv::Rect roi(x1_int, y1_int, x2_int - x1_int, y2_int - y1_int); return originalImage(roi).clone(); } cv::Rect_ get_rect() const { int x1_int = static_cast(x1); int y1_int = static_cast(y1); int width = static_cast(x2 - x1); int height = static_cast(y2 - y1); // Ensure valid dimensions if (width <= 0 || height <= 0) { return cv::Rect(0, 0, 0, 0); // Return invalid rect } return cv::Rect(x1_int, y1_int, width, height); } }; static void print_bbox(Bbox& bbox) { printf("Bbox(x1=%.2f, y1=%.2f, x2=%.2f, y2=%.2f, conf=%.4f, label=%d)\n", bbox.x1, bbox.y1, bbox.x2, bbox.y2, bbox.confidence, bbox.label); } static void qsort_descent_inplace(std::vector& faceobjects, int left, int right) { int i = left; int j = right; float p = faceobjects[(left + right) / 2].confidence; while (i <= j) { while (faceobjects[i].confidence > p) i++; while (faceobjects[j].confidence < p) j--; if (i <= j) { // swap std::swap(faceobjects[i], faceobjects[j]); i++; j--; } } // #pragma omp parallel sections { // #pragma omp section { if (left < j) qsort_descent_inplace(faceobjects, left, j); } // #pragma omp section { if (i < right) qsort_descent_inplace(faceobjects, i, right); } } } static void qsort_descent_inplace(std::vector& faceobjects) { if (faceobjects.empty()) return; qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1); } float calculate_iou(const Bbox& box1, const Bbox& box2) { float x1 = std::max(box1.x1, box2.x1); float y1 = std::max(box1.y1, box2.y1); float x2 = std::min(box1.x2, box2.x2); float y2 = std::min(box1.y2, box2.y2); if (x2 <= x1 || y2 <= y1) { return 0.0f; // no intersect } float intersection_area = (x2 - x1) * (y2 - y1); float box1_area = (box1.x2 - box1.x1) * (box1.y2 - box1.y1); float box2_area = (box2.x2 - box2.x1) * (box2.y2 - box2.y1); float union_area = box1_area + box2_area - intersection_area; return intersection_area / union_area; } static std::vector non_maximum_supression(const std::vector& bbox, float iou_thresh, bool class_agnostic = false) { std::vector picked; const int n = bbox.size(); if (n == 0) return picked; std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = bbox[i].area(); } for (int i = 0; i < n; i++) { const Bbox& a = bbox[i]; bool keep = true; for (int j : picked) { const Bbox& b = bbox[j]; // Enhanced class comparison logic using labels if (!class_agnostic) { if (a.label != b.label) { continue; // Different classes, don't suppress } } float iou = calculate_iou(a, b); if (iou > iou_thresh) { keep = false; break; } } if (keep) { picked.push_back(i); } } return picked; } static std::vector scale_wh(float w0, float h0, float w1, float h1) { float r = std::min(w1 / w0, h1 / h0); std::vector _scale_factor(3); _scale_factor[0] = r; _scale_factor[1] = (float)std::round(w0 * r); _scale_factor[2] = (float)std::round(h0 * r); return _scale_factor; } struct ImagePreProcessResults { ncnn::Mat result; float img_scale, pad_w, pad_h; ImagePreProcessResults(ncnn::Mat result, float img_scale, float pad_w, float pad_h) : result(result), img_scale(img_scale), pad_w(pad_w), pad_h(pad_h) { } }; struct DetectionResult { std::vector bboxes; std::vector > keypoints; }; static ImagePreProcessResults preprocess_yolo_kpts(cv::Mat& input_image, int infer_size) noexcept { float mean_vals[] = {0.f, 0.f, 0.f}; float norm_vals[] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; int img_w = input_image.cols; int img_h = input_image.rows; float scale_factor, new_w, new_h; std::vector _scale_factor = scale_wh(img_w, img_h, (float)infer_size, (float)infer_size); scale_factor = _scale_factor[0]; new_w = _scale_factor[1]; new_h = _scale_factor[2]; ncnn::Mat in = ncnn::Mat::from_pixels_resize(input_image.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, new_w, new_h); // padding calculation int pad_w = (infer_size - new_w) / 2; int pad_h = (infer_size - new_h) / 2; ncnn::Mat in_pad; ncnn::copy_make_border(in, in_pad, pad_h, infer_size - new_h - pad_h, pad_w, infer_size - new_w - pad_w, ncnn::BORDER_CONSTANT, 114.f); in_pad.substract_mean_normalize(mean_vals, norm_vals); return ImagePreProcessResults(in_pad, scale_factor, pad_w, pad_h); } /// parses extra keypoints data for face mmodel /// the format is this: /// [x, y, w, h, conf, class_scores..., kp1_conf, kp1_x, kp1_y, kp2_conf, kp2_x, kp2_y, ...] static DetectionResult parse_yolo_keypoints_results(ncnn::Mat& result, cv::Mat& original_image, ImagePreProcessResults& preproc_img, float confidence_threshold, float iou_threshold, std::vector class_names) { cv::Mat output((int)result.w, (int)result.h, CV_32FC1); for (int i = 0; i < output.cols; i++) { for (int j = 0; j < output.rows; j++) { output.ptr(j)[i] = result.row(i)[j]; } } std::vector detections; std::vector > all_keypoints; int num_classes = class_names.size(); int kp_stride = 3; int num_keypoints = 5; for (int i = 0; i < output.rows; i++) { const float* row_ptr = output.ptr(i); const float* bboxes_ptr = row_ptr; const float* classes_ptr = row_ptr + 4; const float* max_s_ptr = std::max_element(classes_ptr, classes_ptr + num_classes); float score = *max_s_ptr; int class_id = max_s_ptr - classes_ptr; if (score >= confidence_threshold) { float x = bboxes_ptr[0]; float y = bboxes_ptr[1]; float w = bboxes_ptr[2]; float h = bboxes_ptr[3]; float x1 = x - w / 2.0f; float y1 = y - h / 2.0f; float x2 = x + w / 2.0f; float y2 = y + h / 2.0f; if (x2 > x1 && y2 > y1) { Bbox bbox = Bbox(x1, y1, x2, y2, score, class_id) .apply_image_scale(original_image, preproc_img.img_scale, preproc_img.pad_w, preproc_img.pad_h); // Parse exactly 5 keypoints for this face model std::vector face_keypoints; face_keypoints.reserve(15); const float* kp_ptr = row_ptr + 4 + num_classes; float scale = 1.0f / preproc_img.img_scale; for (int k = 0; k < num_keypoints; k++) { float kp_x = kp_ptr[k * kp_stride]; float kp_y = kp_ptr[k * kp_stride + 1]; float kp_conf_raw = kp_ptr[k * kp_stride + 2]; // Apply sigmoid to convert logit to probability float kp_conf = 1.0f / (1.0f + expf(-kp_conf_raw)); // Scale keypoints to original kp_x = (kp_x - preproc_img.pad_w) * scale; kp_y = (kp_y - preproc_img.pad_h) * scale; face_keypoints.push_back(kp_x); face_keypoints.push_back(kp_y); face_keypoints.push_back(kp_conf); } detections.push_back(bbox); all_keypoints.push_back(face_keypoints); } } } // nms qsort_descent_inplace(detections); std::vector picked = non_maximum_supression(detections, iou_threshold, false); DetectionResult res; for (size_t i = 0; i < picked.size(); i++) { int idx = picked[i]; res.bboxes.push_back(detections[idx]); res.keypoints.push_back(all_keypoints[idx]); } return res; } static inline float get_similarity(std::vector f1, std::vector f2) { float sim = 0.0; for (size_t i = 0; i < f1.size(); i++) { sim += f1[i] * f2[i]; } return sim; } // these are converted from here // https://github.com/deepinsight/insightface/blob/master/python-package/insightface/utils/face_align.py static int estimate_norm(float* transform_matrix, const float* lmk, int image_size = 112) { float ARCFACE_DST[] { 38.2946f, 51.6963f, // left eye 73.5318f, 51.5014f, // right eye 56.0252f, 71.7366f, // nose 41.5493f, 92.3655f, // left mouth 70.7299f, 92.2041f // right mouth }; if (image_size % 112 != 0 && image_size % 128 != 0) { return -1; } float ratio, diff_x; if (image_size % 112 == 0) { ratio = static_cast(image_size) / 112.0f; diff_x = 0.0f; } else { ratio = static_cast(image_size) / 128.0f; diff_x = 8.0f * ratio; } float src_points[10]; for (int i = 0; i < 5; i++) { src_points[i * 2] = lmk[i * 3]; src_points[i * 2 + 1] = lmk[i * 3 + 1]; } float dst_points[10]; for (int i = 0; i < 5; i++) { dst_points[i * 2] = ARCFACE_DST[i * 2] * ratio + diff_x; dst_points[i * 2 + 1] = ARCFACE_DST[i * 2 + 1] * ratio; } ncnn::get_affine_transform(dst_points, src_points, 5, transform_matrix); return 0; } static int norm_crop(cv::Mat& output, const cv::Mat& input, const float* lmk, int image_size = 112) { float transform_matrix[6]; int status = estimate_norm(transform_matrix, lmk, image_size); if (status != 0) { return status; } output = cv::Mat(image_size, image_size, CV_8UC3); ncnn::warpaffine_bilinear_c3(input.data, input.cols, input.rows, output.data, image_size, image_size, transform_matrix); return 0; } void normalize_arcface(std::vector& feature) { if (feature.empty()) return; float sum = 0; for (auto it = feature.begin(); it != feature.end(); it++) sum += (float)*it * (float)*it; sum = sqrt(sum); if (sum == 0.0f) return; for (auto it = feature.begin(); it != feature.end(); it++) *it /= sum; } static int get_face(const cv::Mat& rgb, DetectionResult& result) { int status = 0; ncnn::Net yoloface; yoloface.opt.use_vulkan_compute = true; status = yoloface.load_param("yolov8-face.param"); if (status != 0) { fprintf(stderr, "couldn't load params"); return status; } status = yoloface.load_model("yolov8-face.bin"); if (status != 0) { fprintf(stderr, "couldn't load model"); return status; } cv::Mat input_image = rgb.clone(); ImagePreProcessResults preproc_img = preprocess_yolo_kpts(input_image, ARCFACE_EXAMPLE_YOLO_INFER_SIZE); ncnn::Extractor ex = yoloface.create_extractor(); ex.input("in0", preproc_img.result); ncnn::Mat out; ex.extract("out0", out); std::vector class_names = {"face"}; result = parse_yolo_keypoints_results(out, input_image, preproc_img, 0.5, 0.4, class_names); if (result.bboxes.size() < 1) { fprintf(stderr, "no faces are found!"); return -1; } return 0; } static int get_embedding(const cv::Mat& rgb, std::vector& result) { ncnn::Net arcface; arcface.opt.use_vulkan_compute = true; int status = arcface.load_param("arcfaceresnet.param"); if (status != 0) { fprintf(stderr, "couldn't load arcface params"); return status; } status = arcface.load_model("arcfaceresnet.bin"); if (status != 0) { fprintf(stderr, "couldn't load arcface model"); return status; } if (rgb.empty() || rgb.type() != CV_8UC3) { fprintf(stderr, "invalid input image!"); return -1; } /* * the arcface model provided in the link has builtin normalization layers, * no need to run substract_mean_normalize * * reference from .param BinaryOp _minusscalar0 2 1 data scalar_op2 _minusscalar0 0=1 BinaryOp _mulscalar0 2 1 _minusscalar0 scalar_op3 _mulscalar0 0=2 * */ ncnn::Mat in = ncnn::Mat::from_pixels_resize( rgb.data, ncnn::Mat::PIXEL_BGR2RGB, rgb.cols, rgb.rows, 112, 112); ncnn::Extractor ex = arcface.create_extractor(); ex.input("data", in); ncnn::Mat out; ex.extract("fc1", out); const float* ptr = (const float*)out.data; for (int i = 0; i < 512; i++) { result[i] = ptr[i]; } normalize_arcface(result); return 0; } int main(int argc, char** argv) { if (argc != 3) { fprintf(stderr, "Usage: %s \n", argv[0]); return -1; } const char* face1_path = argv[1]; const char* face2_path = argv[2]; int status = 0; cv::Mat face_img1 = cv::imread(face1_path); cv::Mat face_img2 = cv::imread(face2_path); if (face_img1.empty()) { fprintf(stderr, "Failed to load image: %s\n", face1_path); return -1; } if (face_img2.empty()) { fprintf(stderr, "Failed to load image: %s\n", face2_path); return -1; } cv::Mat input_embed1, input_embed2; DetectionResult res1, res2; std::vector embedding1(512), embedding2(512); status = get_face(face_img1, res1); if (status != 0) { fprintf(stderr, "get face failed for %s!\n", face1_path); return -1; } fprintf(stdout, "found faces in face1: %d\n", (int)res1.bboxes.size()); for (size_t i = 0; i < res1.bboxes.size(); i++) { print_bbox(res1.bboxes[i]); } status = get_face(face_img2, res2); if (status != 0) { fprintf(stderr, "get face failed for %s!\n", face2_path); return -1; } fprintf(stdout, "found faces in face2: %d\n", (int)res2.bboxes.size()); for (size_t i = 0; i < res2.bboxes.size(); i++) { print_bbox(res2.bboxes[i]); } status = norm_crop(input_embed1, face_img1, res1.keypoints[0].data()); status = get_embedding(input_embed1, embedding1); if (status != 0) { fprintf(stderr, "get embedding failed for %s!\n", face1_path); return -1; } status = norm_crop(input_embed2, face_img2, res2.keypoints[0].data()); if (status != 0) { fprintf(stderr, "norm_crop failed for face2!\n"); return -1; } status = get_embedding(input_embed2, embedding2); if (status != 0) { fprintf(stderr, "get embedding failed for face2!\n"); return -1; } if (status != 0) { fprintf(stderr, "get embedding failed for %s!\n", face2_path); return -1; } float similarity = get_similarity(embedding1, embedding2); fprintf(stdout, "Similarity: %f\n", similarity); } ================================================ FILE: examples/fasterrcnn.cpp ================================================ // Copyright 2018 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "net.h" #include #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include struct Object { cv::Rect_ rect; int label; float prob; }; static inline float intersection_area(const Object& a, const Object& b) { cv::Rect_ inter = a.rect & b.rect; return inter.area(); } static void qsort_descent_inplace(std::vector& objects, int left, int right) { int i = left; int j = right; float p = objects[(left + right) / 2].prob; while (i <= j) { while (objects[i].prob > p) i++; while (objects[j].prob < p) j--; if (i <= j) { // swap std::swap(objects[i], objects[j]); i++; j--; } } #pragma omp parallel sections { #pragma omp section { if (left < j) qsort_descent_inplace(objects, left, j); } #pragma omp section { if (i < right) qsort_descent_inplace(objects, i, right); } } } static void qsort_descent_inplace(std::vector& objects) { if (objects.empty()) return; qsort_descent_inplace(objects, 0, objects.size() - 1); } static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold, bool agnostic = false) { picked.clear(); const int n = faceobjects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = faceobjects[i].rect.area(); } for (int i = 0; i < n; i++) { const Object& a = faceobjects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { const Object& b = faceobjects[picked[j]]; if (!agnostic && a.label != b.label) continue; // intersection over union float inter_area = intersection_area(a, b); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area if (inter_area / union_area > nms_threshold) keep = 0; } if (keep) picked.push_back(i); } } static int detect_fasterrcnn(const cv::Mat& bgr, std::vector& objects) { ncnn::Net fasterrcnn; fasterrcnn.opt.use_vulkan_compute = true; // original pretrained model from https://github.com/rbgirshick/py-faster-rcnn // py-faster-rcnn/models/pascal_voc/ZF/faster_rcnn_alt_opt/faster_rcnn_test.pt // https://dl.dropboxusercontent.com/s/o6ii098bu51d139/faster_rcnn_models.tgz?dl=0 // ZF_faster_rcnn_final.caffemodel // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models if (fasterrcnn.load_param("ZF_faster_rcnn_final.param")) exit(-1); if (fasterrcnn.load_model("ZF_faster_rcnn_final.bin")) exit(-1); // hyper parameters taken from // py-faster-rcnn/lib/fast_rcnn/config.py // py-faster-rcnn/lib/fast_rcnn/test.py const int target_size = 600; // __C.TEST.SCALES const int max_per_image = 100; const float confidence_thresh = 0.05f; const float nms_threshold = 0.3f; // __C.TEST.NMS // scale to target detect size int w = bgr.cols; int h = bgr.rows; float scale = 1.f; if (w < h) { scale = (float)target_size / w; w = target_size; h = h * scale; } else { scale = (float)target_size / h; h = target_size; w = w * scale; } ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, w, h); const float mean_vals[3] = {102.9801f, 115.9465f, 122.7717f}; in.substract_mean_normalize(mean_vals, 0); ncnn::Mat im_info(3); im_info[0] = h; im_info[1] = w; im_info[2] = scale; // step1, extract feature and all rois ncnn::Extractor ex1 = fasterrcnn.create_extractor(); ex1.input("data", in); ex1.input("im_info", im_info); ncnn::Mat conv5_relu5; // feature ncnn::Mat rois; // all rois ex1.extract("conv5_relu5", conv5_relu5); ex1.extract("rois", rois); // step2, extract bbox and score for each roi std::vector > class_candidates; for (int i = 0; i < rois.c; i++) { ncnn::Extractor ex2 = fasterrcnn.create_extractor(); ncnn::Mat roi = rois.channel(i); // get single roi ex2.input("conv5_relu5", conv5_relu5); ex2.input("rois", roi); ncnn::Mat bbox_pred; ncnn::Mat cls_prob; ex2.extract("bbox_pred", bbox_pred); ex2.extract("cls_prob", cls_prob); int num_class = cls_prob.w; class_candidates.resize(num_class); // find class id with highest score int label = 0; float score = 0.f; for (int i = 0; i < num_class; i++) { float class_score = cls_prob[i]; if (class_score > score) { label = i; score = class_score; } } // ignore background or low score if (label == 0 || score <= confidence_thresh) continue; // fprintf(stderr, "%d = %f\n", label, score); // unscale to image size float x1 = roi[0] / scale; float y1 = roi[1] / scale; float x2 = roi[2] / scale; float y2 = roi[3] / scale; float pb_w = x2 - x1 + 1; float pb_h = y2 - y1 + 1; // apply bbox regression float dx = bbox_pred[label * 4]; float dy = bbox_pred[label * 4 + 1]; float dw = bbox_pred[label * 4 + 2]; float dh = bbox_pred[label * 4 + 3]; float cx = x1 + pb_w * 0.5f; float cy = y1 + pb_h * 0.5f; float obj_cx = cx + pb_w * dx; float obj_cy = cy + pb_h * dy; float obj_w = pb_w * exp(dw); float obj_h = pb_h * exp(dh); float obj_x1 = obj_cx - obj_w * 0.5f; float obj_y1 = obj_cy - obj_h * 0.5f; float obj_x2 = obj_cx + obj_w * 0.5f; float obj_y2 = obj_cy + obj_h * 0.5f; // clip obj_x1 = std::max(std::min(obj_x1, (float)(bgr.cols - 1)), 0.f); obj_y1 = std::max(std::min(obj_y1, (float)(bgr.rows - 1)), 0.f); obj_x2 = std::max(std::min(obj_x2, (float)(bgr.cols - 1)), 0.f); obj_y2 = std::max(std::min(obj_y2, (float)(bgr.rows - 1)), 0.f); // append object Object obj; obj.rect = cv::Rect_(obj_x1, obj_y1, obj_x2 - obj_x1 + 1, obj_y2 - obj_y1 + 1); obj.label = label; obj.prob = score; class_candidates[label].push_back(obj); } // post process objects.clear(); for (int i = 0; i < (int)class_candidates.size(); i++) { std::vector& candidates = class_candidates[i]; qsort_descent_inplace(candidates); std::vector picked; nms_sorted_bboxes(candidates, picked, nms_threshold); for (int j = 0; j < (int)picked.size(); j++) { int z = picked[j]; objects.push_back(candidates[z]); } } qsort_descent_inplace(objects); if (max_per_image > 0 && max_per_image < objects.size()) { objects.resize(max_per_image); } return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = {"background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor" }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0)); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_fasterrcnn(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/mobilenetssd.cpp ================================================ // Copyright 2017 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include struct Object { cv::Rect_ rect; int label; float prob; }; static int detect_mobilenet(const cv::Mat& bgr, std::vector& objects) { ncnn::Net mobilenet; mobilenet.opt.use_vulkan_compute = true; // model is converted from https://github.com/chuanqi305/MobileNet-SSD // and can be downloaded from https://drive.google.com/open?id=0ByaKLD9QaPtucWk0Y0dha1VVY0U // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models if (mobilenet.load_param("mobilenet_ssd_voc_ncnn.param")) exit(-1); if (mobilenet.load_model("mobilenet_ssd_voc_ncnn.bin")) exit(-1); const int target_size = 300; int img_w = bgr.cols; int img_h = bgr.rows; ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, target_size, target_size); const float mean_vals[3] = {127.5f, 127.5f, 127.5f}; const float norm_vals[3] = {1.0 / 127.5, 1.0 / 127.5, 1.0 / 127.5}; in.substract_mean_normalize(mean_vals, norm_vals); ncnn::Extractor ex = mobilenet.create_extractor(); ex.input("data", in); ncnn::Mat out; ex.extract("detection_out", out); // printf("%d %d %d\n", out.w, out.h, out.c); objects.clear(); for (int i = 0; i < out.h; i++) { const float* values = out.row(i); Object object; object.label = values[0]; object.prob = values[1]; object.rect.x = values[2] * img_w; object.rect.y = values[3] * img_h; object.rect.width = values[4] * img_w - object.rect.x; object.rect.height = values[5] * img_h - object.rect.y; objects.push_back(object); } return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = {"background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor" }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0)); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_mobilenet(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/mobilenetv2ssdlite.cpp ================================================ // Copyright 2018 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include class Noop : public ncnn::Layer { }; DEFINE_LAYER_CREATOR(Noop) struct Object { cv::Rect_ rect; int label; float prob; }; static int detect_mobilenetv2(const cv::Mat& bgr, std::vector& objects) { ncnn::Net mobilenetv2; mobilenetv2.opt.use_vulkan_compute = true; mobilenetv2.register_custom_layer("Silence", Noop_layer_creator); // original pretrained model from https://github.com/chuanqi305/MobileNetv2-SSDLite // https://github.com/chuanqi305/MobileNetv2-SSDLite/blob/master/ssdlite/voc/deploy.prototxt // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models if (mobilenetv2.load_param("mobilenetv2_ssdlite_voc.param")) exit(-1); if (mobilenetv2.load_model("mobilenetv2_ssdlite_voc.bin")) exit(-1); const int target_size = 300; int img_w = bgr.cols; int img_h = bgr.rows; ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, target_size, target_size); const float mean_vals[3] = {127.5f, 127.5f, 127.5f}; const float norm_vals[3] = {1.0 / 127.5, 1.0 / 127.5, 1.0 / 127.5}; in.substract_mean_normalize(mean_vals, norm_vals); ncnn::Extractor ex = mobilenetv2.create_extractor(); ex.input("data", in); ncnn::Mat out; ex.extract("detection_out", out); // printf("%d %d %d\n", out.w, out.h, out.c); objects.clear(); for (int i = 0; i < out.h; i++) { const float* values = out.row(i); Object object; object.label = values[0]; object.prob = values[1]; object.rect.x = values[2] * img_w; object.rect.y = values[3] * img_h; object.rect.width = values[4] * img_w - object.rect.x; object.rect.height = values[5] * img_h - object.rect.y; objects.push_back(object); } return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = {"background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor" }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0)); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_mobilenetv2(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/mobilenetv3ssdlite.cpp ================================================ // Copyright 2018 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "net.h" #include "platform.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include #if NCNN_VULKAN #include "gpu.h" #endif // NCNN_VULKAN template const T& clamp(const T& v, const T& lo, const T& hi) { assert(!(hi < lo)); return v < lo ? lo : hi < v ? hi : v; } struct Object { cv::Rect_ rect; int label; float prob; }; static int detect_mobilenetv3(const cv::Mat& bgr, std::vector& objects) { ncnn::Net mobilenetv3; #if NCNN_VULKAN mobilenetv3.opt.use_vulkan_compute = true; #endif // NCNN_VULKAN // converted ncnn model from https://github.com/ujsyehao/mobilenetv3-ssd if (mobilenetv3.load_param("./mobilenetv3_ssdlite_voc.param")) exit(-1); if (mobilenetv3.load_model("./mobilenetv3_ssdlite_voc.bin")) exit(-1); const int target_size = 300; int img_w = bgr.cols; int img_h = bgr.rows; ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, bgr.cols, bgr.rows, target_size, target_size); const float mean_vals[3] = {123.675f, 116.28f, 103.53f}; const float norm_vals[3] = {1.0f, 1.0f, 1.0f}; in.substract_mean_normalize(mean_vals, norm_vals); ncnn::Extractor ex = mobilenetv3.create_extractor(); ex.input("input", in); ncnn::Mat out; ex.extract("detection_out", out); // printf("%d %d %d\n", out.w, out.h, out.c); objects.clear(); for (int i = 0; i < out.h; i++) { const float* values = out.row(i); Object object; object.label = values[0]; object.prob = values[1]; // filter out cross-boundary float x1 = clamp(values[2] * target_size, 0.f, float(target_size - 1)) / target_size * img_w; float y1 = clamp(values[3] * target_size, 0.f, float(target_size - 1)) / target_size * img_h; float x2 = clamp(values[4] * target_size, 0.f, float(target_size - 1)) / target_size * img_w; float y2 = clamp(values[5] * target_size, 0.f, float(target_size - 1)) / target_size * img_h; object.rect.x = x1; object.rect.y = y1; object.rect.width = x2 - x1; object.rect.height = y2 - y1; objects.push_back(object); } return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = {"background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor" }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { if (objects[i].prob > 0.6) { const Object& obj = objects[i]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0)); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_mobilenetv3(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/nanodet.cpp ================================================ // Copyright 2020 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include #include #include struct Object { cv::Rect_ rect; int label; float prob; }; static inline float intersection_area(const Object& a, const Object& b) { cv::Rect_ inter = a.rect & b.rect; return inter.area(); } static void qsort_descent_inplace(std::vector& faceobjects, int left, int right) { int i = left; int j = right; float p = faceobjects[(left + right) / 2].prob; while (i <= j) { while (faceobjects[i].prob > p) i++; while (faceobjects[j].prob < p) j--; if (i <= j) { // swap std::swap(faceobjects[i], faceobjects[j]); i++; j--; } } #pragma omp parallel sections { #pragma omp section { if (left < j) qsort_descent_inplace(faceobjects, left, j); } #pragma omp section { if (i < right) qsort_descent_inplace(faceobjects, i, right); } } } static void qsort_descent_inplace(std::vector& faceobjects) { if (faceobjects.empty()) return; qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1); } static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold, bool agnostic = false) { picked.clear(); const int n = faceobjects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = faceobjects[i].rect.area(); } for (int i = 0; i < n; i++) { const Object& a = faceobjects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { const Object& b = faceobjects[picked[j]]; if (!agnostic && a.label != b.label) continue; // intersection over union float inter_area = intersection_area(a, b); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area if (inter_area / union_area > nms_threshold) keep = 0; } if (keep) picked.push_back(i); } } static void generate_proposals(const ncnn::Mat& cls_pred, const ncnn::Mat& dis_pred, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) { const int num_grid = cls_pred.h; int num_grid_x; int num_grid_y; if (in_pad.w > in_pad.h) { num_grid_x = in_pad.w / stride; num_grid_y = num_grid / num_grid_x; } else { num_grid_y = in_pad.h / stride; num_grid_x = num_grid / num_grid_y; } const int num_class = cls_pred.w; const int reg_max_1 = dis_pred.w / 4; for (int i = 0; i < num_grid_y; i++) { for (int j = 0; j < num_grid_x; j++) { const int idx = i * num_grid_x + j; const float* scores = cls_pred.row(idx); // find label with max score int label = -1; float score = -FLT_MAX; for (int k = 0; k < num_class; k++) { if (scores[k] > score) { label = k; score = scores[k]; } } if (score >= prob_threshold) { ncnn::Mat bbox_pred(reg_max_1, 4, (void*)dis_pred.row(idx)); { ncnn::Layer* softmax = ncnn::create_layer("Softmax"); ncnn::ParamDict pd; pd.set(0, 1); // axis pd.set(1, 1); softmax->load_param(pd); ncnn::Option opt; opt.num_threads = 1; opt.use_packing_layout = false; softmax->create_pipeline(opt); softmax->forward_inplace(bbox_pred, opt); softmax->destroy_pipeline(opt); delete softmax; } float pred_ltrb[4]; for (int k = 0; k < 4; k++) { float dis = 0.f; const float* dis_after_sm = bbox_pred.row(k); for (int l = 0; l < reg_max_1; l++) { dis += l * dis_after_sm[l]; } pred_ltrb[k] = dis * stride; } float pb_cx = (j + 0.5f) * stride; float pb_cy = (i + 0.5f) * stride; float x0 = pb_cx - pred_ltrb[0]; float y0 = pb_cy - pred_ltrb[1]; float x1 = pb_cx + pred_ltrb[2]; float y1 = pb_cy + pred_ltrb[3]; Object obj; obj.rect.x = x0; obj.rect.y = y0; obj.rect.width = x1 - x0; obj.rect.height = y1 - y0; obj.label = label; obj.prob = score; objects.push_back(obj); } } } } static int detect_nanodet(const cv::Mat& bgr, std::vector& objects) { ncnn::Net nanodet; nanodet.opt.use_vulkan_compute = true; // nanodet.opt.use_bf16_storage = true; // original pretrained model from https://github.com/RangiLyu/nanodet // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models if (nanodet.load_param("nanodet_m.param")) exit(-1); if (nanodet.load_model("nanodet_m.bin")) exit(-1); int width = bgr.cols; int height = bgr.rows; const int target_size = 320; const float prob_threshold = 0.4f; const float nms_threshold = 0.5f; // pad to multiple of 32 int w = width; int h = height; float scale = 1.f; if (w > h) { scale = (float)target_size / w; w = target_size; h = h * scale; } else { scale = (float)target_size / h; h = target_size; w = w * scale; } ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, width, height, w, h); // pad to target_size rectangle int wpad = (w + 31) / 32 * 32 - w; int hpad = (h + 31) / 32 * 32 - h; ncnn::Mat in_pad; ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 0.f); const float mean_vals[3] = {103.53f, 116.28f, 123.675f}; const float norm_vals[3] = {0.017429f, 0.017507f, 0.017125f}; in_pad.substract_mean_normalize(mean_vals, norm_vals); ncnn::Extractor ex = nanodet.create_extractor(); ex.input("input.1", in_pad); std::vector proposals; // stride 8 { ncnn::Mat cls_pred; ncnn::Mat dis_pred; ex.extract("792", cls_pred); ex.extract("795", dis_pred); std::vector objects8; generate_proposals(cls_pred, dis_pred, 8, in_pad, prob_threshold, objects8); proposals.insert(proposals.end(), objects8.begin(), objects8.end()); } // stride 16 { ncnn::Mat cls_pred; ncnn::Mat dis_pred; ex.extract("814", cls_pred); ex.extract("817", dis_pred); std::vector objects16; generate_proposals(cls_pred, dis_pred, 16, in_pad, prob_threshold, objects16); proposals.insert(proposals.end(), objects16.begin(), objects16.end()); } // stride 32 { ncnn::Mat cls_pred; ncnn::Mat dis_pred; ex.extract("836", cls_pred); ex.extract("839", dis_pred); std::vector objects32; generate_proposals(cls_pred, dis_pred, 32, in_pad, prob_threshold, objects32); proposals.insert(proposals.end(), objects32.begin(), objects32.end()); } // sort all proposals by score from highest to lowest qsort_descent_inplace(proposals); // apply nms with nms_threshold std::vector picked; nms_sorted_bboxes(proposals, picked, nms_threshold); int count = picked.size(); objects.resize(count); for (int i = 0; i < count; i++) { objects[i] = proposals[picked[i]]; // adjust offset to original unpadded float x0 = (objects[i].rect.x - (wpad / 2)) / scale; float y0 = (objects[i].rect.y - (hpad / 2)) / scale; float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale; float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale; // clip x0 = std::max(std::min(x0, (float)(width - 1)), 0.f); y0 = std::max(std::min(y0, (float)(height - 1)), 0.f); x1 = std::max(std::min(x1, (float)(width - 1)), 0.f); y1 = std::max(std::min(y1, (float)(height - 1)), 0.f); objects[i].rect.x = x0; objects[i].rect.y = y0; objects[i].rect.width = x1 - x0; objects[i].rect.height = y1 - y0; } return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = { "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush" }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0)); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_nanodet(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/nanodetplus_pnnx.cpp ================================================ // Copyright 2020 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include #include #include struct Object { cv::Rect_ rect; int label; float prob; }; static inline float intersection_area(const Object& a, const Object& b) { cv::Rect_ inter = a.rect & b.rect; return inter.area(); } static void qsort_descent_inplace(std::vector& faceobjects, int left, int right) { int i = left; int j = right; float p = faceobjects[(left + right) / 2].prob; while (i <= j) { while (faceobjects[i].prob > p) i++; while (faceobjects[j].prob < p) j--; if (i <= j) { // swap std::swap(faceobjects[i], faceobjects[j]); i++; j--; } } #pragma omp parallel sections { #pragma omp section { if (left < j) qsort_descent_inplace(faceobjects, left, j); } #pragma omp section { if (i < right) qsort_descent_inplace(faceobjects, i, right); } } } static void qsort_descent_inplace(std::vector& faceobjects) { if (faceobjects.empty()) return; qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1); } static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold, bool agnostic = false) { picked.clear(); const int n = faceobjects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = faceobjects[i].rect.area(); } for (int i = 0; i < n; i++) { const Object& a = faceobjects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { const Object& b = faceobjects[picked[j]]; if (!agnostic && a.label != b.label) continue; // intersection over union float inter_area = intersection_area(a, b); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area if (inter_area / union_area > nms_threshold) keep = 0; } if (keep) picked.push_back(i); } } static inline float sigmoid(float x) { return 1.0f / (1.0f + exp(-x)); } static void generate_proposals(const ncnn::Mat& pred, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) { const int num_grid = pred.h; int num_grid_x = pred.w; int num_grid_y = pred.h; const int num_class = 80; // number of classes. 80 for COCO const int reg_max_1 = (pred.c - num_class) / 4; for (int i = 0; i < num_grid_y; i++) { for (int j = 0; j < num_grid_x; j++) { // find label with max score int label = -1; float score = -FLT_MAX; for (int k = 0; k < num_class; k++) { float s = pred.channel(k).row(i)[j]; if (s > score) { label = k; score = s; } } score = sigmoid(score); if (score >= prob_threshold) { ncnn::Mat bbox_pred(reg_max_1, 4); for (int k = 0; k < reg_max_1 * 4; k++) { bbox_pred[k] = pred.channel(num_class + k).row(i)[j]; } { ncnn::Layer* softmax = ncnn::create_layer("Softmax"); ncnn::ParamDict pd; pd.set(0, 1); // axis pd.set(1, 1); softmax->load_param(pd); ncnn::Option opt; opt.num_threads = 1; opt.use_packing_layout = false; softmax->create_pipeline(opt); softmax->forward_inplace(bbox_pred, opt); softmax->destroy_pipeline(opt); delete softmax; } float pred_ltrb[4]; for (int k = 0; k < 4; k++) { float dis = 0.f; const float* dis_after_sm = bbox_pred.row(k); for (int l = 0; l < reg_max_1; l++) { dis += l * dis_after_sm[l]; } pred_ltrb[k] = dis * stride; } float pb_cx = j * stride; float pb_cy = i * stride; float x0 = pb_cx - pred_ltrb[0]; float y0 = pb_cy - pred_ltrb[1]; float x1 = pb_cx + pred_ltrb[2]; float y1 = pb_cy + pred_ltrb[3]; Object obj; obj.rect.x = x0; obj.rect.y = y0; obj.rect.width = x1 - x0; obj.rect.height = y1 - y0; obj.label = label; obj.prob = score; objects.push_back(obj); } } } } static int detect_nanodet(const cv::Mat& bgr, std::vector& objects) { ncnn::Net nanodet; nanodet.opt.use_vulkan_compute = true; // nanodet.opt.use_bf16_storage = true; // original pretrained model from https://github.com/RangiLyu/nanodet // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models // nanodet.load_param("nanodet-plus-m_320.torchscript.ncnn.param"); // nanodet.load_model("nanodet-plus-m_320.torchscript.ncnn.bin"); if (nanodet.load_param("nanodet-plus-m_416.torchscript.ncnn.param")) exit(-1); if (nanodet.load_model("nanodet-plus-m_416.torchscript.ncnn.bin")) exit(-1); int width = bgr.cols; int height = bgr.rows; // const int target_size = 320; const int target_size = 416; const float prob_threshold = 0.4f; const float nms_threshold = 0.5f; // pad to multiple of 32 int w = width; int h = height; float scale = 1.f; if (w > h) { scale = (float)target_size / w; w = target_size; h = h * scale; } else { scale = (float)target_size / h; h = target_size; w = w * scale; } ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, width, height, w, h); // pad to target_size rectangle int wpad = (w + 31) / 32 * 32 - w; int hpad = (h + 31) / 32 * 32 - h; ncnn::Mat in_pad; ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 0.f); const float mean_vals[3] = {103.53f, 116.28f, 123.675f}; const float norm_vals[3] = {0.017429f, 0.017507f, 0.017125f}; in_pad.substract_mean_normalize(mean_vals, norm_vals); ncnn::Extractor ex = nanodet.create_extractor(); ex.input("in0", in_pad); std::vector proposals; // stride 8 { ncnn::Mat pred; ex.extract("231", pred); std::vector objects8; generate_proposals(pred, 8, in_pad, prob_threshold, objects8); proposals.insert(proposals.end(), objects8.begin(), objects8.end()); } // stride 16 { ncnn::Mat pred; ex.extract("228", pred); std::vector objects16; generate_proposals(pred, 16, in_pad, prob_threshold, objects16); proposals.insert(proposals.end(), objects16.begin(), objects16.end()); } // stride 32 { ncnn::Mat pred; ex.extract("225", pred); std::vector objects32; generate_proposals(pred, 32, in_pad, prob_threshold, objects32); proposals.insert(proposals.end(), objects32.begin(), objects32.end()); } // stride 64 { ncnn::Mat pred; ex.extract("222", pred); std::vector objects64; generate_proposals(pred, 64, in_pad, prob_threshold, objects64); proposals.insert(proposals.end(), objects64.begin(), objects64.end()); } // sort all proposals by score from highest to lowest qsort_descent_inplace(proposals); // apply nms with nms_threshold std::vector picked; nms_sorted_bboxes(proposals, picked, nms_threshold); int count = picked.size(); objects.resize(count); for (int i = 0; i < count; i++) { objects[i] = proposals[picked[i]]; // adjust offset to original unpadded float x0 = (objects[i].rect.x - (wpad / 2)) / scale; float y0 = (objects[i].rect.y - (hpad / 2)) / scale; float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale; float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale; // clip x0 = std::max(std::min(x0, (float)(width - 1)), 0.f); y0 = std::max(std::min(y0, (float)(height - 1)), 0.f); x1 = std::max(std::min(x1, (float)(width - 1)), 0.f); y1 = std::max(std::min(y1, (float)(height - 1)), 0.f); objects[i].rect.x = x0; objects[i].rect.y = y0; objects[i].rect.width = x1 - x0; objects[i].rect.height = y1 - y0; } return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = { "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush" }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0)); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_nanodet(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/p2pnet.cpp ================================================ // Copyright 2021 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include #include #include struct CrowdPoint { cv::Point pt; float prob; }; static void shift(int w, int h, int stride, std::vector anchor_points, std::vector& shifted_anchor_points) { std::vector x_, y_; for (int i = 0; i < w; i++) { float x = (i + 0.5) * stride; x_.push_back(x); } for (int i = 0; i < h; i++) { float y = (i + 0.5) * stride; y_.push_back(y); } std::vector shift_x((size_t)w * h, 0), shift_y((size_t)w * h, 0); for (int i = 0; i < h; i++) { for (int j = 0; j < w; j++) { shift_x[i * w + j] = x_[j]; } } for (int i = 0; i < h; i++) { for (int j = 0; j < w; j++) { shift_y[i * w + j] = y_[i]; } } std::vector shifts((size_t)w * h * 2, 0); for (int i = 0; i < w * h; i++) { shifts[i * 2] = shift_x[i]; shifts[i * 2 + 1] = shift_y[i]; } shifted_anchor_points.resize((size_t)2 * w * h * anchor_points.size() / 2, 0); for (int i = 0; i < w * h; i++) { for (int j = 0; j < anchor_points.size() / 2; j++) { float x = anchor_points[j * 2] + shifts[i * 2]; float y = anchor_points[j * 2 + 1] + shifts[i * 2 + 1]; shifted_anchor_points[i * anchor_points.size() / 2 * 2 + j * 2] = x; shifted_anchor_points[i * anchor_points.size() / 2 * 2 + j * 2 + 1] = y; } } } static void generate_anchor_points(int stride, int row, int line, std::vector& anchor_points) { float row_step = (float)stride / row; float line_step = (float)stride / line; std::vector x_, y_; for (int i = 1; i < line + 1; i++) { float x = (i - 0.5) * line_step - stride / 2; x_.push_back(x); } for (int i = 1; i < row + 1; i++) { float y = (i - 0.5) * row_step - stride / 2; y_.push_back(y); } std::vector shift_x((size_t)row * line, 0), shift_y((size_t)row * line, 0); for (int i = 0; i < row; i++) { for (int j = 0; j < line; j++) { shift_x[i * line + j] = x_[j]; } } for (int i = 0; i < row; i++) { for (int j = 0; j < line; j++) { shift_y[i * line + j] = y_[i]; } } anchor_points.resize((size_t)row * line * 2, 0); for (int i = 0; i < row * line; i++) { float x = shift_x[i]; float y = shift_y[i]; anchor_points[i * 2] = x; anchor_points[i * 2 + 1] = y; } } static void generate_anchor_points(int img_w, int img_h, std::vector pyramid_levels, int row, int line, std::vector& all_anchor_points) { std::vector > image_shapes; std::vector strides; for (int i = 0; i < pyramid_levels.size(); i++) { int new_h = std::floor((img_h + std::pow(2, pyramid_levels[i]) - 1) / std::pow(2, pyramid_levels[i])); int new_w = std::floor((img_w + std::pow(2, pyramid_levels[i]) - 1) / std::pow(2, pyramid_levels[i])); image_shapes.push_back(std::make_pair(new_w, new_h)); strides.push_back(std::pow(2, pyramid_levels[i])); } all_anchor_points.clear(); for (int i = 0; i < pyramid_levels.size(); i++) { std::vector anchor_points; generate_anchor_points(std::pow(2, pyramid_levels[i]), row, line, anchor_points); std::vector shifted_anchor_points; shift(image_shapes[i].first, image_shapes[i].second, strides[i], anchor_points, shifted_anchor_points); all_anchor_points.insert(all_anchor_points.end(), shifted_anchor_points.begin(), shifted_anchor_points.end()); } } static int detect_crowd(const cv::Mat& bgr, std::vector& crowd_points) { ncnn::Option opt; opt.num_threads = 4; opt.use_vulkan_compute = false; opt.use_bf16_storage = false; ncnn::Net net; net.opt = opt; // model is converted from // https://github.com/TencentYoutuResearch/CrowdCounting-P2PNet // the ncnn model https://pan.baidu.com/s/1O1CBgvY6yJkrK8Npxx3VMg pwd: ezhx if (net.load_param("p2pnet.param")) exit(-1); if (net.load_model("p2pnet.bin")) exit(-1); int width = bgr.cols; int height = bgr.rows; int new_width = width / 128 * 128; int new_height = height / 128 * 128; ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, width, height, new_width, new_height); std::vector pyramid_levels(1, 3); std::vector all_anchor_points; generate_anchor_points(in.w, in.h, pyramid_levels, 2, 2, all_anchor_points); ncnn::Mat anchor_points = ncnn::Mat(2, all_anchor_points.size() / 2, all_anchor_points.data()); ncnn::Extractor ex = net.create_extractor(); const float mean_vals1[3] = {123.675f, 116.28f, 103.53f}; const float norm_vals1[3] = {0.01712475f, 0.0175f, 0.01742919f}; in.substract_mean_normalize(mean_vals1, norm_vals1); ex.input("input", in); ex.input("anchor", anchor_points); ncnn::Mat score, points; ex.extract("pred_scores", score); ex.extract("pred_points", points); for (int i = 0; i < points.h; i++) { float* score_data = score.row(i); float* points_data = points.row(i); CrowdPoint cp; int x = points_data[0] / new_width * width; int y = points_data[1] / new_height * height; cp.pt = cv::Point(x, y); cp.prob = score_data[1]; crowd_points.push_back(cp); } return 0; } static void draw_result(const cv::Mat& bgr, const std::vector& crowd_points) { cv::Mat image = bgr.clone(); const float threshold = 0.5f; for (int i = 0; i < crowd_points.size(); i++) { if (crowd_points[i].prob > threshold) { cv::circle(image, crowd_points[i].pt, 4, cv::Scalar(0, 0, 255), -1, 8, 0); } } cv::imshow("image", image); cv::waitKey(); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat bgr = cv::imread(imagepath, 1); if (bgr.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector crowd_points; detect_crowd(bgr, crowd_points); draw_result(bgr, crowd_points); return 0; } ================================================ FILE: examples/peleenetssd_seg.cpp ================================================ // Copyright 2017 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include struct Object { cv::Rect_ rect; int label; float prob; }; static int detect_peleenet(const cv::Mat& bgr, std::vector& objects, ncnn::Mat& resized) { ncnn::Net peleenet; peleenet.opt.use_vulkan_compute = true; // model is converted from https://github.com/eric612/MobileNet-YOLO // and can be downloaded from https://drive.google.com/open?id=1Wt6jKv13sBRMHgrGAJYlOlRF-o80pC0g // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models if (peleenet.load_param("pelee.param")) exit(-1); if (peleenet.load_model("pelee.bin")) exit(-1); const int target_size = 304; int img_w = bgr.cols; int img_h = bgr.rows; ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, target_size, target_size); const float mean_vals[3] = {103.9f, 116.7f, 123.6f}; const float norm_vals[3] = {0.017f, 0.017f, 0.017f}; in.substract_mean_normalize(mean_vals, norm_vals); ncnn::Extractor ex = peleenet.create_extractor(); ex.input("data", in); ncnn::Mat out; ex.extract("detection_out", out); // printf("%d %d %d\n", out.w, out.h, out.c); objects.clear(); for (int i = 0; i < out.h; i++) { const float* values = out.row(i); Object object; object.label = values[0]; object.prob = values[1]; object.rect.x = values[2] * img_w; object.rect.y = values[3] * img_h; object.rect.width = values[4] * img_w - object.rect.x; object.rect.height = values[5] * img_h - object.rect.y; objects.push_back(object); } ncnn::Mat seg_out; ex.extract("sigmoid", seg_out); resize_bilinear(seg_out, resized, img_w, img_h); //resize_bicubic(seg_out,resized,img_w,img_h); // sharpness return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects, ncnn::Mat map) { static const char* class_names[] = {"background", "person", "rider", "car", "bus", "truck", "bike", "motor", "traffic light", "traffic sign", "train" }; cv::Mat image = bgr.clone(); const int color[] = {128, 255, 128, 244, 35, 232}; const int color_count = sizeof(color) / sizeof(int); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0)); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } int width = map.w; int height = map.h; int size = map.c; int img_index2 = 0; float threshold = 0.45; const float* ptr2 = map; for (int i = 0; i < height; i++) { unsigned char* ptr1 = image.ptr(i); int img_index1 = 0; for (int j = 0; j < width; j++) { float maxima = threshold; int index = -1; for (int c = 0; c < size; c++) { //const float* ptr3 = map.channel(c); const float* ptr3 = ptr2 + c * width * height; if (ptr3[img_index2] > maxima) { maxima = ptr3[img_index2]; index = c; } } if (index > -1) { int color_index = (index)*3; if (color_index < color_count) { int b = color[color_index]; int g = color[color_index + 1]; int r = color[color_index + 2]; ptr1[img_index1] = b / 2 + ptr1[img_index1] / 2; ptr1[img_index1 + 1] = g / 2 + ptr1[img_index1 + 1] / 2; ptr1[img_index1 + 2] = r / 2 + ptr1[img_index1 + 2] / 2; } } img_index1 += 3; img_index2++; } } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; ncnn::Mat seg_out; detect_peleenet(m, objects, seg_out); draw_objects(m, objects, seg_out); return 0; } ================================================ FILE: examples/piper.cpp ================================================ // Copyright 2025 Tencent // SPDX-License-Identifier: BSD-3-Clause // convert piper checkpoints to ncnn models // 1. checkout https://github.com/OHF-Voice/piper1-gpl (113931937cf235fc881afd1ca4be209bc6919bc7) // 2. apply patch piper1-gpl.patch from https://github.com/nihui/ncnn-android-piper // 3. setup piper with // python3 -m venv .venv // source .venv/bin/activate // python3 -m pip install -e .[train] // 4. download piper checkpoint file (*.ckpt) from https://huggingface.co/datasets/rhasspy/piper-checkpoints // 5. install pnnx via pip install -U pnnx // 6. obtain export_ncnn.py script from https://github.com/nihui/ncnn-android-piper // python export_ncnn.py en.ckpt // convert word list to simple phonemizer dict // 1. prepare word list from https://github.com/Alexir/CMUdict // 2. for each word, get phonemes via command "./espeak-ng -q -v en-us --ipa word" // 3. obtain config.json file from https://huggingface.co/datasets/rhasspy/piper-checkpoints // 4. replace phonemes with ids according to phoneme_id_map in config.json // 5. write dict binary // word1 \0x00 ids1 \0xff word2 \0x00 ids2 \0xff ..... #include "layer.h" #include "mat.h" #include "net.h" #include #include #include #include class relative_embeddings_k_module : public ncnn::Layer { public: relative_embeddings_k_module() { one_blob_only = true; } virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const { const int window_size = 4; const int wsize = bottom_blob.w; const int len = bottom_blob.h; const int num_heads = bottom_blob.c; top_blob.create(len, len, num_heads); top_blob.fill(0.f); #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < num_heads; q++) { const ncnn::Mat x0 = bottom_blob.channel(q); ncnn::Mat out0 = top_blob.channel(q); for (int i = 0; i < len; i++) { const float* xptr = x0.row(i) + std::max(0, window_size - i); float* outptr = out0.row(i) + std::max(i - window_size, 0); const int wsize2 = std::min(len, i - window_size + wsize) - std::max(i - window_size, 0); for (int j = 0; j < wsize2; j++) { *outptr++ = *xptr++; } } } return 0; } }; DEFINE_LAYER_CREATOR(relative_embeddings_k_module) class relative_embeddings_v_module : public ncnn::Layer { public: relative_embeddings_v_module() { one_blob_only = true; } virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const { const int window_size = 4; const int wsize = window_size * 2 + 1; const int len = bottom_blob.h; const int num_heads = bottom_blob.c; top_blob.create(wsize, len, num_heads); top_blob.fill(0.f); #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < num_heads; q++) { const ncnn::Mat x0 = bottom_blob.channel(q); ncnn::Mat out0 = top_blob.channel(q); for (int i = 0; i < len; i++) { const float* xptr = x0.row(i) + std::max(i - window_size, 0); float* outptr = out0.row(i) + std::max(0, window_size - i); const int wsize2 = std::min(len, i - window_size + wsize) - std::max(i - window_size, 0); for (int j = 0; j < wsize2; j++) { *outptr++ = *xptr++; } } } return 0; } }; DEFINE_LAYER_CREATOR(relative_embeddings_v_module) class piecewise_rational_quadratic_transform_module : public ncnn::Layer { public: piecewise_rational_quadratic_transform_module() { one_blob_only = false; } virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const ncnn::Option& opt) const { const ncnn::Mat& h = bottom_blobs[0]; const ncnn::Mat& x1 = bottom_blobs[1]; ncnn::Mat& outputs = top_blobs[0]; const int num_bins = 10; const int filter_channels = 192; const bool reverse = true; const float tail_bound = 5.0f; const float DEFAULT_MIN_BIN_WIDTH = 1e-3f; const float DEFAULT_MIN_BIN_HEIGHT = 1e-3f; const float DEFAULT_MIN_DERIVATIVE = 1e-3f; const int batch_size = x1.w; const int h_params_per_item = 2 * num_bins + (num_bins - 1); // 29 outputs = x1.clone(); float* out_ptr = outputs; #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < batch_size; ++i) { const float current_x = ((const float*)x1)[i]; const float* h_data = h.row(i); if (current_x < -tail_bound || current_x > tail_bound) { continue; } std::vector unnormalized_widths(num_bins); std::vector unnormalized_heights(num_bins); std::vector unnormalized_derivatives(num_bins + 1); const float inv_sqrt_filter_channels = 1.0f / sqrtf(filter_channels); for (int j = 0; j < num_bins; ++j) { unnormalized_widths[j] = h_data[j] * inv_sqrt_filter_channels; } for (int j = 0; j < num_bins; ++j) { unnormalized_heights[j] = h_data[num_bins + j] * inv_sqrt_filter_channels; } for (int j = 0; j < num_bins - 1; ++j) { unnormalized_derivatives[j + 1] = h_data[2 * num_bins + j]; } const float constant = logf(expf(1.f - DEFAULT_MIN_DERIVATIVE) - 1.f); unnormalized_derivatives[0] = constant; unnormalized_derivatives[num_bins] = constant; const float left = -tail_bound, right = tail_bound; const float bottom = -tail_bound, top = tail_bound; // Softmax + Affine std::vector widths(num_bins); float w_max = -INFINITY; for (float val : unnormalized_widths) w_max = std::max(w_max, val); float w_sum = 0.f; for (int j = 0; j < num_bins; ++j) { widths[j] = expf(unnormalized_widths[j] - w_max); w_sum += widths[j]; } for (int j = 0; j < num_bins; ++j) { widths[j] = DEFAULT_MIN_BIN_WIDTH + (1.f - DEFAULT_MIN_BIN_WIDTH * num_bins) * (widths[j] / w_sum); } // cumwidths std::vector cumwidths(num_bins + 1); cumwidths[0] = left; float current_w_sum = 0.f; for (int j = 0; j < num_bins - 1; ++j) { current_w_sum += widths[j]; cumwidths[j + 1] = left + (right - left) * current_w_sum; } cumwidths[num_bins] = right; // heights std::vector heights(num_bins); float h_max = -INFINITY; for (float val : unnormalized_heights) h_max = std::max(h_max, val); float h_sum = 0.f; for (int j = 0; j < num_bins; ++j) { heights[j] = expf(unnormalized_heights[j] - h_max); h_sum += heights[j]; } for (int j = 0; j < num_bins; ++j) { heights[j] = DEFAULT_MIN_BIN_HEIGHT + (1.f - DEFAULT_MIN_BIN_HEIGHT * num_bins) * (heights[j] / h_sum); } // cumheights std::vector cumheights(num_bins + 1); cumheights[0] = bottom; float current_h_sum = 0.f; for (int j = 0; j < num_bins - 1; ++j) { current_h_sum += heights[j]; cumheights[j + 1] = bottom + (top - bottom) * current_h_sum; } cumheights[num_bins] = top; // Softplus std::vector derivatives(num_bins + 1); for (int j = 0; j < num_bins + 1; ++j) { float x = unnormalized_derivatives[j]; derivatives[j] = DEFAULT_MIN_DERIVATIVE + (x > 0 ? x + logf(1.f + expf(-x)) : logf(1.f + expf(x))); } // bin_idx int bin_idx = 0; if (reverse) { auto it = std::upper_bound(cumheights.begin(), cumheights.end(), current_x); bin_idx = std::distance(cumheights.begin(), it) - 1; } else { auto it = std::upper_bound(cumwidths.begin(), cumwidths.end(), current_x); bin_idx = std::distance(cumwidths.begin(), it) - 1; } bin_idx = std::max(0, std::min(bin_idx, num_bins - 1)); // collect coeffs const float input_cumwidths = cumwidths[bin_idx]; const float input_bin_widths = cumwidths[bin_idx + 1] - cumwidths[bin_idx]; const float input_cumheights = cumheights[bin_idx]; const float input_heights = cumheights[bin_idx + 1] - cumheights[bin_idx]; const float input_derivatives = derivatives[bin_idx]; const float input_derivatives_plus_one = derivatives[bin_idx + 1]; const float delta = input_heights / input_bin_widths; // apply transform if (reverse) { float a = (current_x - input_cumheights) * (input_derivatives + input_derivatives_plus_one - 2 * delta) + input_heights * (delta - input_derivatives); float b = input_heights * input_derivatives - (current_x - input_cumheights) * (input_derivatives + input_derivatives_plus_one - 2 * delta); float c = -delta * (current_x - input_cumheights); float discriminant = b * b - 4 * a * c; discriminant = std::max(0.f, discriminant); float root = (2 * c) / (-b - sqrtf(discriminant)); out_ptr[i] = root * input_bin_widths + input_cumwidths; } else { float theta = (current_x - input_cumwidths) / input_bin_widths; float theta_one_minus_theta = theta * (1 - theta); float numerator = input_heights * (delta * theta * theta + input_derivatives * theta_one_minus_theta); float denominator = delta + ((input_derivatives + input_derivatives_plus_one - 2 * delta) * theta_one_minus_theta); out_ptr[i] = input_cumheights + numerator / denominator; } } return 0; } }; DEFINE_LAYER_CREATOR(piecewise_rational_quadratic_transform_module) static bool is_word_eos(const char* word) { const char c = word[0]; return c == ',' || c == '.' || c == ';' || c == '?' || c == '!'; } static void find_word_id(const std::map >& dict, const char* word, const unsigned char*& ids) { ids = 0; unsigned char first_char = toupper(word[0]); if (dict.find(first_char) == dict.end()) return; const std::vector& wordlist = dict.at(first_char); for (size_t i = 0; i < wordlist.size(); i++) { if (strcasecmp(wordlist[i], word) == 0) { // hit ids = (const unsigned char*)(wordlist[i] + strlen(wordlist[i]) + 1); return; } } } static void simple_phonemize(const char* text, std::vector& sequence_ids) { // this is a very simple g2p function, it works for english only // load dict buffer std::vector dictbinbuf; { FILE* fp = fopen("en-word_id.bin", "rb"); if (!fp) return; fseek(fp, 0, SEEK_END); size_t len = ftell(fp); rewind(fp); dictbinbuf.resize(len); fread(dictbinbuf.data(), 1, len, fp); fclose(fp); } // build dict std::map > dict; { const unsigned char* p = dictbinbuf.data(); const char* word = (const char*)p; for (size_t i = 0; i < dictbinbuf.size(); i++) { if (dictbinbuf[i] == 0xff) { unsigned int first_char = toupper(word[0]); dict[first_char].push_back(word); word = (const char*)(p + i + 1); } } } // phonemize mainpart { const int ID_PAD = 0; // interleaved const int ID_BOS = 1; // beginning of sentence const int ID_EOS = 2; // end of sentence const int ID_SPACE = 3; // space bool last_char_is_control = false; bool sentence_begin = true; bool sentence_end = true; char word[256]; const char* p = text; while (*p) { if (sentence_end && !last_char_is_control) { sequence_ids.push_back(ID_BOS); sequence_ids.push_back(ID_PAD); sentence_end = false; } if (sentence_begin || last_char_is_control) { // the very first word } else { // space id sequence_ids.push_back(ID_SPACE); sequence_ids.push_back(ID_PAD); } if (isalnum((unsigned char)*p)) { char* pword = word; // alpha or number *pword++ = *p++; // consume word int wordlen = 1; while (isalnum((unsigned char)*p) && wordlen < 233) { *pword++ = *p++; wordlen++; } *pword = '\0'; if (is_word_eos(word)) { if (!sentence_end) sequence_ids.push_back(ID_EOS); sentence_end = true; last_char_is_control = false; sentence_begin = false; continue; } const unsigned char* ids = 0; find_word_id(dict, word, ids); if (ids) { const unsigned char* pids = ids; while (*pids != 0xff) { sequence_ids.push_back(*pids); sequence_ids.push_back(ID_PAD); pids++; } } else { // no such word, spell alphabet one by one char tmp[2] = {'\0', '\0'}; for (size_t i = 0; i < strlen(word); i++) { tmp[0] = word[i]; find_word_id(dict, tmp, ids); if (ids) { const unsigned char* pids = ids; while (*pids != 0xff) { sequence_ids.push_back(*pids); sequence_ids.push_back(ID_PAD); pids++; } if (i + 1 != strlen(word)) { sequence_ids.push_back(ID_SPACE); sequence_ids.push_back(ID_PAD); } } else { fprintf(stderr, "word char %c not recognized\n", word[i]); } } } last_char_is_control = false; sentence_begin = false; continue; } else { // skip control character p++; last_char_is_control = true; } } if (!sentence_end) sequence_ids.push_back(ID_EOS); } } static void path_attention(const ncnn::Mat& logw, const ncnn::Mat& m_p, const ncnn::Mat& logs_p, float noise_scale, float length_scale, ncnn::Mat& z_p) { const int x_lengths = logw.w; // assert m_p.h == logs_p.h const int depth = m_p.h; std::vector w_ceil(x_lengths); int y_lengths = 0; for (int i = 0; i < x_lengths; i++) { w_ceil[i] = (int)ceilf(expf(logw[i]) * length_scale); y_lengths += w_ceil[i]; } z_p.create(y_lengths, depth); for (int i = 0; i < depth; i++) { const float* m_p_ptr = m_p.row(i); const float* logs_p_ptr = logs_p.row(i); float* ptr = z_p.row(i); for (int j = 0; j < x_lengths; j++) { const float m = m_p_ptr[j]; const float nl = expf(logs_p_ptr[j]) * noise_scale; const int duration = w_ceil[j]; for (int k = 0; k < duration; k++) { ptr[k] = m + (rand() / (float)RAND_MAX) * nl; } ptr += duration; } } } static int tts_piper(const char* text, int speaker_id, std::vector& pcm) { // zh models could be found at // https://github.com/nihui/ncnn-android-piper/tree/master/app/src/main/assets // hyper parameters from https://huggingface.co/datasets/rhasspy/piper-checkpoints/blob/main/en/en_US/libritts_r/medium/config.json const float noise_scale = 0.333f; const float length_scale = 1.f; const float noise_scale_w = 0.333f; // phonemize ncnn::Mat sequence; { std::vector sequence_ids; simple_phonemize(text, sequence_ids); const int sequence_length = (int)sequence_ids.size(); sequence.create(sequence_length); memcpy(sequence, sequence_ids.data(), sequence_length * sizeof(int)); } // enc_p ncnn::Mat x; ncnn::Mat m_p; ncnn::Mat logs_p; { ncnn::Net enc_p; enc_p.opt.use_vulkan_compute = true; enc_p.register_custom_layer("piper.train.vits.attentions.relative_embeddings_k_module", relative_embeddings_k_module_layer_creator); enc_p.register_custom_layer("piper.train.vits.attentions.relative_embeddings_v_module", relative_embeddings_v_module_layer_creator); enc_p.load_param("en_enc_p.ncnn.param"); enc_p.load_model("en_enc_p.ncnn.bin"); ncnn::Extractor ex = enc_p.create_extractor(); ex.input("in0", sequence); ex.extract("out0", x); ex.extract("out1", m_p); ex.extract("out2", logs_p); } // emb_g ncnn::Mat g; { ncnn::Net emb_g; emb_g.opt.use_vulkan_compute = true; emb_g.load_param("en_emb_g.ncnn.param"); emb_g.load_model("en_emb_g.ncnn.bin"); ncnn::Mat speaker_id_mat(1); { int* p = speaker_id_mat; p[0] = speaker_id; } ncnn::Extractor ex = emb_g.create_extractor(); ex.input("in0", speaker_id_mat); ex.extract("out0", g); g = g.reshape(1, g.w); } // dp ncnn::Mat logw; { ncnn::Net dp; dp.opt.use_vulkan_compute = true; dp.register_custom_layer("piper.train.vits.modules.piecewise_rational_quadratic_transform_module", piecewise_rational_quadratic_transform_module_layer_creator); dp.load_param("en_dp.ncnn.param"); dp.load_model("en_dp.ncnn.bin"); ncnn::Mat noise(x.w, 2); for (int i = 0; i < noise.w * noise.h; i++) { noise[i] = rand() / (float)RAND_MAX * noise_scale_w; } ncnn::Extractor ex = dp.create_extractor(); ex.input("in0", x); ex.input("in1", noise); ex.input("in2", g); ex.extract("out0", logw); } // path attention ncnn::Mat z_p; { path_attention(logw, m_p, logs_p, noise_scale, length_scale, z_p); } // flow ncnn::Mat z; { ncnn::Net flow; flow.opt.use_vulkan_compute = true; flow.load_param("en_flow.ncnn.param"); flow.load_model("en_flow.ncnn.bin"); ncnn::Extractor ex = flow.create_extractor(); ex.input("in0", z_p); ex.input("in1", g); ex.extract("out0", z); } // dec ncnn::Mat o; { ncnn::Net dec; dec.opt.use_vulkan_compute = true; dec.load_param("en_dec.ncnn.param"); dec.load_model("en_dec.ncnn.bin"); ncnn::Extractor ex = dec.create_extractor(); ex.input("in0", z); ex.input("in1", g); ex.extract("out0", o); } // normalize and clip { float volume = 1.f; float absmax = 0.f; for (int i = 0; i < o.w; i++) { absmax = std::max(absmax, fabs(o[i])); } if (absmax > 1e-8) { for (int i = 0; i < o.w; i++) { float v = o[i] / absmax * volume; v = std::min(std::max(v, -1.f), 1.f); o[i] = v; } } } // 16bit pcm { pcm.resize(o.w); for (int i = 0; i < o.w; i++) { pcm[i] = (short)(o[i] * 32767); } } return 0; } static void save_pcm_to_wav(const char* path, const short* pcm, int num_samples, int sample_rate) { FILE* f = fopen(path, "wb"); if (!f) return; // write wav header { int16_t num_channels = 1; int16_t bits_per_sample = 16; int32_t byte_rate = sample_rate * num_channels * bits_per_sample / 8; int16_t block_align = num_channels * bits_per_sample / 8; int32_t data_chunk_size = num_samples * num_channels * bits_per_sample / 8; int32_t chunk_size = 36 + data_chunk_size; // RIFF header fwrite("RIFF", 1, 4, f); fwrite(&chunk_size, 4, 1, f); fwrite("WAVE", 1, 4, f); // fmt subchunk fwrite("fmt ", 1, 4, f); int32_t subchunk1_size = 16; int16_t audio_format = 1; // PCM fwrite(&subchunk1_size, 4, 1, f); fwrite(&audio_format, 2, 1, f); fwrite(&num_channels, 2, 1, f); fwrite(&sample_rate, 4, 1, f); fwrite(&byte_rate, 4, 1, f); fwrite(&block_align, 2, 1, f); fwrite(&bits_per_sample, 2, 1, f); // data subchunk fwrite("data", 1, 4, f); fwrite(&data_chunk_size, 4, 1, f); } fwrite(pcm, sizeof(short), num_samples, f); fclose(f); } int main(int argc, char** argv) { if (argc != 4) { fprintf(stderr, "Usage: %s [sentences] [speaker id 0~903] [out path]\n", argv[0]); fprintf(stderr, " %s \"Hello World\" 0 out.wav\n", argv[0]); fprintf(stderr, " %s \"Happy New Year\" 123 out.wav\n", argv[0]); return 0; } const char* text = argv[1]; const int speaker_id = atoi(argv[2]); const char* outpath = argv[3]; std::vector pcm; tts_piper(text, speaker_id, pcm); // "sample_rate": 22050 save_pcm_to_wav(outpath, pcm.data(), pcm.size(), 22050); return 0; } ================================================ FILE: examples/ppocrv5.cpp ================================================ // Copyright 2025 Tencent // SPDX-License-Identifier: BSD-3-Clause // pip install paddlepaddle==3.0.0 // pip install paddleocr==3.0.0 // paddlex --install paddle2onnx // paddleocr ocr -i test.png // paddlex --paddle2onnx --paddle_model_dir ~/.paddlex/official_models/PP-OCRv5_mobile_det --onnx_model_dir PP-OCRv5_mobile_det // paddlex --paddle2onnx --paddle_model_dir ~/.paddlex/official_models/PP-OCRv5_mobile_rec --onnx_model_dir PP-OCRv5_mobile_rec // pnnx PP-OCRv5_mobile_det.onnx inputshape=[1,3,320,320] inputshape2=[1,3,256,256] // pnnx PP-OCRv5_mobile_rec.onnx inputshape=[1,3,48,160] inputshape2=[1,3,48,256] // pnnx PP-OCRv5_server_det.onnx inputshape=[1,3,320,320] inputshape2=[1,3,256,256] fp16=0 // pnnx PP-OCRv5_server_rec.onnx inputshape=[1,3,48,160] inputshape2=[1,3,48,256] fp16=0 #include "layer.h" #include "net.h" #include #include #include #include #include #include #include "ppocrv5_dict.h" struct Character { int id; float prob; }; struct Object { cv::RotatedRect rrect; int orientation; float prob; std::vector text; }; static double contour_score(const cv::Mat& binary, const std::vector& contour) { cv::Rect rect = cv::boundingRect(contour); if (rect.x < 0) rect.x = 0; if (rect.y < 0) rect.y = 0; if (rect.x + rect.width > binary.cols) rect.width = binary.cols - rect.x; if (rect.y + rect.height > binary.rows) rect.height = binary.rows - rect.y; cv::Mat binROI = binary(rect); cv::Mat mask = cv::Mat::zeros(rect.height, rect.width, CV_8U); std::vector roiContour; for (size_t i = 0; i < contour.size(); i++) { cv::Point pt = cv::Point(contour[i].x - rect.x, contour[i].y - rect.y); roiContour.push_back(pt); } std::vector > roiContours = {roiContour}; cv::fillPoly(mask, roiContours, cv::Scalar(255)); double score = cv::mean(binROI, mask).val[0]; return score / 255.f; } static cv::Mat get_rotate_crop_image(const cv::Mat& bgr, const Object& object) { const int orientation = object.orientation; const float rw = object.rrect.size.width; const float rh = object.rrect.size.height; const int target_height = 48; const float target_width = rh * target_height / rw; // warpperspective shall be used to rotate the image // but actually they are all rectangles, so warpaffine is almost enough :P cv::Mat dst; cv::Point2f corners[4]; object.rrect.points(corners); if (orientation == 0) { // horizontal text // corner points order // 0--------1 // | |rw -> as angle=90 // 3--------2 // rh std::vector src_pts(3); src_pts[0] = corners[0]; src_pts[1] = corners[1]; src_pts[2] = corners[3]; std::vector dst_pts(3); dst_pts[0] = cv::Point2f(0, 0); dst_pts[1] = cv::Point2f(target_width, 0); dst_pts[2] = cv::Point2f(0, target_height); cv::Mat tm = cv::getAffineTransform(src_pts, dst_pts); cv::warpAffine(bgr, dst, tm, cv::Size(target_width, target_height), cv::INTER_LINEAR, cv::BORDER_REPLICATE); } else { // vertial text // corner points order // 1----2 // | | // | | // | |rh -> as angle=0 // | | // | | // 0----3 // rw std::vector src_pts(3); src_pts[0] = corners[2]; src_pts[1] = corners[3]; src_pts[2] = corners[1]; std::vector dst_pts(3); dst_pts[0] = cv::Point2f(0, 0); dst_pts[1] = cv::Point2f(target_width, 0); dst_pts[2] = cv::Point2f(0, target_height); cv::Mat tm = cv::getAffineTransform(src_pts, dst_pts); cv::warpAffine(bgr, dst, tm, cv::Size(target_width, target_height), cv::INTER_LINEAR, cv::BORDER_REPLICATE); } return dst; } class PPOCRv5 { public: void init(); void detect(const cv::Mat& bgr, std::vector& objects); void recognize(const cv::Mat& bgr, Object& object); protected: ncnn::Net ppocrv5_det; ncnn::Net ppocrv5_rec; }; void PPOCRv5::init() { // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models // https://github.com/nihui/ncnn-android-ppocrv5/tree/master/app/src/main/assets ppocrv5_det.opt.use_vulkan_compute = true; // ppocrv5_det.opt.use_bf16_storage = true; // fp16 must be disabled for server model // ppocrv5_det.opt.use_fp16_packed = false; // ppocrv5_det.opt.use_fp16_storage = false; ppocrv5_det.load_param("PP_OCRv5_mobile_det.ncnn.param"); ppocrv5_det.load_model("PP_OCRv5_mobile_det.ncnn.bin"); // ppocrv5_det.load_param("PP_OCRv5_server_det.ncnn.param"); // ppocrv5_det.load_model("PP_OCRv5_server_det.ncnn.bin"); ppocrv5_rec.opt.use_vulkan_compute = true; // ppocrv5_rec.opt.use_bf16_storage = true; // fp16 must be disabled for server model // ppocrv5_rec.opt.use_fp16_packed = false; // ppocrv5_rec.opt.use_fp16_storage = false; ppocrv5_rec.load_param("PP_OCRv5_mobile_rec.ncnn.param"); ppocrv5_rec.load_model("PP_OCRv5_mobile_rec.ncnn.bin"); // ppocrv5_rec.load_param("PP_OCRv5_server_rec.ncnn.param"); // ppocrv5_rec.load_model("PP_OCRv5_server_rec.ncnn.bin"); } void PPOCRv5::detect(const cv::Mat& bgr, std::vector& objects) { const int target_size = 960; int img_w = bgr.cols; int img_h = bgr.rows; const int target_stride = 32; // letterbox pad to multiple of target_stride int w = img_w; int h = img_h; float scale = 1.f; if (std::max(w, h) > target_size) { if (w > h) { scale = (float)target_size / w; w = target_size; h = h * scale; } else { scale = (float)target_size / h; h = target_size; w = w * scale; } } ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, img_w, img_h, w, h); int wpad = (w + target_stride - 1) / target_stride * target_stride - w; int hpad = (h + target_stride - 1) / target_stride * target_stride - h; ncnn::Mat in_pad; ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); const float mean_vals[3] = {0.485f * 255.f, 0.456f * 255.f, 0.406f * 255.f}; const float norm_vals[3] = {1 / 0.229f / 255.f, 1 / 0.224f / 255.f, 1 / 0.225f / 255.f}; in_pad.substract_mean_normalize(mean_vals, norm_vals); ncnn::Extractor ex = ppocrv5_det.create_extractor(); ex.input("in0", in_pad); ncnn::Mat out; ex.extract("out0", out); const float denorm_vals[1] = {255.f}; out.substract_mean_normalize(0, denorm_vals); cv::Mat pred(out.h, out.w, CV_8UC1); out.to_pixels(pred.data, ncnn::Mat::PIXEL_GRAY); // threshold binary cv::Mat bitmap; const float threshold = 0.3f; cv::threshold(pred, bitmap, threshold * 255, 255, cv::THRESH_BINARY); // boxes from bitmap { // should use dbnet post process, but I think unclip process is difficult to write // so simply implement expansion. This may lose detection accuracy // original implementation can be referenced // https://github.com/MhLiao/DB/blob/master/structure/representers/seg_detector_representer.py const float box_thresh = 0.6f; const float enlarge_ratio = 1.95f; const float min_size = 3 * scale; const int max_candidates = 1000; std::vector > contours; std::vector hierarchy; cv::findContours(bitmap, contours, hierarchy, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE); contours.resize(std::min(contours.size(), (size_t)max_candidates)); for (size_t i = 0; i < contours.size(); i++) { const std::vector& contour = contours[i]; if (contour.size() <= 2) continue; double score = contour_score(pred, contour); if (score < box_thresh) continue; cv::RotatedRect rrect = cv::minAreaRect(contour); float rrect_maxwh = std::max(rrect.size.width, rrect.size.height); if (rrect_maxwh < min_size) continue; int orientation = 0; if (rrect.angle >= -30 && rrect.angle <= 30 && rrect.size.height > rrect.size.width * 2.7) { // vertical text orientation = 1; } if ((rrect.angle <= -60 || rrect.angle >= 60) && rrect.size.width > rrect.size.height * 2.7) { // vertical text orientation = 1; } if (rrect.angle < -30) { // make orientation from -90 ~ -30 to 90 ~ 150 rrect.angle += 180; } if (orientation == 0 && rrect.angle < 30) { // make it horizontal rrect.angle += 90; std::swap(rrect.size.width, rrect.size.height); } if (orientation == 1 && rrect.angle >= 60) { // make it vertical rrect.angle -= 90; std::swap(rrect.size.width, rrect.size.height); } // enlarge rrect.size.height += rrect.size.width * (enlarge_ratio - 1); rrect.size.width *= enlarge_ratio; // adjust offset to original unpadded rrect.center.x = (rrect.center.x - (wpad / 2)) / scale; rrect.center.y = (rrect.center.y - (hpad / 2)) / scale; rrect.size.width = (rrect.size.width) / scale; rrect.size.height = (rrect.size.height) / scale; Object obj; obj.rrect = rrect; obj.orientation = orientation; obj.prob = score; objects.push_back(obj); } } } void PPOCRv5::recognize(const cv::Mat& bgr, Object& object) { cv::Mat roi = get_rotate_crop_image(bgr, object); ncnn::Mat in = ncnn::Mat::from_pixels(roi.data, ncnn::Mat::PIXEL_BGR, roi.cols, roi.rows); // ~/.paddlex/official_models/PP-OCRv5_mobile_rec/inference.yml const float mean_vals[3] = {127.5, 127.5, 127.5}; const float norm_vals[3] = {1.0 / 127.5, 1.0 / 127.5, 1.0 / 127.5}; in.substract_mean_normalize(mean_vals, norm_vals); ncnn::Extractor ex = ppocrv5_rec.create_extractor(); ex.input("in0", in); ncnn::Mat out; ex.extract("out0", out); // 18385 x len int last_token = 0; for (int i = 0; i < out.h; i++) { const float* p = out.row(i); int index = 0; float max_score = -9999.f; for (int j = 0; j < out.w; j++) { float score = *p++; if (score > max_score) { max_score = score; index = j; } } if (last_token == index) // CTC rule, if index is same as last one, they will be merged into one token continue; last_token = index; if (index <= 0) continue; Character ch; ch.id = index - 1; ch.prob = max_score; object.text.push_back(ch); } } static int detect_ppocrv5(const cv::Mat& bgr, std::vector& objects) { PPOCRv5 ppocrv5; ppocrv5.init(); ppocrv5.detect(bgr, objects); for (size_t i = 0; i < objects.size(); i++) { ppocrv5.recognize(bgr, objects[i]); } return 0; } static int draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const cv::Scalar colors[] = { cv::Scalar(156, 39, 176), cv::Scalar(103, 58, 183), cv::Scalar(63, 81, 181), cv::Scalar(33, 150, 243), cv::Scalar(3, 169, 244), cv::Scalar(0, 188, 212), cv::Scalar(0, 150, 136), cv::Scalar(76, 175, 80), cv::Scalar(139, 195, 74), cv::Scalar(205, 220, 57), cv::Scalar(255, 235, 59), cv::Scalar(255, 193, 7), cv::Scalar(255, 152, 0), cv::Scalar(255, 87, 34), cv::Scalar(121, 85, 72), cv::Scalar(158, 158, 158), cv::Scalar(96, 125, 139) }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; const cv::Scalar& color = colors[i % 17]; fprintf(stderr, "%s %.5f at %.2f %.2f %.2f x %.2f @ %.2f = ", obj.orientation == 0 ? "H" : "V", obj.prob, obj.rrect.center.x, obj.rrect.center.y, obj.rrect.size.width, obj.rrect.size.height, obj.rrect.angle); cv::Point2f corners[4]; obj.rrect.points(corners); cv::line(image, corners[0], corners[1], color); cv::line(image, corners[1], corners[2], color); cv::line(image, corners[2], corners[3], color); cv::line(image, corners[3], corners[0], color); std::string text; for (size_t j = 0; j < objects[i].text.size(); j++) { const Character& ch = objects[i].text[j]; if (ch.id >= character_dict_size) continue; text += character_dict[ch.id]; } fprintf(stderr, "%s\n", text.c_str()); } fprintf(stderr, "opencv putText can not draw non-latin characters, you may see question marks instead\n"); fprintf(stderr, "see opencv-mobile for drawing non-latin characters\n"); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; const cv::Scalar& color = colors[i % 17]; std::string text; for (size_t j = 0; j < objects[i].text.size(); j++) { const Character& ch = objects[i].text[j]; if (ch.id >= character_dict_size) { if (!text.empty() && text.back() != ' ') { text += " "; } continue; } if (obj.orientation == 0) { text += character_dict[ch.id]; } else { text += character_dict[ch.id]; if (j + 1 < objects[i].text.size()) text += "\n"; } } int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rrect.center.x - label_size.width / 2; int y = obj.rrect.center.y - label_size.height / 2 - baseLine; if (y < 0) y = 0; if (y + label_size.height > image.rows) y = image.rows - label_size.height; if (x < 0) x = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); if (obj.orientation == 0) { cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } else { cv::putText(image, text, cv::Point(x, y + label_size.width), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } } cv::imshow("image", image); cv::waitKey(0); return 0; } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_ppocrv5(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/ppocrv5_dict.h ================================================ // Copyright 2025 Tencent // SPDX-License-Identifier: BSD-3-Clause static const char* character_dict[] = { " ", "一", "乙", "二", "十", "丁", "厂", "七", "卜", "八", "人", "入", "儿", "匕", "几", "九", "刁", "了", "刀", "力", "乃", "又", "三", "干", "于", "亏", "工", "土", "士", "才", "下", "寸", "大", "丈", "与", "万", "上", "小", "口", "山", "巾", "千", "乞", "川", "亿", "个", "夕", "久", "么", "勺", "凡", "丸", "及", "广", "亡", "门", "丫", "义", "之", "尸", "己", "已", "巳", "弓", "子", "卫", "也", "女", "刃", "飞", "习", "叉", "马", "乡", "丰", "王", "开", "井", "天", "夫", "元", "无", "云", "专", "丐", "扎", "艺", "木", "五", "支", "厅", "不", "犬", "太", "区", "历", "歹", "友", "尤", "匹", "车", "巨", "牙", "屯", "戈", "比", "互", "切", "瓦", "止", "少", "曰", "日", "中", "贝", "冈", "内", "水", "见", "午", "牛", "手", "气", "毛", "壬", "升", "夭", "长", "仁", "什", "片", "仆", "化", "仇", "币", "仍", "仅", "斤", "爪", "反", "介", "父", "从", "仑", "今", "凶", "分", "乏", "公", "仓", "月", "氏", "勿", "欠", "风", "丹", "匀", "乌", "勾", "凤", "六", "文", "亢", "方", "火", "为", "斗", "忆", "计", "订", "户", "认", "冗", "讥", "心", "尺", "引", "丑", "巴", "孔", "队", "办", "以", "允", "予", "邓", "劝", "双", "书", "幻", "玉", "刊", "未", "末", "示", "击", "打", "巧", "正", "扑", "卉", "扒", "功", "扔", "去", "甘", "世", "艾", "古", "节", "本", "术", "可", "丙", "左", "厉", "石", "右", "布", "夯", "戊", "龙", "平", "灭", "轧", "东", "卡", "北", "占", "凸", "卢", "业", "旧", "帅", "归", "旦", "目", "且", "叶", "甲", "申", "叮", "电", "号", "田", "由", "只", "叭", "史", "央", "兄", "叽", "叼", "叫", "叩", "叨", "另", "叹", "冉", "皿", "凹", "囚", "四", "生", "矢", "失", "乍", "禾", "丘", "付", "仗", "代", "仙", "们", "仪", "白", "仔", "他", "斥", "瓜", "乎", "丛", "令", "用", "甩", "印", "尔", "乐", "句", "匆", "册", "卯", "犯", "外", "处", "冬", "鸟", "务", "包", "饥", "主", "市", "立", "冯", "玄", "闪", "兰", "半", "汁", "汇", "头", "汉", "宁", "穴", "它", "讨", "写", "让", "礼", "训", "议", "必", "讯", "记", "永", "司", "尼", "民", "弗", "弘", "出", "辽", "奶", "奴", "召", "加", "皮", "边", "孕", "发", "圣", "对", "台", "矛", "纠", "母", "幼", "丝", "邦", "式", "迂", "刑", "戎", "动", "扛", "寺", "吉", "扣", "考", "托", "老", "巩", "圾", "执", "扩", "扫", "地", "场", "扬", "耳", "芋", "共", "芒", "亚", "芝", "朽", "朴", "机", "权", "过", "臣", "吏", "再", "协", "西", "压", "厌", "戌", "在", "百", "有", "存", "而", "页", "匠", "夸", "夺", "灰", "达", "列", "死", "成", "夹", "夷", "轨", "邪", "尧", "划", "迈", "毕", "至", "此", "贞", "师", "尘", "尖", "劣", "光", "当", "早", "吁", "吐", "吓", "虫", "曲", "团", "吕", "同", "吊", "吃", "因", "吸", "吗", "吆", "屿", "屹", "岁", "帆", "回", "岂", "则", "刚", "网", "肉", "年", "朱", "先", "丢", "廷", "舌", "竹", "迁", "乔", "迄", "伟", "传", "乒", "乓", "休", "伍", "伏", "优", "臼", "伐", "延", "仲", "件", "任", "伤", "价", "伦", "份", "华", "仰", "仿", "伙", "伪", "自", "伊", "血", "向", "似", "后", "行", "舟", "全", "会", "杀", "合", "兆", "企", "众", "爷", "伞", "创", "肌", "肋", "朵", "杂", "危", "旬", "旨", "旭", "负", "匈", "名", "各", "多", "争", "色", "壮", "冲", "妆", "冰", "庄", "庆", "亦", "刘", "齐", "交", "衣", "次", "产", "决", "亥", "充", "妄", "闭", "问", "闯", "羊", "并", "关", "米", "灯", "州", "汗", "污", "江", "汛", "池", "汝", "汤", "忙", "兴", "宇", "守", "宅", "字", "安", "讲", "讳", "军", "讶", "许", "讹", "论", "讼", "农", "讽", "设", "访", "诀", "寻", "那", "迅", "尽", "导", "异", "弛", "孙", "阵", "阳", "收", "阶", "阴", "防", "奸", "如", "妇", "妃", "好", "她", "妈", "戏", "羽", "观", "欢", "买", "红", "驮", "纤", "驯", "约", "级", "纪", "驰", "纫", "巡", "寿", "弄", "麦", "玖", "玛", "形", "进", "戒", "吞", "远", "违", "韧", "运", "扶", "抚", "坛", "技", "坏", "抠", "扰", "扼", "拒", "找", "批", "址", "扯", "走", "抄", "贡", "汞", "坝", "攻", "赤", "折", "抓", "扳", "抡", "扮", "抢", "孝", "坎", "均", "抑", "抛", "投", "坟", "坑", "抗", "坊", "抖", "护", "壳", "志", "块", "扭", "声", "把", "报", "拟", "却", "抒", "劫", "芙", "芜", "苇", "芽", "花", "芹", "芥", "芬", "苍", "芳", "严", "芦", "芯", "劳", "克", "芭", "苏", "杆", "杠", "杜", "材", "村", "杖", "杏", "杉", "巫", "极", "李", "杨", "求", "甫", "匣", "更", "束", "吾", "豆", "两", "酉", "丽", "医", "辰", "励", "否", "还", "尬", "歼", "来", "连", "轩", "步", "卤", "坚", "肖", "旱", "盯", "呈", "时", "吴", "助", "县", "里", "呆", "吱", "吠", "呕", "园", "旷", "围", "呀", "吨", "足", "邮", "男", "困", "吵", "串", "员", "呐", "听", "吟", "吩", "呛", "吻", "吹", "呜", "吭", "吧", "邑", "吼", "囤", "别", "吮", "岖", "岗", "帐", "财", "针", "钉", "牡", "告", "我", "乱", "利", "秃", "秀", "私", "每", "兵", "估", "体", "何", "佐", "佑", "但", "伸", "佃", "作", "伯", "伶", "佣", "低", "你", "住", "位", "伴", "身", "皂", "伺", "佛", "囱", "近", "彻", "役", "返", "余", "希", "坐", "谷", "妥", "含", "邻", "岔", "肝", "肛", "肚", "肘", "肠", "龟", "甸", "免", "狂", "犹", "狈", "角", "删", "条", "彤", "卵", "灸", "岛", "刨", "迎", "饭", "饮", "系", "言", "冻", "状", "亩", "况", "床", "库", "庇", "疗", "吝", "应", "这", "冷", "庐", "序", "辛", "弃", "冶", "忘", "闰", "闲", "间", "闷", "判", "兑", "灶", "灿", "灼", "弟", "汪", "沐", "沛", "汰", "沥", "沙", "汽", "沃", "沦", "汹", "泛", "沧", "没", "沟", "沪", "沈", "沉", "沁", "怀", "忧", "忱", "快", "完", "宋", "宏", "牢", "究", "穷", "灾", "良", "证", "启", "评", "补", "初", "社", "祀", "识", "诈", "诉", "罕", "诊", "词", "译", "君", "灵", "即", "层", "屁", "尿", "尾", "迟", "局", "改", "张", "忌", "际", "陆", "阿", "陈", "阻", "附", "坠", "妓", "妙", "妖", "姊", "妨", "妒", "努", "忍", "劲", "矣", "鸡", "纬", "驱", "纯", "纱", "纲", "纳", "驳", "纵", "纷", "纸", "纹", "纺", "驴", "纽", "奉", "玩", "环", "武", "青", "责", "现", "玫", "表", "规", "抹", "卦", "坷", "坯", "拓", "拢", "拔", "坪", "拣", "坦", "担", "坤", "押", "抽", "拐", "拖", "者", "拍", "顶", "拆", "拎", "拥", "抵", "拘", "势", "抱", "拄", "垃", "拉", "拦", "幸", "拌", "拧", "拂", "拙", "招", "坡", "披", "拨", "择", "抬", "拇", "拗", "其", "取", "茉", "苦", "昔", "苛", "若", "茂", "苹", "苗", "英", "苟", "苑", "苞", "范", "直", "茁", "茄", "茎", "苔", "茅", "枉", "林", "枝", "杯", "枢", "柜", "枚", "析", "板", "松", "枪", "枫", "构", "杭", "杰", "述", "枕", "丧", "或", "画", "卧", "事", "刺", "枣", "雨", "卖", "郁", "矾", "矿", "码", "厕", "奈", "奔", "奇", "奋", "态", "欧", "殴", "垄", "妻", "轰", "顷", "转", "斩", "轮", "软", "到", "非", "叔", "歧", "肯", "齿", "些", "卓", "虎", "虏", "肾", "贤", "尚", "旺", "具", "味", "果", "昆", "国", "哎", "咕", "昌", "呵", "畅", "明", "易", "咙", "昂", "迪", "典", "固", "忠", "呻", "咒", "咋", "咐", "呼", "鸣", "咏", "呢", "咄", "咖", "岸", "岩", "帖", "罗", "帜", "帕", "岭", "凯", "败", "账", "贩", "贬", "购", "贮", "图", "钓", "制", "知", "迭", "氛", "垂", "牧", "物", "乖", "刮", "秆", "和", "季", "委", "秉", "佳", "侍", "岳", "供", "使", "例", "侠", "侥", "版", "侄", "侦", "侣", "侧", "凭", "侨", "佩", "货", "侈", "依", "卑", "的", "迫", "质", "欣", "征", "往", "爬", "彼", "径", "所", "舍", "金", "刹", "命", "肴", "斧", "爸", "采", "觅", "受", "乳", "贪", "念", "贫", "忿", "肤", "肺", "肢", "肿", "胀", "朋", "股", "肮", "肪", "肥", "服", "胁", "周", "昏", "鱼", "兔", "狐", "忽", "狗", "狞", "备", "饰", "饱", "饲", "变", "京", "享", "庞", "店", "夜", "庙", "府", "底", "疟", "疙", "疚", "剂", "卒", "郊", "庚", "废", "净", "盲", "放", "刻", "育", "氓", "闸", "闹", "郑", "券", "卷", "单", "炬", "炒", "炊", "炕", "炎", "炉", "沫", "浅", "法", "泄", "沽", "河", "沾", "泪", "沮", "油", "泊", "沿", "泡", "注", "泣", "泞", "泻", "泌", "泳", "泥", "沸", "沼", "波", "泼", "泽", "治", "怔", "怯", "怖", "性", "怕", "怜", "怪", "怡", "学", "宝", "宗", "定", "宠", "宜", "审", "宙", "官", "空", "帘", "宛", "实", "试", "郎", "诗", "肩", "房", "诚", "衬", "衫", "视", "祈", "话", "诞", "诡", "询", "该", "详", "建", "肃", "录", "隶", "帚", "屉", "居", "届", "刷", "屈", "弧", "弥", "弦", "承", "孟", "陋", "陌", "孤", "陕", "降", "函", "限", "妹", "姑", "姐", "姓", "妮", "始", "姆", "迢", "驾", "叁", "参", "艰", "线", "练", "组", "绅", "细", "驶", "织", "驹", "终", "驻", "绊", "驼", "绍", "绎", "经", "贯", "契", "贰", "奏", "春", "帮", "玷", "珍", "玲", "玻", "毒", "型", "拭", "挂", "封", "持", "拷", "拱", "项", "垮", "挎", "城", "挟", "挠", "政", "赴", "赵", "挡", "拽", "哉", "挺", "括", "垢", "拴", "拾", "挑", "垛", "指", "垫", "挣", "挤", "拼", "挖", "按", "挥", "挪", "拯", "某", "甚", "荆", "茸", "革", "茬", "荐", "巷", "带", "草", "茧", "茵", "茶", "荒", "茫", "荡", "荣", "荤", "荧", "故", "胡", "荫", "荔", "南", "药", "标", "栈", "柑", "枯", "柄", "栋", "相", "查", "柏", "栅", "柳", "柱", "柿", "栏", "柠", "树", "勃", "要", "柬", "咸", "威", "歪", "研", "砖", "厘", "厚", "砌", "砂", "泵", "砚", "砍", "面", "耐", "耍", "牵", "鸥", "残", "殃", "轴", "轻", "鸦", "皆", "韭", "背", "战", "点", "虐", "临", "览", "竖", "省", "削", "尝", "昧", "盹", "是", "盼", "眨", "哇", "哄", "哑", "显", "冒", "映", "星", "昨", "咧", "昭", "畏", "趴", "胃", "贵", "界", "虹", "虾", "蚁", "思", "蚂", "虽", "品", "咽", "骂", "勋", "哗", "咱", "响", "哈", "哆", "咬", "咳", "咪", "哪", "哟", "炭", "峡", "罚", "贱", "贴", "贻", "骨", "幽", "钙", "钝", "钞", "钟", "钢", "钠", "钥", "钦", "钧", "钩", "钮", "卸", "缸", "拜", "看", "矩", "毡", "氢", "怎", "牲", "选", "适", "秒", "香", "种", "秋", "科", "重", "复", "竿", "段", "便", "俩", "贷", "顺", "修", "俏", "保", "促", "俄", "俐", "侮", "俭", "俗", "俘", "信", "皇", "泉", "鬼", "侵", "禹", "侯", "追", "俊", "盾", "待", "徊", "衍", "律", "很", "须", "叙", "剑", "逃", "食", "盆", "胚", "胧", "胆", "胜", "胞", "胖", "脉", "胎", "勉", "狭", "狮", "独", "狰", "狡", "狱", "狠", "贸", "怨", "急", "饵", "饶", "蚀", "饺", "饼", "峦", "弯", "将", "奖", "哀", "亭", "亮", "度", "迹", "庭", "疮", "疯", "疫", "疤", "咨", "姿", "亲", "音", "帝", "施", "闺", "闻", "闽", "阀", "阁", "差", "养", "美", "姜", "叛", "送", "类", "迷", "籽", "娄", "前", "首", "逆", "兹", "总", "炼", "炸", "烁", "炮", "炫", "烂", "剃", "洼", "洁", "洪", "洒", "柒", "浇", "浊", "洞", "测", "洗", "活", "派", "洽", "染", "洛", "浏", "济", "洋", "洲", "浑", "浓", "津", "恃", "恒", "恢", "恍", "恬", "恤", "恰", "恼", "恨", "举", "觉", "宣", "宦", "室", "宫", "宪", "突", "穿", "窃", "客", "诫", "冠", "诬", "语", "扁", "袄", "祖", "神", "祝", "祠", "误", "诱", "诲", "说", "诵", "垦", "退", "既", "屋", "昼", "屏", "屎", "费", "陡", "逊", "眉", "孩", "陨", "除", "险", "院", "娃", "姥", "姨", "姻", "娇", "姚", "娜", "怒", "架", "贺", "盈", "勇", "怠", "癸", "蚤", "柔", "垒", "绑", "绒", "结", "绕", "骄", "绘", "给", "绚", "骆", "络", "绝", "绞", "骇", "统", "耕", "耘", "耗", "耙", "艳", "泰", "秦", "珠", "班", "素", "匿", "蚕", "顽", "盏", "匪", "捞", "栽", "捕", "埂", "捂", "振", "载", "赶", "起", "盐", "捎", "捍", "捏", "埋", "捉", "捆", "捐", "损", "袁", "捌", "都", "哲", "逝", "捡", "挫", "换", "挽", "挚", "热", "恐", "捣", "壶", "捅", "埃", "挨", "耻", "耿", "耽", "聂", "恭", "莽", "莱", "莲", "莫", "莉", "荷", "获", "晋", "恶", "莹", "莺", "真", "框", "梆", "桂", "桔", "栖", "档", "桐", "株", "桥", "桦", "栓", "桃", "格", "桩", "校", "核", "样", "根", "索", "哥", "速", "逗", "栗", "贾", "酌", "配", "翅", "辱", "唇", "夏", "砸", "砰", "砾", "础", "破", "原", "套", "逐", "烈", "殊", "殉", "顾", "轿", "较", "顿", "毙", "致", "柴", "桌", "虑", "监", "紧", "党", "逞", "晒", "眠", "晓", "哮", "唠", "鸭", "晃", "哺", "晌", "剔", "晕", "蚌", "畔", "蚣", "蚊", "蚪", "蚓", "哨", "哩", "圃", "哭", "哦", "恩", "鸯", "唤", "唁", "哼", "唧", "啊", "唉", "唆", "罢", "峭", "峨", "峰", "圆", "峻", "贼", "贿", "赂", "赃", "钱", "钳", "钻", "钾", "铁", "铃", "铅", "缺", "氧", "氨", "特", "牺", "造", "乘", "敌", "秤", "租", "积", "秧", "秩", "称", "秘", "透", "笔", "笑", "笋", "债", "借", "值", "倚", "俺", "倾", "倒", "倘", "俱", "倡", "候", "赁", "俯", "倍", "倦", "健", "臭", "射", "躬", "息", "倔", "徒", "徐", "殷", "舰", "舱", "般", "航", "途", "拿", "耸", "爹", "舀", "爱", "豺", "豹", "颁", "颂", "翁", "胰", "脆", "脂", "胸", "胳", "脏", "脐", "胶", "脑", "脓", "逛", "狸", "狼", "卿", "逢", "鸵", "留", "鸳", "皱", "饿", "馁", "凌", "凄", "恋", "桨", "浆", "衰", "衷", "高", "郭", "席", "准", "座", "症", "病", "疾", "斋", "疹", "疼", "疲", "脊", "效", "离", "紊", "唐", "瓷", "资", "凉", "站", "剖", "竞", "部", "旁", "旅", "畜", "阅", "羞", "羔", "瓶", "拳", "粉", "料", "益", "兼", "烤", "烘", "烦", "烧", "烛", "烟", "烙", "递", "涛", "浙", "涝", "浦", "酒", "涉", "消", "涡", "浩", "海", "涂", "浴", "浮", "涣", "涤", "流", "润", "涧", "涕", "浪", "浸", "涨", "烫", "涩", "涌", "悖", "悟", "悄", "悍", "悔", "悯", "悦", "害", "宽", "家", "宵", "宴", "宾", "窍", "窄", "容", "宰", "案", "请", "朗", "诸", "诺", "读", "扇", "诽", "袜", "袖", "袍", "被", "祥", "课", "冥", "谁", "调", "冤", "谅", "谆", "谈", "谊", "剥", "恳", "展", "剧", "屑", "弱", "陵", "祟", "陶", "陷", "陪", "娱", "娟", "恕", "娥", "娘", "通", "能", "难", "预", "桑", "绢", "绣", "验", "继", "骏", "球", "琐", "理", "琉", "琅", "捧", "堵", "措", "描", "域", "捺", "掩", "捷", "排", "焉", "掉", "捶", "赦", "堆", "推", "埠", "掀", "授", "捻", "教", "掏", "掐", "掠", "掂", "培", "接", "掷", "控", "探", "据", "掘", "掺", "职", "基", "聆", "勘", "聊", "娶", "著", "菱", "勒", "黄", "菲", "萌", "萝", "菌", "萎", "菜", "萄", "菊", "菩", "萍", "菠", "萤", "营", "乾", "萧", "萨", "菇", "械", "彬", "梦", "婪", "梗", "梧", "梢", "梅", "检", "梳", "梯", "桶", "梭", "救", "曹", "副", "票", "酝", "酗", "厢", "戚", "硅", "硕", "奢", "盔", "爽", "聋", "袭", "盛", "匾", "雪", "辅", "辆", "颅", "虚", "彪", "雀", "堂", "常", "眶", "匙", "晨", "睁", "眯", "眼", "悬", "野", "啪", "啦", "曼", "晦", "晚", "啄", "啡", "距", "趾", "啃", "跃", "略", "蚯", "蛀", "蛇", "唬", "累", "鄂", "唱", "患", "啰", "唾", "唯", "啤", "啥", "啸", "崖", "崎", "崭", "逻", "崔", "帷", "崩", "崇", "崛", "婴", "圈", "铐", "铛", "铝", "铜", "铭", "铲", "银", "矫", "甜", "秸", "梨", "犁", "秽", "移", "笨", "笼", "笛", "笙", "符", "第", "敏", "做", "袋", "悠", "偿", "偶", "偎", "偷", "您", "售", "停", "偏", "躯", "兜", "假", "衅", "徘", "徙", "得", "衔", "盘", "舶", "船", "舵", "斜", "盒", "鸽", "敛", "悉", "欲", "彩", "领", "脚", "脖", "脯", "豚", "脸", "脱", "象", "够", "逸", "猜", "猪", "猎", "猫", "凰", "猖", "猛", "祭", "馅", "馆", "凑", "减", "毫", "烹", "庶", "麻", "庵", "痊", "痒", "痕", "廊", "康", "庸", "鹿", "盗", "章", "竟", "商", "族", "旋", "望", "率", "阎", "阐", "着", "羚", "盖", "眷", "粘", "粗", "粒", "断", "剪", "兽", "焊", "焕", "清", "添", "鸿", "淋", "涯", "淹", "渠", "渐", "淑", "淌", "混", "淮", "淆", "渊", "淫", "渔", "淘", "淳", "液", "淤", "淡", "淀", "深", "涮", "涵", "婆", "梁", "渗", "情", "惜", "惭", "悼", "惧", "惕", "惟", "惊", "惦", "悴", "惋", "惨", "惯", "寇", "寅", "寄", "寂", "宿", "窒", "窑", "密", "谋", "谍", "谎", "谐", "袱", "祷", "祸", "谓", "谚", "谜", "逮", "敢", "尉", "屠", "弹", "隋", "堕", "随", "蛋", "隅", "隆", "隐", "婚", "婶", "婉", "颇", "颈", "绩", "绪", "续", "骑", "绰", "绳", "维", "绵", "绷", "绸", "综", "绽", "绿", "缀", "巢", "琴", "琳", "琢", "琼", "斑", "替", "揍", "款", "堪", "塔", "搭", "堰", "揩", "越", "趁", "趋", "超", "揽", "堤", "提", "博", "揭", "喜", "彭", "揣", "插", "揪", "搜", "煮", "援", "搀", "裁", "搁", "搓", "搂", "搅", "壹", "握", "搔", "揉", "斯", "期", "欺", "联", "葫", "散", "惹", "葬", "募", "葛", "董", "葡", "敬", "葱", "蒋", "蒂", "落", "韩", "朝", "辜", "葵", "棒", "棱", "棋", "椰", "植", "森", "焚", "椅", "椒", "棵", "棍", "椎", "棉", "棚", "棕", "棺", "榔", "椭", "惠", "惑", "逼", "粟", "棘", "酣", "酥", "厨", "厦", "硬", "硝", "确", "硫", "雁", "殖", "裂", "雄", "颊", "雳", "暂", "雅", "翘", "辈", "悲", "紫", "凿", "辉", "敞", "棠", "赏", "掌", "晴", "睐", "暑", "最", "晰", "量", "鼎", "喷", "喳", "晶", "喇", "遇", "喊", "遏", "晾", "景", "畴", "践", "跋", "跌", "跑", "跛", "遗", "蛙", "蛛", "蜓", "蜒", "蛤", "喝", "鹃", "喂", "喘", "喉", "喻", "啼", "喧", "嵌", "幅", "帽", "赋", "赌", "赎", "赐", "赔", "黑", "铸", "铺", "链", "销", "锁", "锄", "锅", "锈", "锋", "锌", "锐", "甥", "掰", "短", "智", "氮", "毯", "氯", "鹅", "剩", "稍", "程", "稀", "税", "筐", "等", "筑", "策", "筛", "筒", "筏", "答", "筋", "筝", "傲", "傅", "牌", "堡", "集", "焦", "傍", "储", "皓", "皖", "粤", "奥", "街", "惩", "御", "循", "艇", "舒", "逾", "番", "释", "禽", "腊", "脾", "腋", "腔", "腕", "鲁", "猩", "猬", "猾", "猴", "惫", "然", "馈", "馋", "装", "蛮", "就", "敦", "斌", "痘", "痢", "痪", "痛", "童", "竣", "阔", "善", "翔", "羡", "普", "粪", "尊", "奠", "道", "遂", "曾", "焰", "港", "滞", "湖", "湘", "渣", "渤", "渺", "湿", "温", "渴", "溃", "溅", "滑", "湃", "渝", "湾", "渡", "游", "滋", "渲", "溉", "愤", "慌", "惰", "愕", "愣", "惶", "愧", "愉", "慨", "割", "寒", "富", "寓", "窜", "窝", "窖", "窗", "窘", "遍", "雇", "裕", "裤", "裙", "禅", "禄", "谢", "谣", "谤", "谦", "犀", "属", "屡", "强", "粥", "疏", "隔", "隙", "隘", "媒", "絮", "嫂", "媚", "婿", "登", "缅", "缆", "缉", "缎", "缓", "缔", "缕", "骗", "编", "骚", "缘", "瑟", "鹉", "瑞", "瑰", "瑙", "魂", "肆", "摄", "摸", "填", "搏", "塌", "鼓", "摆", "携", "搬", "摇", "搞", "塘", "摊", "聘", "斟", "蒜", "勤", "靴", "靶", "鹊", "蓝", "墓", "幕", "蓬", "蓄", "蒲", "蓉", "蒙", "蒸", "献", "椿", "禁", "楚", "楷", "榄", "想", "槐", "榆", "楼", "概", "赖", "酪", "酬", "感", "碍", "碘", "碑", "碎", "碰", "碗", "碌", "尴", "雷", "零", "雾", "雹", "辐", "辑", "输", "督", "频", "龄", "鉴", "睛", "睹", "睦", "瞄", "睫", "睡", "睬", "嗜", "鄙", "嗦", "愚", "暖", "盟", "歇", "暗", "暇", "照", "畸", "跨", "跷", "跳", "跺", "跪", "路", "跤", "跟", "遣", "蜈", "蜗", "蛾", "蜂", "蜕", "嗅", "嗡", "嗓", "署", "置", "罪", "罩", "蜀", "幌", "错", "锚", "锡", "锣", "锤", "锥", "锦", "键", "锯", "锰", "矮", "辞", "稚", "稠", "颓", "愁", "筹", "签", "简", "筷", "毁", "舅", "鼠", "催", "傻", "像", "躲", "魁", "衙", "微", "愈", "遥", "腻", "腰", "腥", "腮", "腹", "腺", "鹏", "腾", "腿", "鲍", "猿", "颖", "触", "解", "煞", "雏", "馍", "馏", "酱", "禀", "痹", "廓", "痴", "痰", "廉", "靖", "新", "韵", "意", "誊", "粮", "数", "煎", "塑", "慈", "煤", "煌", "满", "漠", "滇", "源", "滤", "滥", "滔", "溪", "溜", "漓", "滚", "溢", "溯", "滨", "溶", "溺", "粱", "滩", "慎", "誉", "塞", "寞", "窥", "窟", "寝", "谨", "褂", "裸", "福", "谬", "群", "殿", "辟", "障", "媳", "嫉", "嫌", "嫁", "叠", "缚", "缝", "缠", "缤", "剿", "静", "碧", "璃", "赘", "熬", "墙", "墟", "嘉", "摧", "赫", "截", "誓", "境", "摘", "摔", "撇", "聚", "慕", "暮", "摹", "蔓", "蔑", "蔡", "蔗", "蔽", "蔼", "熙", "蔚", "兢", "模", "槛", "榴", "榜", "榨", "榕", "歌", "遭", "酵", "酷", "酿", "酸", "碟", "碱", "碳", "磁", "愿", "需", "辖", "辗", "雌", "裳", "颗", "瞅", "墅", "嗽", "踊", "蜻", "蜡", "蝇", "蜘", "蝉", "嘛", "嘀", "赚", "锹", "锻", "镀", "舞", "舔", "稳", "熏", "箕", "算", "箩", "管", "箫", "舆", "僚", "僧", "鼻", "魄", "魅", "貌", "膜", "膊", "膀", "鲜", "疑", "孵", "馒", "裹", "敲", "豪", "膏", "遮", "腐", "瘩", "瘟", "瘦", "辣", "彰", "竭", "端", "旗", "精", "粹", "歉", "弊", "熄", "熔", "煽", "潇", "漆", "漱", "漂", "漫", "滴", "漾", "演", "漏", "慢", "慷", "寨", "赛", "寡", "察", "蜜", "寥", "谭", "肇", "褐", "褪", "谱", "隧", "嫩", "翠", "熊", "凳", "骡", "缩", "慧", "撵", "撕", "撒", "撩", "趣", "趟", "撑", "撮", "撬", "播", "擒", "墩", "撞", "撤", "增", "撰", "聪", "鞋", "鞍", "蕉", "蕊", "蔬", "蕴", "横", "槽", "樱", "橡", "樟", "橄", "敷", "豌", "飘", "醋", "醇", "醉", "磕", "磊", "磅", "碾", "震", "霄", "霉", "瞒", "题", "暴", "瞎", "嘻", "嘶", "嘲", "嘹", "影", "踢", "踏", "踩", "踪", "蝶", "蝴", "蝠", "蝎", "蝌", "蝗", "蝙", "嘿", "嘱", "幢", "墨", "镇", "镐", "镑", "靠", "稽", "稻", "黎", "稿", "稼", "箱", "篓", "箭", "篇", "僵", "躺", "僻", "德", "艘", "膝", "膛", "鲤", "鲫", "熟", "摩", "褒", "瘪", "瘤", "瘫", "凛", "颜", "毅", "糊", "遵", "憋", "潜", "澎", "潮", "潭", "鲨", "澳", "潘", "澈", "澜", "澄", "懂", "憔", "懊", "憎", "额", "翩", "褥", "谴", "鹤", "憨", "慰", "劈", "履", "豫", "缭", "撼", "擂", "操", "擅", "燕", "蕾", "薯", "薛", "薇", "擎", "薪", "薄", "颠", "翰", "噩", "橱", "橙", "橘", "整", "融", "瓢", "醒", "霍", "霎", "辙", "冀", "餐", "嘴", "踱", "蹄", "蹂", "蟆", "螃", "器", "噪", "鹦", "赠", "默", "黔", "镜", "赞", "穆", "篮", "篡", "篷", "篱", "儒", "邀", "衡", "膨", "雕", "鲸", "磨", "瘾", "瘸", "凝", "辨", "辩", "糙", "糖", "糕", "燃", "濒", "澡", "激", "懒", "憾", "懈", "窿", "壁", "避", "缰", "缴", "戴", "擦", "藉", "鞠", "藏", "藐", "檬", "檐", "檀", "礁", "磷", "霜", "霞", "瞭", "瞧", "瞬", "瞳", "瞩", "瞪", "曙", "蹋", "蹈", "螺", "蟋", "蟀", "嚎", "赡", "穗", "魏", "簧", "簇", "繁", "徽", "爵", "朦", "臊", "鳄", "癌", "辫", "赢", "糟", "糠", "燥", "懦", "豁", "臀", "臂", "翼", "骤", "藕", "鞭", "藤", "覆", "瞻", "蹦", "嚣", "镰", "翻", "鳍", "鹰", "瀑", "襟", "璧", "戳", "孽", "警", "蘑", "藻", "攀", "曝", "蹲", "蹭", "蹬", "巅", "簸", "簿", "蟹", "颤", "靡", "癣", "瓣", "羹", "鳖", "爆", "疆", "鬓", "壤", "馨", "耀", "躁", "蠕", "嚼", "嚷", "巍", "籍", "鳞", "魔", "糯", "灌", "譬", "蠢", "霸", "露", "霹", "躏", "黯", "髓", "赣", "囊", "镶", "瓤", "罐", "矗", "乂", "乜", "兀", "弋", "孑", "孓", "幺", "亓", "韦", "廿", "丏", "卅", "仄", "厄", "仃", "仉", "仂", "兮", "刈", "爻", "卞", "闩", "讣", "尹", "夬", "爿", "毋", "邗", "邛", "艽", "艿", "札", "叵", "匝", "丕", "匜", "劢", "卟", "叱", "叻", "仨", "仕", "仟", "仡", "仫", "仞", "卮", "氐", "犰", "刍", "邝", "邙", "汀", "讦", "讧", "讪", "讫", "尻", "阡", "尕", "弁", "驭", "匡", "耒", "玎", "玑", "邢", "圩", "圬", "圭", "扦", "圪", "圳", "圹", "扪", "圮", "圯", "芊", "芍", "芄", "芨", "芑", "芎", "芗", "亘", "厍", "夼", "戍", "尥", "乩", "旯", "曳", "岌", "屺", "凼", "囡", "钇", "缶", "氘", "氖", "牝", "伎", "伛", "伢", "佤", "仵", "伥", "伧", "伉", "伫", "囟", "汆", "刖", "夙", "旮", "刎", "犷", "犸", "舛", "凫", "邬", "饧", "汕", "汔", "汐", "汲", "汜", "汊", "忖", "忏", "讴", "讵", "祁", "讷", "聿", "艮", "厾", "阱", "阮", "阪", "丞", "妁", "牟", "纡", "纣", "纥", "纨", "玕", "玙", "抟", "抔", "圻", "坂", "坍", "坞", "抃", "抉", "㧐", "芫", "邯", "芸", "芾", "苈", "苣", "芷", "芮", "苋", "芼", "苌", "苁", "芩", "芪", "芡", "芟", "苄", "苎", "苡", "杌", "杓", "杞", "杈", "忑", "孛", "邴", "邳", "矶", "奁", "豕", "忒", "欤", "轫", "迓", "邶", "忐", "卣", "邺", "旰", "呋", "呒", "呓", "呔", "呖", "呃", "旸", "吡", "町", "虬", "呗", "吽", "吣", "吲", "帏", "岐", "岈", "岘", "岑", "岚", "兕", "囵", "囫", "钊", "钋", "钌", "迕", "氙", "氚", "牤", "佞", "邱", "攸", "佚", "佝", "佟", "佗", "伽", "彷", "佘", "佥", "孚", "豸", "坌", "肟", "邸", "奂", "劬", "狄", "狁", "鸠", "邹", "饨", "饩", "饪", "饫", "饬", "亨", "庑", "庋", "疔", "疖", "肓", "闱", "闳", "闵", "羌", "炀", "沣", "沅", "沔", "沤", "沌", "沏", "沚", "汩", "汨", "沂", "汾", "沨", "汴", "汶", "沆", "沩", "泐", "怃", "怄", "忡", "忤", "忾", "怅", "忻", "忪", "怆", "忭", "忸", "诂", "诃", "诅", "诋", "诌", "诏", "诒", "孜", "陇", "陀", "陂", "陉", "妍", "妩", "妪", "妣", "妊", "妗", "妫", "妞", "姒", "妤", "邵", "劭", "刭", "甬", "邰", "纭", "纰", "纴", "纶", "纾", "玮", "玡", "玭", "玠", "玢", "玥", "玦", "盂", "忝", "匦", "坩", "抨", "拤", "坫", "拈", "垆", "抻", "劼", "拃", "拊", "坼", "坻", "㧟", "坨", "坭", "抿", "坳", "耶", "苷", "苯", "苤", "茏", "苫", "苜", "苴", "苒", "苘", "茌", "苻", "苓", "茚", "茆", "茑", "茓", "茔", "茕", "茀", "苕", "枥", "枇", "杪", "杳", "枧", "杵", "枨", "枞", "枋", "杻", "杷", "杼", "矸", "砀", "刳", "奄", "瓯", "殁", "郏", "轭", "郅", "鸢", "盱", "昊", "昙", "杲", "昃", "咂", "呸", "昕", "昀", "旻", "昉", "炅", "咔", "畀", "虮", "咀", "呷", "黾", "呱", "呤", "咚", "咆", "咛", "呶", "呣", "呦", "咝", "岢", "岿", "岬", "岫", "帙", "岣", "峁", "刿", "迥", "岷", "剀", "帔", "峄", "沓", "囹", "罔", "钍", "钎", "钏", "钒", "钕", "钗", "邾", "迮", "牦", "竺", "迤", "佶", "佬", "佰", "侑", "侉", "臾", "岱", "侗", "侃", "侏", "侩", "佻", "佾", "侪", "佼", "佯", "侬", "帛", "阜", "侔", "徂", "刽", "郄", "怂", "籴", "瓮", "戗", "肼", "䏝", "肽", "肱", "肫", "剁", "迩", "郇", "狙", "狎", "狍", "狒", "咎", "炙", "枭", "饯", "饴", "冽", "冼", "庖", "疠", "疝", "疡", "兖", "妾", "劾", "炜", "𬉼", "炖", "炘", "炝", "炔", "泔", "沭", "泷", "泸", "泱", "泅", "泗", "泠", "泺", "泖", "泫", "泮", "沱", "泯", "泓", "泾", "怙", "怵", "怦", "怛", "怏", "怍", "㤘", "怩", "怫", "怿", "宕", "穹", "宓", "诓", "诔", "诖", "诘", "戾", "诙", "戽", "郓", "衩", "祆", "祎", "祉", "祇", "诛", "诜", "诟", "诠", "诣", "诤", "诧", "诨", "诩", "戕", "孢", "亟", "陔", "妲", "妯", "姗", "帑", "弩", "孥", "驽", "虱", "迦", "迨", "绀", "绁", "绂", "驷", "驸", "绉", "绌", "驿", "骀", "甾", "珏", "珐", "珂", "珑", "玳", "珀", "顸", "珉", "珈", "拮", "垭", "挝", "垣", "挞", "垤", "赳", "贲", "垱", "垌", "郝", "垧", "垓", "挦", "垠", "茜", "荚", "荑", "贳", "荜", "莒", "茼", "茴", "茱", "莛", "荞", "茯", "荏", "荇", "荃", "荟", "荀", "茗", "荠", "茭", "茨", "垩", "荥", "荦", "荨", "荩", "剋", "荪", "茹", "荬", "荮", "柰", "栉", "柯", "柘", "栊", "柩", "枰", "栌", "柙", "枵", "柚", "枳", "柞", "柝", "栀", "柢", "栎", "枸", "柈", "柁", "枷", "柽", "剌", "酊", "郦", "甭", "砗", "砘", "砒", "斫", "砭", "砜", "奎", "耷", "虺", "殂", "殇", "殄", "殆", "轱", "轲", "轳", "轶", "轸", "虿", "毖", "觇", "尜", "哐", "眄", "眍", "𠳐", "郢", "眇", "眊", "眈", "禺", "哂", "咴", "曷", "昴", "昱", "昵", "咦", "哓", "哔", "畎", "毗", "呲", "胄", "畋", "畈", "虼", "虻", "盅", "咣", "哕", "剐", "郧", "咻", "囿", "咿", "哌", "哙", "哚", "咯", "咩", "咤", "哝", "哏", "哞", "峙", "峣", "罘", "帧", "峒", "峤", "峋", "峥", "贶", "钚", "钛", "钡", "钣", "钤", "钨", "钫", "钯", "氡", "氟", "牯", "郜", "秕", "秭", "竽", "笈", "笃", "俦", "俨", "俅", "俪", "叟", "垡", "牮", "俣", "俚", "皈", "俑", "俟", "逅", "徇", "徉", "舢", "俞", "郗", "俎", "郤", "爰", "郛", "瓴", "胨", "胪", "胛", "胂", "胙", "胍", "胗", "胝", "朐", "胫", "鸨", "匍", "狨", "狯", "飑", "狩", "狲", "訇", "逄", "昝", "饷", "饸", "饹", "胤", "孪", "娈", "弈", "奕", "庥", "疬", "疣", "疥", "疭", "庠", "竑", "彦", "飒", "闼", "闾", "闿", "阂", "羑", "迸", "籼", "酋", "炳", "炻", "炽", "炯", "烀", "炷", "烃", "洱", "洹", "洧", "洌", "浃", "洇", "洄", "洙", "涎", "洎", "洫", "浍", "洮", "洵", "浒", "浔", "浕", "洳", "恸", "恓", "恹", "恫", "恺", "恻", "恂", "恪", "恽", "宥", "扃", "衲", "衽", "衿", "袂", "祛", "祜", "祓", "祚", "诮", "祗", "祢", "诰", "诳", "鸩", "昶", "郡", "咫", "弭", "牁", "胥", "陛", "陟", "娅", "姮", "娆", "姝", "姣", "姘", "姹", "怼", "羿", "炱", "矜", "绔", "骁", "骅", "绗", "绛", "骈", "耖", "挈", "珥", "珙", "顼", "珰", "珩", "珧", "珣", "珞", "琤", "珲", "敖", "恚", "埔", "埕", "埘", "埙", "埚", "挹", "耆", "耄", "埒", "捋", "贽", "垸", "捃", "盍", "荸", "莆", "莳", "莴", "莪", "莠", "莓", "莜", "莅", "荼", "莩", "荽", "莸", "荻", "莘", "莎", "莞", "莨", "渇", "鸪", "莼", "栲", "栳", "郴", "桓", "桡", "桎", "桢", "桤", "梃", "栝", "桕", "桁", "桧", "桅", "栟", "桉", "栩", "逑", "逋", "彧", "鬲", "豇", "酐", "逦", "厝", "孬", "砝", "砹", "砺", "砧", "砷", "砟", "砼", "砥", "砣", "剞", "砻", "轼", "轾", "辂", "鸫", "趸", "龀", "鸬", "虔", "逍", "眬", "唛", "晟", "眩", "眙", "哧", "哽", "唔", "晁", "晏", "鸮", "趵", "趿", "畛", "蚨", "蚜", "蚍", "蚋", "蚬", "蚝", "蚧", "唢", "圄", "唣", "唏", "盎", "唑", "崂", "崃", "罡", "罟", "峪", "觊", "赅", "钰", "钲", "钴", "钵", "钹", "钺", "钽", "钼", "钿", "铀", "铂", "铄", "铆", "铈", "铉", "铊", "铋", "铌", "铍", "䥽", "铎", "氩", "氤", "氦", "毪", "舐", "秣", "秫", "盉", "笄", "笕", "笊", "笏", "笆", "俸", "倩", "俵", "偌", "俳", "俶", "倬", "倏", "恁", "倭", "倪", "俾", "倜", "隼", "隽", "倌", "倥", "臬", "皋", "郫", "倨", "衄", "颀", "徕", "舫", "釜", "奚", "衾", "胯", "胱", "胴", "胭", "脍", "胼", "朕", "脒", "胺", "鸱", "玺", "鸲", "狷", "猁", "狳", "猃", "狺", "逖", "桀", "袅", "饽", "凇", "栾", "挛", "亳", "疳", "疴", "疸", "疽", "痈", "疱", "痂", "痉", "衮", "凋", "颃", "恣", "旆", "旄", "旃", "阃", "阄", "訚", "阆", "恙", "粑", "朔", "郸", "烜", "烨", "烩", "烊", "剡", "郯", "烬", "涑", "浯", "涞", "涟", "娑", "涅", "涠", "浞", "涓", "浥", "涔", "浜", "浠", "浣", "浚", "悚", "悭", "悝", "悒", "悌", "悛", "宸", "窈", "剜", "诹", "冢", "诼", "袒", "袢", "祯", "诿", "谀", "谂", "谄", "谇", "屐", "屙", "陬", "勐", "奘", "牂", "蚩", "陲", "姬", "娠", "娌", "娉", "娲", "娩", "娴", "娣", "娓", "婀", "畚", "逡", "绠", "骊", "绡", "骋", "绥", "绦", "绨", "骎", "邕", "鸶", "彗", "耜", "焘", "舂", "琏", "琇", "麸", "揶", "埴", "埯", "捯", "掳", "掴", "埸", "埵", "赧", "埤", "捭", "逵", "埝", "堋", "堍", "掬", "鸷", "掖", "捽", "掊", "堉", "掸", "捩", "掮", "悫", "埭", "埽", "掇", "掼", "聃", "菁", "萁", "菘", "堇", "萘", "萋", "菽", "菖", "萜", "萸", "萑", "棻", "菔", "菟", "萏", "萃", "菏", "菹", "菪", "菅", "菀", "萦", "菰", "菡", "梵", "梿", "梏", "觋", "桴", "桷", "梓", "棁", "桫", "棂", "啬", "郾", "匮", "敕", "豉", "鄄", "酞", "酚", "戛", "硎", "硭", "硒", "硖", "硗", "硐", "硇", "硌", "鸸", "瓠", "匏", "厩", "龚", "殒", "殓", "殍", "赉", "雩", "辄", "堑", "眭", "眦", "啧", "晡", "晤", "眺", "眵", "眸", "圊", "喏", "喵", "啉", "勖", "晞", "唵", "晗", "冕", "啭", "畦", "趺", "啮", "跄", "蚶", "蛄", "蛎", "蛆", "蚰", "蛊", "圉", "蚱", "蛉", "蛏", "蚴", "啁", "啕", "唿", "啐", "唼", "唷", "啖", "啵", "啶", "啷", "唳", "唰", "啜", "帻", "崚", "崦", "帼", "崮", "崤", "崆", "赇", "赈", "赊", "铑", "铒", "铗", "铙", "铟", "铠", "铡", "铢", "铣", "铤", "铧", "铨", "铩", "铪", "铫", "铬", "铮", "铯", "铰", "铱", "铳", "铵", "铷", "氪", "牾", "鸹", "秾", "逶", "笺", "筇", "笸", "笪", "笮", "笠", "笥", "笤", "笳", "笾", "笞", "偾", "偃", "偕", "偈", "傀", "偬", "偻", "皑", "皎", "鸻", "徜", "舸", "舻", "舴", "舷", "龛", "翎", "脬", "脘", "脲", "匐", "猗", "猡", "猞", "猝", "斛", "猕", "馗", "馃", "馄", "鸾", "孰", "庹", "庾", "痔", "痍", "疵", "翊", "旌", "旎", "袤", "阇", "阈", "阉", "阊", "阋", "阍", "阏", "羟", "粝", "粕", "敝", "焐", "烯", "焓", "烽", "焖", "烷", "焗", "渍", "渚", "淇", "淅", "淞", "渎", "涿", "淖", "挲", "淠", "涸", "渑", "淦", "淝", "淬", "涪", "淙", "涫", "渌", "淄", "惬", "悻", "悱", "惝", "惘", "悸", "惆", "惚", "惇", "惮", "窕", "谌", "谏", "扈", "皲", "谑", "裆", "袷", "裉", "谒", "谔", "谕", "谖", "谗", "谙", "谛", "谝", "逯", "郿", "隈", "粜", "隍", "隗", "婧", "婊", "婕", "娼", "婢", "婵", "胬", "袈", "翌", "恿", "欸", "绫", "骐", "绮", "绯", "绱", "骒", "绲", "骓", "绶", "绺", "绻", "绾", "骖", "缁", "耠", "琫", "琵", "琶", "琪", "瑛", "琦", "琥", "琨", "靓", "琰", "琮", "琯", "琬", "琛", "琚", "辇", "鼋", "揳", "堞", "搽", "揸", "揠", "堙", "趄", "揖", "颉", "塄", "揿", "耋", "揄", "蛩", "蛰", "塆", "摒", "揆", "掾", "聒", "葑", "葚", "靰", "靸", "葳", "葺", "葸", "萼", "葆", "葩", "葶", "蒌", "萱", "戟", "葭", "楮", "棼", "椟", "棹", "椤", "棰", "赍", "椋", "椁", "椪", "棣", "椐", "鹁", "覃", "酤", "酢", "酡", "鹂", "厥", "殚", "殛", "雯", "雱", "辊", "辋", "椠", "辍", "辎", "斐", "睄", "睑", "睇", "睃", "戢", "喋", "嗒", "喃", "喱", "喹", "晷", "喈", "跖", "跗", "跞", "跚", "跎", "跏", "跆", "蛱", "蛲", "蛭", "蛳", "蛐", "蛔", "蛞", "蛴", "蛟", "蛘", "喁", "喟", "啾", "嗖", "喑", "嗟", "喽", "嗞", "喀", "喔", "喙", "嵘", "嵖", "崴", "遄", "詈", "嵎", "崽", "嵬", "嵛", "嵯", "嵝", "嵫", "幄", "嵋", "赕", "铻", "铼", "铿", "锃", "锂", "锆", "锇", "锉", "锏", "锑", "锒", "锔", "锕", "掣", "矬", "氰", "毳", "毽", "犊", "犄", "犋", "鹄", "犍", "嵇", "黍", "稃", "稂", "筚", "筵", "筌", "傣", "傈", "舄", "牍", "傥", "傧", "遑", "傩", "遁", "徨", "媭", "畲", "弑", "颌", "翕", "釉", "鹆", "舜", "貂", "腈", "腌", "腓", "腆", "腴", "腑", "腚", "腱", "鱿", "鲀", "鲂", "颍", "猢", "猹", "猥", "飓", "觞", "觚", "猱", "颎", "飧", "馇", "馊", "亵", "脔", "裒", "痣", "痨", "痦", "痞", "痤", "痫", "痧", "赓", "竦", "瓿", "啻", "颏", "鹇", "阑", "阒", "阕", "粞", "遒", "孳", "焯", "焜", "焙", "焱", "鹈", "湛", "渫", "湮", "湎", "湜", "渭", "湍", "湫", "溲", "湟", "溆", "湲", "湔", "湉", "渥", "湄", "滁", "愠", "惺", "愦", "惴", "愀", "愎", "愔", "喾", "寐", "谟", "扉", "裢", "裎", "裥", "祾", "祺", "谠", "幂", "谡", "谥", "谧", "遐", "孱", "弼", "巽", "骘", "媪", "媛", "婷", "巯", "翚", "皴", "婺", "骛", "缂", "缃", "缄", "彘", "缇", "缈", "缌", "缑", "缒", "缗", "飨", "耢", "瑚", "瑁", "瑜", "瑗", "瑄", "瑕", "遨", "骜", "韫", "髡", "塬", "鄢", "趔", "趑", "摅", "摁", "蜇", "搋", "搪", "搐", "搛", "搠", "摈", "彀", "毂", "搦", "搡", "蓁", "戡", "蓍", "鄞", "靳", "蓐", "蓦", "鹋", "蒽", "蓓", "蓖", "蓊", "蒯", "蓟", "蓑", "蒿", "蒺", "蓠", "蒟", "蒡", "蒹", "蒴", "蒗", "蓥", "颐", "楔", "楠", "楂", "楝", "楫", "楸", "椴", "槌", "楯", "皙", "榈", "槎", "榉", "楦", "楣", "楹", "椽", "裘", "剽", "甄", "酮", "酰", "酯", "酩", "蜃", "碛", "碓", "硼", "碉", "碚", "碇", "碜", "鹌", "辏", "龃", "龅", "訾", "粲", "虞", "睚", "嗪", "韪", "嗷", "嗉", "睨", "睢", "雎", "睥", "嘟", "嗑", "嗫", "嗬", "嗔", "嗝", "戥", "嗄", "煦", "暄", "遢", "暌", "跬", "跶", "跸", "跐", "跣", "跹", "跻", "蛸", "蜊", "蜍", "蜉", "蜣", "畹", "蛹", "嗣", "嗯", "嗥", "嗲", "嗳", "嗌", "嗍", "嗨", "嗐", "嗤", "嗵", "罨", "嵊", "嵩", "嵴", "骰", "锗", "锛", "锜", "锝", "锞", "锟", "锢", "锨", "锩", "锭", "锱", "雉", "氲", "犏", "歃", "稞", "稗", "稔", "筠", "筢", "筮", "筲", "筱", "牒", "煲", "敫", "徭", "愆", "艄", "觎", "毹", "貊", "貅", "貉", "颔", "腠", "腩", "腼", "腭", "腧", "塍", "媵", "詹", "鲅", "鲆", "鲇", "鲈", "稣", "鲋", "鲐", "肄", "鹐", "飕", "觥", "遛", "馐", "鹑", "亶", "瘃", "痱", "痼", "痿", "瘐", "瘁", "瘆", "麂", "裔", "歆", "旒", "雍", "阖", "阗", "阙", "羧", "豢", "粳", "猷", "煳", "煜", "煨", "煅", "煊", "煸", "煺", "滟", "溱", "溘", "漭", "滢", "溥", "溧", "溽", "裟", "溻", "溷", "滗", "滫", "溴", "滏", "滃", "滦", "溏", "滂", "滓", "溟", "滪", "愫", "慑", "慊", "鲎", "骞", "窦", "窠", "窣", "裱", "褚", "裨", "裾", "裰", "禊", "谩", "谪", "媾", "嫫", "媲", "嫒", "嫔", "媸", "缙", "缜", "缛", "辔", "骝", "缟", "缡", "缢", "缣", "骟", "耥", "璈", "瑶", "瑭", "獒", "觏", "慝", "嫠", "韬", "叆", "髦", "摽", "墁", "撂", "摞", "撄", "翥", "踅", "摭", "墉", "墒", "榖", "綦", "蔫", "蔷", "靺", "靼", "鞅", "靿", "甍", "蔸", "蔟", "蔺", "戬", "蕖", "蔻", "蓿", "斡", "鹕", "蓼", "榛", "榧", "榻", "榫", "榭", "槔", "榱", "槁", "槟", "槠", "榷", "僰", "酽", "酶", "酹", "厮", "碡", "碴", "碣", "碲", "磋", "臧", "豨", "殡", "霆", "霁", "辕", "蜚", "裴", "翡", "龇", "龈", "睿", "䁖", "睽", "嘞", "嘈", "嘌", "嘁", "嘎", "暧", "暝", "踌", "踉", "蜞", "蜥", "蜮", "蝈", "蜴", "蜱", "蜩", "蜷", "蜿", "螂", "蜢", "嘘", "嘡", "鹗", "嘣", "嘤", "嘚", "嗾", "嘧", "罴", "罱", "幔", "嶂", "幛", "赙", "罂", "骷", "骶", "鹘", "锲", "锴", "锶", "锷", "锸", "锵", "镁", "镂", "犒", "箐", "箦", "箧", "箍", "箸", "箬", "箅", "箪", "箔", "箜", "箢", "箓", "毓", "僖", "儆", "僳", "僭", "劁", "僮", "魃", "魆", "睾", "艋", "鄱", "膈", "膑", "鲑", "鲔", "鲚", "鲛", "鲟", "獐", "觫", "雒", "夤", "馑", "銮", "塾", "麽", "瘌", "瘊", "瘘", "瘙", "廖", "韶", "旖", "膂", "阚", "鄯", "鲞", "粿", "粼", "粽", "糁", "槊", "鹚", "熘", "熥", "潢", "漕", "滹", "漯", "漶", "潋", "潴", "漪", "漉", "漳", "漩", "澉", "潍", "慵", "搴", "窨", "寤", "綮", "谮", "褡", "褙", "褓", "褛", "褊", "谯", "谰", "谲", "暨", "屣", "鹛", "嫣", "嫱", "嫖", "嫦", "嫚", "嫘", "嫡", "鼐", "翟", "瞀", "鹜", "骠", "缥", "缦", "缧", "缨", "骢", "缪", "缫", "耦", "耧", "瑾", "璜", "璀", "璎", "璁", "璋", "璇", "奭", "髯", "髫", "撷", "撅", "赭", "撸", "鋆", "撙", "撺", "墀", "聩", "觐", "鞑", "蕙", "鞒", "蕈", "蕨", "蕤", "蕞", "蕺", "瞢", "蕃", "蕲", "赜", "槿", "樯", "槭", "樗", "樘", "樊", "槲", "醌", "醅", "靥", "魇", "餍", "磔", "磙", "霈", "辘", "龉", "龊", "觑", "瞌", "瞋", "瞑", "嘭", "噎", "噶", "颙", "暹", "噘", "踔", "踝", "踟", "踒", "踬", "踮", "踯", "踺", "踞", "蝽", "蝾", "蝻", "蝰", "蝮", "螋", "蝓", "蝣", "蝼", "噗", "嘬", "颚", "噍", "噢", "噙", "噜", "噌", "噔", "颛", "幞", "幡", "嶙", "嶝", "骺", "骼", "骸", "镊", "镉", "镌", "镍", "镏", "镒", "镓", "镔", "稷", "箴", "篑", "篁", "篌", "篆", "牖", "儋", "徵", "磐", "虢", "鹞", "膘", "滕", "鲠", "鲡", "鲢", "鲣", "鲥", "鲧", "鲩", "獗", "獠", "觯", "馓", "馔", "麾", "廛", "瘛", "瘼", "瘢", "瘠", "齑", "羯", "羰", "𥻗", "遴", "糌", "糍", "糅", "熜", "熵", "熠", "澍", "澌", "潸", "潦", "潲", "鋈", "潟", "潼", "潺", "憬", "憧", "寮", "窳", "谳", "褴", "褟", "褫", "谵", "熨", "屦", "嬉", "勰", "戮", "蝥", "缬", "缮", "缯", "骣", "畿", "耩", "耨", "耪", "璞", "璟", "靛", "璠", "璘", "聱", "螯", "髻", "髭", "髹", "擀", "熹", "甏", "擞", "縠", "磬", "颞", "蕻", "鞘", "颟", "薤", "薨", "檠", "薏", "薮", "薜", "薅", "樾", "橛", "橇", "樵", "檎", "橹", "樽", "樨", "橼", "墼", "橐", "翮", "醛", "醐", "醍", "醚", "磲", "赝", "飙", "殪", "霖", "霏", "霓", "錾", "辚", "臻", "遽", "氅", "瞟", "瞠", "瞰", "嚄", "嚆", "噤", "暾", "蹀", "踹", "踵", "踽", "蹉", "蹁", "螨", "蟒", "螈", "螅", "螭", "螠", "螟", "噱", "噬", "噫", "噻", "噼", "罹", "圜", "䦃", "镖", "镗", "镘", "镚", "镛", "镝", "镞", "镠", "氇", "氆", "憩", "穑", "篝", "篥", "篦", "篪", "篙", "盥", "劓", "翱", "魉", "魈", "徼", "歙", "膳", "膦", "膙", "鲮", "鲱", "鲲", "鲳", "鲴", "鲵", "鲷", "鲻", "獴", "獭", "獬", "邂", "鹧", "廨", "赟", "瘰", "廪", "瘿", "瘵", "瘴", "癃", "瘳", "斓", "麇", "麈", "嬴", "壅", "羲", "糗", "瞥", "甑", "燎", "燠", "燔", "燧", "濑", "濉", "潞", "澧", "澹", "澥", "澶", "濂", "褰", "寰", "窸", "褶", "禧", "嬖", "犟", "隰", "嬗", "颡", "缱", "缲", "缳", "璨", "璩", "璐", "璪", "螫", "擤", "壕", "觳", "罄", "擢", "薹", "鞡", "鞬", "薷", "薰", "藓", "藁", "檄", "檩", "懋", "醢", "翳", "礅", "磴", "鹩", "龋", "龌", "豳", "壑", "黻", "嚏", "嚅", "蹑", "蹒", "蹊", "蟥", "螬", "螵", "疃", "螳", "蟑", "嚓", "羁", "罽", "罾", "嶷", "黜", "黝", "髁", "髀", "镡", "镢", "镣", "镦", "镧", "镩", "镪", "镫", "罅", "黏", "簌", "篾", "篼", "簖", "簋", "鼢", "黛", "儡", "鹪", "鼾", "皤", "魍", "龠", "繇", "貘", "邈", "貔", "臌", "膻", "臆", "臃", "鲼", "鲽", "鳀", "鳃", "鳅", "鳇", "鳊", "螽", "燮", "鹫", "襄", "糜", "縻", "膺", "癍", "麋", "懑", "濡", "濮", "濞", "濠", "濯", "蹇", "謇", "邃", "襁", "檗", "擘", "孺", "隳", "嬷", "蟊", "鹬", "鍪", "鏊", "鳌", "鬈", "鬃", "瞽", "鞯", "鞨", "鞫", "鞧", "鞣", "藜", "藠", "藩", "醪", "蹙", "礓", "燹", "餮", "瞿", "曛", "颢", "曜", "躇", "蹚", "鹭", "蟛", "蟪", "蟠", "蟮", "鹮", "黠", "黟", "髅", "髂", "镬", "镭", "镯", "馥", "簟", "簪", "鼬", "雠", "艟", "鳎", "鳏", "鳐", "癞", "癔", "癜", "癖", "糨", "蹩", "鎏", "懵", "彝", "邋", "鬏", "攉", "攒", "鞲", "鞴", "藿", "蘧", "蘅", "麓", "醮", "醯", "酃", "霪", "霭", "霨", "黼", "嚯", "蹰", "蹶", "蹽", "蹼", "蹴", "蹾", "蹿", "蠖", "蠓", "蟾", "蠊", "黢", "髋", "髌", "镲", "籀", "籁", "齁", "魑", "艨", "鳓", "鳔", "鳕", "鳗", "鳙", "麒", "鏖", "羸", "㸆", "瀚", "瀣", "瀛", "襦", "谶", "襞", "骥", "缵", "瓒", "攘", "蘩", "蘖", "醴", "霰", "酆", "矍", "曦", "躅", "鼍", "巉", "黩", "黥", "黪", "镳", "镴", "黧", "纂", "璺", "鼯", "臜", "鳜", "鳝", "鳟", "獾", "孀", "骧", "瓘", "鼙", "醺", "礴", "颦", "曩", "鳢", "癫", "麝", "夔", "爝", "灏", "禳", "鐾", "羼", "蠡", "耱", "懿", "蘸", "鹳", "霾", "氍", "饕", "躐", "髑", "镵", "穰", "饔", "鬻", "鬟", "趱", "攫", "攥", "颧", "躜", "鼹", "癯", "麟", "蠲", "蠹", "躞", "衢", "鑫", "灞", "襻", "纛", "鬣", "攮", "囔", "馕", "戆", "爨", "齉", "亍", "尢", "彳", "卬", "殳", "𠙶", "毌", "邘", "戋", "圢", "氕", "伋", "仝", "冮", "氿", "汈", "氾", "忉", "宄", "讱", "扞", "圲", "圫", "芏", "芃", "朳", "朸", "𨙸", "邨", "吒", "吖", "屼", "屾", "辿", "钆", "仳", "伣", "伈", "癿", "甪", "邠", "犴", "冱", "邡", "闫", "汋", "䜣", "讻", "孖", "纩", "玒", "玓", "玘", "玚", "刬", "坜", "坉", "扽", "坋", "扺", "㧑", "毐", "芰", "芣", "苊", "苉", "芘", "芴", "芠", "芤", "杕", "杙", "杄", "杧", "杩", "尪", "尨", "轪", "坒", "芈", "旴", "旵", "呙", "㕮", "岍", "岠", "岜", "呇", "冏", "觃", "岙", "伾", "㑇", "伭", "佖", "伲", "佁", "飏", "狃", "闶", "汧", "汫", "𣲘", "𣲗", "沄", "沘", "汭", "㳇", "沇", "忮", "忳", "忺", "祃", "诇", "邲", "诎", "诐", "屃", "岊", "阽", "䢺", "阼", "妧", "妘", "𨚕", "纮", "驲", "纻", "纼", "玤", "玞", "玱", "玟", "邽", "邿", "坥", "坰", "坬", "坽", "弆", "耵", "䢼", "𦭜", "茋", "苧", "苾", "苠", "枅", "㭎", "枘", "枍", "矼", "矻", "匼", "旿", "昇", "昄", "昒", "昈", "咉", "咇", "咍", "岵", "岽", "岨", "岞", "峂", "㟃", "囷", "钐", "钔", "钖", "牥", "佴", "垈", "侁", "侹", "佸", "佺", "隹", "㑊", "侂", "佽", "侘", "郈", "舠", "郐", "郃", "攽", "肭", "肸", "肷", "狉", "狝", "饳", "忞", "於", "炌", "炆", "泙", "沺", "泂", "泜", "泃", "泇", "怊", "峃", "穸", "祋", "祊", "鸤", "弢", "弨", "陑", "陎", "卺", "乸", "妭", "姈", "迳", "叕", "驵", "䌹", "驺", "绋", "绐", "砉", "耔", "㛃", "玶", "珇", "珅", "珋", "玹", "珌", "玿", "韨", "垚", "垯", "垙", "垲", "埏", "垍", "耇", "垎", "垴", "垟", "垞", "挓", "垵", "垏", "拶", "荖", "荁", "荙", "荛", "茈", "茽", "荄", "茺", "荓", "茳", "𦰡", "茛", "荭", "㭕", "柷", "柃", "柊", "枹", "栐", "柖", "郚", "剅", "䴓", "迺", "厖", "砆", "砑", "砄", "耏", "奓", "䶮", "轵", "轷", "轹", "轺", "昺", "昽", "盷", "咡", "咺", "昳", "昣", "哒", "昤", "昫", "昡", "咥", "昪", "虷", "虸", "哃", "峘", "耑", "峛", "峗", "峧", "帡", "钘", "钜", "钪", "钬", "钭", "矧", "秬", "俫", "舁", "俜", "俙", "俍", "垕", "衎", "舣", "弇", "侴", "鸧", "䏡", "胠", "𦙶", "胈", "胩", "胣", "朏", "飐", "訄", "饻", "庤", "疢", "炣", "炟", "㶲", "洭", "洘", "洓", "洿", "㳚", "泚", "浈", "浉", "洸", "洑", "洢", "洈", "洚", "洺", "洨", "浐", "㳘", "洴", "洣", "恔", "宬", "窀", "扂", "袆", "祏", "祐", "祕", "叚", "陧", "陞", "娀", "姞", "姱", "姤", "姶", "姽", "枲", "绖", "骃", "彖", "骉", "恝", "珪", "珛", "珹", "琊", "玼", "珖", "珽", "珦", "珫", "珒", "珢", "珕", "珝", "埗", "垾", "垺", "埆", "垿", "埌", "埇", "莰", "茝", "鄀", "莶", "莝", "䓖", "莙", "栻", "桠", "桄", "梠", "栴", "梴", "栒", "酎", "酏", "砵", "砠", "砫", "砬", "硁", "恧", "翃", "郪", "𨐈", "辀", "辁", "剕", "赀", "哢", "晅", "晊", "唝", "哳", "哱", "冔", "晔", "晐", "晖", "畖", "蚄", "蚆", "帱", "崁", "峿", "崄", "帨", "崀", "赆", "钷", "眚", "甡", "笫", "倻", "倴", "脩", "倮", "倕", "倞", "倓", "倧", "衃", "虒", "舭", "舯", "舥", "瓞", "鬯", "鸰", "脎", "朓", "胲", "虓", "鱽", "狴", "峱", "狻", "眢", "勍", "痄", "疰", "痃", "竘", "羖", "羓", "桊", "敉", "烠", "烔", "烶", "烻", "涍", "浡", "浭", "浬", "涄", "涢", "涐", "浰", "浟", "浛", "浼", "浲", "涘", "悈", "悃", "悢", "宧", "窅", "窊", "窎", "扅", "扆", "袪", "袗", "袯", "祧", "隺", "堲", "疍", "𨺙", "陴", "烝", "砮", "㛚", "哿", "翀", "翂", "剟", "绤", "骍", "䂮", "琎", "珸", "珵", "琄", "琈", "琀", "珺", "掭", "堎", "堐", "埼", "掎", "埫", "堌", "晢", "掞", "埪", "壸", "㙍", "聍", "菝", "萚", "菥", "莿", "䓫", "勚", "䓬", "萆", "菂", "菍", "菼", "萣", "䓨", "菉", "䓛", "梼", "梽", "桲", "梾", "桯", "梣", "梌", "桹", "敔", "厣", "硔", "硙", "硚", "硊", "硍", "勔", "䴕", "龁", "逴", "唪", "啫", "翈", "㫰", "晙", "畤", "趼", "跂", "蛃", "蚲", "蚺", "啴", "䎃", "崧", "崟", "崞", "崒", "崌", "崡", "铏", "铕", "铖", "铘", "铚", "铞", "铥", "铴", "牻", "牿", "稆", "笱", "笯", "偰", "偡", "鸺", "偭", "偲", "偁", "㿠", "鄅", "偓", "徛", "衒", "舳", "舲", "鸼", "悆", "鄃", "瓻", "䝙", "脶", "脞", "脟", "䏲", "鱾", "猇", "猊", "猄", "觖", "𠅤", "庱", "庼", "庳", "痓", "䴔", "竫", "堃", "阌", "羝", "羕", "焆", "烺", "焌", "淏", "淟", "淜", "淴", "淯", "湴", "涴", "㥄", "惛", "惔", "悰", "惙", "寁", "逭", "袼", "裈", "祲", "谞", "艴", "弸", "弶", "隃", "婞", "娵", "婼", "媖", "婳", "婍", "婌", "婫", "婤", "婘", "婠", "绹", "骕", "絜", "珷", "琲", "琡", "琟", "琔", "琭", "堾", "堼", "揕", "㙘", "堧", "喆", "堨", "塅", "堠", "絷", "𡎚", "葜", "惎", "萳", "葙", "靬", "葴", "蒇", "蒈", "鄚", "蒉", "蓇", "萩", "蒐", "葰", "葎", "鄑", "蒎", "葖", "蒄", "萹", "棤", "棽", "棫", "椓", "椑", "鹀", "椆", "棓", "棬", "棪", "椀", "楗", "甦", "酦", "觌", "奡", "皕", "硪", "欹", "詟", "辌", "棐", "龂", "黹", "牚", "睎", "晫", "晪", "晱", "𧿹", "蛑", "畯", "斝", "喤", "崶", "嵁", "崾", "嵅", "崿", "嵚", "翙", "圌", "圐", "赑", "淼", "赒", "铹", "铽", "𨱇", "锊", "锍", "锎", "锓", "犇", "颋", "稌", "筀", "筘", "筜", "筥", "筅", "傃", "傉", "翛", "傒", "傕", "舾", "畬", "脿", "腘", "䐃", "腙", "腒", "鲃", "猰", "猯", "㺄", "馉", "鄗", "廋", "廆", "鄌", "粢", "遆", "旐", "焞", "欻", "𣸣", "溚", "溁", "湝", "渰", "湓", "㴔", "渟", "溠", "渼", "溇", "湣", "湑", "溞", "愐", "愃", "敩", "甯", "棨", "扊", "裣", "祼", "婻", "媆", "媞", "㛹", "媓", "媂", "媄", "毵", "矞", "缊", "缐", "骙", "瑃", "瑓", "瑅", "瑆", "䴖", "瑖", "瑝", "瑔", "瑀", "𤧛", "瑳", "瑂", "嶅", "瑑", "遘", "髢", "塥", "堽", "赪", "摛", "塝", "搒", "搌", "蒱", "蒨", "蓏", "蔀", "蓢", "蓂", "蒻", "蓣", "椹", "楪", "榃", "榅", "楒", "楞", "楩", "榇", "椸", "楙", "歅", "碃", "碏", "碈", "䃅", "硿", "鄠", "辒", "龆", "觜", "䣘", "暕", "鹍", "㬊", "暅", "跱", "蜐", "蜎", "嵲", "赗", "骱", "锖", "锘", "锳", "锧", "锪", "锫", "锬", "稑", "稙", "䅟", "筻", "筼", "筶", "筦", "筤", "傺", "鹎", "僇", "艅", "艉", "谼", "貆", "腽", "腨", "腯", "鲉", "鲊", "鲌", "䲟", "鲏", "雊", "猺", "飔", "觟", "𦝼", "馌", "裛", "廒", "瘀", "瘅", "鄘", "鹒", "鄜", "麀", "鄣", "阘", "煁", "煃", "煴", "煋", "煟", "煓", "滠", "溍", "溹", "滆", "滉", "溦", "溵", "漷", "滧", "滘", "滍", "愭", "慥", "慆", "塱", "裼", "禋", "禔", "禘", "禒", "谫", "鹔", "愍", "嫄", "媱", "戤", "戣", "缞", "耤", "瑧", "瑨", "瑱", "瑷", "瑢", "斠", "摏", "墕", "墈", "墐", "墘", "摴", "銎", "𡐓", "墚", "撖", "靽", "鞁", "蔌", "蔈", "蓰", "蔹", "蔊", "嘏", "榰", "榑", "槚", "𣗋", "槜", "榍", "疐", "酺", "酾", "酲", "酴", "碶", "䃎", "碨", "𥔲", "碹", "碥", "劂", "䴗", "夥", "瞍", "鹖", "㬎", "跽", "蜾", "幖", "嶍", "圙", "𨱏", "锺", "锼", "锽", "锾", "锿", "镃", "镄", "镅", "馝", "鹙", "箨", "箖", "劄", "僬", "僦", "僔", "僎", "槃", "㙦", "鲒", "鲕", "鲖", "鲗", "鲘", "鲙", "𩽾", "夐", "獍", "飗", "凘", "廑", "廙", "瘗", "瘥", "瘕", "鲝", "鄫", "熇", "漹", "漖", "潆", "漤", "潩", "漼", "漴", "㽏", "漈", "漋", "漻", "慬", "窬", "窭", "㮾", "褕", "禛", "禚", "隩", "嫕", "嫭", "嫜", "嫪", "㻬", "麹", "璆", "漦", "叇", "墣", "墦", "墡", "劐", "薁", "蕰", "蔃", "鼒", "槱", "鹝", "磏", "磉", "殣", "慭", "霅", "暵", "暲", "暶", "踦", "踣", "䗖", "蝘", "蝲", "蝤", "噇", "噂", "噀", "罶", "嶲", "嶓", "㠇", "嶟", "嶒", "镆", "镈", "镋", "镎", "镕", "稹", "儇", "皞", "皛", "䴘", "艎", "艏", "鹟", "𩾃", "鲦", "鲪", "鲬", "橥", "觭", "鹠", "鹡", "糇", "糈", "翦", "鹢", "鹣", "熛", "潖", "潵", "㵐", "澂", "澛", "瑬", "潽", "潾", "潏", "憭", "憕", "戭", "褯", "禤", "嫽", "遹", "璥", "璲", "璒", "憙", "擐", "鄹", "薳", "鞔", "黇", "蕗", "薢", "蕹", "橞", "橑", "橦", "醑", "觱", "磡", "𥕢", "磜", "豮", "鹾", "虤", "暿", "曌", "曈", "㬚", "蹅", "踶", "䗛", "螗", "疁", "㠓", "幪", "嶦", "𨱑", "馞", "穄", "篚", "篯", "簉", "鼽", "衠", "盦", "螣", "縢", "鲭", "鲯", "鲰", "鲺", "鲹", "亸", "癀", "瘭", "羱", "糒", "燋", "熻", "燊", "燚", "燏", "濩", "濋", "澪", "澽", "澴", "澭", "澼", "憷", "憺", "懔", "黉", "嬛", "鹨", "翯", "璱", "𤩽", "璬", "璮", "髽", "擿", "薿", "薸", "檑", "櫆", "檞", "醨", "繄", "磹", "磻", "瞫", "瞵", "蹐", "蟏", "㘎", "镤", "镥", "镨", "𨱔", "矰", "穙", "穜", "穟", "簕", "簃", "簏", "儦", "魋", "斶", "艚", "谿", "䲠", "鲾", "鲿", "鳁", "鳂", "鳈", "鳉", "獯", "䗪", "馘", "襕", "襚", "螱", "甓", "嬬", "嬥", "𦈡", "瓀", "釐", "鬶", "爇", "鞳", "鞮", "藟", "藦", "藨", "鹲", "檫", "黡", "礞", "礌", "𥖨", "蹢", "蹜", "蟫", "䗴", "嚚", "髃", "镮", "镱", "酂", "馧", "簠", "簝", "簰", "鼫", "鼩", "皦", "臑", "䲢", "鳑", "鳒", "鹱", "鹯", "癗", "𦒍", "旞", "翷", "冁", "䎖", "瀔", "瀍", "瀌", "襜", "䴙", "嚭", "㰀", "鬷", "醭", "蹯", "蠋", "翾", "鳘", "儳", "儴", "鼗", "𩾌", "鳚", "鳛", "麑", "麖", "蠃", "彟", "嬿", "鬒", "蘘", "欂", "醵", "颥", "甗", "𨟠", "巇", "酅", "髎", "犨", "𨭉", "㸌", "爔", "瀱", "瀹", "瀼", "瀵", "襫", "孅", "骦", "耰", "𤫉", "瓖", "鬘", "趯", "罍", "鼱", "鳠", "鳡", "鳣", "爟", "爚", "灈", "韂", "糵", "蘼", "礵", "鹴", "躔", "皭", "龢", "鳤", "亹", "籥", "鼷", "玃", "醾", "齇", "觿", "蠼", "𬣙", "𬇕", "𬣞", "𬘓", "𫭟", "𫭢", "𫇭", "𫐄", "𫵷", "𬇙", "𬣡", "𫸩", "𫘜", "𬘘", "𫘝", "𬨂", "𬀩", "𬀪", "𬬩", "𫍣", "𬣳", "𬩽", "𬮿", "𬯀", "𫰛", "𬳵", "𬳶", "𫠊", "𬍛", "鿍", "𬜬", "𪾢", "𪨰", "𫓧", "𬬮", "𬬱", "𬬭", "𬘡", "𬳽", "𬘩", "𫄧", "𪟝", "𬍤", "𫭼", "𬜯", "𬂩", "𫠆", "𬌗", "𫑡", "𪨶", "𬬸", "𬬻", "𬬹", "𬬿", "𬭁", "𫢸", "𫗧", "𬊈", "𬒈", "𬳿", "𫄨", "𬘫", "𫮃", "鿎", "𬱖", "𬟽", "𫓯", "𫟹", "𫟼", "𬇹", "𬍡", "𬤇", "𫍯", "𬤊", "𫍲", "𬯎", "𬘬", "𬘭", "𬴂", "𫘦", "𫟅", "𬘯", "𫘧", "𪣻", "𬃊", "𬷕", "𫐐", "𬹼", "𫶇", "𫖮", "鿏", "𬭊", "𫓶", "𬭎", "𫖯", "𬱟", "𫛭", "𫷷", "𬮱", "𬊤", "𬴃", "𫘨", "𬪩", "𬒔", "𬨎", "𫐓", "𫫇", "𫓹", "𬭚", "𬭛", "𬕂", "𬶋", "𬶍", "𫔶", "𫌀", "𫖳", "𫘪", "𫘬", "𫞩", "𪤗", "𬸘", "𬒗", "𫚖", "𬭤", "𫚕", "𬶐", "𬶏", "𬸚", "𬤝", "𬙂", "𬭩", "𬸣", "𫍽", "𬴊", "𬞟", "𫟦", "𬺈", "𫠜", "𪩘", "𬭬", "𬭯", "𫗴", "𬸦", "𫄷", "𬭳", "𬭶", "𫔍", "𬭸", "𬭼", "𫔎", "𬸪", "𬶟", "𬶠", "𬶨", "𫄸", "𬟁", "𬙊", "𬶭", "𬶮", "𬙋", "𬺓", "𫚭", "廠", "蔔", "兒", "幾", "幹", "虧", "纔", "與", "萬", "韆", "億", "個", "廣", "門", "義", "衛", "飛", "習", "馬", "鄉", "豐", "開", "無", "雲", "專", "藝", "廳", "區", "歷", "曆", "車", "貝", "岡", "見", "氣", "長", "僕", "幣", "僅", "從", "侖", "倉", "風", "烏", "鳳", "爲", "鬥", "憶", "計", "訂", "認", "譏", "醜", "隊", "辦", "鄧", "勸", "雙", "書", "擊", "撲", "節", "術", "厲", "龍", "滅", "軋", "東", "盧", "業", "舊", "帥", "歸", "葉", "電", "號", "衹", "隻", "嘰", "嘆", "們", "儀", "叢", "爾", "樂", "處", "鼕", "鳥", "務", "飢", "饑", "馮", "閃", "蘭", "匯", "彙", "頭", "漢", "寧", "討", "寫", "讓", "禮", "訓", "議", "訊", "記", "齣", "遼", "邊", "發", "髮", "聖", "對", "臺", "颱", "檯", "糾", "絲", "動", "鞏", "執", "擴", "掃", "場", "揚", "亞", "樸", "機", "權", "過", "協", "壓", "厭", "頁", "誇", "奪", "達", "夾", "軌", "堯", "劃", "邁", "畢", "貞", "師", "塵", "當", "噹", "籲", "嚇", "蟲", "麯", "團", "糰", "嗎", "嶼", "歲", "迴", "豈", "則", "剛", "網", "硃", "遷", "喬", "偉", "傳", "優", "傷", "價", "倫", "華", "僞", "嚮", "後", "會", "殺", "閤", "衆", "爺", "傘", "創", "雜", "負", "壯", "衝", "妝", "莊", "慶", "劉", "齊", "産", "閉", "問", "闖", "關", "燈", "湯", "興", "講", "諱", "軍", "訝", "許", "訛", "論", "訟", "農", "諷", "設", "訪", "訣", "尋", "盡", "儘", "導", "孫", "陣", "陽", "階", "陰", "婦", "媽", "戲", "觀", "歡", "買", "紅", "馱", "纖", "縴", "馴", "約", "級", "紀", "馳", "紉", "壽", "麥", "瑪", "進", "遠", "違", "韌", "運", "撫", "壇", "罎", "壞", "摳", "擾", "貢", "垻", "壩", "摺", "掄", "搶", "墳", "護", "殻", "塊", "聲", "報", "擬", "蕪", "葦", "蒼", "嚴", "蘆", "勞", "蘇", "囌", "極", "楊", "兩", "麗", "醫", "勵", "還", "殲", "來", "連", "軒", "鹵", "滷", "堅", "時", "縣", "裏", "嘔", "園", "曠", "圍", "噸", "郵", "睏", "員", "聽", "嗆", "嗚", "彆", "嶇", "崗", "帳", "財", "針", "釘", "亂", "體", "傭", "徹", "餘", "穀", "鄰", "腸", "龜", "猶", "狽", "條", "島", "飯", "飲", "係", "繫", "凍", "狀", "畝", "庫", "療", "應", "這", "廬", "閏", "閑", "間", "悶", "竈", "燦", "瀝", "淪", "滄", "溝", "滬", "瀋", "懷", "憂", "窮", "證", "啓", "評", "補", "識", "詐", "訴", "診", "詞", "譯", "靈", "層", "遲", "張", "際", "陸", "陳", "墜", "勁", "鷄", "緯", "驅", "純", "紗", "綱", "納", "駁", "縱", "紛", "紙", "紋", "紡", "驢", "紐", "環", "責", "現", "錶", "規", "攏", "揀", "擔", "頂", "擁", "勢", "攔", "擰", "撥", "擇", "蘋", "範", "莖", "樞", "櫃", "闆", "鬆", "槍", "楓", "構", "喪", "畫", "棗", "賣", "鬱", "礬", "礦", "碼", "厠", "奮", "態", "歐", "毆", "壟", "轟", "頃", "轉", "斬", "輪", "軟", "齒", "虜", "腎", "賢", "國", "暢", "嚨", "鳴", "羅", "幟", "嶺", "凱", "敗", "賬", "販", "貶", "購", "貯", "圖", "釣", "製", "颳", "俠", "僥", "偵", "側", "憑", "僑", "貨", "質", "徑", "捨", "覓", "貪", "貧", "膚", "腫", "脹", "骯", "脅", "魚", "獰", "備", "飾", "飽", "飼", "變", "龐", "廟", "瘧", "劑", "廢", "閘", "鬧", "鄭", "捲", "單", "爐", "淺", "濘", "瀉", "潑", "澤", "憐", "學", "寶", "寵", "審", "簾", "實", "試", "詩", "誠", "襯", "視", "話", "誕", "詭", "詢", "該", "詳", "肅", "録", "隸", "彌", "瀰", "陝", "駕", "參", "艱", "綫", "練", "組", "紳", "細", "駛", "織", "駒", "終", "駐", "絆", "駝", "紹", "繹", "經", "貫", "貳", "幫", "項", "挾", "撓", "趙", "擋", "墊", "擠", "揮", "薦", "帶", "繭", "蕩", "榮", "葷", "熒", "鬍", "蔭", "藥", "標", "棧", "棟", "欄", "檸", "樹", "鹹", "磚", "硯", "麵", "牽", "鷗", "殘", "軸", "輕", "鴉", "戰", "點", "臨", "覽", "竪", "嘗", "啞", "顯", "貴", "蝦", "蟻", "螞", "雖", "駡", "勛", "嘩", "響", "喲", "峽", "罰", "賤", "貼", "貽", "鈣", "鈍", "鈔", "鍾", "鐘", "鋼", "鈉", "鑰", "欽", "鈞", "鈎", "鈕", "氈", "氫", "選", "適", "種", "鞦", "復", "複", "倆", "貸", "順", "儉", "須", "鬚", "劍", "朧", "膽", "勝", "狹", "獅", "獨", "獄", "貿", "餌", "饒", "蝕", "餃", "餅", "巒", "彎", "將", "奬", "瘡", "瘋", "親", "閨", "聞", "閩", "閥", "閣", "養", "薑", "類", "婁", "總", "煉", "爍", "爛", "窪", "潔", "灑", "澆", "濁", "測", "瀏", "濟", "渾", "濃", "惱", "舉", "覺", "憲", "竊", "誡", "誣", "語", "襖", "誤", "誘", "誨", "説", "誦", "墾", "晝", "費", "遜", "隕", "險", "嬌", "賀", "壘", "綁", "絨", "結", "繞", "驕", "繪", "給", "絢", "駱", "絡", "絶", "絞", "駭", "統", "艷", "蠶", "頑", "盞", "撈", "載", "趕", "鹽", "損", "撿", "摯", "剝", "熱", "搗", "壺", "聶", "萊", "蓮", "獲", "穫", "惡", "噁", "瑩", "鶯", "檔", "橋", "樺", "樁", "樣", "賈", "礫", "礎", "顧", "轎", "較", "頓", "斃", "緻", "慮", "監", "緊", "黨", "曬", "曉", "嘮", "鴨", "暈", "鴦", "罷", "圓", "賊", "賄", "賂", "贜", "錢", "鉗", "鑽", "鉀", "鐵", "鈴", "鉛", "犧", "敵", "積", "稱", "筆", "債", "傾", "賃", "艦", "艙", "聳", "愛", "頒", "頌", "臟", "髒", "臍", "膠", "腦", "膿", "鴕", "鴛", "皺", "餓", "餒", "戀", "槳", "漿", "準", "癥", "齋", "離", "資", "競", "閲", "煩", "燒", "燭", "遞", "濤", "澇", "渦", "塗", "滌", "潤", "澗", "漲", "燙", "澀", "憫", "寬", "傢", "賓", "竅", "請", "諸", "諾", "讀", "誹", "襪", "課", "誰", "調", "諒", "諄", "談", "誼", "懇", "劇", "難", "預", "絹", "綉", "驗", "繼", "駿", "瑣", "擲", "據", "摻", "職", "蘿", "螢", "營", "蕭", "薩", "夢", "檢", "醖", "碩", "聾", "襲", "輔", "輛", "顱", "懸", "躍", "纍", "囉", "嘯", "嶄", "邏", "嬰", "銬", "鐺", "鋁", "銅", "銘", "鏟", "銀", "矯", "穢", "籠", "償", "軀", "釁", "銜", "盤", "鴿", "斂", "領", "臉", "獵", "餡", "館", "癢", "鏇", "閻", "闡", "蓋", "斷", "獸", "鴻", "漸", "淵", "漁", "澱", "滲", "慚", "懼", "驚", "慘", "慣", "謀", "諜", "謊", "諧", "禱", "禍", "謂", "諺", "謎", "彈", "墮", "隨", "隱", "嬸", "頗", "頸", "績", "緒", "續", "騎", "綽", "繩", "維", "綿", "綳", "綢", "綜", "綻", "緑", "綴", "瓊", "趨", "攬", "攙", "擱", "摟", "攪", "聯", "蔣", "韓", "橢", "確", "頰", "靂", "暫", "翹", "輩", "鑿", "輝", "賞", "睞", "噴", "疇", "踐", "遺", "鵑", "賦", "賭", "贖", "賜", "賠", "鑄", "鋪", "鏈", "銷", "鎖", "鋤", "鍋", "銹", "鋒", "鋅", "鋭", "鵝", "築", "篩", "儲", "懲", "禦", "釋", "臘", "魯", "憊", "饋", "饞", "裝", "蠻", "闊", "糞", "滯", "濕", "潰", "濺", "灣", "憤", "竄", "窩", "褲", "禪", "謝", "謡", "謗", "謙", "屬", "屢", "緬", "纜", "緝", "緞", "緩", "締", "縷", "騙", "編", "騷", "緣", "鵡", "攝", "擺", "襬", "攤", "鵲", "藍", "濛", "懞", "矇", "獻", "欖", "樓", "賴", "礙", "尷", "霧", "輻", "輯", "輸", "頻", "齡", "鑒", "蹺", "蝸", "錯", "錨", "錫", "鑼", "錘", "錐", "錦", "鍵", "鋸", "錳", "辭", "頽", "籌", "簽", "籤", "簡", "膩", "鵬", "騰", "鮑", "穎", "觸", "雛", "饃", "餾", "醬", "謄", "糧", "數", "滿", "濾", "濫", "灕", "濱", "灘", "譽", "窺", "寢", "謹", "謬", "闢", "縛", "縫", "纏", "繽", "贅", "墻", "衊", "藹", "檻", "釀", "願", "轄", "輾", "顆", "踴", "蠟", "蠅", "蟬", "賺", "鍬", "鍛", "鍍", "穩", "籮", "簫", "輿", "鮮", "饅", "瀟", "賽", "譚", "譜", "騾", "縮", "攆", "聰", "藴", "櫻", "飄", "黴", "瞞", "題", "囑", "鎮", "鎬", "鎊", "簍", "鯉", "鯽", "癟", "癱", "顔", "鯊", "瀾", "額", "譴", "鶴", "繚", "顛", "轍", "鸚", "贈", "鏡", "贊", "籃", "籬", "鯨", "癮", "辯", "瀕", "懶", "繮", "繳", "矚", "贍", "鰐", "辮", "贏", "驟", "囂", "鐮", "鰭", "鷹", "巔", "顫", "癬", "鱉", "鬢", "鱗", "躪", "贛", "鑲", "韋", "閂", "訃", "勱", "芻", "鄺", "訐", "訌", "訕", "訖", "馭", "璣", "壙", "捫", "薌", "厙", "釔", "傴", "倀", "傖", "獷", "獁", "鳬", "鄔", "餳", "懺", "謳", "詎", "訥", "紆", "紂", "紇", "紈", "璵", "摶", "塢", "㩳", "蕓", "藶", "莧", "萇", "蓯", "磯", "奩", "歟", "軔", "鄴", "嘸", "囈", "嚦", "暘", "唄", "幃", "峴", "嵐", "圇", "釗", "釙", "釕", "僉", "鳩", "鄒", "飩", "餼", "飪", "飫", "飭", "廡", "癤", "闈", "閎", "閔", "煬", "灃", "漚", "渢", "潙", "憮", "慪", "愾", "悵", "愴", "詁", "訶", "詛", "詆", "謅", "詔", "詒", "隴", "陘", "嫵", "嫗", "嬀", "剄", "紜", "紕", "紝", "綸", "紓", "瑋", "匭", "壚", "擓", "蘢", "蔦", "塋", "煢", "櫪", "梘", "棖", "樅", "碭", "甌", "郟", "軛", "鳶", "曇", "蟣", "黽", "嚀", "噝", "巋", "劌", "剴", "嶧", "釷", "釺", "釧", "釩", "釹", "釵", "儈", "儕", "儂", "劊", "慫", "糴", "戧", "膞", "邇", "梟", "餞", "飴", "癘", "瘍", "煒", "熰", "熗", "瀧", "瀘", "濼", "涇", "㥮", "懌", "誆", "誄", "詿", "詰", "詼", "鄆", "禕", "誅", "詵", "詬", "詮", "詣", "諍", "詫", "諢", "詡", "駑", "紺", "紲", "紱", "駟", "駙", "縐", "絀", "驛", "駘", "瓏", "頇", "埡", "撾", "撻", "賁", "壋", "撏", "莢", "貰", "蓽", "蕎", "薈", "薺", "堊", "滎", "犖", "蕁", "藎", "蓀", "蕒", "葤", "櫛", "櫳", "櫨", "櫟", "檉", "酈", "硨", "碸", "殤", "軲", "軻", "轤", "軼", "軫", "蠆", "覘", "瞘", "嘵", "嗶", "噦", "剮", "鄖", "噲", "噥", "嶢", "幀", "嶠", "貺", "鈈", "鈦", "鋇", "鈑", "鈐", "鎢", "鈁", "鈀", "篤", "儔", "儼", "儷", "腖", "臚", "脛", "鴇", "獪", "颮", "猻", "餉", "餄", "餎", "孿", "孌", "癧", "瘲", "颯", "闥", "閭", "闓", "閡", "熾", "烴", "浹", "澮", "滸", "潯", "濜", "慟", "懨", "愷", "惻", "惲", "誚", "禰", "誥", "誑", "鴆", "婭", "嬈", "懟", "絝", "驍", "驊", "絎", "絳", "駢", "頊", "璫", "琿", "塒", "塤", "堝", "贄", "蒔", "萵", "蕕", "鴣", "蒓", "橈", "楨", "榿", "檜", "邐", "礪", "礱", "軾", "輊", "輅", "鶇", "躉", "齔", "鸕", "矓", "嘜", "鴞", "蜆", "嗩", "嶗", "崍", "覬", "賅", "鈺", "鉦", "鈷", "鉢", "鈸", "鉞", "鉭", "鉬", "鈿", "鈾", "鉑", "鑠", "鉚", "鈰", "鉉", "鉈", "鉍", "鈮", "鈹", "鏺", "鐸", "氬", "筧", "頎", "徠", "膾", "鴟", "璽", "鴝", "獫", "裊", "餑", "欒", "攣", "癰", "痙", "頏", "閫", "鬮", "誾", "閬", "鄲", "燁", "燴", "燼", "淶", "漣", "潿", "慳", "諏", "諑", "禎", "諉", "諛", "諗", "諂", "誶", "媧", "嫻", "綆", "驪", "綃", "騁", "綏", "縧", "綈", "駸", "鷥", "燾", "璉", "麩", "擄", "摑", "鷙", "撣", "慤", "摜", "縈", "槤", "覡", "欞", "嗇", "匱", "硤", "磽", "鴯", "龔", "殞", "殮", "賚", "輒", "塹", "嘖", "囀", "嚙", "蹌", "蠣", "蠱", "蟶", "幘", "幗", "賕", "賑", "賒", "銠", "鉺", "鋏", "鐃", "銦", "鎧", "鍘", "銖", "銑", "鋌", "鏵", "銓", "鎩", "鉿", "銚", "鉻", "錚", "銫", "鉸", "銥", "銃", "銨", "銣", "鴰", "穠", "箋", "籩", "僨", "僂", "皚", "鴴", "艫", "龕", "玀", "獼", "餜", "餛", "鸞", "闍", "閾", "閹", "閶", "鬩", "閽", "閼", "羥", "糲", "燜", "漬", "瀆", "澠", "愜", "憚", "諶", "諫", "皸", "謔", "襠", "謁", "諤", "諭", "諼", "讒", "諳", "諦", "諞", "糶", "嬋", "綾", "騏", "綺", "緋", "緔", "騍", "緄", "騅", "綬", "綹", "綣", "綰", "驂", "緇", "靚", "輦", "黿", "頡", "撳", "蟄", "壪", "蔞", "櫝", "欏", "賫", "鵓", "鸝", "殫", "輥", "輞", "槧", "輟", "輜", "瞼", "躒", "蛺", "蟯", "螄", "蠐", "嘍", "嶸", "嶁", "賧", "鋙", "錸", "鏗", "鋥", "鋰", "鋯", "鋨", "銼", "鐧", "銻", "鋃", "鋦", "錒", "犢", "鵠", "篳", "牘", "儻", "儐", "儺", "嬃", "頜", "鵒", "魷", "魨", "魴", "潁", "颶", "觴", "熲", "餷", "餿", "褻", "臠", "癆", "癇", "賡", "頦", "鷳", "闌", "闃", "闋", "鵜", "憒", "嚳", "謨", "褳", "襇", "讜", "謖", "謚", "謐", "騭", "巰", "翬", "騖", "緙", "緗", "緘", "緹", "緲", "緦", "緱", "縋", "緡", "饗", "耮", "驁", "韞", "攄", "擯", "轂", "驀", "鶓", "薊", "蘺", "鎣", "頤", "櫚", "櫸", "磧", "磣", "鵪", "輳", "齟", "齙", "韙", "囁", "躂", "蹕", "躚", "躋", "噯", "鍺", "錛", "錡", "鍀", "錁", "錕", "錮", "鍁", "錈", "錠", "錙", "覦", "頷", "鮁", "鮃", "鮎", "鱸", "穌", "鮒", "鮐", "鵮", "颼", "饈", "鶉", "瘮", "闔", "闐", "闕", "灧", "瀅", "潷", "灤", "澦", "懾", "鱟", "騫", "竇", "謾", "謫", "嬡", "嬪", "縉", "縝", "縟", "轡", "騮", "縞", "縭", "縊", "縑", "騸", "覯", "韜", "靉", "攖", "薔", "藺", "鶘", "檳", "櫧", "釅", "殯", "霽", "轅", "齜", "齦", "瞜", "曖", "躊", "蟈", "鶚", "嚶", "羆", "賻", "罌", "鶻", "鍥", "鍇", "鍶", "鍔", "鍤", "鏘", "鎂", "鏤", "簀", "篋", "簞", "籙", "臏", "鮭", "鮪", "鱭", "鮫", "鱘", "饉", "鑾", "瘻", "闞", "鮝", "糝", "鷀", "瀲", "濰", "譖", "褸", "譙", "讕", "譎", "鶥", "嬙", "鶩", "驃", "縹", "縵", "縲", "纓", "驄", "繆", "繅", "耬", "瓔", "擷", "擼", "攛", "聵", "覲", "韃", "鞽", "蘄", "賾", "檣", "靨", "魘", "饜", "轆", "齬", "齪", "覷", "顒", "躓", "躑", "蠑", "螻", "顎", "嚕", "顓", "鑷", "鎘", "鎸", "鎳", "鎦", "鎰", "鎵", "鑌", "簣", "鷂", "鯁", "鱺", "鰱", "鰹", "鰣", "鯀", "鯇", "觶", "饊", "饌", "齏", "讞", "襤", "譫", "屨", "纈", "繕", "繒", "驏", "擻", "顳", "顢", "藪", "櫓", "櫞", "贋", "飆", "鏨", "轔", "蟎", "鐯", "鏢", "鏜", "鏝", "鏰", "鏞", "鏑", "鏃", "鏐", "氌", "穡", "魎", "鯪", "鯡", "鯤", "鯧", "鯝", "鯢", "鯛", "鯔", "獺", "鷓", "贇", "癭", "斕", "瀨", "顙", "繾", "繰", "繯", "蘚", "鷯", "齲", "齷", "躡", "蹣", "羈", "鐔", "鐝", "鐐", "鐓", "鑭", "鑹", "鏹", "鐙", "籪", "鷦", "鱝", "鰈", "鯷", "鰓", "鰍", "鰉", "鯿", "鷲", "懣", "鷸", "鰲", "韉", "顥", "鷺", "䴉", "髏", "鑊", "鐳", "鐲", "讎", "鰨", "鰥", "鰩", "癩", "攢", "靄", "躥", "髖", "髕", "鑔", "籟", "鰳", "鰾", "鱈", "鰻", "鱅", "讖", "驥", "纘", "瓚", "鼉", "黷", "黲", "鑣", "鑞", "臢", "鱖", "鱔", "鱒", "驤", "顰", "鱧", "癲", "灝", "鸛", "鑱", "趲", "顴", "躦", "饢", "戇", "戔", "訏", "訒", "釓", "俔", "閆", "澫", "訢", "訩", "詝", "紃", "纊", "瑒", "剗", "塸", "壢", "埨", "撝", "蔿", "榪", "軑", "軏", "咼", "㠣", "覎", "㑳", "颺", "閌", "潕", "湋", "澐", "浿", "諓", "禡", "詗", "詘", "詖", "屓", "彄", "紘", "馹", "馼", "紵", "紞", "駃", "紖", "瑲", "薴", "棡", "軝", "暐", "晛", "崬", "釴", "釤", "鍆", "鍚", "鄶", "獮", "飿", "嶨", "詷", "詪", "鄩", "鳲", "隑", "隮", "娙", "逕", "駓", "駔", "駉", "絅", "騶", "䮄", "紼", "紿", "瓅", "韍", "墶", "塏", "薘", "蕘", "蔄", "葒", "鳾", "龑", "軹", "軤", "轢", "軺", "睍", "曨", "噠", "鈃", "鈇", "鉅", "鋹", "釿", "錀", "鈧", "鈥", "鈄", "倈", "艤", "鶬", "颭", "餏", "湞", "溮", "滻", "褘", "絰", "駰", "絪", "駪", "綎", "綖", "驫", "勣", "璕", "𡑍", "䓣", "薟", "藭", "椏", "梜", "頍", "硜", "輄", "輈", "輇", "貲", "嗊", "曄", "暉", "鄳", "幬", "輋", "嶮", "贐", "鉥", "鉕", "鑪", "鉮", "鉊", "鉧", "僤", "鴒", "魛", "餗", "燖", "溳", "礐", "窵", "襏", "駼", "絺", "綌", "騂", "綄", "璡", "墠", "壼", "聹", "蘀", "勩", "罃", "檮", "棶", "厴", "䃮", "磑", "礄", "鴷", "齕", "頔", "廼", "凢", "亾", "枒", "屍", "匃", "匄", "紥", "紮", "疋", "殀", "讐", "觔", "兇", "宂", "㕥", "㠯", "栞", "佈", "佔", "呌", "敂", "冄", "坵", "僊", "怱", "悤", "冊", "夘", "戼", "牠", "妳", "嬭", "摃", "釦", "攷", "託", "衺", "衕", "弔", "喫", "囙", "㠶", "颿", "秊", "倣", "髣", "佀", "朶", "氷", "決", "併", "並", "竝", "汙", "汚", "異", "姦", "廵", "挵", "衖", "搤", "阯", "撦", "埳", "阬", "誌", "㕁", "卻", "刦", "刧", "刼", "芲", "蘤", "桿", "槓", "荳", "獃", "唫", "脗", "皁", "彿", "髴", "疘", "刪", "鉋", "鑤", "況", "牀", "恡", "棄", "洶", "汎", "災", "烖", "菑", "禩", "侷", "跼", "坿", "玅", "姉", "妬", "翫", "搨", "柺", "拕", "牴", "觝", "倖", "抝", "盃", "桮", "傑", "逩", "肎", "菓", "崐", "崑", "呪", "虖", "嘑", "謼", "詠", "㟁", "嵒", "巗", "巖", "雰", "稈", "咊", "嶽", "妷", "姪", "廹", "徃", "餚", "採", "寀", "唸", "週", "昬", "兎", "兔", "亯", "亱", "䘚", "淨", "劵", "匟", "㳒", "灋", "洩", "霑", "淚", "註", "恠", "箒", "屆", "絃", "圅", "旾", "珎", "掛", "垜", "艸", "茘", "査", "栢", "柵", "栁", "桺", "柹", "韮", "揹", "昰", "閧", "鬨", "冐", "暎", "嚥", "倃", "𠴰", "偺", "喒", "齩", "欬", "榘", "㑺", "儁", "敍", "敘", "肧", "脈", "䘑", "衇", "跡", "蹟", "砲", "礮", "薙", "鬀", "恆", "怳", "卹", "䘏", "賉", "婣", "畊", "揑", "綑", "輓", "恥", "躭", "晉", "棲", "覈", "慄", "翄", "脣", "槕", "㨪", "螡", "蟁", "㤙", "陗", "峩", "峯", "乗", "椉", "咲", "筍", "俛", "頫", "勌", "䠶", "躳", "慇", "拏", "㧱", "挐", "脃", "胷", "肐", "貍", "㽞", "畱", "淒", "悽", "蓆", "効", "傚", "涼", "缾", "菸", "煙", "淛", "湧", "誖", "猂", "醼", "讌", "㝠", "寃", "孃", "桒", "毬", "瑠", "璢", "瑯", "㨗", "搥", "搯", "蔆", "惏", "楳", "槑", "捄", "廂", "慽", "慼", "瞇", "埜", "畧", "虵", "稭", "棃", "犂", "迻", "媮", "兠", "舩", "慾", "綵", "腳", "𩓐", "夠", "豬", "貓", "湊", "減", "庻", "蔴", "菴", "朢", "睠", "觕", "麤", "釬", "銲", "痳", "殽", "婬", "滛", "湻", "㴱", "樑", "顇", "㝛", "窰", "窯", "琹", "欵", "墖", "趂", "隄", "愽", "揷", "揫", "煑", "朞", "㪚", "塟", "蔥", "蔕", "稜", "棊", "碁", "椶", "偪", "㕑", "廚", "廈", "鴈", "冣", "㝡", "晳", "鼃", "餧", "餵", "嗁", "諠", "㡌", "賸", "筴", "筞", "筩", "栰", "暠", "皜", "踰", "蝟", "㪟", "燄", "遊", "媿", "嘅", "庽", "窓", "牎", "牕", "窻", "徧", "僱", "帬", "裠", "強", "彊", "疎", "壻", "瓌", "䰟", "皷", "擕", "㩗", "㩦", "攜", "懃", "鞾", "幙", "㮣", "酧", "詶", "醻", "掽", "踫", "㼝", "盌", "磟", "覩", "倸", "㬉", "煗", "煖", "晻", "闇", "炤", "跥", "䗬", "蠭", "寘", "辠", "稺", "穉", "燬", "譭", "瘉", "癒", "顋", "骽", "猨", "蝯", "稟", "痺", "癡", "亷", "㢘", "韻", "泝", "遡", "昚", "躶", "臝", "羣", "㬪", "曡", "疊", "勦", "琍", "瓈", "𤋮", "熈", "牓", "搾", "謌", "堿", "鹻", "鹼", "矁", "燻", "髈", "𤺥", "辢", "旂", "𡚁", "潄", "砦", "詧", "嫰", "櫈", "撐", "墪", "譔", "鞵", "鞌", "蕋", "橤", "蘂", "醕", "譆", "跴", "蹤", "蜨", "蠍", "稾", "殭", "惪", "厀", "襃", "癅", "䊀", "餬", "潛", "癄", "顦", "鷰", "藷", "櫥", "螎", "蹏", "蟇", "譟", "簒", "彫", "琱", "鵰", "餹", "餻", "簷", "粦", "燐", "緐", "幑", "蹧", "粇", "穅", "臋", "籐", "繙", "飜", "孼", "蠏", "燿", "蝡", "稬", "穤", "惷", "覇", "鑵", "戹", "阨", "剳", "帀", "巵", "亙", "佇", "竚", "穽", "岅", "虯", "𦍑", "羗", "啎", "姙", "㘭", "袟", "袠", "逈", "㒺", "犛", "氂", "偘", "甕", "罋", "冺", "姍", "蝨", "琺", "瑇", "尅", "梔", "斮", "斲", "斵", "暱", "毘", "蝱", "吚", "哶", "峝", "粃", "竢", "狥", "秈", "烱", "㳄", "袵", "盇", "涖", "蒞", "碪", "蠔", "唕", "倐", "儵", "雋", "皐", "臯", "衂", "䶊", "臙", "獧", "痾", "皰", "湼", "澣", "濬", "塚", "襢", "娿", "勅", "勑", "戞", "廐", "廄", "眥", "覜", "勗", "啗", "噉", "傯", "挱", "㥫", "惥", "慂", "陻", "蕚", "萲", "蕿", "蘐", "藼", "櫂", "箠", "槨", "啑", "蹠", "蚘", "痐", "蛕", "蜖", "瘖", "遯", "醃", "飱", "冪", "簑", "枏", "柟", "檝", "楥", "矴", "椗", "嘷", "獋", "粺", "䈰", "諐", "齶", "堘", "疿", "雝", "秔", "稉", "槀", "搉", "廝", "叡", "嘠", "蜋", "筯", "篛", "麞", "糉", "緥", "璿", "髥", "臕", "餈", "剹", "橜", "罇", "蜺", "矙", "憇", "翺", "饍", "瞖", "羴", "羶", "爕", "繦", "騌", "鬉", "騣", "蔾", "䠀", "簮", "躕", "蹵", "䝔", "貛", "鼴", "麐", "塡", "あ", "い", "う", "え", "お", "か", "き", "く", "け", "こ", "さ", "し", "す", "せ", "そ", "た", "ち", "つ", "て", "と", "な", "に", "ぬ", "ね", "の", "は", "ひ", "ふ", "へ", "ほ", "ま", "み", "む", "め", "も", "や", "ゆ", "よ", "ら", "り", "る", "れ", "ろ", "わ", "を", "ん", "が", "ぎ", "ぐ", "げ", "ご", "ざ", "じ", "ず", "ぜ", "ぞ", "だ", "ぢ", "づ", "で", "ど", "ば", "び", "ぶ", "べ", "ぼ", "ぱ", "ぴ", "ぷ", "ぺ", "ぽ", "ぁ", "ぃ", "ぅ", "ぇ", "ぉ", "っ", "ゃ", "ゅ", "ょ", "ゎ", "ゕ", "ゖ", "ア", "イ", "ウ", "エ", "オ", "カ", "キ", "ク", "ケ", "コ", "サ", "シ", "ス", "セ", "ソ", "タ", "チ", "ツ", "テ", "ト", "ナ", "ニ", "ヌ", "ネ", "ノ", "ハ", "ヒ", "フ", "ヘ", "ホ", "マ", "ミ", "ム", "メ", "モ", "ヤ", "ユ", "ヨ", "ラ", "リ", "ル", "レ", "ロ", "ワ", "ヲ", "ン", "ガ", "ギ", "グ", "ゲ", "ゴ", "ザ", "ジ", "ズ", "ゼ", "ゾ", "ダ", "ヂ", "ヅ", "デ", "ド", "バ", "ビ", "ブ", "ベ", "ボ", "パ", "ピ", "プ", "ペ", "ポ", "ァ", "ィ", "ゥ", "ェ", "ォ", "ッ", "ャ", "ュ", "ョ", "ヮ", "ヵ", "ヶ", "ヷ", "ヸ", "ヹ", "ヺ", "・", "ー", "ヽ", "ヾ", "ヿ", "ア", "イ", "ウ", "エ", "オ", "カ", "キ", "ク", "ケ", "コ", "サ", "シ", "ス", "セ", "ソ", "タ", "チ", "ツ", "テ", "ト", "ナ", "ニ", "ヌ", "ネ", "ノ", "ハ", "ヒ", "フ", "ヘ", "ホ", "マ", "ミ", "ム", "メ", "モ", "ヤ", "ユ", "ヨ", "ラ", "リ", "ル", "レ", "ロ", "ワ", "ヲ", "ン", "゙", "゚", "ァ", "ィ", "ゥ", "ェ", "ォ", "ッ", "ャ", "ュ", "ョ", "円", "気", "糸", "絵", "楽", "帰", "戸", "広", "黒", "図", "線", "読", "売", "歩", "毎", "亜", "悪", "圧", "扱", "囲", "為", "壱", "隠", "栄", "営", "駅", "塩", "縁", "艶", "応", "桜", "穏", "仮", "価", "箇", "ゑ", "ゝ", "ゞ", "ヰ", "ヴ", "㈱", "両", "丼", "丿", "亀", "仏", "伝", "侶", "俤", "値", "倶", "倹", "偐", "偽", "働", "儛", "兌", "児", "冑", "冨", "凞", "処", "凪", "別", "剣", "剤", "剰", "劔", "労", "勧", "勲", "匁", "匂", "匲", "卍", "単", "厳", "収", "呂", "呉", "呑", "呰", "唖", "喚", "喩", "喰", "噛", "噺", "嚢", "囃", "団", "圀", "圏", "堀", "堺", "塀", "塁", "塙", "増", "墺", "壊", "壌", "壷", "変", "奨", "姫", "娯", "嫐", "嬢", "嬾", "孁", "宍", "実", "宮", "寔", "寛", "対", "専", "尭", "峠", "崋", "嶋", "巀", "巌", "巣", "巻", "帯", "幇", "庁", "廃", "廻", "弉", "弌", "弐", "弖", "弾", "従", "徳", "徴", "忯", "恵", "悩", "惣", "懐", "懽", "戦", "戯", "戻", "払", "抜", "択", "拝", "拠", "拡", "拵", "挙", "挿", "捗", "捜", "掟", "掲", "掻", "揃", "換", "揺", "摂", "撃", "撹", "斉", "斎", "旛", "旡", "晧", "晩", "暁", "暦", "曽", "杁", "杢", "杣", "杮", "枓", "枠", "枡", "柾", "栂", "栃", "桝", "桟", "桾", "梛", "梱", "梲", "梶", "椙", "検", "椥", "楕", "楡", "楢", "榊", "榎", "槇", "様", "槙", "槻", "樋", "権", "樫", "橿", "檥", "欅", "歎", "歓", "歯", "歳", "歴", "毀", "沖", "沢", "浄", "涙", "済", "渉", "渋", "渓", "渕", "満", "滝", "漑", "潅", "澁", "瀞", "瀬", "焔", "焼", "煇", "煕", "煥", "燗", "爼", "犠", "狛", "猟", "獏", "獣", "珊", "瑤", "甞", "畑", "畠", "畳", "畷", "畺", "痩", "癪", "発", "県", "眞", "砕", "碕", "礒", "禖", "禿", "稲", "穂", "穣", "竃", "竜", "竴", "笹", "筈", "筬", "筰", "箆", "箏", "箙", "篠", "篭", "簺", "籾", "粂", "粋", "粛", "粧", "糺", "紬", "絁", "経", "絖", "絣", "絽", "継", "続", "綟", "総", "縄", "縅", "縒", "縦", "繊", "繋", "繍", "繝", "繧", "纐", "纒", "罠", "罧", "罵", "羂", "羇", "羨", "聟", "聡", "聨", "聴", "脇", "脳", "膣", "膵", "臈", "臓", "臥", "舎", "舖", "舗", "舘", "芿", "苅", "茲", "荊", "荘", "莬", "莵", "菫", "萠", "蔵", "薗", "薫", "薬", "薭", "蘊", "蛍", "蝋", "蝿", "蟷", "衞", "衵", "袙", "袞", "袰", "袴", "袿", "裃", "裡", "裲", "褄", "褌", "襴", "襷", "覗", "覚", "覧", "観", "訳", "証", "諌", "諚", "諟", "諡", "諮", "譛", "譲", "讃", "豅", "豊", "豎", "賎", "賛", "贔", "躙", "躰", "転", "軽", "輌", "辥", "辺", "辻", "込", "逓", "遅", "遙", "邉", "郷", "酔", "醗", "醤", "醸", "釈", "鉄", "鉇", "鉤", "鉱", "鉾", "銈", "銕", "銭", "鋲", "鋳", "鋺", "錆", "錍", "錣", "錬", "錵", "鍑", "鍮", "鍼", "鎌", "鎗", "鎚", "鎹", "鐇", "鐚", "鐡", "鑁", "鑑", "鑚", "鑢", "閇", "関", "閦", "闘", "陥", "険", "隣", "隷", "雑", "雫", "霊", "靜", "靫", "靭", "靱", "鞄", "鞆", "頚", "頬", "頴", "頼", "顕", "顗", "餝", "饂", "駄", "駆", "駈", "騒", "験", "騨", "髄", "髙", "髪", "髷", "鯖", "鯰", "鯱", "鰒", "鰯", "鰰", "鳰", "鴎", "鴫", "鵄", "鵞", "鵺", "鶏", "鹸", "麁", "麺", "麿", "黌", "黙", "鼈", "齢", "龗", "縯", "蟅", "坖", "祂", "鼂", "鱚", "蛻", "屌", "呾", "煔", "吶", "扥", "蚖", "銂", "尃", "夋", "鵼", "徬", "寳", "彡", "舨", "湳", "麼", "鍈", "崈", "鱣", "盺", "拺", "瑥", "茷", "焻", "奀", "驎", "鱰", "砢", "痟", "廱", "僜", "瘺", "鱊", "擥", "嶰", "淓", "跅", "浵", "媗", "璦", "煠", "檊", "媃", "峅", "躄", "鉟", "塽", "蟴", "鯮", "弍", "烒", "鵵", "妑", "孋", "蚡", "恊", "輭", "廞", "產", "曅", "盜", "騤", "囪", "鱀", "茇", "葊", "逹", "狓", "崢", "趖", "凃", "羙", "鮸", "昞", "楿", "渽", "圗", "麪", "屇", "鍉", "葝", "沯", "爭", "幵", "筭", "寊", "銋", "貮", "鎭", "熺", "昜", "鍱", "墬", "愒", "磺", "嚈", "稘", "珮", "釆", "殑", "鍩", "䲁", "蕷", "鐿", "僡", "佹", "輶", "冴", "襶", "賔", "猙", "辧", "絛", "磾", "韁", "螔", "譳", "礑", "鋱", "魩", "嚗", "棆", "牆", "敟", "柶", "瓛", "魣", "巎", "轘", "襌", "枼", "鸌", "逺", "錏", "縡", "帢", "騄", "媼", "埅", "鄤", "萐", "祙", "旼", "詥", "鶲", "燉", "卲", "銱", "庲", "伱", "氽", "嵿", "挻", "煵", "窋", "鐤", "鮊", "鱬", "鰧", "嬤", "譞", "諲", "脭", "悳", "崘", "阭", "內", "袾", "冚", "壐", "咗", "礠", "孮", "痲", "埈", "肹", "鰮", "鮓", "濊", "塜", "凜", "蒢", "噰", "桼", "峍", "焴", "鶒", "鋮", "綠", "鶹", "熿", "毴", "咟", "嘥", "睺", "繡", "郎", "瘞", "鉶", "蔎", "秠", "緤", "蝀", "躝", "蟜", "繃", "囮", "墫", "乭", "胊", "濙", "瘓", "榣", "鑛", "鐫", "嶴", "甹", "坮", "銾", "蒭", "睜", "俋", "餠", "榢", "蓳", "盋", "堷", "鍏", "苝", "巛", "蚵", "暏", "熤", "嬨", "墎", "鏽", "戶", "菺", "膮", "熖", "睪", "栜", "捱", "榗", "鍷", "曧", "犽", "韑", "袓", "䖝", "焄", "喦", "髲", "疌", "㴪", "侊", "貐", "蕅", "禠", "蕑", "囯", "暊", "儞", "佋", "柎", "㐱", "鰤", "苳", "鱥", "謤", "遶", "眀", "鑀", "羋", "顏", "陜", "銩", "黶", "苼", "蒤", "棛", "儫", "咁", "抦", "衚", "棩", "焿", "脫", "麅", "玏", "埧", "淸", "黁", "淽", "彠", "鮨", "沜", "糀", "厓", "楧", "嶌", "簹", "檵", "鱇", "嶬", "廸", "卽", "樀", "贌", "酼", "籛", "沒", "晸", "諪", "蕡", "妏", "鄋", "蒍", "奧", "抇", "蓨", "薆", "鱷", "巘", "䝉", "亰", "寈", "槩", "誒", "麴", "蕟", "溎", "蘗", "榦", "斿", "暟", "炲", "拚", "娖", "繖", "橚", "寜", "爀", "饟", "悅", "鯏", "彜", "眾", "葯", "嬝", "埮", "獇", "馛", "溙", "瀦", "熼", "硓", "鈢", "樆", "輬", "鰜", "蔘", "渙", "澔", "嗮", "旉", "籜", "媊", "燘", "儚", "頹", "缽", "俽", "逨", "鱓", "郞", "歊", "杴", "珡", "杋", "醁", "鰏", "鵾", "鐽", "鮋", "巶", "荅", "薾", "囓", "蹻", "獎", "禑", "鎓", "榲", "僴", "綞", "尓", "敭", "曔", "褔", "鬅", "亊", "鏦", "蓘", "裬", "鱲", "薡", "鰗", "箑", "鬪", "縂", "璸", "甙", "茮", "辵", "岻", "覿", "滈", "鯶", "鑂", "囶", "舺", "溋", "拋", "菾", "敾", "虨", "綝", "蝍", "醂", "禨", "賹", "廧", "絕", "槗", "徫", "鎔", "曮", "蠂", "捒", "堈", "莕", "蓪", "敎", "禃", "櫱", "綧", "瀶", "逌", "浤", "碻", "刄", "逤", "剏", "氹", "菈", "娫", "蜛", "嵗", "糎", "螶", "譓", "鏳", "嵙", "瑊", "隲", "檨", "緈", "畵", "砯", "簗", "彅", "鰺", "騋", "窶", "嚒", "嵻", "尙", "頵", "槰", "虉", "醞", "巂", "彔", "偊", "畇", "鱨", "妸", "塲", "畐", "鈫", "錟", "磪", "摠", "彥", "璙", "囝", "寗", "耎", "鮡", "蘓", "弅", "焃", "飥", "戙", "塰", "儱", "槺", "噏", "魟", "禵", "佧", "咘", "盪", "瑈", "鉲", "睭", "鏌", "鼇", "郋", "魮", "朖", "滽", "渃", "滙", "熯", "醿", "鎅", "褀", "鬬", "巄", "螥", "眜", "釚", "柉", "壎", "峇", "姸", "唭", "鮜", "鈖", "嫈", "壄", "洤", "黃", "伕", "堦", "嶔", "鮰", "鞞", "漎", "鉓", "鮗", "壴", "阝", "妀", "矽", "獢", "倗", "銪", "鴓", "橒", "凈", "哖", "屚", "偍", "瑺", "媯", "淍", "驌", "椇", "赬", "薐", "糹", "碽", "濲", "釭", "晭", "纕", "寖", "閞", "歿", "呎", "鶆", "屄", "櫿", "犎", "旲", "㙟", "龎", "翜", "螾", "說", "衜", "泆", "軎", "鵂", "荎", "嚧", "硂", "桖", "褭", "筊", "鰷", "秳", "戩", "轀", "鬹", "飬", "卋", "暸", "狦", "搢", "娋", "鏴", "溫", "毉", "淰", "謩", "餺", "鵙", "鳽", "鮀", "狶", "氻", "轝", "妺", "袛", "蓭", "梂", "娛", "牼", "稅", "兿", "玾", "煚", "僩", "鶿", "鬄", "崠", "鉆", "鯓", "蚢", "庀", "鵟", "坣", "殼", "悞", "熅", "敻", "鍠", "曶", "愼", "搳", "姃", "砳", "槼", "臞", "韾", "靑", "鸊", "薲", "虛", "蠄", "啟", "鶺", "苺", "滾", "褞", "仺", "胇", "憻", "郳", "烉", "驩", "冇", "枖", "夌", "搵", "匸", "盨", "櫾", "霤", "麊", "貒", "噓", "嗢", "笩", "晈", "冂", "銳", "毿", "慜", "囧", "閜", "娸", "庢", "壆", "馯", "桱", "兗", "葃", "侅", "煐", "鐦", "藸", "鷎", "嵰", "逎", "弒", "匋", "鐭", "廔", "砩", "孆", "灴", "伷", "兪", "鴗", "澯", "幚", "旙", "勻", "礽", "婑", "鱮", "娍", "銶", "吳", "鍟", "仼", "鳧", "彞", "娽", "昛", "鰼", "剎", "佉", "鉏", "偸", "鰆", "讙", "橪", "啱", "岀", "孻", "釪", "乹", "鈳", "漇", "檦", "埻", "祿", "爌", "禇", "鱵", "㸃", "梉", "燝", "霙", "炁", "飮", "蠙", "勷", "鵎", "儥", "鐠", "唻", "廰", "嚿", "嵕", "墱", "紑", "搖", "瘜", "皝", "鸑", "瀁", "粵", "撚", "巑", "梀", "啯", "眛", "諴", "夊", "僙", "鍝", "裖", "鮣", "凬", "飡", "灊", "橓", "嫳", "筳", "咑", "粍", "瓑", "璌", "伃", "閰", "傜", "黐", "謢", "驒", "橫", "蛯", "寕", "蠵", "瞓", "旳", "翏", "硏", "寯", "韡", "楤", "鰃", "朿", "侞", "鵯", "愨", "祹", "厔", "丌", "盩", "謏", "魕", "啣", "閱", "曺", "枛", "罉", "卐", "樻", "鷉", "鯒", "鋡", "磱", "枱", "攴", "蠷", "穈", "嚟", "檽", "趐", "奐", "鋐", "檇", "薀", "峼", "咭", "訔", "韠", "鑴", "鸐", "唃", "捦", "鸜", "誴", "罳", "璄", "暃", "夀", "賨", "鞥", "鈊", "灡", "鮍", "懮", "籣", "昐", "陁", "襾", "鮠", "鈏", "囍", "婯", "艔", "貭", "䰾", "姁", "禼", "堖", "鋶", "仛", "鏷", "謜", "鑅", "忬", "蘶", "謠", "觙", "奫", "狟", "泩", "桙", "飈", "垰", "啍", "嚞", "鯕", "蒧", "榞", "徸", "璹", "揔", "欉", "魞", "菶", "玧", "鳯", "廍", "侚", "岰", "岧", "鋕", "凵", "彣", "崱", "媜", "倢", "鵐", "砋", "鷚", "鱠", "鮻", "繻", "摵", "贓", "磵", "錻", "痠", "粩", "胅", "奣", "塨", "瀠", "鸘", "啚", "娳", "霶", "壔", "峚", "甂", "廁", "覌", "鰂", "猳", "鱻", "盫", "裿", "杬", "歛", "澋", "蘞", "嵜", "尐", "旽", "鉌", "鎛", "豿", "凖", "榤", "禓", "龝", "悧", "鷟", "鮟", "吋", "喢", "岪", "吥", "漵", "頠", "豔", "巿", "鑨", "醣", "熳", "懍", "湥", "檡", "韺", "戱", "緖", "鐈", "凉", "緃", "鮹", "媐", "爯", "巆", "褍", "鐬", "昍", "扙", "鍳", "芛", "蟳", "嬅", "糬", "吔", "塭", "譿", "冧", "鏓", "嶪", "嗹", "椵", "姀", "閿", "褧", "錞", "玆", "笘", "篔", "萡", "鶡", "螐", "鮄", "鰟", "脷", "啲", "杤", "蓚", "尗", "娎", "殟", "淥", "蝚", "蓧", "彐", "嚤", "銍", "囒", "坶", "淩", "鶼", "鱂", "喼", "燫", "肏", "姵", "廌", "禟", "籝", "迵", "嵨", "堮", "蟌", "憍", "廕", "蜑", "緁", "唘", "竩", "崙", "璚", "粄", "栨", "罈", "梫", "貤", "藔", "蜯", "訁", "斖", "煶", "馦", "妠", "閟", "疕", "夆", "鎪", "膥", "澻", "嘢", "嚐", "靁", "鎻", "鰛", "穵", "烋", "縕", "褎", "疒", "壠", "溼", "圂", "咅", "鯭", "鯙", "磘", "玨", "珤", "朊", "蚼", "濶", "薞", "嚩", "丟", "嫺", "鯻", "椲", "鰕", "刂", "蠘", "踎", "瀴", "琁", "鰶", "瑴", "肜", "㐂", "欥", "媺", "竻", "讚", "𣇉", "裵", "緜", "廩", "齧", "叄", "俌", "厰", "滀", "錄", "鷫", "鯗", "攞", "姌", "蔝", "幷", "縤", "屻", "鯃", "雞", "纁", "嫲", "嵮", "屭", "嶃", "跩", "鋗", "蕢", "篊", "俬", "淎", "暻", "鏻", "憓", "玗", "溈", "笭", "糢", "勳", "閒", "沍", "咾", "鉷", "蘵", "俁", "崵", "毸", "苪", "掙", "鴡", "萭", "俴", "屜", "蒾", "艹", "剷", "慍", "朮", "枴", "氳", "猓", "甽", "箝", "譁", "贗", "迆", "鈽", "鍊", "鍰", "鏍", "靦", "餽", "丮", "丱", "仜", "仩", "伬", "伔", "仱", "伀", "伻", "佢", "佒", "侀", "侇", "佷", "佌", "佪", "侐", "侜", "俓", "侲", "俉", "侻", "侳", "俇", "倅", "倇", "倰", "倛", "倳", "倷", "俷", "倠", "偯", "偞", "偠", "偋", "偝", "偛", "偢", "偅", "偟", "偩", "偫", "傛", "傔", "傞", "傋", "傌", "傎", "傝", "偨", "傂", "傽", "傿", "僆", "傮", "僄", "僈", "傰", "僁", "傱", "僋", "僗", "僛", "僪", "僝", "僓", "僿", "儃", "儰", "僸", "僶", "僾", "儌", "僽", "儜", "儓", "儗", "儑", "儢", "儤", "儠", "儸", "儹", "儽", "冓", "冘", "冞", "凊", "凅", "凔", "刌", "刉", "刓", "刜", "刞", "刵", "刲", "剆", "刱", "剉", "剚", "剒", "剫", "剭", "剬", "剺", "剸", "剻", "剼", "劀", "劋", "劖", "劘", "劗", "劙", "劦", "勴", "匊", "匢", "匰", "匴", "匷", "匽", "卌", "卼", "厎", "厒", "厗", "厞", "厜", "厤", "厬", "厹", "吰", "吷", "吪", "呿", "咈", "呫", "呺", "呥", "呬", "呴", "茍", "咷", "咮", "咶", "哅", "咠", "咢", "唦", "唗", "唒", "哤", "唚", "唈", "哫", "唅", "唴", "啢", "唶", "啒", "啅", "唌", "唲", "喨", "喥", "喭", "噅", "喓", "喣", "啽", "喌", "嗃", "嗛", "嗋", "嗀", "喿", "喍", "嗏", "嗕", "嗈", "嘕", "嘒", "嗼", "嘐", "嘓", "嘂", "嗺", "嘝", "嘄", "嗿", "噈", "噊", "噆", "噚", "嘳", "嘽", "嘾", "噮", "噳", "噣", "噭", "噞", "嚌", "嚍", "嚃", "嚘", "嚜", "嚫", "嚪", "嚬", "嚲", "嚵", "嚽", "嚾", "囆", "囅", "囋", "囗", "圁", "圞", "圠", "坁", "坅", "坲", "坱", "垀", "坴", "垗", "垝", "垔", "垘", "垽", "垼", "埢", "埶", "堩", "堣", "塈", "堥", "塓", "塉", "塯", "塕", "塼", "墆", "塿", "塴", "墋", "塺", "墝", "墯", "壈", "墽", "壖", "壝", "壛", "壾", "壿", "夃", "夎", "夒", "夗", "奅", "奊", "奰", "奲", "奼", "妦", "妎", "妢", "妐", "妵", "姏", "姎", "㚷", "姡", "姺", "姼", "娭", "婐", "婟", "婥", "婓", "婗", "媔", "媟", "媢", "婸", "媦", "媥", "媬", "媕", "娷", "嫇", "嫋", "媰", "媻", "嫮", "嫥", "嫢", "嫛", "嫿", "嫴", "嫷", "嫶", "嬎", "嬓", "嬐", "嬲", "嬽", "孈", "屘", "孲", "孷", "宎", "宨", "寪", "寍", "寋", "寑", "寙", "寠", "寱", "尌", "尒", "尟", "尰", "尳", "屖", "屔", "屝", "屧", "屩", "屮", "屴", "岏", "岋", "岉", "岒", "岮", "岤", "岯", "岟", "岝", "峐", "峌", "峞", "峉", "峊", "峬", "峮", "峷", "崝", "崨", "崥", "崏", "崰", "崣", "崷", "嵃", "嵑", "崳", "崺", "嵂", "嵱", "嵣", "嵥", "嵞", "嶀", "嵽", "嶆", "嵺", "嵷", "嶊", "嶉", "嶈", "嵾", "嶕", "嶜", "嶡", "嶚", "嶞", "嶱", "嶩", "嶵", "嶭", "巃", "巏", "巕", "巟", "巹", "帊", "帗", "帟", "帣", "帠", "帤", "帩", "帾", "帴", "幏", "幎", "幓", "幩", "幝", "幠", "幧", "幨", "幦", "幭", "幰", "庂", "庉", "庌", "庈", "庰", "庛", "庣", "庨", "庮", "庪", "庬", "庴", "廅", "廇", "廘", "廗", "廎", "廜", "緳", "廦", "廥", "廮", "廯", "蠯", "廾", "弚", "弝", "弣", "弤", "弮", "弳", "彃", "彉", "彋", "彏", "彯", "彴", "彸", "彾", "徦", "徥", "徯", "徲", "徾", "徿", "忀", "忁", "忔", "忕", "忨", "忣", "忷", "忥", "怭", "怲", "怋", "怴", "怗", "怚", "怞", "怬", "怢", "怐", "怮", "怓", "怷", "怹", "恲", "恞", "恅", "恇", "恉", "恛", "恌", "恀", "恟", "悀", "悁", "悕", "悗", "悇", "悊", "悐", "悾", "悺", "惓", "惤", "惈", "悷", "惉", "悹", "惌", "惢", "惄", "愊", "愖", "愅", "惵", "愓", "惸", "惼", "惾", "慉", "慅", "愶", "愲", "愮", "愯", "愬", "慁", "慞", "慱", "慒", "慓", "慲", "憀", "慴", "慔", "慺", "慛", "憃", "慹", "憱", "憰", "憢", "憉", "憛", "憯", "憟", "憪", "憡", "憝", "憖", "懅", "憴", "懆", "懁", "憿", "憸", "憵", "憼", "懧", "懠", "懥", "懤", "懘", "懭", "懱", "懪", "懰", "懫", "懻", "戁", "戃", "戄", "戉", "戠", "酨", "戺", "扐", "扜", "扤", "扡", "扢", "抆", "抌", "抎", "抏", "扻", "抭", "抴", "拑", "抾", "抪", "抶", "抮", "挍", "挋", "挃", "拫", "拹", "挏", "挌", "拸", "挀", "拲", "捖", "挬", "挶", "揤", "捊", "挼", "挩", "捁", "挴", "捘", "捔", "捥", "掝", "掗", "掫", "掯", "捵", "掜", "捼", "掤", "掔", "掱", "揎", "揥", "揨", "揯", "揊", "揲", "揵", "摡", "揟", "揝", "揜", "揘", "揅", "揱", "搆", "搟", "搕", "搘", "搹", "搷", "搣", "搰", "搊", "搚", "摀", "搧", "搫", "摍", "摝", "摲", "摦", "摎", "摋", "摓", "摐", "摿", "摮", "摰", "撢", "撠", "撗", "撜", "撋", "撊", "撌", "撟", "擗", "擖", "擏", "擉", "撽", "擩", "擣", "擫", "擭", "擨", "擽", "擸", "攇", "攐", "攍", "攌", "攗", "攕", "攓", "攡", "攠", "攦", "攩", "攭", "攲", "攳", "敁", "敊", "敆", "敓", "敧", "敪", "敤", "敜", "敯", "敳", "敶", "敺", "敹", "敿", "斁", "斀", "斄", "斒", "斔", "斞", "斨", "斪", "斻", "旍", "旓", "旚", "旝", "旟", "昲", "昦", "昢", "晇", "晥", "晜", "晼", "晬", "暀", "暆", "暍", "暋", "暡", "暰", "暩", "曀", "曊", "曋", "曏", "曒", "曚", "曣", "曭", "朁", "朅", "朄", "朒", "朘", "朣", "朾", "朹", "朻", "朼", "杅", "杇", "杝", "杗", "枎", "杶", "枆", "枌", "柲", "枺", "枻", "柸", "柀", "柅", "柫", "柤", "柍", "柮", "柣", "柂", "柧", "栚", "桋", "桏", "栱", "栵", "栫", "栭", "栯", "栘", "栔", "梡", "梇", "梐", "桭", "梮", "楖", "梬", "梩", "桵", "梒", "椌", "椄", "棜", "棷", "棳", "棌", "椈", "楰", "棯", "椔", "棸", "楟", "楎", "楱", "楅", "楺", "楈", "楛", "楉", "楬", "椳", "楀", "楄", "楶", "楘", "榶", "槉", "榠", "榬", "榼", "榙", "榩", "榾", "榯", "槄", "榽", "榹", "槥", "槸", "樕", "樠", "槬", "槢", "樛", "樝", "槾", "樧", "槮", "樔", "槷", "橀", "樴", "橉", "橧", "樲", "橨", "橝", "橭", "橶", "樿", "橁", "檍", "檖", "檁", "檟", "橾", "檛", "檓", "檕", "檃", "櫅", "檹", "櫡", "櫠", "櫌", "櫑", "櫙", "櫋", "櫜", "櫐", "櫫", "櫬", "櫰", "櫹", "櫺", "櫼", "欃", "欋", "欈", "欐", "欑", "欘", "欨", "欴", "欯", "欭", "欱", "欶", "欳", "欷", "欿", "歂", "歈", "歍", "歋", "歕", "歔", "歜", "歠", "歭", "歾", "肂", "殈", "殏", "殔", "殗", "殙", "殠", "殥", "殢", "殦", "殧", "殰", "殶", "毃", "毄", "毈", "毇", "毊", "毚", "毞", "毦", "毤", "毨", "毣", "毰", "毲", "毻", "毼", "毾", "氁", "氀", "氄", "氠", "氶", "汃", "汒", "汏", "汍", "汸", "沋", "汱", "汯", "沕", "汦", "汳", "泬", "沶", "沬", "泧", "沷", "泭", "泲", "泒", "沴", "洟", "洊", "洀", "浺", "浶", "洍", "涒", "浘", "浢", "涊", "涆", "浧", "涗", "涳", "涬", "淢", "涷", "淔", "渀", "淈", "涾", "淊", "涽", "淭", "湆", "湇", "湅", "湢", "渿", "湁", "渜", "渳", "湀", "渻", "渮", "湨", "湡", "渱", "渨", "湠", "湱", "湩", "渹", "溛", "滖", "溓", "溔", "滒", "溰", "溾", "滜", "滵", "滱", "漃", "漥", "漮", "潎", "漙", "漧", "漘", "漒", "滭", "漊", "潳", "滮", "潀", "漰", "潃", "漅", "濆", "澒", "澅", "潚", "潠", "澖", "潶", "潬", "潒", "潐", "潗", "澓", "潝", "濇", "濎", "濈", "濄", "澞", "澨", "瀄", "濌", "澩", "濴", "濔", "濣", "濭", "濧", "濦", "瀇", "瀎", "濿", "瀀", "濻", "瀙", "瀖", "瀫", "瀡", "瀢", "瀩", "瀯", "瀷", "灂", "瀸", "瀿", "瀺", "灄", "灉", "灖", "灗", "灛", "灟", "灨", "灩", "灪", "炾", "炰", "烓", "烑", "缹", "焍", "烰", "焠", "焮", "焣", "煆", "煣", "煝", "熐", "熉", "熀", "熂", "熚", "燅", "燂", "熸", "燀", "燡", "爁", "爊", "爂", "爓", "爞", "爢", "爣", "牄", "牉", "牋", "牏", "牣", "牬", "牰", "牸", "牷", "犈", "犉", "犆", "犅", "犌", "犑", "犐", "犗", "犕", "犓", "犘", "犚", "犝", "犞", "犥", "犦", "犤", "犣", "犩", "犪", "犮", "犵", "犿", "狆", "狖", "狋", "狘", "狜", "狔", "狚", "狌", "狑", "狊", "狤", "狫", "狪", "狣", "猀", "狾", "猑", "猘", "猈", "狿", "猏", "猋", "猒", "猧", "猲", "猭", "猦", "猣", "猵", "猼", "獂", "獀", "獊", "獑", "獌", "獘", "獞", "獟", "獝", "獛", "獡", "獩", "獦", "獥", "獳", "獶", "獽", "獿", "玂", "玁", "玈", "玊", "玔", "珓", "珶", "琖", "瑵", "璊", "瑽", "璅", "瑿", "璗", "瓁", "瓋", "瓝", "瓟", "瓡", "瓥", "瓨", "瓬", "瓵", "瓾", "瓽", "甀", "甃", "甈", "甋", "甐", "甒", "甔", "甖", "甝", "甮", "甿", "畟", "畣", "畽", "疀", "疧", "痁", "疻", "痀", "痎", "痏", "痋", "痌", "痑", "痚", "痡", "痝", "痗", "痯", "瘏", "痷", "痸", "痻", "瘈", "瘑", "瘝", "瘣", "瘯", "瘱", "瘽", "癈", "癉", "癙", "癐", "癓", "癠", "癵", "癹", "皊", "皏", "皫", "皯", "皵", "皻", "皽", "皾", "盄", "盓", "盝", "盬", "盭", "盳", "眃", "眅", "盻", "眝", "眐", "眓", "眒", "眣", "眑", "眕", "眹", "眱", "眲", "眴", "眳", "眽", "睆", "睅", "睊", "睋", "睌", "睕", "睟", "睒", "睖", "睩", "睧", "睔", "瞁", "睼", "瞂", "睮", "睯", "瞏", "瞉", "瞚", "瞝", "瞡", "瞛", "瞲", "瞷", "瞶", "瞴", "矂", "矉", "矊", "矌", "矎", "矏", "矐", "矔", "矕", "矘", "矠", "矱", "矲", "矹", "矺", "砅", "砐", "砏", "砎", "砨", "硈", "硉", "硠", "硥", "硱", "硰", "硩", "碔", "碄", "碅", "碆", "硾", "碫", "碞", "磍", "磌", "磎", "磈", "磃", "磝", "磩", "磥", "磞", "磛", "磳", "磼", "磿", "礔", "礉", "礝", "礛", "礜", "礥", "礣", "礧", "礨", "礭", "礿", "祌", "祅", "祔", "祒", "祑", "祤", "祩", "祪", "祣", "祫", "祡", "祴", "祳", "禂", "禗", "禜", "禫", "禭", "禬", "禴", "禷", "禸", "歶", "秅", "秏", "秖", "秎", "秮", "秪", "秺", "秶", "稊", "稒", "稫", "穊", "稰", "稯", "穋", "穛", "穖", "穧", "穨", "穮", "穬", "穭", "穱", "穾", "窆", "窉", "窌", "窏", "窔", "窐", "窙", "窢", "窞", "窫", "窲", "窴", "窱", "窾", "竀", "竁", "竷", "笐", "笓", "笅", "笵", "笻", "笴", "笰", "笢", "笝", "笲", "筄", "筡", "箈", "箊", "箌", "箛", "箎", "箘", "箄", "箷", "箾", "篎", "箯", "箹", "篞", "篣", "篧", "篕", "篨", "篹", "簅", "篲", "篿", "篻", "簎", "篴", "簂", "簁", "篸", "篽", "簜", "簩", "簙", "簭", "簦", "簨", "簢", "簥", "簳", "簼", "簬", "簻", "籉", "籈", "籊", "籔", "籗", "籧", "籦", "籯", "籺", "籸", "籹", "粊", "粔", "粻", "糔", "糪", "糱", "糷", "紎", "紟", "紒", "紽", "紸", "紶", "紩", "絇", "紾", "絘", "絯", "絓", "絧", "絏", "絭", "絫", "綀", "綍", "絿", "綅", "絻", "絼", "綔", "綷", "緂", "綪", "緀", "緅", "緎", "緆", "緌", "綯", "綼", "緷", "緛", "緪", "緧", "縃", "緺", "緶", "緰", "縗", "縌", "縓", "縎", "縜", "縚", "縏", "縼", "繂", "縳", "顈", "繈", "縸", "縪", "繉", "繀", "縩", "緵", "縰", "縿", "縶", "繜", "繐", "繣", "繘", "繢", "繟", "繑", "繠", "繶", "繵", "繸", "繷", "繺", "繲", "繴", "纀", "纇", "纋", "纆", "纑", "纗", "纚", "缿", "罊", "罏", "罜", "罞", "罝", "罛", "罣", "罥", "罦", "罭", "罫", "罬", "罻", "罼", "罺", "罿", "羃", "羉", "羍", "羒", "羜", "羛", "羢", "羠", "羦", "羬", "羭", "羵", "羳", "羷", "羺", "羾", "翋", "翍", "翐", "翑", "翇", "翢", "翣", "翭", "翪", "翨", "翴", "翲", "翽", "翿", "耟", "耞", "耡", "耴", "耾", "耹", "聇", "聈", "聑", "聏", "聝", "肕", "肙", "肒", "肣", "肵", "胘", "胑", "胐", "胕", "胉", "胏", "胹", "胵", "脁", "胻", "脀", "胾", "胔", "脰", "脥", "脤", "脙", "脡", "脕", "脧", "腃", "腏", "腄", "腇", "脽", "腍", "腤", "腷", "腜", "腛", "腢", "腲", "朡", "腞", "腶", "膉", "膆", "膃", "膇", "膍", "膌", "膋", "膟", "膕", "膢", "膱", "膹", "膫", "膰", "膬", "膴", "膲", "臇", "膷", "臄", "臅", "臒", "臐", "臗", "臛", "臡", "臦", "臩", "臮", "臲", "臷", "臸", "臿", "舋", "舑", "舕", "舝", "舡", "舼", "舽", "艀", "艂", "艓", "艒", "艐", "艑", "艕", "艛", "艵", "艼", "芀", "芐", "芅", "芓", "芔", "苀", "芚", "芵", "芧", "芞", "芺", "苙", "苨", "苖", "苬", "苲", "苵", "苶", "茙", "茥", "茿", "茦", "茢", "荂", "茪", "荍", "茖", "茤", "茠", "茩", "茻", "莐", "莣", "莍", "荺", "莤", "荴", "莏", "莁", "荵", "莔", "莃", "莌", "莋", "荾", "莥", "菨", "萒", "菧", "菤", "菆", "菣", "菿", "菋", "菎", "菵", "萉", "菞", "菳", "菕", "蓱", "萿", "葹", "葥", "葀", "葧", "萰", "葍", "葽", "蔇", "葞", "萷", "萺", "萴", "葅", "菙", "葋", "萯", "葂", "葟", "葌", "蓎", "蒬", "蒮", "蒫", "蒪", "蒚", "蒝", "蓌", "蒛", "蒩", "蒘", "蒶", "蒠", "蔤", "蔏", "蔩", "蔉", "蔍", "蔧", "蔜", "蓻", "蓺", "蓴", "蔪", "蓲", "蓷", "蓫", "蔒", "蓩", "蔖", "蓾", "蔨", "蔮", "蔂", "蓶", "蔱", "蓹", "蔠", "蔰", "蕫", "蕍", "蕀", "蕆", "蕄", "蕇", "蕣", "蕛", "蕱", "蕵", "蕮", "蕧", "蕠", "蕦", "蕝", "薃", "薧", "薕", "薠", "薋", "薣", "薚", "蕼", "薉", "蕸", "薎", "薖", "薍", "薝", "薂", "藆", "藀", "藃", "藂", "薵", "薽", "藇", "藄", "藋", "藈", "藅", "薱", "薶", "藒", "藫", "藱", "藙", "藡", "藚", "藗", "藲", "藬", "藘", "藣", "藑", "藰", "蘁", "藾", "蘛", "蘉", "蘌", "蘪", "蘦", "蘟", "蘣", "蘜", "蘙", "蘮", "蘡", "蘠", "蘥", "蘴", "蘳", "蘬", "虀", "蘹", "蘱", "蘻", "蘾", "虃", "虆", "虇", "虈", "虌", "虋", "虙", "虡", "虣", "虩", "虪", "虰", "虭", "虴", "蚑", "蚞", "蚇", "蚗", "蚚", "蚅", "蚥", "蚙", "蚿", "蚷", "蛂", "蛁", "蛅", "蛈", "蚹", "蚳", "蚸", "蛌", "蚻", "蛢", "蛦", "蛓", "蛣", "蛚", "蛪", "蛝", "蛫", "蛜", "蛬", "蛗", "蜄", "蛷", "蜌", "蛖", "蛵", "蜁", "蛶", "蜳", "蝫", "蜙", "蝃", "蜬", "蝁", "蝆", "蜠", "蜲", "蜪", "蜭", "蜼", "蜵", "蝂", "蜦", "蜧", "蜸", "蜤", "蜰", "蝖", "蝷", "蟡", "蝳", "蝔", "蝛", "蝒", "蝑", "蝞", "蝭", "蝪", "蝐", "蝝", "蝬", "蝺", "蝜", "螛", "螏", "螓", "螒", "螁", "螖", "螘", "蝹", "螇", "螑", "螝", "螜", "螚", "螪", "螰", "螹", "螼", "螮", "蟉", "蟃", "蟂", "螷", "螴", "螿", "螸", "蟞", "蟧", "蟦", "蟢", "蟟", "蟤", "蟔", "蟓", "蟭", "蟘", "螤", "蟗", "蟙", "蠁", "蟨", "蠀", "蟺", "蠉", "蠌", "蟼", "蠈", "蟿", "蠗", "蠩", "蠝", "蠛", "蠠", "蠤", "蠜", "蠫", "蠬", "蠨", "蠦", "蠪", "蠥", "蠰", "蠮", "蠳", "蠸", "蠾", "蠽", "蠿", "衁", "衈", "衋", "衧", "衪", "衭", "衶", "袀", "衱", "衯", "袃", "袉", "袕", "袨", "袚", "袑", "袡", "袘", "袧", "袬", "袌", "袺", "裗", "袹", "袸", "裀", "袶", "袽", "袲", "裋", "裍", "裞", "裚", "裷", "裧", "裺", "裮", "裶", "裯", "裻", "褁", "褅", "褋", "褗", "褆", "褖", "褑", "褦", "褮", "褱", "褢", "褩", "褵", "褼", "褾", "襒", "褷", "襂", "褽", "襓", "襋", "襆", "襐", "襛", "襗", "襡", "襘", "襝", "襣", "襭", "襩", "襮", "襳", "襹", "襺", "覂", "覅", "覕", "覛", "覝", "覢", "覤", "覣", "覭", "覮", "覶", "觓", "觤", "觡", "觠", "觢", "觩", "觰", "觬", "觲", "觷", "觺", "觻", "觼", "觾", "訑", "訰", "訧", "訬", "訞", "詍", "訹", "詙", "詀", "詄", "詅", "訿", "誂", "詻", "誃", "誫", "誙", "誋", "諆", "誸", "諔", "諕", "誻", "諀", "諅", "諵", "諝", "諰", "諈", "謞", "謘", "謑", "謋", "謒", "謕", "謍", "謈", "謪", "謧", "謣", "謰", "謵", "譇", "謯", "謱", "謥", "謷", "謦", "譐", "譈", "譊", "譀", "譋", "譕", "譑", "譠", "譪", "譝", "譨", "譣", "譥", "譹", "譸", "譅", "譺", "譻", "譾", "讄", "讂", "讆", "讋", "讔", "讘", "讟", "谹", "谻", "谽", "谾", "豃", "豋", "豍", "豏", "豗", "豜", "豝", "豟", "豥", "豤", "豦", "豭", "豰", "豲", "豱", "豯", "豵", "豷", "豶", "豻", "豽", "貁", "貀", "貄", "貏", "貑", "貕", "貙", "貗", "貜", "貣", "貾", "賌", "賥", "賟", "賙", "賵", "賮", "贆", "贕", "贙", "赨", "赩", "赮", "赸", "趀", "趌", "趎", "趏", "趍", "趓", "趠", "趜", "趡", "趥", "趧", "趬", "趪", "趭", "趫", "趮", "趷", "趹", "跘", "跓", "跍", "跇", "跜", "跕", "跙", "跈", "跰", "跠", "跮", "跦", "跢", "跧", "跲", "跫", "踂", "跿", "踍", "踃", "踇", "踆", "跾", "踠", "踥", "踤", "踡", "踕", "踛", "踖", "踑", "踙", "踧", "踘", "踓", "踳", "踾", "踸", "踼", "蹎", "蹍", "蹓", "蹗", "蹖", "蹞", "蹥", "蹛", "蹡", "蹝", "蹔", "蹸", "蹳", "蹪", "躆", "躈", "躖", "躗", "躟", "躠", "躤", "躣", "躩", "躨", "躽", "軓", "軘", "軞", "軯", "軷", "軦", "軮", "軥", "軵", "軧", "軨", "軶", "軱", "軬", "輆", "軿", "輁", "輀", "輂", "輐", "輑", "輤", "輘", "輚", "輠", "輣", "輖", "輗", "輮", "輵", "輲", "輹", "輷", "輴", "轃", "轇", "轈", "轒", "轑", "轏", "轐", "轓", "轙", "轖", "轗", "轕", "轚", "轞", "轛", "轠", "辴", "迉", "迒", "迋", "迍", "迖", "迣", "迡", "迾", "迿", "逜", "逿", "遝", "遳", "遰", "遻", "邆", "邅", "遾", "邍", "邔", "邟", "邥", "邞", "邧", "郱", "郕", "郖", "郠", "郙", "郣", "郥", "郘", "郰", "郲", "郔", "鄬", "郼", "鄈", "郹", "郻", "鄁", "鄇", "郺", "鄐", "鄍", "鄏", "鄎", "鄟", "鄝", "鄡", "鄛", "鄨", "鄪", "鄦", "鄮", "鄵", "鄸", "鄻", "鄾", "酀", "酁", "酄", "酇", "酖", "酘", "酓", "酟", "酳", "醆", "醊", "醓", "醙", "醟", "醥", "醧", "醰", "醱", "醷", "醲", "醳", "醹", "醽", "釂", "釃", "釢", "釱", "釳", "釸", "鈚", "鈌", "鈒", "釽", "鈆", "鉒", "鉠", "鉯", "鈶", "鉼", "銤", "銛", "銔", "鉹", "銗", "鋄", "鋀", "鋟", "鋘", "鋩", "鋝", "鋂", "鋊", "錧", "錼", "錭", "錎", "鋋", "鎡", "鎃", "鎯", "鍖", "鍜", "鍐", "鍭", "鍌", "鎒", "鎷", "鎝", "鎉", "鎎", "鎞", "鏏", "鏂", "鏚", "鏬", "鏙", "鐋", "鐏", "鏾", "鐕", "鐨", "鐍", "鐀", "鐎", "鐖", "鐻", "鐶", "鑐", "鑋", "鑕", "鑮", "鑯", "钂", "钀", "钁", "钃", "镺", "镻", "镼", "镽", "閈", "閍", "閺", "閵", "闀", "闉", "闅", "閷", "闒", "闑", "闚", "闛", "闠", "闟", "闤", "阞", "阢", "阤", "阠", "阰", "阹", "阸", "阺", "陏", "陓", "陊", "陼", "陭", "陫", "隇", "陾", "隉", "隒", "隓", "隞", "隤", "隿", "雂", "雈", "雓", "雔", "雗", "雚", "雟", "雘", "雺", "雽", "雿", "霂", "霋", "霒", "霐", "霠", "霣", "霢", "霩", "霫", "霬", "霮", "霵", "霿", "靆", "靃", "靪", "靮", "靷", "靲", "靾", "鞃", "鞀", "鞂", "靻", "鞊", "鞎", "鞈", "鞙", "鞗", "鞚", "鞜", "鞤", "鞪", "鞷", "鞶", "鞹", "鞻", "鞿", "韄", "韅", "韇", "韎", "韐", "韏", "韕", "韔", "韗", "韝", "韟", "韣", "韥", "韰", "韱", "韹", "韽", "頄", "頖", "頞", "頝", "頩", "頨", "頯", "頲", "顁", "顄", "顊", "顉", "顅", "顐", "顑", "顜", "顝", "顠", "顣", "顟", "顤", "顪", "顩", "顲", "颬", "颲", "颸", "颽", "颻", "颾", "飁", "飂", "飉", "飋", "飌", "飣", "飶", "餂", "餀", "飺", "餔", "餖", "餕", "餤", "餟", "餥", "餫", "餪", "餲", "餯", "餭", "餱", "餰", "饁", "饇", "饐", "饎", "饙", "饘", "饛", "饡", "馣", "馲", "馰", "馵", "馻", "馺", "駂", "馽", "駜", "駍", "駏", "駎", "駖", "駮", "駬", "駥", "駤", "駣", "駩", "駺", "駴", "駷", "駹", "駶", "駻", "駽", "駾", "騃", "騉", "騑", "騊", "騇", "騚", "騕", "騥", "騝", "騛", "騢", "騠", "騧", "騞", "騜", "騵", "騲", "騴", "騱", "騬", "騪", "騩", "騹", "騽", "驆", "騺", "驓", "驔", "驈", "驉", "驖", "驞", "驠", "驦", "驨", "骭", "骫", "骹", "骿", "骴", "骾", "髇", "髊", "髆", "髍", "髐", "髟", "髧", "髬", "髳", "髶", "髺", "髾", "鬁", "髼", "鬋", "鬊", "鬎", "鬌", "鬐", "鬕", "鬗", "鬖", "鬙", "鬞", "鬠", "鬤", "鬫", "鬳", "鬵", "鬺", "鬾", "鬿", "魊", "魌", "魖", "魠", "魡", "魧", "魱", "魦", "魶", "魵", "鮅", "鮇", "魼", "魾", "魻", "鮂", "鮚", "鮞", "鮛", "鮦", "鮥", "鮤", "鮆", "鯆", "鮿", "鮵", "鯈", "鯫", "鯠", "鯞", "鯦", "鯬", "鰌", "鰋", "鰅", "鯸", "鰫", "鰝", "鰬", "鱆", "鰿", "鱄", "鱁", "鰴", "鱐", "鱍", "鱋", "鱕", "鱦", "鱢", "鱞", "鱴", "鱳", "鱹", "鳦", "鳪", "鳭", "鳱", "鳵", "鳼", "鳺", "鳿", "鳷", "鴀", "鳹", "鳻", "鴅", "鴃", "鴥", "鴠", "鴔", "鴩", "鴘", "鴢", "鴐", "鴳", "鵁", "鵧", "鴶", "鴮", "鴱", "鴸", "鵅", "鵃", "鴾", "鵀", "鴽", "鵏", "鵊", "鵛", "鵋", "鵖", "鵌", "鵗", "鵔", "鵷", "鶁", "鶊", "鶄", "鶈", "鵱", "鶀", "鵸", "鶋", "鶌", "鵽", "鵫", "鵴", "鵩", "鶅", "鵳", "鵻", "鶂", "鵹", "鶟", "鶙", "鶤", "鶝", "鶐", "鶛", "鶠", "鶔", "鶜", "鶪", "鶗", "鶢", "鶨", "鶞", "鶣", "鶖", "鶷", "鶶", "鷁", "鷇", "鷊", "鷏", "鶾", "鷅", "鷃", "鶵", "鷈", "鶱", "鶭", "鷛", "鷒", "鷞", "鷋", "鷐", "鷜", "鷑", "鷩", "鷘", "鷖", "鷵", "鷕", "鷻", "鷷", "鷣", "鷤", "鷶", "鷡", "鷮", "鷢", "鸂", "鷾", "鸇", "鸃", "鸆", "鸅", "鸀", "鸁", "鸉", "鷿", "鷽", "鸄", "鸋", "鸍", "鸏", "鸒", "鸔", "鸓", "鸗", "鸙", "鹺", "麃", "麆", "麉", "麎", "麌", "麔", "麙", "麛", "麚", "麜", "麠", "麡", "麧", "麮", "麰", "麶", "麷", "黀", "黂", "黈", "黓", "黕", "黖", "黚", "黤", "黫", "黮", "黭", "黰", "黳", "黵", "黺", "鼁", "鼀", "鼆", "鼊", "鼏", "鼖", "鼛", "鼘", "鼜", "鼤", "鼣", "鼥", "鼪", "鼨", "鼭", "鼰", "鼮", "鼵", "鼳", "鼲", "鼸", "鼶", "齀", "齂", "齃", "齌", "齍", "齎", "齖", "齗", "齘", "齛", "齠", "齞", "齝", "齥", "齤", "齫", "齱", "齰", "齮", "齯", "齴", "齵", "齸", "齻", "齺", "齹", "齾", "龒", "龤", "堔", "礂", "蒏", "蒆", "兙", "兛", "兞", "兝", "兡", "兣", "嗧", "瓩", "忼", "擡", "氊", "穇", "擧", "譌", "!", "\"", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ":", ";", "<", "=", ">", "?", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "[", "]", "_", "`", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "©", "°", "²", "´", "½", "Á", "Ä", "Å", "Ç", "È", "É", "Í", "Ó", "Ö", "×", "Ü", "ß", "à", "á", "â", "ã", "ä", "å", "æ", "ç", "è", "é", "ê", "ë", "í", "ð", "ñ", "ò", "ó", "ô", "õ", "ö", "ø", "ú", "û", "ü", "ý", "ā", "ă", "ą", "ć", "Č", "č", "đ", "ē", "ė", "ę", "ğ", "ī", "ı", "Ł", "ł", "ń", "ň", "ō", "ř", "Ş", "ş", "Š", "š", "ţ", "ū", "ż", "Ž", "ž", "Ș", "ș", "ț", "Δ", "α", "λ", "μ", "φ", "Г", "О", "а", "в", "л", "о", "р", "с", "т", "я", "ồ", "—", "―", "’", "“", "”", "…", "℃", "→", "∇", "−", "■", "☆", "、", "。", "々", "〆", "〈", "〉", "「", "」", "『", "』", "〔", "〕", "〜", "!", "#", "%", "&", "(", ")", "+", ",", "-", ".", "/", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ":", ";", "=", "?", "@", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "R", "S", "T", "U", "V", "W", "X", "Z", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "~", "・", "ǎ", "ǒ", "ě", "ǐ", "ì", "ǔ", "ù", "ǖ", "ǘ", "ǚ", "ǜ", "【", "】", "《", "》", "‥", "{", "}", "\\", "|", "@", "^", "~", "÷", "∕", "∙", "⋅", "·", "⊕", "⊖", "⊗", "⊘", "⊙", "±", "∓", "∩", "∪", "□", "⊎", "⊓", "⊔", "≠", "≈", "≡", "≤", "≥", "≪", "≫", "≲", "≳", "≶", "≷", "≺", "≻", "≼", "≽", "∈", "∉", "⊂", "⊃", "⊆", "⊇", "⊄", "⊅", "∅", "∖", "∁", "∆", "∧", "∨", "¬", "⊻", "⊼", "⊽", "←", "↔", "⇒", "⇐", "⇔", "∀", "∃", "∄", "∴", "∵", "∝", "∞", "⊥", "∟", "∠", "∡", "∢", "′", "″", "∥", "⊾", "⊿", "∂", "∫", "∬", "∭", "∮", "∯", "∰", "∑", "∏", "√", "∛", "∜", "∱", "∲", "∳", "∶", "∷", "∼", "®", "≄", "≅", "≃", "≦", "≧", "⊈", "⊉", "⊢", "⊤", "⊨", "⊧", "℉", "Ω", "℧", "Å", "⌀", "ℏ", "⅀", "⍺", "⍵", "¢", "€", "£", "¥", "¥", "₿", "↑", "↓", "↕", "↖", "↗", "↘", "↙", "↺", "↻", "↼", "↽", "↾", "↿", "⇀", "⇁", "⇂", "⇃", "⇋", "⇌", "ª", "º", "⁰", "¹", "³", "⁴", "⁵", "⁶", "⁷", "⁸", "⁹", "⁺", "⁻", "⁼", "⁽", "⁾", "ⁿ", "₀", "₁", "₂", "₃", "₄", "₅", "₆", "₇", "₈", "₉", "₊", "₋", "₌", "₍", "₎", "Ⅰ", "Ⅱ", "Ⅲ", "Ⅳ", "Ⅴ", "Ⅵ", "Ⅶ", "Ⅷ", "Ⅸ", "Ⅹ", "Ⅺ", "Ⅻ", "ⅰ", "ⅱ", "ⅲ", "ⅳ", "ⅴ", "ⅵ", "ⅶ", "ⅷ", "ⅸ", "ⅹ", "ⅺ", "ⅻ", "☰", "☱", "☲", "☳", "☴", "☵", "☶", "☷", "♀", "♂", "♳", "♴", "♵", "♶", "♷", "♸", "♹", "♺", "♩", "♪", "♫", "♬", "⚪", "⚫", "⚬", "✶", "✷", "✸", "➀", "➁", "➂", "➃", "➄", "➅", "➆", "➇", "➈", "➉", "➊", "➋", "➌", "➍", "➎", "➏", "➐", "➑", "➒", "➓", "⏀", "⏁", "⏂", "⏃", "⏄", "⏅", "⏆", "⏇", "⏈", "⏉", "⏊", "⏋", "⏌", "⏚", "⏴", "⏵", "⏶", "⏷", "⏸", "⏹", "⏺", "⏻", "⏼", "Α", "Β", "Γ", "Ε", "Ζ", "Η", "Θ", "Ι", "Κ", "Λ", "Μ", "Ν", "Ξ", "Ο", "Π", "Ρ", "Σ", "Τ", "Υ", "Φ", "Χ", "Ψ", "β", "γ", "δ", "ε", "ζ", "η", "θ", "ι", "κ", "ν", "ξ", "ο", "π", "ρ", "σ", "τ", "υ", "χ", "ψ", "ω", "ϐ", "ϑ", "ϒ", "ϕ", "█", "ϖ", "ϰ", "ϱ", "ϴ", "ϵ", "ϝ", "Ϟ", "ϟ", "Ϡ", "ϡ", "Ϣ", "ϣ", "Ϥ", "ϥ", "Ϧ", "ϧ", "Ϩ", "ϩ", "Ϫ", "ϫ", "Ϭ", "ϭ", "Ϯ", "ϯ", "∸", "∹", "∺", "∻", "∽", "∾", "∿", "≀", "≁", "≂", "≆", "≇", "≉", "≊", "≋", "≌", "≍", "≎", "≏", "≐", "≑", "≒", "≓", "≔", "≕", "≖", "≗", "≘", "≙", "≚", "≛", "≜", "≝", "≞", "≟", "≢", "≣", "≨", "≩", "≬", "≭", "≮", "≯", "≰", "≱", "≴", "≵", "≸", "≹", "≾", "≿", "⊀", "⊁", "⊊", "⊋", "⊌", "⊍", "⊏", "⊐", "⊑", "⊒", "⊚", "⊛", "⊜", "⊝", "⊞", "⊟", "⊠", "⊡", "⊣", "⊦", "⊩", "⊪", "⊫", "⊬", "⊭", "⊮", "⊯", "⊰", "⊱", "⊲", "⊳", "⊴", "⊵", "⊶", "⊷", "⊸", "⊹", "⊺", "ℎ", "℘", "ℜ", "ℑ", "ℵ", "ℶ", "ℷ", "ℸ", "⌬", "⌭", "⌮", "⌯", "⎔", "¤", "₠", "₡", "₢", "₣", "₤", "₥", "₦", "₧", "₨", "₩", "₪", "₫", "₭", "₮", "₯", "₰", "₱", "₲", "₳", "₴", "₵", "₶", "₷", "₸", "₹", "₺", "₻", "₼", "₽", "₾", "↚", "↛", "↜", "↝", "↞", "↟", "↠", "↡", "↢", "↣", "↤", "↥", "↦", "↧", "↨", "↩", "↪", "↫", "↬", "↭", "↮", "↯", "↰", "↱", "↲", "↳", "↴", "↵", "↶", "↷", "↸", "↹", "⇄", "⇅", "⇆", "⇇", "⇈", "⇉", "⇊", "⇍", "⇎", "⇏", "⇑", "⇓", "⇕", "⇖", "⇗", "⇘", "⇙", "⇚", "⇛", "⇜", "⇝", "⇞", "⇟", "⇠", "⇡", "⇢", "⇣", "⇤", "⇥", "⇦", "⇧", "⇨", "⇩", "⇪", "⇫", "⇬", "⇭", "⇮", "⇯", "⇰", "⇱", "⇲", "⇳", "⇴", "⇵", "⇶", "⇷", "⇸", "⇹", "⇺", "⇻", "⇼", "⇽", "⇾", "⇿", "ↀ", "ↁ", "ↂ", "☀", "☁", "☂", "☃", "☄", "★", "☇", "☈", "☉", "☊", "☋", "☌", "☍", "☎", "☏", "☐", "☑", "☒", "☓", "☔", "☕", "☖", "☗", "☘", "☙", "☚", "☛", "☜", "☝", "☞", "☟", "☠", "☡", "☢", "☣", "☤", "☥", "☦", "☧", "☨", "☩", "☪", "☫", "☬", "☭", "☮", "☯", "☸", "☹", "☺", "☻", "☼", "☽", "☾", "☿", "♁", "♃", "♄", "♅", "♆", "♇", "♔", "♕", "♖", "♗", "♘", "♙", "♚", "♛", "♜", "♝", "♞", "♟", "♠", "♡", "♢", "♣", "♤", "♥", "♦", "♧", "♨", "♭", "♮", "♯", "♰", "♱", "♲", "♻", "♼", "♽", "♾", "⚀", "⚁", "⚂", "⚃", "⚄", "⚅", "⚆", "⚇", "⚈", "⚉", "⚊", "⚋", "⚌", "⚍", "⚎", "⚏", "⚐", "⚑", "⚒", "⚓", "⚔", "⚕", "⚖", "⚗", "⚘", "⚙", "⚚", "⚛", "⚜", "⚝", "⚞", "⚟", "⚠", "⚡", "⚢", "⚣", "⚤", "⚥", "⚦", "⚧", "⚨", "⚩", "⚭", "⚮", "⚯", "⚰", "⚱", "⚲", "⚳", "⚴", "⚵", "⚶", "⚷", "⚸", "⚹", "⚺", "⚻", "⚼", "⚿", "⛀", "⛁", "⛂", "⛃", "⛆", "⛇", "⛈", "⛉", "⛊", "⛋", "⛌", "⛍", "⛏", "⛐", "⛑", "⛒", "⛓", "⛕", "⛖", "⛗", "⛘", "⛙", "⛚", "⛛", "⛜", "⛝", "⛞", "⛠", "⛡", "⛢", "⛣", "⛤", "⛥", "⛦", "⛧", "⛨", "⛩", "⛪", "⛫", "⛬", "⛭", "⛮", "⛯", "⛶", "⛾", "⛿", "✆", "✇", "✈", "✉", "✌", "✍", "✎", "✏", "✐", "✑", "✒", "✓", "✔", "✕", "✙", "✚", "✛", "✜", "✝", "✞", "✟", "✠", "✡", "✢", "✣", "✤", "✥", "✦", "✧", "✩", "✪", "✫", "✬", "✭", "✮", "✯", "✰", "✱", "✲", "✳", "✴", "✵", "✹", "✺", "✻", "✼", "✽", "✾", "✿", "❀", "❁", "❂", "❃", "❄", "❅", "❆", "❇", "❈", "❉", "❊", "❋", "❍", "❏", "❐", "❑", "❒", "❖", "❘", "❙", "❚", "❛", "❜", "❝", "❞", "❡", "❢", "❣", "❤", "❥", "❦", "❧", "❨", "❩", "❪", "❫", "❬", "❭", "❮", "❯", "❰", "❱", "❲", "❳", "❴", "❵", "❶", "❷", "❸", "❹", "❺", "❻", "❼", "❽", "❾", "❿", "①", "②", "③", "④", "⑤", "⑥", "⑦", "⑧", "⑨", "⑩", "➔", "➕", "➖", "➗", "➘", "➙", "➚", "➛", "➜", "➝", "➞", "➟", "➠", "➡", "➢", "➣", "➤", "➥", "➦", "➧", "➨", "➩", "➪", "➫", "➬", "➭", "➮", "➯", "➰", "➱", "➲", "➳", "➴", "➵", "➶", "➷", "➸", "➹", "➺", "➻", "➼", "➽", "➾", "➿", "⌘", "⌥", "⌃", "⎋", "⌫", "⌦", "⏏", "⌤", "⌧", "⌨", "⎆", "⎇", "⎈", "⎉", "⎊", "⎌", "⎍", "⎎", "⎏", "⎐", "⎑", "⎒", "⎓", "⎕", "⎖", "⎗", "⎘", "⎙", "⎚", "⎛", "⎜", "⎝", "⎞", "⎟", "⎠", "⎡", "⎢", "⎣", "⎤", "⎥", "⎦", "⎧", "⎨", "⎩", "⎪", "⎫", "⎬", "⎭", "⎮", "⎯", "⎰", "⎱", "⎲", "⎳", "⎴", "⎵", "⎶", "⎷", "⎸", "⎹", "⎺", "⎻", "⎼", "⎽", "⎾", "⎿", "⏍", "⏎", "⏐", "⏑", "⏒", "⏓", "⏔", "⏕", "⏖", "⏗", "⏘", "⏙", "⏛", "⏜", "⏝", "⏞", "⏟", "⏠", "⏡", "⏢", "⏣", "⏤", "⏥", "⏦", "⏧", "⏨", "⏭", "⏮", "⏯", "⏱", "⏲", "▲", "▽", "◐", "⏽", "⏾", "⏿", "ɐ", "ɑ", "ɒ", "ɓ", "ɔ", "ɕ", "ɖ", "ɗ", "ɘ", "ə", "ɚ", "ɛ", "ɜ", "ɝ", "ɞ", "ɟ", "ɠ", "ɡ", "ɢ", "ɣ", "ɤ", "ɥ", "ɦ", "ɧ", "ɨ", "ɩ", "ɪ", "ɫ", "ɬ", "ɭ", "ɮ", "ɯ", "ɰ", "ɱ", "ɲ", "ɳ", "ɴ", "ɵ", "ɶ", "ɷ", "ɸ", "ɹ", "ɺ", "ɻ", "ɼ", "ɽ", "ɾ", "ɿ", "ʀ", "ʁ", "ʂ", "ʃ", "ʄ", "ʅ", "ʆ", "ʇ", "ʈ", "ʉ", "ʊ", "ʋ", "ʌ", "ʍ", "ʎ", "ʏ", "ʐ", "ʑ", "ʒ", "ʓ", "ʔ", "ʕ", "ʖ", "ʗ", "ʘ", "ʙ", "ʚ", "ʛ", "ʜ", "ʝ", "ʞ", "ʟ", "ʠ", "ʡ", "ʢ", "ʣ", "ʤ", "ʥ", "ʦ", "ʧ", "ʨ", "ʩ", "ʪ", "ʫ", "ʬ", "ʭ", "ʮ", "ʯ", "━", "Ǝ", "Ã", "●", "▶", "|", "𝑢", "〖", "〗", "︽", "–", "﹥", "𝜓", "•", "∋", "ƒ", "०", "✘", "Е", "◉", "〒", "𝒱", "𝜆", "⟹", "﹪", "◊", "╆", "오", "˂", "〉", "𝝎", "▪", "△", "▁", "◼", "〇", "▷", "▬", "𝒮", "†", "ₒ", "⼁", "〵", "⭐", "╳", "⟶", "으", "⬆", "Ạ", "◀", "", "▫", "丄", "︾", "◥", "‖", "𝜌", "ⅼ", "▼", "⁎", "﹏", "😁", "😂", "😃", "😄", "😅", "😆", "😉", "😊", "😋", "😌", "😍", "😏", "😒", "😓", "😔", "😖", "😘", "😚", "😜", "😝", "😞", "😠", "😡", "😢", "😣", "😤", "😥", "😨", "😩", "😪", "😫", "😭", "😰", "😱", "😲", "😳", "😵", "😷", "😸", "😹", "😺", "😻", "😼", "😽", "😾", "😿", "🙀", "🙅", "🙆", "🙇", "🙈", "🙉", "🙊", "🙋", "🙌", "🙍", "🙎", "🙏", "✂", "✅", "✊", "✋", "✖", "✨", "❌", "❎", "❓", "❔", "❕", "❗", "🚀", "🚃", "🚄", "🚅", "🚇", "🚉", "🚌", "🚏", "🚑", "🚒", "🚓", "🚕", "🚗", "🚙", "🚚", "🚢", "🚤", "🚥", "🚧", "🚨", "🚩", "🚪", "🚫", "🚬", "🚭", "🚲", "🚶", "🚹", "🚺", "🚻", "🚼", "🚽", "🚾", "🛀", "Ⓜ", "🅰", "🅱", "🅾", "🅿", "🆎", "🆑", "🆒", "🆓", "🆔", "🆕", "🆖", "🆗", "🆘", "🆙", "🆚", "🇩🇪", "🇬🇧", "🇨🇳", "🇯🇵", "🇫🇷", "🇰🇷", "🇪🇸", "🇮🇹", "🇷🇺", "🇺🇸", "🈁", "ℹ", "⌚", "⌛", "⏩", "⏪", "⏫", "⏬", "⏰", "⏳", "◻", "◽", "◾", "♈", "♉", "♊", "♋", "♌", "♍", "♎", "♏", "♐", "♑", "♒", "♓", "♿", "⚽", "⚾", "⛄", "⛅", "⛎", "⛔", "⛲", "⛳", "⛵", "⛺", "⛽", "⤴", "⤵", "⬅", "⬇", "⬛", "⬜", "⭕", "〰", "〽", "㊗", "㊙", "🀄", "🃏", "🌀", "🌁", "🌂", "🌃", "🌄", "🌅", "🌆", "🌇", "🌈", "🌉", "🌊", "🌋", "🌌", "🌏", "🌑", "🌓", "🌔", "🌕", "🌙", "🌛", "🌟", "🌠", "🌰", "🌱", "🌴", "🌵", "🌷", "🌸", "🌹", "🌺", "🌻", "🌼", "🌽", "🌾", "🌿", "🍀", "🍁", "🍂", "🍃", "🍄", "🍅", "🍆", "🍇", "🍈", "🍉", "🍊", "🍌", "🍍", "🍎", "🍏", "🍑", "🍒", "🍓", "🍔", "🍕", "🍖", "🍗", "🍘", "🍙", "🍚", "🍛", "🍜", "🍝", "🍞", "🍟", "🍠", "🍡", "🍢", "🍣", "🍤", "🍥", "🍦", "🍧", "🍨", "🍩", "🍪", "🍫", "🍬", "🍭", "🍮", "🍯", "🍰", "🍱", "🍲", "🍳", "🍴", "🍵", "🍶", "🍷", "🍸", "🍹", "🍺", "🍻", "🎀", "🎁", "🎂", "🎃", "🎄", "🎅", "🎆", "🎇", "🎈", "🎉", "🎊", "🎋", "🎌", "🎍", "🎎", "🎏", "🎐", "🎑", "🎒", "🎓", "🎠", "🎡", "🎢", "🎣", "🎤", "🎥", "🎦", "🎧", "🎨", "🎩", "🎪", "🎫", "🎬", "🎭", "🎮", "🎯", "🎰", "🎱", "🎲", "🎳", "🎴", "🎵", "🎶", "🎷", "🎸", "🎹", "🎺", "🎻", "🎼", "🎽", "🎾", "🎿", "🏀", "🏁", "🏂", "🏃", "🏄", "🏆", "🏈", "🏊", "🏠", "🏡", "🏢", "🏣", "🏥", "🏦", "🏧", "🏨", "🏩", "🏪", "🏫", "🏬", "🏭", "🏮", "🏯", "🏰", "🐌", "🐍", "🐎", "🐑", "🐒", "🐔", "🐗", "🐘", "🐙", "🐚", "🐛", "🐜", "🐝", "🐞", "🐟", "🐠", "🐡", "🐢", "🐣", "🐤", "🐥", "🐦", "🐧", "🐨", "🐩", "🐫", "🐬", "🐭", "🐮", "🐯", "🐰", "🐱", "🐲", "🐳", "🐴", "🐵", "🐶", "🐷", "🐸", "🐹", "🐺", "🐻", "🐼", "🐽", "🐾", "👀", "👂", "👃", "👄", "👅", "👆", "👇", "👈", "👉", "👊", "👋", "👌", "👍", "👎", "👏", "👐", "👑", "👒", "👓", "👔", "👕", "👖", "👗", "👘", "👙", "👚", "👛", "👜", "👝", "👞", "👟", "👠", "👡", "👢", "👣", "👤", "👦", "👧", "👨", "👩", "👪", "👫", "👮", "👯", "👰", "👱", "👲", "👳", "👴", "👵", "👶", "👷", "👸", "👹", "👺", "👻", "👼", "👽", "👾", "👿", "💀", "💁", "💂", "💃", "💄", "💅", "💆", "💇", "💈", "💉", "💊", "💋", "💌", "💍", "💎", "💏", "💐", "💑", "💒", "💓", "💔", "💕", "💖", "💗", "💘", "💙", "💚", "💛", "💜", "💝", "💞", "💟", "💠", "💡", "💢", "💣", "💤", "💥", "💦", "💧", "💨", "💩", "💪", "💫", "💬", "💮", "💯", "💰", "💲", "💳", "💴", "💵", "💸", "💹", "💺", "💻", "💼", "💽", "💾", "💿", "📀", "📁", "📂", "📃", "📄", "📅", "📆", "📇", "📈", "📉", "📊", "📋", "📌", "📍", "📎", "📏", "📐", "📑", "📒", "📓", "📔", "📕", "📖", "📗", "📘", "📙", "📚", "📛", "📜", "📝", "📞", "📟", "📠", "📡", "📢", "📣", "📤", "📥", "📦", "📧", "📨", "📩", "📪", "📫", "📮", "📰", "📱", "📲", "📳", "📴", "📶", "📷", "📹", "📺", "📻", "📼", "🔃", "🔊", "🔋", "🔌", "🔍", "🔎", "🔏", "🔐", "🔑", "🔒", "🔓", "🔔", "🔖", "🔗", "🔘", "🔙", "🔚", "🔛", "🔜", "🔝", "🔞", "🔟", "🔠", "🔡", "🔢", "🔣", "🔤", "🔥", "🔦", "🔧", "🔨", "🔩", "🔪", "🔫", "🔮", "🔯", "🔰", "🔱", "🔲", "🔳", "🔴", "🔵", "🔶", "🔷", "🔸", "🔹", "🔺", "🔻", "🔼", "🔽", "🕐", "🕑", "🕒", "🕓", "🕔", "🕕", "🕖", "🕗", "🕘", "🕙", "🕚", "🕛", "🗻", "🗼", "🗽", "🗾", "🗿", "😀", "😇", "😈", "😎", "😐", "😑", "😕", "😗", "😙", "😛", "😟", "😦", "😧", "😬", "😮", "😯", "😴", "😶", "🚁", "🚂", "🚆", "🚈", "🚊", "🚍", "🚎", "🚐", "🚔", "🚖", "🚘", "🚛", "🚜", "🚝", "🚞", "🚟", "🚠", "🚡", "🚣", "🚦", "🚮", "🚯", "🚰", "🚱", "🚳", "🚴", "🚵", "🚷", "🚸", "🚿", "🛁", "🛂", "🛃", "🛄", "🛅", "🌍", "🌎", "🌐", "🌒", "🌖", "🌗", "🌘", "🌚", "🌜", "🌝", "🌞", "🌲", "🌳", "🍋", "🍐", "🍼", "🏇", "🏉", "🏤", "🐀", "🐁", "🐂", "🐃", "🐄", "🐅", "🐆", "🐇", "🐈", "🐉", "🐊", "🐋", "🐏", "🐐", "🐓", "🐕", "🐖", "🐪", "👥", "👬", "👭", "💭", "💶", "💷", "📬", "📭", "📯", "📵", "🔀", "🔁", "🔂", "🔄", "🔅", "🔆", "🔇", "🔉", "🔕", "🔬", "🔭", "🕜", "🕝", "🕞", "🕟", "🕠", "🕡", "🕢", "🕣", "🕤", "🕥", "🕦", "🕧" }; static const int character_dict_size = sizeof(character_dict) / sizeof(const char*); ================================================ FILE: examples/retinaface.cpp ================================================ // Copyright 2019 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include struct FaceObject { cv::Rect_ rect; cv::Point2f landmark[5]; float prob; }; static inline float intersection_area(const FaceObject& a, const FaceObject& b) { cv::Rect_ inter = a.rect & b.rect; return inter.area(); } static void qsort_descent_inplace(std::vector& faceobjects, int left, int right) { int i = left; int j = right; float p = faceobjects[(left + right) / 2].prob; while (i <= j) { while (faceobjects[i].prob > p) i++; while (faceobjects[j].prob < p) j--; if (i <= j) { // swap std::swap(faceobjects[i], faceobjects[j]); i++; j--; } } #pragma omp parallel sections { #pragma omp section { if (left < j) qsort_descent_inplace(faceobjects, left, j); } #pragma omp section { if (i < right) qsort_descent_inplace(faceobjects, i, right); } } } static void qsort_descent_inplace(std::vector& faceobjects) { if (faceobjects.empty()) return; qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1); } static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold) { picked.clear(); const int n = faceobjects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = faceobjects[i].rect.area(); } for (int i = 0; i < n; i++) { const FaceObject& a = faceobjects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { const FaceObject& b = faceobjects[picked[j]]; // intersection over union float inter_area = intersection_area(a, b); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area if (inter_area / union_area > nms_threshold) keep = 0; } if (keep) picked.push_back(i); } } // copy from src/layer/proposal.cpp static ncnn::Mat generate_anchors(int base_size, const ncnn::Mat& ratios, const ncnn::Mat& scales) { int num_ratio = ratios.w; int num_scale = scales.w; ncnn::Mat anchors; anchors.create(4, num_ratio * num_scale); const float cx = base_size * 0.5f; const float cy = base_size * 0.5f; for (int i = 0; i < num_ratio; i++) { float ar = ratios[i]; int r_w = round(base_size / sqrt(ar)); int r_h = round(r_w * ar); //round(base_size * sqrt(ar)); for (int j = 0; j < num_scale; j++) { float scale = scales[j]; float rs_w = r_w * scale; float rs_h = r_h * scale; float* anchor = anchors.row(i * num_scale + j); anchor[0] = cx - rs_w * 0.5f; anchor[1] = cy - rs_h * 0.5f; anchor[2] = cx + rs_w * 0.5f; anchor[3] = cy + rs_h * 0.5f; } } return anchors; } static void generate_proposals(const ncnn::Mat& anchors, int feat_stride, const ncnn::Mat& score_blob, const ncnn::Mat& bbox_blob, const ncnn::Mat& landmark_blob, float prob_threshold, std::vector& faceobjects) { int w = score_blob.w; int h = score_blob.h; // generate face proposal from bbox deltas and shifted anchors const int num_anchors = anchors.h; for (int q = 0; q < num_anchors; q++) { const float* anchor = anchors.row(q); const ncnn::Mat score = score_blob.channel(q + num_anchors); const ncnn::Mat bbox = bbox_blob.channel_range(q * 4, 4); const ncnn::Mat landmark = landmark_blob.channel_range(q * 10, 10); // shifted anchor float anchor_y = anchor[1]; float anchor_w = anchor[2] - anchor[0]; float anchor_h = anchor[3] - anchor[1]; for (int i = 0; i < h; i++) { float anchor_x = anchor[0]; for (int j = 0; j < w; j++) { int index = i * w + j; float prob = score[index]; if (prob >= prob_threshold) { // apply center size float dx = bbox.channel(0)[index]; float dy = bbox.channel(1)[index]; float dw = bbox.channel(2)[index]; float dh = bbox.channel(3)[index]; float cx = anchor_x + anchor_w * 0.5f; float cy = anchor_y + anchor_h * 0.5f; float pb_cx = cx + anchor_w * dx; float pb_cy = cy + anchor_h * dy; float pb_w = anchor_w * exp(dw); float pb_h = anchor_h * exp(dh); float x0 = pb_cx - pb_w * 0.5f; float y0 = pb_cy - pb_h * 0.5f; float x1 = pb_cx + pb_w * 0.5f; float y1 = pb_cy + pb_h * 0.5f; FaceObject obj; obj.rect.x = x0; obj.rect.y = y0; obj.rect.width = x1 - x0 + 1; obj.rect.height = y1 - y0 + 1; obj.landmark[0].x = cx + (anchor_w + 1) * landmark.channel(0)[index]; obj.landmark[0].y = cy + (anchor_h + 1) * landmark.channel(1)[index]; obj.landmark[1].x = cx + (anchor_w + 1) * landmark.channel(2)[index]; obj.landmark[1].y = cy + (anchor_h + 1) * landmark.channel(3)[index]; obj.landmark[2].x = cx + (anchor_w + 1) * landmark.channel(4)[index]; obj.landmark[2].y = cy + (anchor_h + 1) * landmark.channel(5)[index]; obj.landmark[3].x = cx + (anchor_w + 1) * landmark.channel(6)[index]; obj.landmark[3].y = cy + (anchor_h + 1) * landmark.channel(7)[index]; obj.landmark[4].x = cx + (anchor_w + 1) * landmark.channel(8)[index]; obj.landmark[4].y = cy + (anchor_h + 1) * landmark.channel(9)[index]; obj.prob = prob; faceobjects.push_back(obj); } anchor_x += feat_stride; } anchor_y += feat_stride; } } } static int detect_retinaface(const cv::Mat& bgr, std::vector& faceobjects) { ncnn::Net retinaface; retinaface.opt.use_vulkan_compute = true; // model is converted from // https://github.com/deepinsight/insightface/tree/master/RetinaFace#retinaface-pretrained-models // https://github.com/deepinsight/insightface/issues/669 // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models // retinaface.load_param("retinaface-R50.param"); // retinaface.load_model("retinaface-R50.bin"); if (retinaface.load_param("mnet.25-opt.param")) exit(-1); if (retinaface.load_model("mnet.25-opt.bin")) exit(-1); const float prob_threshold = 0.8f; const float nms_threshold = 0.4f; int img_w = bgr.cols; int img_h = bgr.rows; ncnn::Mat in = ncnn::Mat::from_pixels(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h); ncnn::Extractor ex = retinaface.create_extractor(); ex.input("data", in); std::vector faceproposals; // stride 32 { ncnn::Mat score_blob, bbox_blob, landmark_blob; ex.extract("face_rpn_cls_prob_reshape_stride32", score_blob); ex.extract("face_rpn_bbox_pred_stride32", bbox_blob); ex.extract("face_rpn_landmark_pred_stride32", landmark_blob); const int base_size = 16; const int feat_stride = 32; ncnn::Mat ratios(1); ratios[0] = 1.f; ncnn::Mat scales(2); scales[0] = 32.f; scales[1] = 16.f; ncnn::Mat anchors = generate_anchors(base_size, ratios, scales); std::vector faceobjects32; generate_proposals(anchors, feat_stride, score_blob, bbox_blob, landmark_blob, prob_threshold, faceobjects32); faceproposals.insert(faceproposals.end(), faceobjects32.begin(), faceobjects32.end()); } // stride 16 { ncnn::Mat score_blob, bbox_blob, landmark_blob; ex.extract("face_rpn_cls_prob_reshape_stride16", score_blob); ex.extract("face_rpn_bbox_pred_stride16", bbox_blob); ex.extract("face_rpn_landmark_pred_stride16", landmark_blob); const int base_size = 16; const int feat_stride = 16; ncnn::Mat ratios(1); ratios[0] = 1.f; ncnn::Mat scales(2); scales[0] = 8.f; scales[1] = 4.f; ncnn::Mat anchors = generate_anchors(base_size, ratios, scales); std::vector faceobjects16; generate_proposals(anchors, feat_stride, score_blob, bbox_blob, landmark_blob, prob_threshold, faceobjects16); faceproposals.insert(faceproposals.end(), faceobjects16.begin(), faceobjects16.end()); } // stride 8 { ncnn::Mat score_blob, bbox_blob, landmark_blob; ex.extract("face_rpn_cls_prob_reshape_stride8", score_blob); ex.extract("face_rpn_bbox_pred_stride8", bbox_blob); ex.extract("face_rpn_landmark_pred_stride8", landmark_blob); const int base_size = 16; const int feat_stride = 8; ncnn::Mat ratios(1); ratios[0] = 1.f; ncnn::Mat scales(2); scales[0] = 2.f; scales[1] = 1.f; ncnn::Mat anchors = generate_anchors(base_size, ratios, scales); std::vector faceobjects8; generate_proposals(anchors, feat_stride, score_blob, bbox_blob, landmark_blob, prob_threshold, faceobjects8); faceproposals.insert(faceproposals.end(), faceobjects8.begin(), faceobjects8.end()); } // sort all proposals by score from highest to lowest qsort_descent_inplace(faceproposals); // apply nms with nms_threshold std::vector picked; nms_sorted_bboxes(faceproposals, picked, nms_threshold); int face_count = picked.size(); faceobjects.resize(face_count); for (int i = 0; i < face_count; i++) { faceobjects[i] = faceproposals[picked[i]]; // clip to image size float x0 = faceobjects[i].rect.x; float y0 = faceobjects[i].rect.y; float x1 = x0 + faceobjects[i].rect.width; float y1 = y0 + faceobjects[i].rect.height; x0 = std::max(std::min(x0, (float)img_w - 1), 0.f); y0 = std::max(std::min(y0, (float)img_h - 1), 0.f); x1 = std::max(std::min(x1, (float)img_w - 1), 0.f); y1 = std::max(std::min(y1, (float)img_h - 1), 0.f); faceobjects[i].rect.x = x0; faceobjects[i].rect.y = y0; faceobjects[i].rect.width = x1 - x0; faceobjects[i].rect.height = y1 - y0; } return 0; } static void draw_faceobjects(const cv::Mat& bgr, const std::vector& faceobjects) { cv::Mat image = bgr.clone(); for (size_t i = 0; i < faceobjects.size(); i++) { const FaceObject& obj = faceobjects[i]; fprintf(stderr, "%.5f at %.2f %.2f %.2f x %.2f\n", obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::rectangle(image, obj.rect, cv::Scalar(0, 255, 0)); cv::circle(image, obj.landmark[0], 2, cv::Scalar(0, 255, 255), -1); cv::circle(image, obj.landmark[1], 2, cv::Scalar(0, 255, 255), -1); cv::circle(image, obj.landmark[2], 2, cv::Scalar(0, 255, 255), -1); cv::circle(image, obj.landmark[3], 2, cv::Scalar(0, 255, 255), -1); cv::circle(image, obj.landmark[4], 2, cv::Scalar(0, 255, 255), -1); char text[256]; sprintf(text, "%.1f%%", obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector faceobjects; detect_retinaface(m, faceobjects); draw_faceobjects(m, faceobjects); return 0; } ================================================ FILE: examples/rfcn.cpp ================================================ // Copyright 2018 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "net.h" #include #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include struct Object { cv::Rect_ rect; int label; float prob; }; static inline float intersection_area(const Object& a, const Object& b) { cv::Rect_ inter = a.rect & b.rect; return inter.area(); } static void qsort_descent_inplace(std::vector& objects, int left, int right) { int i = left; int j = right; float p = objects[(left + right) / 2].prob; while (i <= j) { while (objects[i].prob > p) i++; while (objects[j].prob < p) j--; if (i <= j) { // swap std::swap(objects[i], objects[j]); i++; j--; } } #pragma omp parallel sections { #pragma omp section { if (left < j) qsort_descent_inplace(objects, left, j); } #pragma omp section { if (i < right) qsort_descent_inplace(objects, i, right); } } } static void qsort_descent_inplace(std::vector& objects) { if (objects.empty()) return; qsort_descent_inplace(objects, 0, objects.size() - 1); } static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold, bool agnostic = false) { picked.clear(); const int n = faceobjects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = faceobjects[i].rect.area(); } for (int i = 0; i < n; i++) { const Object& a = faceobjects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { const Object& b = faceobjects[picked[j]]; if (!agnostic && a.label != b.label) continue; // intersection over union float inter_area = intersection_area(a, b); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area if (inter_area / union_area > nms_threshold) keep = 0; } if (keep) picked.push_back(i); } } static int detect_rfcn(const cv::Mat& bgr, std::vector& objects) { ncnn::Net rfcn; rfcn.opt.use_vulkan_compute = true; // original pretrained model from https://github.com/YuwenXiong/py-R-FCN // https://github.com/YuwenXiong/py-R-FCN/blob/master/models/pascal_voc/ResNet-50/rfcn_end2end/test_agnostic.prototxt // https://1drv.ms/u/s!AoN7vygOjLIQqUWHpY67oaC7mopf // resnet50_rfcn_final.caffemodel if (rfcn.load_param("rfcn_end2end.param")) exit(-1); if (rfcn.load_model("rfcn_end2end.bin")) exit(-1); const int target_size = 224; const int max_per_image = 100; const float confidence_thresh = 0.6f; // CONF_THRESH const float nms_threshold = 0.3f; // NMS_THRESH // scale to target detect size int w = bgr.cols; int h = bgr.rows; float scale = 1.f; if (w < h) { scale = (float)target_size / w; w = target_size; h = h * scale; } else { scale = (float)target_size / h; h = target_size; w = w * scale; } ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, w, h); const float mean_vals[3] = {102.9801f, 115.9465f, 122.7717f}; in.substract_mean_normalize(mean_vals, 0); ncnn::Mat im_info(3); im_info[0] = h; im_info[1] = w; im_info[2] = scale; // step1, extract feature and all rois ncnn::Extractor ex1 = rfcn.create_extractor(); ex1.input("data", in); ex1.input("im_info", im_info); ncnn::Mat rfcn_cls; ncnn::Mat rfcn_bbox; ncnn::Mat rois; // all rois ex1.extract("rfcn_cls", rfcn_cls); ex1.extract("rfcn_bbox", rfcn_bbox); ex1.extract("rois", rois); // step2, extract bbox and score for each roi std::vector > class_candidates; for (int i = 0; i < rois.c; i++) { ncnn::Extractor ex2 = rfcn.create_extractor(); ncnn::Mat roi = rois.channel(i); // get single roi ex2.input("rfcn_cls", rfcn_cls); ex2.input("rfcn_bbox", rfcn_bbox); ex2.input("rois", roi); ncnn::Mat bbox_pred; ncnn::Mat cls_prob; ex2.extract("bbox_pred", bbox_pred); ex2.extract("cls_prob", cls_prob); int num_class = cls_prob.w; class_candidates.resize(num_class); // find class id with highest score int label = 0; float score = 0.f; for (int i = 0; i < num_class; i++) { float class_score = cls_prob[i]; if (class_score > score) { label = i; score = class_score; } } // ignore background or low score if (label == 0 || score <= confidence_thresh) continue; // fprintf(stderr, "%d = %f\n", label, score); // unscale to image size float x1 = roi[0] / scale; float y1 = roi[1] / scale; float x2 = roi[2] / scale; float y2 = roi[3] / scale; float pb_w = x2 - x1 + 1; float pb_h = y2 - y1 + 1; // apply bbox regression float dx = bbox_pred[4]; float dy = bbox_pred[4 + 1]; float dw = bbox_pred[4 + 2]; float dh = bbox_pred[4 + 3]; float cx = x1 + pb_w * 0.5f; float cy = y1 + pb_h * 0.5f; float obj_cx = cx + pb_w * dx; float obj_cy = cy + pb_h * dy; float obj_w = pb_w * exp(dw); float obj_h = pb_h * exp(dh); float obj_x1 = obj_cx - obj_w * 0.5f; float obj_y1 = obj_cy - obj_h * 0.5f; float obj_x2 = obj_cx + obj_w * 0.5f; float obj_y2 = obj_cy + obj_h * 0.5f; // clip obj_x1 = std::max(std::min(obj_x1, (float)(bgr.cols - 1)), 0.f); obj_y1 = std::max(std::min(obj_y1, (float)(bgr.rows - 1)), 0.f); obj_x2 = std::max(std::min(obj_x2, (float)(bgr.cols - 1)), 0.f); obj_y2 = std::max(std::min(obj_y2, (float)(bgr.rows - 1)), 0.f); // append object Object obj; obj.rect = cv::Rect_(obj_x1, obj_y1, obj_x2 - obj_x1 + 1, obj_y2 - obj_y1 + 1); obj.label = label; obj.prob = score; class_candidates[label].push_back(obj); } // post process objects.clear(); for (int i = 0; i < (int)class_candidates.size(); i++) { std::vector& candidates = class_candidates[i]; qsort_descent_inplace(candidates); std::vector picked; nms_sorted_bboxes(candidates, picked, nms_threshold); for (int j = 0; j < (int)picked.size(); j++) { int z = picked[j]; objects.push_back(candidates[z]); } } qsort_descent_inplace(objects); if (max_per_image > 0 && max_per_image < objects.size()) { objects.resize(max_per_image); } return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = {"background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor" }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0)); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_rfcn(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/rvm.cpp ================================================ // Copyright 2025 Tencent // SPDX-License-Identifier: BSD-3-Clause // ncnn model exported from https://github.com/PeterL1n/RobustVideoMatting // // import torch // from torch import nn // from model import MattingNetwork // from model.fast_guided_filter import FastGuidedFilterRefiner // from model.deep_guided_filter import DeepGuidedFilterRefiner // // class Model(nn.Module): // def __init__(self): // super().__init__() // // self.rvm = MattingNetwork('mobilenetv3').eval() // self.rvm.load_state_dict(torch.load('rvm_mobilenetv3.pth')) // // self.refiner_deep = DeepGuidedFilterRefiner() // self.refiner_fast = FastGuidedFilterRefiner() // // def forward_first_frame(self, src): // return self.rvm(src) // // def forward(self, src, src_sm, r1, r2, r3, r4): // // f1, f2, f3, f4 = self.rvm.backbone(src_sm) // f4 = self.rvm.aspp(f4) // hid, *rec = self.rvm.decoder(src_sm, f1, f2, f3, f4, r1, r2, r3, r4) // // # downsample // fgr_residual, pha = self.rvm.project_mat(hid).split([3, 1], dim=-3) // fgr = fgr_residual + src_sm // // # downsample + refiner_deep // fgr_residual_deep, pha_deep = self.refiner_deep(src, src_sm, fgr_residual, pha, hid) // fgr_deep = fgr_residual_deep + src // // # downsample + refiner_fast // fgr_residual_fast, pha_fast = self.refiner_fast(src, src_sm, fgr_residual, pha, hid) // fgr_fast = fgr_residual_fast + src // // # downsample + segmentation // seg = self.rvm.project_seg(hid) // // return fgr, pha, fgr_deep, pha_deep, fgr_fast, pha_fast, seg, *rec // // import pnnx // // model = Model().eval() // // x = torch.rand(1, 3, 512, 512) // x2 = torch.rand(1, 3, 256, 256) // x2_hr = torch.rand(1, 3, 1024, 1024) // // # generate feats via forward_first_frame, with different shapes // fgr, pha, r1, r2, r3, r4 = model.forward_first_frame(x) // fgr2, pha2, r12, r22, r32, r42 = model.forward_first_frame(x2) // // # export with dynamic shape // pnnx.export(model, "rvm_mobilenetv3.pt", (x, x, r1, r2, r3, r4), (x2_hr, x2, r12, r22, r32, r42)) // // and then fix refiner_fast fp16 overflow issue in ncnn.param via appending 31=1 layer feat mask // // BinaryOp div_58 2 1 401 399 402 0=3 31=1 // #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif static int detect_rvm(const cv::Mat& bgr, cv::Mat& fgr, cv::Mat& pha, cv::Mat& seg) { ncnn::Net rvm; rvm.opt.use_vulkan_compute = true; // https://github.com/nihui/ncnn-android-rvm/tree/master/app/src/main/assets // you shall also change r1,r2,r3,r4 shape below when model changed if (rvm.load_param("rvm_mobilenetv3.ncnn.param")) exit(-1); if (rvm.load_model("rvm_mobilenetv3.ncnn.bin")) exit(-1); // if (rvm.load_param("rvm_resnet50.ncnn.param")) // exit(-1); // if (rvm.load_model("rvm_resnet50.ncnn.bin")) // exit(-1); const int w = bgr.cols; const int h = bgr.rows; const int target_size = 512; const int max_stride = 16; bool refine_deep = true; // bool refine_fast = true; const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; ncnn::Mat in_pad; ncnn::Mat in_small_pad; int wpad = 0; int hpad = 0; bool downsample = std::max(w, h) > target_size; if (downsample) { // letterbox pad to multiple of max_stride int w2 = w; int h2 = h; float scale = 1.f; if (w > h) { scale = (float)target_size / w; w2 = target_size; h2 = h2 * scale; } else { scale = (float)target_size / h; h2 = target_size; w2 = w2 * scale; } ncnn::Mat in_small = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, w, h, w2, h2); // letterbox pad to target_size rectangle int w2pad = (w2 + max_stride - 1) / max_stride * max_stride - w2; int h2pad = (h2 + max_stride - 1) / max_stride * max_stride - h2; ncnn::copy_make_border(in_small, in_small_pad, h2pad / 2, h2pad - h2pad / 2, w2pad / 2, w2pad - w2pad / 2, ncnn::BORDER_CONSTANT, 114.f); in_small_pad.substract_mean_normalize(0, norm_vals); int w3 = w; int h3 = h; if (w > h) { w3 = w; h3 = in_small_pad.h / scale; wpad = 0; hpad = h3 - h; } else { h3 = h; w3 = in_small_pad.w / scale; wpad = w3 - w; hpad = 0; } ncnn::Mat in = ncnn::Mat::from_pixels(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, w, h); ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); in_pad.substract_mean_normalize(0, norm_vals); } else { ncnn::Mat in = ncnn::Mat::from_pixels(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, w, h); // letterbox pad to target_size rectangle wpad = (w + max_stride - 1) / max_stride * max_stride - w; hpad = (h + max_stride - 1) / max_stride * max_stride - h; ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); in_pad.substract_mean_normalize(0, norm_vals); in_small_pad = in_pad; } // rvm_mobilenetv3 ncnn::Mat r1(in_small_pad.w / 2, in_small_pad.h / 2, 16); ncnn::Mat r2(in_small_pad.w / 4, in_small_pad.h / 4, 20); ncnn::Mat r3(in_small_pad.w / 8, in_small_pad.h / 8, 40); ncnn::Mat r4(in_small_pad.w / 16, in_small_pad.h / 16, 64); // rvm_resnet50 // ncnn::Mat r1(in_small_pad.w / 2, in_small_pad.h / 2, 16); // ncnn::Mat r2(in_small_pad.w / 4, in_small_pad.h / 4, 32); // ncnn::Mat r3(in_small_pad.w / 8, in_small_pad.h / 8, 64); // ncnn::Mat r4(in_small_pad.w / 16, in_small_pad.h / 16, 128); r1.fill(0.f); r2.fill(0.f); r3.fill(0.f); r4.fill(0.f); ncnn::Extractor ex = rvm.create_extractor(); ex.input("in0", in_pad); ex.input("in1", in_small_pad); ex.input("in2", r1); ex.input("in3", r2); ex.input("in4", r3); ex.input("in5", r4); ncnn::Mat out_fgr; ncnn::Mat out_pha; if (downsample) { if (refine_deep) { // downsample + refine deep ex.extract("out2", out_fgr); ex.extract("out3", out_pha); } else // if (refine_fast) { // downsample + refine fast ex.extract("out4", out_fgr); ex.extract("out5", out_pha); } } else { // no downsample ex.extract("out0", out_fgr); ex.extract("out1", out_pha); } ncnn::Mat out_seg; // segmentation ex.extract("out6", out_seg); // feats ex.extract("out7", r1); ex.extract("out8", r2); ex.extract("out9", r3); ex.extract("out10", r4); const float denorm_vals[3] = {255.f, 255.f, 255.f}; out_fgr.substract_mean_normalize(0, denorm_vals); fgr.create(out_fgr.h, out_fgr.w, CV_8UC3); out_fgr.to_pixels(fgr.data, ncnn::Mat::PIXEL_RGB2BGR); out_pha.substract_mean_normalize(0, denorm_vals); pha.create(out_pha.h, out_pha.w, CV_8UC1); out_pha.to_pixels(pha.data, ncnn::Mat::PIXEL_GRAY); out_seg.substract_mean_normalize(0, denorm_vals); seg.create(in_pad.h, in_pad.w, CV_8UC1); out_seg.to_pixels_resize(seg.data, ncnn::Mat::PIXEL_GRAY, in_pad.w, in_pad.h); // cut letterbox pad fgr = fgr(cv::Rect(wpad / 2, hpad / 2, w, h)); pha = pha(cv::Rect(wpad / 2, hpad / 2, w, h)); seg = seg(cv::Rect(wpad / 2, hpad / 2, w, h)); return 0; } static void draw_objects(const cv::Mat& bgr, const cv::Mat& fgr, const cv::Mat& pha, const cv::Mat& seg) { const int w = bgr.cols; const int h = bgr.rows; // composite cv::Mat comp(h, w, CV_8UC3); for (int y = 0; y < h; y++) { const uchar* pf = fgr.ptr(y); const uchar* pa = pha.ptr(y); uchar* p = comp.ptr(y); for (int x = 0; x < w; x++) { const float alpha = pa[0] / 255.f; p[0] = cv::saturate_cast(pf[0] * alpha + (1 - alpha) * 155); p[1] = cv::saturate_cast(pf[1] * alpha + (1 - alpha) * 255); p[2] = cv::saturate_cast(pf[2] * alpha + (1 - alpha) * 120); pf += 3; pa += 1; p += 3; } } // composite seg cv::Mat comp_seg(h, w, CV_8UC3); for (int y = 0; y < h; y++) { const uchar* pb = bgr.ptr(y); const uchar* ps = seg.ptr(y); uchar* p = comp_seg.ptr(y); for (int x = 0; x < w; x++) { const float alpha = ps[0] / 255.f; p[0] = cv::saturate_cast(pb[0] * alpha + (1 - alpha) * 155); p[1] = cv::saturate_cast(pb[1] * alpha + (1 - alpha) * 255); p[2] = cv::saturate_cast(pb[2] * alpha + (1 - alpha) * 120); pb += 3; ps += 1; p += 3; } } cv::imshow("comp", comp); cv::imshow("comp_seg", comp_seg); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } cv::Mat fgr; cv::Mat pha; cv::Mat seg; detect_rvm(m, fgr, pha, seg); draw_objects(m, fgr, pha, seg); return 0; } ================================================ FILE: examples/scrfd.cpp ================================================ // Copyright 2021 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include struct FaceObject { cv::Rect_ rect; float prob; }; static inline float intersection_area(const FaceObject& a, const FaceObject& b) { cv::Rect_ inter = a.rect & b.rect; return inter.area(); } static void qsort_descent_inplace(std::vector& faceobjects, int left, int right) { int i = left; int j = right; float p = faceobjects[(left + right) / 2].prob; while (i <= j) { while (faceobjects[i].prob > p) i++; while (faceobjects[j].prob < p) j--; if (i <= j) { // swap std::swap(faceobjects[i], faceobjects[j]); i++; j--; } } #pragma omp parallel sections { #pragma omp section { if (left < j) qsort_descent_inplace(faceobjects, left, j); } #pragma omp section { if (i < right) qsort_descent_inplace(faceobjects, i, right); } } } static void qsort_descent_inplace(std::vector& faceobjects) { if (faceobjects.empty()) return; qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1); } static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold) { picked.clear(); const int n = faceobjects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = faceobjects[i].rect.area(); } for (int i = 0; i < n; i++) { const FaceObject& a = faceobjects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { const FaceObject& b = faceobjects[picked[j]]; // intersection over union float inter_area = intersection_area(a, b); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area if (inter_area / union_area > nms_threshold) keep = 0; } if (keep) picked.push_back(i); } } // insightface/detection/scrfd/mmdet/core/anchor/anchor_generator.py gen_single_level_base_anchors() static ncnn::Mat generate_anchors(int base_size, const ncnn::Mat& ratios, const ncnn::Mat& scales) { int num_ratio = ratios.w; int num_scale = scales.w; ncnn::Mat anchors; anchors.create(4, num_ratio * num_scale); const float cx = 0; const float cy = 0; for (int i = 0; i < num_ratio; i++) { float ar = ratios[i]; int r_w = round(base_size / sqrt(ar)); int r_h = round(r_w * ar); //round(base_size * sqrt(ar)); for (int j = 0; j < num_scale; j++) { float scale = scales[j]; float rs_w = r_w * scale; float rs_h = r_h * scale; float* anchor = anchors.row(i * num_scale + j); anchor[0] = cx - rs_w * 0.5f; anchor[1] = cy - rs_h * 0.5f; anchor[2] = cx + rs_w * 0.5f; anchor[3] = cy + rs_h * 0.5f; } } return anchors; } static void generate_proposals(const ncnn::Mat& anchors, int feat_stride, const ncnn::Mat& score_blob, const ncnn::Mat& bbox_blob, float prob_threshold, std::vector& faceobjects) { int w = score_blob.w; int h = score_blob.h; // generate face proposal from bbox deltas and shifted anchors const int num_anchors = anchors.h; for (int q = 0; q < num_anchors; q++) { const float* anchor = anchors.row(q); const ncnn::Mat score = score_blob.channel(q); const ncnn::Mat bbox = bbox_blob.channel_range(q * 4, 4); // shifted anchor float anchor_y = anchor[1]; float anchor_w = anchor[2] - anchor[0]; float anchor_h = anchor[3] - anchor[1]; for (int i = 0; i < h; i++) { float anchor_x = anchor[0]; for (int j = 0; j < w; j++) { int index = i * w + j; float prob = score[index]; if (prob >= prob_threshold) { // insightface/detection/scrfd/mmdet/models/dense_heads/scrfd_head.py _get_bboxes_single() float dx = bbox.channel(0)[index] * feat_stride; float dy = bbox.channel(1)[index] * feat_stride; float dw = bbox.channel(2)[index] * feat_stride; float dh = bbox.channel(3)[index] * feat_stride; // insightface/detection/scrfd/mmdet/core/bbox/transforms.py distance2bbox() float cx = anchor_x + anchor_w * 0.5f; float cy = anchor_y + anchor_h * 0.5f; float x0 = cx - dx; float y0 = cy - dy; float x1 = cx + dw; float y1 = cy + dh; FaceObject obj; obj.rect.x = x0; obj.rect.y = y0; obj.rect.width = x1 - x0 + 1; obj.rect.height = y1 - y0 + 1; obj.prob = prob; faceobjects.push_back(obj); } anchor_x += feat_stride; } anchor_y += feat_stride; } } } static int detect_scrfd(const cv::Mat& bgr, std::vector& faceobjects) { ncnn::Net scrfd; scrfd.opt.use_vulkan_compute = true; // model is converted from // https://github.com/deepinsight/insightface/tree/master/detection/scrfd // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models if (scrfd.load_param("scrfd_500m-opt2.param")) exit(-1); if (scrfd.load_model("scrfd_500m-opt2.bin")) exit(-1); int width = bgr.cols; int height = bgr.rows; // insightface/detection/scrfd/configs/scrfd/scrfd_500m.py const int target_size = 640; const float prob_threshold = 0.3f; const float nms_threshold = 0.45f; // pad to multiple of 32 int w = width; int h = height; float scale = 1.f; if (w > h) { scale = (float)target_size / w; w = target_size; h = h * scale; } else { scale = (float)target_size / h; h = target_size; w = w * scale; } ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, width, height, w, h); // pad to target_size rectangle int wpad = (w + 31) / 32 * 32 - w; int hpad = (h + 31) / 32 * 32 - h; ncnn::Mat in_pad; ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 0.f); const float mean_vals[3] = {127.5f, 127.5f, 127.5f}; const float norm_vals[3] = {1 / 128.f, 1 / 128.f, 1 / 128.f}; in_pad.substract_mean_normalize(mean_vals, norm_vals); ncnn::Extractor ex = scrfd.create_extractor(); ex.input("input.1", in_pad); std::vector faceproposals; // stride 32 { ncnn::Mat score_blob, bbox_blob; ex.extract("412", score_blob); ex.extract("415", bbox_blob); const int base_size = 16; const int feat_stride = 8; ncnn::Mat ratios(1); ratios[0] = 1.f; ncnn::Mat scales(2); scales[0] = 1.f; scales[1] = 2.f; ncnn::Mat anchors = generate_anchors(base_size, ratios, scales); std::vector faceobjects32; generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects32); faceproposals.insert(faceproposals.end(), faceobjects32.begin(), faceobjects32.end()); } // stride 16 { ncnn::Mat score_blob, bbox_blob; ex.extract("474", score_blob); ex.extract("477", bbox_blob); const int base_size = 64; const int feat_stride = 16; ncnn::Mat ratios(1); ratios[0] = 1.f; ncnn::Mat scales(2); scales[0] = 1.f; scales[1] = 2.f; ncnn::Mat anchors = generate_anchors(base_size, ratios, scales); std::vector faceobjects16; generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects16); faceproposals.insert(faceproposals.end(), faceobjects16.begin(), faceobjects16.end()); } // stride 8 { ncnn::Mat score_blob, bbox_blob; ex.extract("536", score_blob); ex.extract("539", bbox_blob); const int base_size = 256; const int feat_stride = 32; ncnn::Mat ratios(1); ratios[0] = 1.f; ncnn::Mat scales(2); scales[0] = 1.f; scales[1] = 2.f; ncnn::Mat anchors = generate_anchors(base_size, ratios, scales); std::vector faceobjects8; generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects8); faceproposals.insert(faceproposals.end(), faceobjects8.begin(), faceobjects8.end()); } // sort all proposals by score from highest to lowest qsort_descent_inplace(faceproposals); // apply nms with nms_threshold std::vector picked; nms_sorted_bboxes(faceproposals, picked, nms_threshold); int face_count = picked.size(); faceobjects.resize(face_count); for (int i = 0; i < face_count; i++) { faceobjects[i] = faceproposals[picked[i]]; // adjust offset to original unpadded float x0 = (faceobjects[i].rect.x - (wpad / 2)) / scale; float y0 = (faceobjects[i].rect.y - (hpad / 2)) / scale; float x1 = (faceobjects[i].rect.x + faceobjects[i].rect.width - (wpad / 2)) / scale; float y1 = (faceobjects[i].rect.y + faceobjects[i].rect.height - (hpad / 2)) / scale; x0 = std::max(std::min(x0, (float)width - 1), 0.f); y0 = std::max(std::min(y0, (float)height - 1), 0.f); x1 = std::max(std::min(x1, (float)width - 1), 0.f); y1 = std::max(std::min(y1, (float)height - 1), 0.f); faceobjects[i].rect.x = x0; faceobjects[i].rect.y = y0; faceobjects[i].rect.width = x1 - x0; faceobjects[i].rect.height = y1 - y0; } return 0; } static void draw_faceobjects(const cv::Mat& bgr, const std::vector& faceobjects) { cv::Mat image = bgr.clone(); for (size_t i = 0; i < faceobjects.size(); i++) { const FaceObject& obj = faceobjects[i]; fprintf(stderr, "%.5f at %.2f %.2f %.2f x %.2f\n", obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::rectangle(image, obj.rect, cv::Scalar(0, 255, 0)); char text[256]; sprintf(text, "%.1f%%", obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector faceobjects; detect_scrfd(m, faceobjects); draw_faceobjects(m, faceobjects); return 0; } ================================================ FILE: examples/scrfd_crowdhuman.cpp ================================================ // Copyright 2021 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include struct FaceObject { cv::Rect_ rect; float prob; }; static inline float intersection_area(const FaceObject& a, const FaceObject& b) { cv::Rect_ inter = a.rect & b.rect; return inter.area(); } static void qsort_descent_inplace(std::vector& faceobjects, int left, int right) { int i = left; int j = right; float p = faceobjects[(left + right) / 2].prob; while (i <= j) { while (faceobjects[i].prob > p) i++; while (faceobjects[j].prob < p) j--; if (i <= j) { // swap std::swap(faceobjects[i], faceobjects[j]); i++; j--; } } #pragma omp parallel sections { #pragma omp section { if (left < j) qsort_descent_inplace(faceobjects, left, j); } #pragma omp section { if (i < right) qsort_descent_inplace(faceobjects, i, right); } } } static void qsort_descent_inplace(std::vector& faceobjects) { if (faceobjects.empty()) return; qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1); } static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold) { picked.clear(); const int n = faceobjects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = faceobjects[i].rect.area(); } for (int i = 0; i < n; i++) { const FaceObject& a = faceobjects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { const FaceObject& b = faceobjects[picked[j]]; // intersection over union float inter_area = intersection_area(a, b); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area if (inter_area / union_area > nms_threshold) keep = 0; } if (keep) picked.push_back(i); } } // insightface/detection/scrfd/mmdet/core/anchor/anchor_generator.py gen_single_level_base_anchors() static ncnn::Mat generate_anchors(int base_size, const ncnn::Mat& ratios, const ncnn::Mat& scales) { int num_ratio = ratios.w; int num_scale = scales.w; ncnn::Mat anchors; anchors.create(4, num_ratio * num_scale); const float cx = 0; const float cy = 0; for (int i = 0; i < num_ratio; i++) { float ar = ratios[i]; int r_w = round(base_size / sqrt(ar)); int r_h = round(r_w * ar); //round(base_size * sqrt(ar)); for (int j = 0; j < num_scale; j++) { float scale = scales[j]; float rs_w = r_w * scale; float rs_h = r_h * scale; float* anchor = anchors.row(i * num_scale + j); anchor[0] = cx - rs_w * 0.5f; anchor[1] = cy - rs_h * 0.5f; anchor[2] = cx + rs_w * 0.5f; anchor[3] = cy + rs_h * 0.5f; } } return anchors; } static void generate_proposals(const ncnn::Mat& anchors, int feat_stride, const ncnn::Mat& score_blob, const ncnn::Mat& bbox_blob, float prob_threshold, std::vector& faceobjects) { int w = score_blob.w; int h = score_blob.h; // generate face proposal from bbox deltas and shifted anchors const int num_anchors = anchors.h; for (int q = 0; q < num_anchors; q++) { const float* anchor = anchors.row(q); const ncnn::Mat score = score_blob.channel(q); const ncnn::Mat bbox = bbox_blob.channel_range(q * 4, 4); // shifted anchor float anchor_y = anchor[1]; float anchor_w = anchor[2] - anchor[0]; float anchor_h = anchor[3] - anchor[1]; for (int i = 0; i < h; i++) { float anchor_x = anchor[0]; for (int j = 0; j < w; j++) { int index = i * w + j; float prob = score[index]; if (prob >= prob_threshold) { // insightface/detection/scrfd/mmdet/models/dense_heads/scrfd_head.py _get_bboxes_single() float dx = bbox.channel(0)[index] * feat_stride; float dy = bbox.channel(1)[index] * feat_stride; float dw = bbox.channel(2)[index] * feat_stride; float dh = bbox.channel(3)[index] * feat_stride; // insightface/detection/scrfd/mmdet/core/bbox/transforms.py distance2bbox() float cx = anchor_x + anchor_w * 0.5f; float cy = anchor_y + anchor_h * 0.5f; float x0 = cx - dx; float y0 = cy - dy; float x1 = cx + dw; float y1 = cy + dh; FaceObject obj; obj.rect.x = x0; obj.rect.y = y0; obj.rect.width = x1 - x0 + 1; obj.rect.height = y1 - y0 + 1; obj.prob = prob; faceobjects.push_back(obj); } anchor_x += feat_stride; } anchor_y += feat_stride; } } } static int detect_scrfd(const cv::Mat& bgr, std::vector& faceobjects) { ncnn::Net scrfd; scrfd.opt.use_vulkan_compute = true; // Insight face does not provided a trained scrfd_crowdhuman model // but I have one for detecing cat face, you can have a try here: // https://drive.google.com/file/d/1JogkKa0f_09HkENbCnXy9hRYxm35wKTn if (scrfd.load_param("scrfd_crowdhuman.param")) exit(-1); if (scrfd.load_model("scrfd_crowdhuman.bin")) exit(-1); int width = bgr.cols; int height = bgr.rows; const int target_size = 640; const float prob_threshold = 0.3f; const float nms_threshold = 0.45f; // pad to multiple of 32 int w = width; int h = height; float scale = 1.f; if (w > h) { scale = (float)target_size / w; w = target_size; h = h * scale; } else { scale = (float)target_size / h; h = target_size; w = w * scale; } ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, width, height, w, h); // pad to target_size rectangle int wpad = (w + 31) / 32 * 32 - w; int hpad = (h + 31) / 32 * 32 - h; ncnn::Mat in_pad; ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 0.f); const float mean_vals[3] = {127.5f, 127.5f, 127.5f}; const float norm_vals[3] = {1 / 128.f, 1 / 128.f, 1 / 128.f}; in_pad.substract_mean_normalize(mean_vals, norm_vals); ncnn::Extractor ex = scrfd.create_extractor(); ex.input("input.1", in_pad); std::vector faceproposals; // stride 8 { ncnn::Mat score_blob, bbox_blob; ex.extract("490", score_blob); ex.extract("493", bbox_blob); const int base_size = 8; const int feat_stride = 8; ncnn::Mat ratios(1); ratios[0] = 2.f; ncnn::Mat scales(1); scales[0] = 3.f; ncnn::Mat anchors = generate_anchors(base_size, ratios, scales); std::vector faceobjects32; generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects32); faceproposals.insert(faceproposals.end(), faceobjects32.begin(), faceobjects32.end()); } // stride 16 { ncnn::Mat score_blob, bbox_blob; ex.extract("510", score_blob); ex.extract("513", bbox_blob); const int base_size = 16; const int feat_stride = 16; ncnn::Mat ratios(1); ratios[0] = 2.f; ncnn::Mat scales(1); scales[0] = 3.f; ncnn::Mat anchors = generate_anchors(base_size, ratios, scales); std::vector faceobjects16; generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects16); faceproposals.insert(faceproposals.end(), faceobjects16.begin(), faceobjects16.end()); } // stride 32 { ncnn::Mat score_blob, bbox_blob; ex.extract("530", score_blob); ex.extract("533", bbox_blob); const int base_size = 32; const int feat_stride = 32; ncnn::Mat ratios(1); ratios[0] = 2.f; ncnn::Mat scales(1); scales[0] = 3.f; ncnn::Mat anchors = generate_anchors(base_size, ratios, scales); std::vector faceobjects8; generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects8); faceproposals.insert(faceproposals.end(), faceobjects8.begin(), faceobjects8.end()); } // stride 64 { ncnn::Mat score_blob, bbox_blob, kps_blob; ex.extract("550", score_blob); ex.extract("553", bbox_blob); const int base_size = 64; const int feat_stride = 64; ncnn::Mat ratios(1); ratios[0] = 2.f; ncnn::Mat scales(1); scales[0] = 3.f; ncnn::Mat anchors = generate_anchors(base_size, ratios, scales); std::vector faceobjects8; generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects8); faceproposals.insert(faceproposals.end(), faceobjects8.begin(), faceobjects8.end()); } // stride 128 { ncnn::Mat score_blob, bbox_blob, kps_blob; ex.extract("570", score_blob); ex.extract("573", bbox_blob); const int base_size = 128; const int feat_stride = 128; ncnn::Mat ratios(1); ratios[0] = 2.f; ncnn::Mat scales(1); scales[0] = 3.f; ncnn::Mat anchors = generate_anchors(base_size, ratios, scales); std::vector faceobjects8; generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects8); faceproposals.insert(faceproposals.end(), faceobjects8.begin(), faceobjects8.end()); } // sort all proposals by score from highest to lowest qsort_descent_inplace(faceproposals); // apply nms with nms_threshold std::vector picked; nms_sorted_bboxes(faceproposals, picked, nms_threshold); int face_count = picked.size(); faceobjects.resize(face_count); for (int i = 0; i < face_count; i++) { faceobjects[i] = faceproposals[picked[i]]; // adjust offset to original unpadded float x0 = (faceobjects[i].rect.x - (wpad / 2)) / scale; float y0 = (faceobjects[i].rect.y - (hpad / 2)) / scale; float x1 = (faceobjects[i].rect.x + faceobjects[i].rect.width - (wpad / 2)) / scale; float y1 = (faceobjects[i].rect.y + faceobjects[i].rect.height - (hpad / 2)) / scale; x0 = std::max(std::min(x0, (float)width - 1), 0.f); y0 = std::max(std::min(y0, (float)height - 1), 0.f); x1 = std::max(std::min(x1, (float)width - 1), 0.f); y1 = std::max(std::min(y1, (float)height - 1), 0.f); faceobjects[i].rect.x = x0; faceobjects[i].rect.y = y0; faceobjects[i].rect.width = x1 - x0; faceobjects[i].rect.height = y1 - y0; } return 0; } static void draw_faceobjects(const cv::Mat& bgr, const std::vector& faceobjects) { cv::Mat image = bgr.clone(); for (size_t i = 0; i < faceobjects.size(); i++) { const FaceObject& obj = faceobjects[i]; fprintf(stderr, "%.5f at %.2f %.2f %.2f x %.2f\n", obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::rectangle(image, obj.rect, cv::Scalar(0, 255, 0)); char text[256]; sprintf(text, "%.1f%%", obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector faceobjects; detect_scrfd(m, faceobjects); draw_faceobjects(m, faceobjects); return 0; } ================================================ FILE: examples/shufflenetv2.cpp ================================================ // Copyright 2018 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "net.h" #include #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #endif #include #include static int detect_shufflenetv2(const cv::Mat& bgr, std::vector& cls_scores) { ncnn::Net shufflenetv2; shufflenetv2.opt.use_vulkan_compute = true; // https://github.com/miaow1988/ShuffleNet_V2_pytorch_caffe // models can be downloaded from https://github.com/miaow1988/ShuffleNet_V2_pytorch_caffe/releases if (shufflenetv2.load_param("shufflenet_v2_x0.5.param")) exit(-1); if (shufflenetv2.load_model("shufflenet_v2_x0.5.bin")) exit(-1); ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, 224, 224); const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; in.substract_mean_normalize(0, norm_vals); ncnn::Extractor ex = shufflenetv2.create_extractor(); ex.input("data", in); ncnn::Mat out; ex.extract("fc", out); // manually call softmax on the fc output // convert result into probability // skip if your model already has softmax operation { ncnn::Layer* softmax = ncnn::create_layer("Softmax"); ncnn::ParamDict pd; softmax->load_param(pd); softmax->forward_inplace(out, shufflenetv2.opt); delete softmax; } out = out.reshape(out.w * out.h * out.c); cls_scores.resize(out.w); for (int j = 0; j < out.w; j++) { cls_scores[j] = out[j]; } return 0; } static int print_topk(const std::vector& cls_scores, int topk) { // partial sort topk with index int size = cls_scores.size(); std::vector > vec; vec.resize(size); for (int i = 0; i < size; i++) { vec[i] = std::make_pair(cls_scores[i], i); } std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::greater >()); // print topk and score for (int i = 0; i < topk; i++) { float score = vec[i].first; int index = vec[i].second; fprintf(stderr, "%d = %f\n", index, score); } return 0; } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector cls_scores; detect_shufflenetv2(m, cls_scores); print_topk(cls_scores, 3); return 0; } ================================================ FILE: examples/simplepose.cpp ================================================ // Copyright 2019 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "net.h" #include #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include struct KeyPoint { cv::Point2f p; float prob; }; static int detect_posenet(const cv::Mat& bgr, std::vector& keypoints) { ncnn::Net posenet; posenet.opt.use_vulkan_compute = true; // the simple baseline human pose estimation from gluon-cv // https://gluon-cv.mxnet.io/build/examples_pose/demo_simple_pose.html // mxnet model exported via // pose_net.hybridize() // pose_net.export('pose') // then mxnet2ncnn // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models if (posenet.load_param("pose.param")) exit(-1); if (posenet.load_model("pose.bin")) exit(-1); int w = bgr.cols; int h = bgr.rows; ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, w, h, 192, 256); // transforms.ToTensor(), // transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), // R' = (R / 255 - 0.485) / 0.229 = (R - 0.485 * 255) / 0.229 / 255 // G' = (G / 255 - 0.456) / 0.224 = (G - 0.456 * 255) / 0.224 / 255 // B' = (B / 255 - 0.406) / 0.225 = (B - 0.406 * 255) / 0.225 / 255 const float mean_vals[3] = {0.485f * 255.f, 0.456f * 255.f, 0.406f * 255.f}; const float norm_vals[3] = {1 / 0.229f / 255.f, 1 / 0.224f / 255.f, 1 / 0.225f / 255.f}; in.substract_mean_normalize(mean_vals, norm_vals); ncnn::Extractor ex = posenet.create_extractor(); ex.input("data", in); ncnn::Mat out; ex.extract("conv3_fwd", out); // resolve point from heatmap keypoints.clear(); for (int p = 0; p < out.c; p++) { const ncnn::Mat m = out.channel(p); float max_prob = 0.f; int max_x = 0; int max_y = 0; for (int y = 0; y < out.h; y++) { const float* ptr = m.row(y); for (int x = 0; x < out.w; x++) { float prob = ptr[x]; if (prob > max_prob) { max_prob = prob; max_x = x; max_y = y; } } } KeyPoint keypoint; keypoint.p = cv::Point2f(max_x * w / (float)out.w, max_y * h / (float)out.h); keypoint.prob = max_prob; keypoints.push_back(keypoint); } return 0; } static void draw_pose(const cv::Mat& bgr, const std::vector& keypoints) { cv::Mat image = bgr.clone(); // draw bone static const int joint_pairs[16][2] = { {0, 1}, {1, 3}, {0, 2}, {2, 4}, {5, 6}, {5, 7}, {7, 9}, {6, 8}, {8, 10}, {5, 11}, {6, 12}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16} }; for (int i = 0; i < 16; i++) { const KeyPoint& p1 = keypoints[joint_pairs[i][0]]; const KeyPoint& p2 = keypoints[joint_pairs[i][1]]; if (p1.prob < 0.2f || p2.prob < 0.2f) continue; cv::line(image, p1.p, p2.p, cv::Scalar(255, 0, 0), 2); } // draw joint for (size_t i = 0; i < keypoints.size(); i++) { const KeyPoint& keypoint = keypoints[i]; fprintf(stderr, "%.2f %.2f = %.5f\n", keypoint.p.x, keypoint.p.y, keypoint.prob); if (keypoint.prob < 0.2f) continue; cv::circle(image, keypoint.p, 3, cv::Scalar(0, 255, 0), -1); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector keypoints; detect_posenet(m, keypoints); draw_pose(m, keypoints); return 0; } ================================================ FILE: examples/squeezencnn/README.md ================================================ The squeezenet android example project has been moved to https://github.com/nihui/ncnn-android-squeezenet ================================================ FILE: examples/squeezenet.cpp ================================================ // Copyright 2017 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "net.h" #include #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #endif #include #include static int detect_squeezenet(const cv::Mat& bgr, std::vector& cls_scores) { ncnn::Net squeezenet; squeezenet.opt.use_vulkan_compute = true; // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models if (squeezenet.load_param("squeezenet_v1.1.param")) exit(-1); if (squeezenet.load_model("squeezenet_v1.1.bin")) exit(-1); ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, 227, 227); const float mean_vals[3] = {104.f, 117.f, 123.f}; in.substract_mean_normalize(mean_vals, 0); ncnn::Extractor ex = squeezenet.create_extractor(); ex.input("data", in); ncnn::Mat out; ex.extract("prob", out); cls_scores.resize(out.w); for (int j = 0; j < out.w; j++) { cls_scores[j] = out[j]; } return 0; } static int print_topk(const std::vector& cls_scores, int topk) { // partial sort topk with index int size = cls_scores.size(); std::vector > vec; vec.resize(size); for (int i = 0; i < size; i++) { vec[i] = std::make_pair(cls_scores[i], i); } std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::greater >()); // print topk and score for (int i = 0; i < topk; i++) { float score = vec[i].first; int index = vec[i].second; fprintf(stderr, "%d = %f\n", index, score); } return 0; } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector cls_scores; detect_squeezenet(m, cls_scores); print_topk(cls_scores, 3); return 0; } ================================================ FILE: examples/squeezenet_c_api.cpp ================================================ // Copyright 2020 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "c_api.h" #include #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #endif #include #include static int detect_squeezenet(const cv::Mat& bgr, std::vector& cls_scores) { ncnn_net_t squeezenet = ncnn_net_create(); ncnn_option_t opt = ncnn_option_create(); ncnn_option_set_use_vulkan_compute(opt, 1); ncnn_net_set_option(squeezenet, opt); // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models if (ncnn_net_load_param(squeezenet, "squeezenet_v1.1.param")) exit(-1); if (ncnn_net_load_model(squeezenet, "squeezenet_v1.1.bin")) exit(-1); ncnn_mat_t in = ncnn_mat_from_pixels_resize(bgr.data, NCNN_MAT_PIXEL_BGR, bgr.cols, bgr.rows, bgr.cols * 3, 227, 227, NULL); const float mean_vals[3] = {104.f, 117.f, 123.f}; ncnn_mat_substract_mean_normalize(in, mean_vals, 0); ncnn_extractor_t ex = ncnn_extractor_create(squeezenet); ncnn_extractor_input(ex, "data", in); ncnn_mat_t out; ncnn_extractor_extract(ex, "prob", &out); const int out_w = ncnn_mat_get_w(out); const float* out_data = (const float*)ncnn_mat_get_data(out); cls_scores.resize(out_w); for (int j = 0; j < out_w; j++) { cls_scores[j] = out_data[j]; } ncnn_mat_destroy(in); ncnn_mat_destroy(out); ncnn_extractor_destroy(ex); ncnn_option_destroy(opt); ncnn_net_destroy(squeezenet); return 0; } static int print_topk(const std::vector& cls_scores, int topk) { // partial sort topk with index int size = cls_scores.size(); std::vector > vec; vec.resize(size); for (int i = 0; i < size; i++) { vec[i] = std::make_pair(cls_scores[i], i); } std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::greater >()); // print topk and score for (int i = 0; i < topk; i++) { float score = vec[i].first; int index = vec[i].second; fprintf(stderr, "%d = %f\n", index, score); } return 0; } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector cls_scores; detect_squeezenet(m, cls_scores); print_topk(cls_scores, 3); return 0; } ================================================ FILE: examples/squeezenet_v1.1.param ================================================ 7767517 75 83 Input data 0 1 data 0=227 1=227 2=3 Convolution conv1 1 1 data conv1 0=64 1=3 2=1 3=2 4=0 5=1 6=1728 ReLU relu_conv1 1 1 conv1 conv1_relu_conv1 0=0.000000 Pooling pool1 1 1 conv1_relu_conv1 pool1 0=0 1=3 2=2 3=0 4=0 Convolution fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=1024 ReLU fire2/relu_squeeze1x1 1 1 fire2/squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1 0=0.000000 Split splitncnn_0 1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 Convolution fire2/expand1x1 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024 ReLU fire2/relu_expand1x1 1 1 fire2/expand1x1 fire2/expand1x1_fire2/relu_expand1x1 0=0.000000 Convolution fire2/expand3x3 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216 ReLU fire2/relu_expand3x3 1 1 fire2/expand3x3 fire2/expand3x3_fire2/relu_expand3x3 0=0.000000 Concat fire2/concat 2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat 0=0 Convolution fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=2048 ReLU fire3/relu_squeeze1x1 1 1 fire3/squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1 0=0.000000 Split splitncnn_1 1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 Convolution fire3/expand1x1 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024 ReLU fire3/relu_expand1x1 1 1 fire3/expand1x1 fire3/expand1x1_fire3/relu_expand1x1 0=0.000000 Convolution fire3/expand3x3 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216 ReLU fire3/relu_expand3x3 1 1 fire3/expand3x3 fire3/expand3x3_fire3/relu_expand3x3 0=0.000000 Concat fire3/concat 2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat 0=0 Pooling pool3 1 1 fire3/concat pool3 0=0 1=3 2=2 3=0 4=0 Convolution fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=4096 ReLU fire4/relu_squeeze1x1 1 1 fire4/squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1 0=0.000000 Split splitncnn_2 1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 Convolution fire4/expand1x1 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096 ReLU fire4/relu_expand1x1 1 1 fire4/expand1x1 fire4/expand1x1_fire4/relu_expand1x1 0=0.000000 Convolution fire4/expand3x3 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864 ReLU fire4/relu_expand3x3 1 1 fire4/expand3x3 fire4/expand3x3_fire4/relu_expand3x3 0=0.000000 Concat fire4/concat 2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat 0=0 Convolution fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=8192 ReLU fire5/relu_squeeze1x1 1 1 fire5/squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1 0=0.000000 Split splitncnn_3 1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 Convolution fire5/expand1x1 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096 ReLU fire5/relu_expand1x1 1 1 fire5/expand1x1 fire5/expand1x1_fire5/relu_expand1x1 0=0.000000 Convolution fire5/expand3x3 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864 ReLU fire5/relu_expand3x3 1 1 fire5/expand3x3 fire5/expand3x3_fire5/relu_expand3x3 0=0.000000 Concat fire5/concat 2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat 0=0 Pooling pool5 1 1 fire5/concat pool5 0=0 1=3 2=2 3=0 4=0 Convolution fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=12288 ReLU fire6/relu_squeeze1x1 1 1 fire6/squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1 0=0.000000 Split splitncnn_4 1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 Convolution fire6/expand1x1 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216 ReLU fire6/relu_expand1x1 1 1 fire6/expand1x1 fire6/expand1x1_fire6/relu_expand1x1 0=0.000000 Convolution fire6/expand3x3 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944 ReLU fire6/relu_expand3x3 1 1 fire6/expand3x3 fire6/expand3x3_fire6/relu_expand3x3 0=0.000000 Concat fire6/concat 2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat 0=0 Convolution fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=18432 ReLU fire7/relu_squeeze1x1 1 1 fire7/squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1 0=0.000000 Split splitncnn_5 1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 Convolution fire7/expand1x1 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216 ReLU fire7/relu_expand1x1 1 1 fire7/expand1x1 fire7/expand1x1_fire7/relu_expand1x1 0=0.000000 Convolution fire7/expand3x3 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944 ReLU fire7/relu_expand3x3 1 1 fire7/expand3x3 fire7/expand3x3_fire7/relu_expand3x3 0=0.000000 Concat fire7/concat 2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat 0=0 Convolution fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=24576 ReLU fire8/relu_squeeze1x1 1 1 fire8/squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1 0=0.000000 Split splitncnn_6 1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 Convolution fire8/expand1x1 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384 ReLU fire8/relu_expand1x1 1 1 fire8/expand1x1 fire8/expand1x1_fire8/relu_expand1x1 0=0.000000 Convolution fire8/expand3x3 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456 ReLU fire8/relu_expand3x3 1 1 fire8/expand3x3 fire8/expand3x3_fire8/relu_expand3x3 0=0.000000 Concat fire8/concat 2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat 0=0 Convolution fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 ReLU fire9/relu_squeeze1x1 1 1 fire9/squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1 0=0.000000 Split splitncnn_7 1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 Convolution fire9/expand1x1 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384 ReLU fire9/relu_expand1x1 1 1 fire9/expand1x1 fire9/expand1x1_fire9/relu_expand1x1 0=0.000000 Convolution fire9/expand3x3 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456 ReLU fire9/relu_expand3x3 1 1 fire9/expand3x3 fire9/expand3x3_fire9/relu_expand3x3 0=0.000000 Concat fire9/concat 2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat 0=0 Dropout drop9 1 1 fire9/concat fire9/concat_drop9 Convolution conv10 1 1 fire9/concat_drop9 conv10 0=1000 1=1 2=1 3=1 4=1 5=1 6=512000 ReLU relu_conv10 1 1 conv10 conv10_relu_conv10 0=0.000000 Pooling pool10 1 1 conv10_relu_conv10 pool10 0=1 1=0 2=1 3=0 4=1 Softmax prob 1 1 pool10 prob 0=0 ================================================ FILE: examples/squeezenet_v1.1.prototxt ================================================ name: "squeezenet_v1.1_deploy" layer { name: "data" type: "Input" top: "data" input_param { shape: { dim: 1 dim: 3 dim: 227 dim: 227 } } } layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" convolution_param { num_output: 64 kernel_size: 3 stride: 2 } } layer { name: "relu_conv1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "pool1" type: "Pooling" bottom: "conv1" top: "pool1" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "fire2/squeeze1x1" type: "Convolution" bottom: "pool1" top: "fire2/squeeze1x1" convolution_param { num_output: 16 kernel_size: 1 } } layer { name: "fire2/relu_squeeze1x1" type: "ReLU" bottom: "fire2/squeeze1x1" top: "fire2/squeeze1x1" } layer { name: "fire2/expand1x1" type: "Convolution" bottom: "fire2/squeeze1x1" top: "fire2/expand1x1" convolution_param { num_output: 64 kernel_size: 1 } } layer { name: "fire2/relu_expand1x1" type: "ReLU" bottom: "fire2/expand1x1" top: "fire2/expand1x1" } layer { name: "fire2/expand3x3" type: "Convolution" bottom: "fire2/squeeze1x1" top: "fire2/expand3x3" convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "fire2/relu_expand3x3" type: "ReLU" bottom: "fire2/expand3x3" top: "fire2/expand3x3" } layer { name: "fire2/concat" type: "Concat" bottom: "fire2/expand1x1" bottom: "fire2/expand3x3" top: "fire2/concat" } layer { name: "fire3/squeeze1x1" type: "Convolution" bottom: "fire2/concat" top: "fire3/squeeze1x1" convolution_param { num_output: 16 kernel_size: 1 } } layer { name: "fire3/relu_squeeze1x1" type: "ReLU" bottom: "fire3/squeeze1x1" top: "fire3/squeeze1x1" } layer { name: "fire3/expand1x1" type: "Convolution" bottom: "fire3/squeeze1x1" top: "fire3/expand1x1" convolution_param { num_output: 64 kernel_size: 1 } } layer { name: "fire3/relu_expand1x1" type: "ReLU" bottom: "fire3/expand1x1" top: "fire3/expand1x1" } layer { name: "fire3/expand3x3" type: "Convolution" bottom: "fire3/squeeze1x1" top: "fire3/expand3x3" convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "fire3/relu_expand3x3" type: "ReLU" bottom: "fire3/expand3x3" top: "fire3/expand3x3" } layer { name: "fire3/concat" type: "Concat" bottom: "fire3/expand1x1" bottom: "fire3/expand3x3" top: "fire3/concat" } layer { name: "pool3" type: "Pooling" bottom: "fire3/concat" top: "pool3" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "fire4/squeeze1x1" type: "Convolution" bottom: "pool3" top: "fire4/squeeze1x1" convolution_param { num_output: 32 kernel_size: 1 } } layer { name: "fire4/relu_squeeze1x1" type: "ReLU" bottom: "fire4/squeeze1x1" top: "fire4/squeeze1x1" } layer { name: "fire4/expand1x1" type: "Convolution" bottom: "fire4/squeeze1x1" top: "fire4/expand1x1" convolution_param { num_output: 128 kernel_size: 1 } } layer { name: "fire4/relu_expand1x1" type: "ReLU" bottom: "fire4/expand1x1" top: "fire4/expand1x1" } layer { name: "fire4/expand3x3" type: "Convolution" bottom: "fire4/squeeze1x1" top: "fire4/expand3x3" convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "fire4/relu_expand3x3" type: "ReLU" bottom: "fire4/expand3x3" top: "fire4/expand3x3" } layer { name: "fire4/concat" type: "Concat" bottom: "fire4/expand1x1" bottom: "fire4/expand3x3" top: "fire4/concat" } layer { name: "fire5/squeeze1x1" type: "Convolution" bottom: "fire4/concat" top: "fire5/squeeze1x1" convolution_param { num_output: 32 kernel_size: 1 } } layer { name: "fire5/relu_squeeze1x1" type: "ReLU" bottom: "fire5/squeeze1x1" top: "fire5/squeeze1x1" } layer { name: "fire5/expand1x1" type: "Convolution" bottom: "fire5/squeeze1x1" top: "fire5/expand1x1" convolution_param { num_output: 128 kernel_size: 1 } } layer { name: "fire5/relu_expand1x1" type: "ReLU" bottom: "fire5/expand1x1" top: "fire5/expand1x1" } layer { name: "fire5/expand3x3" type: "Convolution" bottom: "fire5/squeeze1x1" top: "fire5/expand3x3" convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "fire5/relu_expand3x3" type: "ReLU" bottom: "fire5/expand3x3" top: "fire5/expand3x3" } layer { name: "fire5/concat" type: "Concat" bottom: "fire5/expand1x1" bottom: "fire5/expand3x3" top: "fire5/concat" } layer { name: "pool5" type: "Pooling" bottom: "fire5/concat" top: "pool5" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "fire6/squeeze1x1" type: "Convolution" bottom: "pool5" top: "fire6/squeeze1x1" convolution_param { num_output: 48 kernel_size: 1 } } layer { name: "fire6/relu_squeeze1x1" type: "ReLU" bottom: "fire6/squeeze1x1" top: "fire6/squeeze1x1" } layer { name: "fire6/expand1x1" type: "Convolution" bottom: "fire6/squeeze1x1" top: "fire6/expand1x1" convolution_param { num_output: 192 kernel_size: 1 } } layer { name: "fire6/relu_expand1x1" type: "ReLU" bottom: "fire6/expand1x1" top: "fire6/expand1x1" } layer { name: "fire6/expand3x3" type: "Convolution" bottom: "fire6/squeeze1x1" top: "fire6/expand3x3" convolution_param { num_output: 192 pad: 1 kernel_size: 3 } } layer { name: "fire6/relu_expand3x3" type: "ReLU" bottom: "fire6/expand3x3" top: "fire6/expand3x3" } layer { name: "fire6/concat" type: "Concat" bottom: "fire6/expand1x1" bottom: "fire6/expand3x3" top: "fire6/concat" } layer { name: "fire7/squeeze1x1" type: "Convolution" bottom: "fire6/concat" top: "fire7/squeeze1x1" convolution_param { num_output: 48 kernel_size: 1 } } layer { name: "fire7/relu_squeeze1x1" type: "ReLU" bottom: "fire7/squeeze1x1" top: "fire7/squeeze1x1" } layer { name: "fire7/expand1x1" type: "Convolution" bottom: "fire7/squeeze1x1" top: "fire7/expand1x1" convolution_param { num_output: 192 kernel_size: 1 } } layer { name: "fire7/relu_expand1x1" type: "ReLU" bottom: "fire7/expand1x1" top: "fire7/expand1x1" } layer { name: "fire7/expand3x3" type: "Convolution" bottom: "fire7/squeeze1x1" top: "fire7/expand3x3" convolution_param { num_output: 192 pad: 1 kernel_size: 3 } } layer { name: "fire7/relu_expand3x3" type: "ReLU" bottom: "fire7/expand3x3" top: "fire7/expand3x3" } layer { name: "fire7/concat" type: "Concat" bottom: "fire7/expand1x1" bottom: "fire7/expand3x3" top: "fire7/concat" } layer { name: "fire8/squeeze1x1" type: "Convolution" bottom: "fire7/concat" top: "fire8/squeeze1x1" convolution_param { num_output: 64 kernel_size: 1 } } layer { name: "fire8/relu_squeeze1x1" type: "ReLU" bottom: "fire8/squeeze1x1" top: "fire8/squeeze1x1" } layer { name: "fire8/expand1x1" type: "Convolution" bottom: "fire8/squeeze1x1" top: "fire8/expand1x1" convolution_param { num_output: 256 kernel_size: 1 } } layer { name: "fire8/relu_expand1x1" type: "ReLU" bottom: "fire8/expand1x1" top: "fire8/expand1x1" } layer { name: "fire8/expand3x3" type: "Convolution" bottom: "fire8/squeeze1x1" top: "fire8/expand3x3" convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "fire8/relu_expand3x3" type: "ReLU" bottom: "fire8/expand3x3" top: "fire8/expand3x3" } layer { name: "fire8/concat" type: "Concat" bottom: "fire8/expand1x1" bottom: "fire8/expand3x3" top: "fire8/concat" } layer { name: "fire9/squeeze1x1" type: "Convolution" bottom: "fire8/concat" top: "fire9/squeeze1x1" convolution_param { num_output: 64 kernel_size: 1 } } layer { name: "fire9/relu_squeeze1x1" type: "ReLU" bottom: "fire9/squeeze1x1" top: "fire9/squeeze1x1" } layer { name: "fire9/expand1x1" type: "Convolution" bottom: "fire9/squeeze1x1" top: "fire9/expand1x1" convolution_param { num_output: 256 kernel_size: 1 } } layer { name: "fire9/relu_expand1x1" type: "ReLU" bottom: "fire9/expand1x1" top: "fire9/expand1x1" } layer { name: "fire9/expand3x3" type: "Convolution" bottom: "fire9/squeeze1x1" top: "fire9/expand3x3" convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "fire9/relu_expand3x3" type: "ReLU" bottom: "fire9/expand3x3" top: "fire9/expand3x3" } layer { name: "fire9/concat" type: "Concat" bottom: "fire9/expand1x1" bottom: "fire9/expand3x3" top: "fire9/concat" } layer { name: "drop9" type: "Dropout" bottom: "fire9/concat" top: "fire9/concat" dropout_param { dropout_ratio: 0.5 } } layer { name: "conv10" type: "Convolution" bottom: "fire9/concat" top: "conv10" convolution_param { num_output: 1000 pad: 1 kernel_size: 1 } } layer { name: "relu_conv10" type: "ReLU" bottom: "conv10" top: "conv10" } layer { name: "pool10" type: "Pooling" bottom: "conv10" top: "pool10" pooling_param { pool: AVE global_pooling: true } } layer { name: "prob" type: "Softmax" bottom: "pool10" top: "prob" } ================================================ FILE: examples/squeezenetssd.cpp ================================================ // Copyright 2017 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include struct Object { cv::Rect_ rect; int label; float prob; }; static int detect_squeezenet(const cv::Mat& bgr, std::vector& objects) { ncnn::Net squeezenet; squeezenet.opt.use_vulkan_compute = true; // original pretrained model from https://github.com/chuanqi305/SqueezeNet-SSD // squeezenet_ssd_voc_deploy.prototxt // https://drive.google.com/open?id=0B3gersZ2cHIxdGpyZlZnbEQ5Snc // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models if (squeezenet.load_param("squeezenet_ssd_voc.param")) exit(-1); if (squeezenet.load_model("squeezenet_ssd_voc.bin")) exit(-1); const int target_size = 300; int img_w = bgr.cols; int img_h = bgr.rows; ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, target_size, target_size); const float mean_vals[3] = {104.f, 117.f, 123.f}; in.substract_mean_normalize(mean_vals, 0); ncnn::Extractor ex = squeezenet.create_extractor(); ex.input("data", in); ncnn::Mat out; ex.extract("detection_out", out); // printf("%d %d %d\n", out.w, out.h, out.c); objects.clear(); for (int i = 0; i < out.h; i++) { const float* values = out.row(i); Object object; object.label = values[0]; object.prob = values[1]; object.rect.x = values[2] * img_w; object.rect.y = values[3] * img_h; object.rect.width = values[4] * img_w - object.rect.x; object.rect.height = values[5] * img_h - object.rect.y; objects.push_back(object); } return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = {"background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor" }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0)); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_squeezenet(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/synset_words.txt ================================================ n01440764 tench, Tinca tinca n01443537 goldfish, Carassius auratus n01484850 great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias n01491361 tiger shark, Galeocerdo cuvieri n01494475 hammerhead, hammerhead shark n01496331 electric ray, crampfish, numbfish, torpedo n01498041 stingray n01514668 cock n01514859 hen n01518878 ostrich, Struthio camelus n01530575 brambling, Fringilla montifringilla n01531178 goldfinch, Carduelis carduelis n01532829 house finch, linnet, Carpodacus mexicans n01534433 junco, snowbird n01537544 indigo bunting, indigo finch, indigo bird, Passerina cyanea n01558993 robin, American robin, Turdus migratorius n01560419 bulbul n01580077 jay n01582220 magpie n01592084 chickadee n01601694 water ouzel, dipper n01608432 kite n01614925 bald eagle, American eagle, Haliaeetus leucocephalus n01616318 vulture n01622779 great grey owl, great gray owl, Strix nebulosa n01629819 European fire salamander, Salamandra salamandra n01630670 common newt, Triturus vulgaris n01631663 eft n01632458 spotted salamander, Ambystoma maculatum n01632777 axolotl, mud puppy, Ambystoma mexicanum n01641577 bullfrog, Rana catesbeiana n01644373 tree frog, tree-frog n01644900 tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui n01664065 loggerhead, loggerhead turtle, Caretta caretta n01665541 leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea n01667114 mud turtle n01667778 terrapin n01669191 box turtle, box tortoise n01675722 banded gecko n01677366 common iguana, iguana, Iguana iguana n01682714 American chameleon, anole, Anolis carolinensis n01685808 whiptail, whiptail lizard n01687978 agama n01688243 frilled lizard, Chlamydosaurus kingi n01689811 alligator lizard n01692333 Gila monster, Heloderma suspectum n01693334 green lizard, Lacerta viridis n01694178 African chameleon, Chamaeleo chamaeleon n01695060 Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis n01697457 African crocodile, Nile crocodile, Crocodylus niloticus n01698640 American alligator, Alligator mississipiensis n01704323 triceratops n01728572 thunder snake, worm snake, Carphophis amoenus n01728920 ringneck snake, ring-necked snake, ring snake n01729322 hognose snake, puff adder, sand viper n01729977 green snake, grass snake n01734418 king snake, kingsnake n01735189 garter snake, grass snake n01737021 water snake n01739381 vine snake n01740131 night snake, Hypsiglena torquata n01742172 boa constrictor, Constrictor constrictor n01744401 rock python, rock snake, Python sebae n01748264 Indian cobra, Naja naja n01749939 green mamba n01751748 sea snake n01753488 horned viper, cerastes, sand viper, horned asp, Cerastes cornutus n01755581 diamondback, diamondback rattlesnake, Crotalus adamanteus n01756291 sidewinder, horned rattlesnake, Crotalus cerastes n01768244 trilobite n01770081 harvestman, daddy longlegs, Phalangium opilio n01770393 scorpion n01773157 black and gold garden spider, Argiope aurantia n01773549 barn spider, Araneus cavaticus n01773797 garden spider, Aranea diademata n01774384 black widow, Latrodectus mactans n01774750 tarantula n01775062 wolf spider, hunting spider n01776313 tick n01784675 centipede n01795545 black grouse n01796340 ptarmigan n01797886 ruffed grouse, partridge, Bonasa umbellus n01798484 prairie chicken, prairie grouse, prairie fowl n01806143 peacock n01806567 quail n01807496 partridge n01817953 African grey, African gray, Psittacus erithacus n01818515 macaw n01819313 sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita n01820546 lorikeet n01824575 coucal n01828970 bee eater n01829413 hornbill n01833805 hummingbird n01843065 jacamar n01843383 toucan n01847000 drake n01855032 red-breasted merganser, Mergus serrator n01855672 goose n01860187 black swan, Cygnus atratus n01871265 tusker n01872401 echidna, spiny anteater, anteater n01873310 platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus n01877812 wallaby, brush kangaroo n01882714 koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus n01883070 wombat n01910747 jellyfish n01914609 sea anemone, anemone n01917289 brain coral n01924916 flatworm, platyhelminth n01930112 nematode, nematode worm, roundworm n01943899 conch n01944390 snail n01945685 slug n01950731 sea slug, nudibranch n01955084 chiton, coat-of-mail shell, sea cradle, polyplacophore n01968897 chambered nautilus, pearly nautilus, nautilus n01978287 Dungeness crab, Cancer magister n01978455 rock crab, Cancer irroratus n01980166 fiddler crab n01981276 king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica n01983481 American lobster, Northern lobster, Maine lobster, Homarus americans n01984695 spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish n01985128 crayfish, crawfish, crawdad, crawdaddy n01986214 hermit crab n01990800 isopod n02002556 white stork, Ciconia ciconia n02002724 black stork, Ciconia nigra n02006656 spoonbill n02007558 flamingo n02009229 little blue heron, Egretta caerulea n02009912 American egret, great white heron, Egretta albus n02011460 bittern n02012849 crane n02013706 limpkin, Aramus pictus n02017213 European gallinule, Porphyrio porphyrio n02018207 American coot, marsh hen, mud hen, water hen, Fulica americana n02018795 bustard n02025239 ruddy turnstone, Arenaria interpres n02027492 red-backed sandpiper, dunlin, Erolia alpina n02028035 redshank, Tringa totanus n02033041 dowitcher n02037110 oystercatcher, oyster catcher n02051845 pelican n02056570 king penguin, Aptenodytes patagonica n02058221 albatross, mollymawk n02066245 grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus n02071294 killer whale, killer, orca, grampus, sea wolf, Orcinus orca n02074367 dugong, Dugong dugon n02077923 sea lion n02085620 Chihuahua n02085782 Japanese spaniel n02085936 Maltese dog, Maltese terrier, Maltese n02086079 Pekinese, Pekingese, Peke n02086240 Shih-Tzu n02086646 Blenheim spaniel n02086910 papillon n02087046 toy terrier n02087394 Rhodesian ridgeback n02088094 Afghan hound, Afghan n02088238 basset, basset hound n02088364 beagle n02088466 bloodhound, sleuthhound n02088632 bluetick n02089078 black-and-tan coonhound n02089867 Walker hound, Walker foxhound n02089973 English foxhound n02090379 redbone n02090622 borzoi, Russian wolfhound n02090721 Irish wolfhound n02091032 Italian greyhound n02091134 whippet n02091244 Ibizan hound, Ibizan Podenco n02091467 Norwegian elkhound, elkhound n02091635 otterhound, otter hound n02091831 Saluki, gazelle hound n02092002 Scottish deerhound, deerhound n02092339 Weimaraner n02093256 Staffordshire bullterrier, Staffordshire bull terrier n02093428 American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier n02093647 Bedlington terrier n02093754 Border terrier n02093859 Kerry blue terrier n02093991 Irish terrier n02094114 Norfolk terrier n02094258 Norwich terrier n02094433 Yorkshire terrier n02095314 wire-haired fox terrier n02095570 Lakeland terrier n02095889 Sealyham terrier, Sealyham n02096051 Airedale, Airedale terrier n02096177 cairn, cairn terrier n02096294 Australian terrier n02096437 Dandie Dinmont, Dandie Dinmont terrier n02096585 Boston bull, Boston terrier n02097047 miniature schnauzer n02097130 giant schnauzer n02097209 standard schnauzer n02097298 Scotch terrier, Scottish terrier, Scottie n02097474 Tibetan terrier, chrysanthemum dog n02097658 silky terrier, Sydney silky n02098105 soft-coated wheaten terrier n02098286 West Highland white terrier n02098413 Lhasa, Lhasa apso n02099267 flat-coated retriever n02099429 curly-coated retriever n02099601 golden retriever n02099712 Labrador retriever n02099849 Chesapeake Bay retriever n02100236 German short-haired pointer n02100583 vizsla, Hungarian pointer n02100735 English setter n02100877 Irish setter, red setter n02101006 Gordon setter n02101388 Brittany spaniel n02101556 clumber, clumber spaniel n02102040 English springer, English springer spaniel n02102177 Welsh springer spaniel n02102318 cocker spaniel, English cocker spaniel, cocker n02102480 Sussex spaniel n02102973 Irish water spaniel n02104029 kuvasz n02104365 schipperke n02105056 groenendael n02105162 malinois n02105251 briard n02105412 kelpie n02105505 komondor n02105641 Old English sheepdog, bobtail n02105855 Shetland sheepdog, Shetland sheep dog, Shetland n02106030 collie n02106166 Border collie n02106382 Bouvier des Flandres, Bouviers des Flandres n02106550 Rottweiler n02106662 German shepherd, German shepherd dog, German police dog, alsatian n02107142 Doberman, Doberman pinscher n02107312 miniature pinscher n02107574 Greater Swiss Mountain dog n02107683 Bernese mountain dog n02107908 Appenzeller n02108000 EntleBucher n02108089 boxer n02108422 bull mastiff n02108551 Tibetan mastiff n02108915 French bulldog n02109047 Great Dane n02109525 Saint Bernard, St Bernard n02109961 Eskimo dog, husky n02110063 malamute, malemute, Alaskan malamute n02110185 Siberian husky n02110341 dalmatian, coach dog, carriage dog n02110627 affenpinscher, monkey pinscher, monkey dog n02110806 basenji n02110958 pug, pug-dog n02111129 Leonberg n02111277 Newfoundland, Newfoundland dog n02111500 Great Pyrenees n02111889 Samoyed, Samoyede n02112018 Pomeranian n02112137 chow, chow chow n02112350 keeshond n02112706 Brabancon griffon n02113023 Pembroke, Pembroke Welsh corgi n02113186 Cardigan, Cardigan Welsh corgi n02113624 toy poodle n02113712 miniature poodle n02113799 standard poodle n02113978 Mexican hairless n02114367 timber wolf, grey wolf, gray wolf, Canis lupus n02114548 white wolf, Arctic wolf, Canis lupus tundrarum n02114712 red wolf, maned wolf, Canis rufus, Canis niger n02114855 coyote, prairie wolf, brush wolf, Canis latrans n02115641 dingo, warrigal, warragal, Canis dingo n02115913 dhole, Cuon alpinus n02116738 African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus n02117135 hyena, hyaena n02119022 red fox, Vulpes vulpes n02119789 kit fox, Vulpes macrotis n02120079 Arctic fox, white fox, Alopex lagopus n02120505 grey fox, gray fox, Urocyon cinereoargenteus n02123045 tabby, tabby cat n02123159 tiger cat n02123394 Persian cat n02123597 Siamese cat, Siamese n02124075 Egyptian cat n02125311 cougar, puma, catamount, mountain lion, painter, panther, Felis concolor n02127052 lynx, catamount n02128385 leopard, Panthera pardus n02128757 snow leopard, ounce, Panthera uncia n02128925 jaguar, panther, Panthera onca, Felis onca n02129165 lion, king of beasts, Panthera leo n02129604 tiger, Panthera tigris n02130308 cheetah, chetah, Acinonyx jubatus n02132136 brown bear, bruin, Ursus arctos n02133161 American black bear, black bear, Ursus americans, Euarctos americans n02134084 ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus n02134418 sloth bear, Melursus ursinus, Ursus ursinus n02137549 mongoose n02138441 meerkat, mierkat n02165105 tiger beetle n02165456 ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle n02167151 ground beetle, carabid beetle n02168699 long-horned beetle, longicorn, longicorn beetle n02169497 leaf beetle, chrysomelid n02172182 dung beetle n02174001 rhinoceros beetle n02177972 weevil n02190166 fly n02206856 bee n02219486 ant, emmet, pismire n02226429 grasshopper, hopper n02229544 cricket n02231487 walking stick, walkingstick, stick insect n02233338 cockroach, roach n02236044 mantis, mantid n02256656 cicada, cicala n02259212 leafhopper n02264363 lacewing, lacewing fly n02268443 dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk n02268853 damselfly n02276258 admiral n02277742 ringlet, ringlet butterfly n02279972 monarch, monarch butterfly, milkweed butterfly, Danaus plexippus n02280649 cabbage butterfly n02281406 sulphur butterfly, sulfur butterfly n02281787 lycaenid, lycaenid butterfly n02317335 starfish, sea star n02319095 sea urchin n02321529 sea cucumber, holothurian n02325366 wood rabbit, cottontail, cottontail rabbit n02326432 hare n02328150 Angora, Angora rabbit n02342885 hamster n02346627 porcupine, hedgehog n02356798 fox squirrel, eastern fox squirrel, Sciurus niger n02361337 marmot n02363005 beaver n02364673 guinea pig, Cavia cobaya n02389026 sorrel n02391049 zebra n02395406 hog, pig, grunter, squealer, Sus scrofa n02396427 wild boar, boar, Sus scrofa n02397096 warthog n02398521 hippopotamus, hippo, river horse, Hippopotamus amphibius n02403003 ox n02408429 water buffalo, water ox, Asiatic buffalo, Bubalus bubalis n02410509 bison n02412080 ram, tup n02415577 bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis n02417914 ibex, Capra ibex n02422106 hartebeest n02422699 impala, Aepyceros melampus n02423022 gazelle n02437312 Arabian camel, dromedary, Camelus dromedarius n02437616 llama n02441942 weasel n02442845 mink n02443114 polecat, fitch, foulmart, foumart, Mustela putorius n02443484 black-footed ferret, ferret, Mustela nigripes n02444819 otter n02445715 skunk, polecat, wood pussy n02447366 badger n02454379 armadillo n02457408 three-toed sloth, ai, Bradypus tridactylus n02480495 orangutan, orang, orangutang, Pongo pygmaeus n02480855 gorilla, Gorilla gorilla n02481823 chimpanzee, chimp, Pan troglodytes n02483362 gibbon, Hylobates lar n02483708 siamang, Hylobates syndactylus, Symphalangus syndactylus n02484975 guenon, guenon monkey n02486261 patas, hussar monkey, Erythrocebus patas n02486410 baboon n02487347 macaque n02488291 langur n02488702 colobus, colobus monkey n02489166 proboscis monkey, Nasalis larvatus n02490219 marmoset n02492035 capuchin, ringtail, Cebus capucinus n02492660 howler monkey, howler n02493509 titi, titi monkey n02493793 spider monkey, Ateles geoffroyi n02494079 squirrel monkey, Saimiri sciureus n02497673 Madagascar cat, ring-tailed lemur, Lemur catta n02500267 indri, indris, Indri indri, Indri brevicaudatus n02504013 Indian elephant, Elephas maximus n02504458 African elephant, Loxodonta africana n02509815 lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens n02510455 giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca n02514041 barracouta, snoek n02526121 eel n02536864 coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch n02606052 rock beauty, Holocanthus tricolor n02607072 anemone fish n02640242 sturgeon n02641379 gar, garfish, garpike, billfish, Lepisosteus osseus n02643566 lionfish n02655020 puffer, pufferfish, blowfish, globefish n02666196 abacus n02667093 abaya n02669723 academic gown, academic robe, judge's robe n02672831 accordion, piano accordion, squeeze box n02676566 acoustic guitar n02687172 aircraft carrier, carrier, flattop, attack aircraft carrier n02690373 airliner n02692877 airship, dirigible n02699494 altar n02701002 ambulance n02704792 amphibian, amphibious vehicle n02708093 analog clock n02727426 apiary, bee house n02730930 apron n02747177 ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin n02749479 assault rifle, assault gun n02769748 backpack, back pack, knapsack, packsack, rucksack, haversack n02776631 bakery, bakeshop, bakehouse n02777292 balance beam, beam n02782093 balloon n02783161 ballpoint, ballpoint pen, ballpen, Biro n02786058 Band Aid n02787622 banjo n02788148 bannister, banister, balustrade, balusters, handrail n02790996 barbell n02791124 barber chair n02791270 barbershop n02793495 barn n02794156 barometer n02795169 barrel, cask n02797295 barrow, garden cart, lawn cart, wheelbarrow n02799071 baseball n02802426 basketball n02804414 bassinet n02804610 bassoon n02807133 bathing cap, swimming cap n02808304 bath towel n02808440 bathtub, bathing tub, bath, tub n02814533 beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon n02814860 beacon, lighthouse, beacon light, pharos n02815834 beaker n02817516 bearskin, busby, shako n02823428 beer bottle n02823750 beer glass n02825657 bell cote, bell cot n02834397 bib n02835271 bicycle-built-for-two, tandem bicycle, tandem n02837789 bikini, two-piece n02840245 binder, ring-binder n02841315 binoculars, field glasses, opera glasses n02843684 birdhouse n02859443 boathouse n02860847 bobsled, bobsleigh, bob n02865351 bolo tie, bolo, bola tie, bola n02869837 bonnet, poke bonnet n02870880 bookcase n02871525 bookshop, bookstore, bookstall n02877765 bottlecap n02879718 bow n02883205 bow tie, bow-tie, bowtie n02892201 brass, memorial tablet, plaque n02892767 brassiere, bra, bandeau n02894605 breakwater, groin, groyne, mole, bulwark, seawall, jetty n02895154 breastplate, aegis, egis n02906734 broom n02909870 bucket, pail n02910353 buckle n02916936 bulletproof vest n02917067 bullet train, bullet n02927161 butcher shop, meat market n02930766 cab, hack, taxi, taxicab n02939185 caldron, cauldron n02948072 candle, taper, wax light n02950826 cannon n02951358 canoe n02951585 can opener, tin opener n02963159 cardigan n02965783 car mirror n02966193 carousel, carrousel, merry-go-round, roundabout, whirligig n02966687 carpenter's kit, tool kit n02971356 carton n02974003 car wheel n02977058 cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM n02978881 cassette n02979186 cassette player n02980441 castle n02981792 catamaran n02988304 CD player n02992211 cello, violoncello n02992529 cellular telephone, cellular phone, cellphone, cell, mobile phone n02999410 chain n03000134 chainlink fence n03000247 chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour n03000684 chain saw, chainsaw n03014705 chest n03016953 chiffonier, commode n03017168 chime, bell, gong n03018349 china cabinet, china closet n03026506 Christmas stocking n03028079 church, church building n03032252 cinema, movie theater, movie theatre, movie house, picture palace n03041632 cleaver, meat cleaver, chopper n03042490 cliff dwelling n03045698 cloak n03047690 clog, geta, patten, sabot n03062245 cocktail shaker n03063599 coffee mug n03063689 coffeepot n03065424 coil, spiral, volute, whorl, helix n03075370 combination lock n03085013 computer keyboard, keypad n03089624 confectionery, confectionary, candy store n03095699 container ship, containership, container vessel n03100240 convertible n03109150 corkscrew, bottle screw n03110669 cornet, horn, trumpet, trump n03124043 cowboy boot n03124170 cowboy hat, ten-gallon hat n03125729 cradle n03126707 crane n03127747 crash helmet n03127925 crate n03131574 crib, cot n03133878 Crock Pot n03134739 croquet ball n03141823 crutch n03146219 cuirass n03160309 dam, dike, dyke n03179701 desk n03180011 desktop computer n03187595 dial telephone, dial phone n03188531 diaper, nappy, napkin n03196217 digital clock n03197337 digital watch n03201208 dining table, board n03207743 dishrag, dishcloth n03207941 dishwasher, dish washer, dishwashing machine n03208938 disk brake, disc brake n03216828 dock, dockage, docking facility n03218198 dogsled, dog sled, dog sleigh n03220513 dome n03223299 doormat, welcome mat n03240683 drilling platform, offshore rig n03249569 drum, membranophone, tympan n03250847 drumstick n03255030 dumbbell n03259280 Dutch oven n03271574 electric fan, blower n03272010 electric guitar n03272562 electric locomotive n03290653 entertainment center n03291819 envelope n03297495 espresso maker n03314780 face powder n03325584 feather boa, boa n03337140 file, file cabinet, filing cabinet n03344393 fireboat n03345487 fire engine, fire truck n03347037 fire screen, fireguard n03355925 flagpole, flagstaff n03372029 flute, transverse flute n03376595 folding chair n03379051 football helmet n03384352 forklift n03388043 fountain n03388183 fountain pen n03388549 four-poster n03393912 freight car n03394916 French horn, horn n03400231 frying pan, frypan, skillet n03404251 fur coat n03417042 garbage truck, dustcart n03424325 gasmask, respirator, gas helmet n03425413 gas pump, gasoline pump, petrol pump, island dispenser n03443371 goblet n03444034 go-kart n03445777 golf ball n03445924 golfcart, golf cart n03447447 gondola n03447721 gong, tam-tam n03450230 gown n03452741 grand piano, grand n03457902 greenhouse, nursery, glasshouse n03459775 grille, radiator grille n03461385 grocery store, grocery, food market, market n03467068 guillotine n03476684 hair slide n03476991 hair spray n03478589 half track n03481172 hammer n03482405 hamper n03483316 hand blower, blow dryer, blow drier, hair dryer, hair drier n03485407 hand-held computer, hand-held microcomputer n03485794 handkerchief, hankie, hanky, hankey n03492542 hard disc, hard disk, fixed disk n03494278 harmonica, mouth organ, harp, mouth harp n03495258 harp n03496892 harvester, reaper n03498962 hatchet n03527444 holster n03529860 home theater, home theatre n03530642 honeycomb n03532672 hook, claw n03534580 hoopskirt, crinoline n03535780 horizontal bar, high bar n03538406 horse cart, horse-cart n03544143 hourglass n03584254 iPod n03584829 iron, smoothing iron n03590841 jack-o'-lantern n03594734 jean, blue jean, denim n03594945 jeep, landrover n03595614 jersey, T-shirt, tee shirt n03598930 jigsaw puzzle n03599486 jinrikisha, ricksha, rickshaw n03602883 joystick n03617480 kimono n03623198 knee pad n03627232 knot n03630383 lab coat, laboratory coat n03633091 ladle n03637318 lampshade, lamp shade n03642806 laptop, laptop computer n03649909 lawn mower, mower n03657121 lens cap, lens cover n03658185 letter opener, paper knife, paperknife n03661043 library n03662601 lifeboat n03666591 lighter, light, igniter, ignitor n03670208 limousine, limo n03673027 liner, ocean liner n03676483 lipstick, lip rouge n03680355 Loafer n03690938 lotion n03691459 loudspeaker, speaker, speaker unit, loudspeaker system, speaker system n03692522 loupe, jeweler's loupe n03697007 lumbermill, sawmill n03706229 magnetic compass n03709823 mailbag, postbag n03710193 mailbox, letter box n03710637 maillot n03710721 maillot, tank suit n03717622 manhole cover n03720891 maraca n03721384 marimba, xylophone n03724870 mask n03729826 matchstick n03733131 maypole n03733281 maze, labyrinth n03733805 measuring cup n03742115 medicine chest, medicine cabinet n03743016 megalith, megalithic structure n03759954 microphone, mike n03761084 microwave, microwave oven n03763968 military uniform n03764736 milk can n03769881 minibus n03770439 miniskirt, mini n03770679 minivan n03773504 missile n03775071 mitten n03775546 mixing bowl n03776460 mobile home, manufactured home n03777568 Model T n03777754 modem n03781244 monastery n03782006 monitor n03785016 moped n03786901 mortar n03787032 mortarboard n03788195 mosque n03788365 mosquito net n03791053 motor scooter, scooter n03792782 mountain bike, all-terrain bike, off-roader n03792972 mountain tent n03793489 mouse, computer mouse n03794056 mousetrap n03796401 moving van n03803284 muzzle n03804744 nail n03814639 neck brace n03814906 necklace n03825788 nipple n03832673 notebook, notebook computer n03837869 obelisk n03838899 oboe, hautboy, hautbois n03840681 ocarina, sweet potato n03841143 odometer, hodometer, mileometer, milometer n03843555 oil filter n03854065 organ, pipe organ n03857828 oscilloscope, scope, cathode-ray oscilloscope, CRO n03866082 overskirt n03868242 oxcart n03868863 oxygen mask n03871628 packet n03873416 paddle, boat paddle n03874293 paddlewheel, paddle wheel n03874599 padlock n03876231 paintbrush n03877472 pajama, pyjama, pj's, jammies n03877845 palace n03884397 panpipe, pandean pipe, syrinx n03887697 paper towel n03888257 parachute, chute n03888605 parallel bars, bars n03891251 park bench n03891332 parking meter n03895866 passenger car, coach, carriage n03899768 patio, terrace n03902125 pay-phone, pay-station n03903868 pedestal, plinth, footstall n03908618 pencil box, pencil case n03908714 pencil sharpener n03916031 perfume, essence n03920288 Petri dish n03924679 photocopier n03929660 pick, plectrum, plectron n03929855 pickelhaube n03930313 picket fence, paling n03930630 pickup, pickup truck n03933933 pier n03935335 piggy bank, penny bank n03937543 pill bottle n03938244 pillow n03942813 ping-pong ball n03944341 pinwheel n03947888 pirate, pirate ship n03950228 pitcher, ewer n03954731 plane, carpenter's plane, woodworking plane n03956157 planetarium n03958227 plastic bag n03961711 plate rack n03967562 plow, plough n03970156 plunger, plumber's helper n03976467 Polaroid camera, Polaroid Land camera n03976657 pole n03977966 police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria n03980874 poncho n03982430 pool table, billiard table, snooker table n03983396 pop bottle, soda bottle n03991062 pot, flowerpot n03992509 potter's wheel n03995372 power drill n03998194 prayer rug, prayer mat n04004767 printer n04005630 prison, prison house n04008634 projectile, missile n04009552 projector n04019541 puck, hockey puck n04023962 punching bag, punch bag, punching ball, punchball n04026417 purse n04033901 quill, quill pen n04033995 quilt, comforter, comfort, puff n04037443 racer, race car, racing car n04039381 racket, racquet n04040759 radiator n04041544 radio, wireless n04044716 radio telescope, radio reflector n04049303 rain barrel n04065272 recreational vehicle, RV, R.V. n04067472 reel n04069434 reflex camera n04070727 refrigerator, icebox n04074963 remote control, remote n04081281 restaurant, eating house, eating place, eatery n04086273 revolver, six-gun, six-shooter n04090263 rifle n04099969 rocking chair, rocker n04111531 rotisserie n04116512 rubber eraser, rubber, pencil eraser n04118538 rugby ball n04118776 rule, ruler n04120489 running shoe n04125021 safe n04127249 safety pin n04131690 saltshaker, salt shaker n04133789 sandal n04136333 sarong n04141076 sax, saxophone n04141327 scabbard n04141975 scale, weighing machine n04146614 school bus n04147183 schooner n04149813 scoreboard n04152593 screen, CRT screen n04153751 screw n04154565 screwdriver n04162706 seat belt, seatbelt n04179913 sewing machine n04192698 shield, buckler n04200800 shoe shop, shoe-shop, shoe store n04201297 shoji n04204238 shopping basket n04204347 shopping cart n04208210 shovel n04209133 shower cap n04209239 shower curtain n04228054 ski n04229816 ski mask n04235860 sleeping bag n04238763 slide rule, slipstick n04239074 sliding door n04243546 slot, one-armed bandit n04251144 snorkel n04252077 snowmobile n04252225 snowplow, snowplough n04254120 soap dispenser n04254680 soccer ball n04254777 sock n04258138 solar dish, solar collector, solar furnace n04259630 sombrero n04263257 soup bowl n04264628 space bar n04265275 space heater n04266014 space shuttle n04270147 spatula n04273569 speedboat n04275548 spider web, spider's web n04277352 spindle n04285008 sports car, sport car n04286575 spotlight, spot n04296562 stage n04310018 steam locomotive n04311004 steel arch bridge n04311174 steel drum n04317175 stethoscope n04325704 stole n04326547 stone wall n04328186 stopwatch, stop watch n04330267 stove n04332243 strainer n04335435 streetcar, tram, tramcar, trolley, trolley car n04336792 stretcher n04344873 studio couch, day bed n04346328 stupa, tope n04347754 submarine, pigboat, sub, U-boat n04350905 suit, suit of clothes n04355338 sundial n04355933 sunglass n04356056 sunglasses, dark glasses, shades n04357314 sunscreen, sunblock, sun blocker n04366367 suspension bridge n04367480 swab, swob, mop n04370456 sweatshirt n04371430 swimming trunks, bathing trunks n04371774 swing n04372370 switch, electric switch, electrical switch n04376876 syringe n04380533 table lamp n04389033 tank, army tank, armored combat vehicle, armoured combat vehicle n04392985 tape player n04398044 teapot n04399382 teddy, teddy bear n04404412 television, television system n04409515 tennis ball n04417672 thatch, thatched roof n04418357 theater curtain, theatre curtain n04423845 thimble n04428191 thresher, thrasher, threshing machine n04429376 throne n04435653 tile roof n04442312 toaster n04443257 tobacco shop, tobacconist shop, tobacconist n04447861 toilet seat n04456115 torch n04458633 totem pole n04461696 tow truck, tow car, wrecker n04462240 toyshop n04465501 tractor n04467665 trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi n04476259 tray n04479046 trench coat n04482393 tricycle, trike, velocipede n04483307 trimaran n04485082 tripod n04486054 triumphal arch n04487081 trolleybus, trolley coach, trackless trolley n04487394 trombone n04493381 tub, vat n04501370 turnstile n04505470 typewriter keyboard n04507155 umbrella n04509417 unicycle, monocycle n04515003 upright, upright piano n04517823 vacuum, vacuum cleaner n04522168 vase n04523525 vault n04525038 velvet n04525305 vending machine n04532106 vestment n04532670 viaduct n04536866 violin, fiddle n04540053 volleyball n04542943 waffle iron n04548280 wall clock n04548362 wallet, billfold, notecase, pocketbook n04550184 wardrobe, closet, press n04552348 warplane, military plane n04553703 washbasin, handbasin, washbowl, lavabo, wash-hand basin n04554684 washer, automatic washer, washing machine n04557648 water bottle n04560804 water jug n04562935 water tower n04579145 whiskey jug n04579432 whistle n04584207 wig n04589890 window screen n04590129 window shade n04591157 Windsor tie n04591713 wine bottle n04592741 wing n04596742 wok n04597913 wooden spoon n04599235 wool, woolen, woollen n04604644 worm fence, snake fence, snake-rail fence, Virginia fence n04606251 wreck n04612504 yawl n04613696 yurt n06359193 web site, website, internet site, site n06596364 comic book n06785654 crossword puzzle, crossword n06794110 street sign n06874185 traffic light, traffic signal, stoplight n07248320 book jacket, dust cover, dust jacket, dust wrapper n07565083 menu n07579787 plate n07583066 guacamole n07584110 consomme n07590611 hot pot, hotpot n07613480 trifle n07614500 ice cream, icecream n07615774 ice lolly, lolly, lollipop, popsicle n07684084 French loaf n07693725 bagel, beigel n07695742 pretzel n07697313 cheeseburger n07697537 hotdog, hot dog, red hot n07711569 mashed potato n07714571 head cabbage n07714990 broccoli n07715103 cauliflower n07716358 zucchini, courgette n07716906 spaghetti squash n07717410 acorn squash n07717556 butternut squash n07718472 cucumber, cuke n07718747 artichoke, globe artichoke n07720875 bell pepper n07730033 cardoon n07734744 mushroom n07742313 Granny Smith n07745940 strawberry n07747607 orange n07749582 lemon n07753113 fig n07753275 pineapple, ananas n07753592 banana n07754684 jackfruit, jak, jack n07760859 custard apple n07768694 pomegranate n07802026 hay n07831146 carbonara n07836838 chocolate sauce, chocolate syrup n07860988 dough n07871810 meat loaf, meatloaf n07873807 pizza, pizza pie n07875152 potpie n07880968 burrito n07892512 red wine n07920052 espresso n07930864 cup n07932039 eggnog n09193705 alp n09229709 bubble n09246464 cliff, drop, drop-off n09256479 coral reef n09288635 geyser n09332890 lakeside, lakeshore n09399592 promontory, headland, head, foreland n09421951 sandbar, sand bar n09428293 seashore, coast, seacoast, sea-coast n09468604 valley, vale n09472597 volcano n09835506 ballplayer, baseball player n10148035 groom, bridegroom n10565667 scuba diver n11879895 rapeseed n11939491 daisy n12057211 yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum n12144580 corn n12267677 acorn n12620546 hip, rose hip, rosehip n12768682 buckeye, horse chestnut, conker n12985857 coral fungus n12998815 agaric n13037406 gyromitra n13040303 stinkhorn, carrion fungus n13044778 earthstar n13052670 hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa n13054560 bolete n13133613 ear, spike, capitulum n15075141 toilet tissue, toilet paper, bathroom tissue ================================================ FILE: examples/whisper.cpp ================================================ // Copyright 2025 Tencent // SPDX-License-Identifier: BSD-3-Clause // whisper speech recognition implemented with ncnn library // convert openai-whisper checkpoints to ncnn models // 1. install pnnx via pip install -U pnnx // 2. obtain export_ncnn.py script from https://github.com/nihui/ncnn-android-whisper // 3. edit export_ncnn.py for changing the models among tiny/base/small/medium/large-v3-turbo // 4. make sure you have good internet connection // python export_ncnn.py // convert vocab.json to simple whisper_vocab.txt // 1. obtain vocab.json file from https://huggingface.co/openai/whisper-tiny/blob/main/vocab.json // 2. convert json dict into plain list, save to whisper_vocab.txt // NOTE large-v3-turbo has special token ids from others, one more language(yue) and does not support translation #include "net.h" #include "layer.h" #include "layer_type.h" #include #include #include #include #include #include #include // https://huggingface.co/openai/whisper-tiny/blob/main/tokenizer_config.json static const int token_endoftext = 50257; static const int token_startoftranscript = 50258; static const int token_lang_first = 50259; static const int token_lang_last = 50357; static const int token_lang_count = token_lang_last - token_lang_first + 1; // clang-format off // *INDENT-OFF* static const char* token_langs[] = { "en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl", "ca", "nl", "ar", "sv", "it", "id", "hi", "fi", "vi", "he", "uk", "el", "ms", "cs", "ro", "da", "hu", "ta", "no", "th", "ur", "hr", "bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn", "sr", "az", "sl", "kn", "et", "mk", "br", "eu", "is", "hy", "ne", "mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si", "km", "sn", "yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi", "lo", "uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my", "bo", "tl", "mg", "as", "tt", "haw", "ln", "ha", "ba", "jw", "su" }; // *INDENT-ON* // clang-format on static const int token_translate = 50358; static const int token_transcribe = 50359; static const int token_startoflm = 50360; static const int token_startofprev = 50361; static const int token_nocaptions = 50362; static const int token_notimestamps = 50363; static const int token_timestamp_first = 50364; static const int token_timestamp_last = 51864; // https://huggingface.co/openai/whisper-large-v3-turbo/blob/main/tokenizer_config.json // static const int token_endoftext = 50257; // static const int token_startoftranscript = 50258; // static const int token_lang_first = 50259; // static const int token_lang_last = 50357; // static const int token_lang_count = token_lang_last - token_lang_first + 1; // // clang-format off // // *INDENT-OFF* // static const char* token_langs[] = { // "en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl", "ca", "nl", "ar", "sv", // "it", "id", "hi", "fi", "vi", "he", "uk", "el", "ms", "cs", "ro", "da", "hu", "ta", "no", // "th", "ur", "hr", "bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn", "sr", // "az", "sl", "kn", "et", "mk", "br", "eu", "is", "hy", "ne", "mn", "bs", "kk", "sq", "sw", // "gl", "mr", "pa", "si", "km", "sn", "yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", // "am", "yi", "lo", "uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my", "bo", "tl", // "mg", "as", "tt", "haw", "ln", "ha", "ba", "jw", "su", "yue" // }; // // *INDENT-ON* // // clang-format on // static const int token_translate = 50359; // static const int token_transcribe = 50360; // static const int token_startoflm = 50361; // static const int token_startofprev = 50362; // static const int token_nospeech = 50363; // static const int token_notimestamps = 50364; // static const int token_timestamp_first = 50365; // static const int token_timestamp_last = 51865; // tokenizer for handling text tokens class Tokenizer { public: std::vector reverse_vocab; uint8_t byte_decoder[512]; // unicode code point to byte value // generate byte decoder for tokenization void generate_byte_decoder() { // initialize array to 0 memset(byte_decoder, 0, 512 * sizeof(uint8_t)); // define function to check if char is in "printable" range auto is_printable = [](int b) { return (b >= '!' && b <= '~') // '!' to '~' || (b >= 161 && b <= 172) // '¡' to '¬' || (b >= 174 && b <= 255); // '®' to 'ÿ' }; // handle "printable" characters // for these chars, key and value are the same for (int b = 0; b < 256; ++b) { if (is_printable(b)) { byte_decoder[b] = static_cast(b); } } // handle remaining characters // for these chars, key starts from 256 and increments int n = 0; for (int b = 0; b < 256; ++b) { if (!is_printable(b)) { byte_decoder[256 + n] = static_cast(b); n++; } } } // convert utf-8 string to code points std::vector utf8_to_codepoints(const std::string& s) const { std::vector codepoints; for (size_t i = 0; i < s.length();) { uint32_t cp = 0; int len = 0; unsigned char c = s[i]; if (c < 0x80) // 1-byte { cp = c; len = 1; } else if ((c & 0xE0) == 0xC0) // 2-byte { cp = ((s[i] & 0x1F) << 6) | (s[i + 1] & 0x3F); len = 2; } else if ((c & 0xF0) == 0xE0) // 3-byte { cp = ((s[i] & 0x0F) << 12) | ((s[i + 1] & 0x3F) << 6) | (s[i + 2] & 0x3F); len = 3; } else if ((c & 0xF8) == 0xF0) // 4-byte { cp = ((s[i] & 0x07) << 18) | ((s[i + 1] & 0x3F) << 12) | ((s[i + 2] & 0x3F) << 6) | (s[i + 3] & 0x3F); len = 4; } else { // invalid utf-8 start byte, skip i++; continue; } codepoints.push_back(cp); i += len; } return codepoints; } bool load(const char* vocab_path) { // generate decoder when loading generate_byte_decoder(); { FILE* fp = fopen(vocab_path, "rb"); if (!fp) { fprintf(stderr, "fopen %s failed\n", vocab_path); return false; } char line[256]; while (!feof(fp)) { char* s = fgets(line, 255, fp); if (!s) break; int vocab_len = strlen(line); if (vocab_len > 1) { // drop the tail newline vocab_len -= 1; } reverse_vocab.push_back(std::string(line, vocab_len)); } fclose(fp); } return true; } // decode token ids to text std::string decode(const std::vector& tokens) const { std::string outstring; bool in_timestamp = false; // step 1: concatenate token ids to a string with special unicode characters std::string text_buffer; for (int token_id : tokens) { if (token_id < token_endoftext) { text_buffer += reverse_vocab[token_id]; continue; } // handle timestamp tokens if (token_id >= token_timestamp_first && token_id <= token_timestamp_last) { int timestamp = (token_id - token_timestamp_first) * 2; char tmp[256]; sprintf(tmp, " [%d.%02d] ", timestamp / 100, timestamp % 100); if (in_timestamp) { // step 2: translate the special string back to original byte stream std::vector codepoints = utf8_to_codepoints(text_buffer); std::vector byte_sequence; for (uint32_t cp : codepoints) { byte_sequence.push_back(byte_decoder[cp]); } std::string s(byte_sequence.begin(), byte_sequence.end()); text_buffer.clear(); outstring += s; outstring += tmp; outstring += "\n"; in_timestamp = false; } else { outstring += tmp; in_timestamp = true; } } // ignore functional/special tokens } if (!text_buffer.empty()) { // step 2: translate the special string back to original byte stream std::vector codepoints = utf8_to_codepoints(text_buffer); std::vector byte_sequence; for (uint32_t cp : codepoints) { byte_sequence.push_back(byte_decoder[cp]); } std::string s(byte_sequence.begin(), byte_sequence.end()); outstring += s; } return outstring; } }; // result class for beam search class Result { public: std::vector ids; float score; std::vector kvcache; }; // main whisper implementation class class Whisper { public: int load(); int detect_lang(const std::vector& samples, std::string& lang) const; int transcribe(const std::vector& samples, const char* lang, std::string& text) const; protected: int extract_fbank_feature(const std::vector& samples, ncnn::Mat& input_features) const; int run_encoder(const ncnn::Mat& input_features, ncnn::Mat& encoder_states) const; int run_decoder_prefill(const std::vector& tokens, const ncnn::Mat& encoder_states, ncnn::Mat& last_logits, std::vector& out_kvcache) const; int run_decoder_step(const std::vector& tokens, const ncnn::Mat& encoder_states, ncnn::Mat& last_logits, const std::vector& kvcache, std::vector& out_kvcache) const; protected: ncnn::Net fbank; ncnn::Net encoder; ncnn::Net embed_token; ncnn::Net embed_position; ncnn::Net decoder; ncnn::Net proj_out; Tokenizer tokenizer; protected: std::vector kv_cache_indexes; std::vector out_kv_cache_indexes; }; int Whisper::load() { // whisper models could be found at // https://github.com/nihui/ncnn-android-whisper/releases // https://github.com/nihui/ncnn-android-whisper/tree/master/app/src/main/assets fbank.opt.use_vulkan_compute = true; fbank.opt.use_fp16_packed = false; fbank.opt.use_fp16_storage = false; fbank.opt.use_fp16_arithmetic = false; encoder.opt.use_vulkan_compute = true; encoder.opt.use_fp16_packed = false; encoder.opt.use_fp16_storage = false; encoder.opt.use_fp16_arithmetic = false; decoder.opt.use_vulkan_compute = true; decoder.opt.use_fp16_packed = false; decoder.opt.use_fp16_storage = false; decoder.opt.use_fp16_arithmetic = false; proj_out.opt.use_vulkan_compute = true; proj_out.opt.use_fp16_packed = false; proj_out.opt.use_fp16_storage = false; proj_out.opt.use_fp16_arithmetic = false; fbank.load_param("whisper_tiny_fbank.ncnn.param"); fbank.load_model("whisper_tiny_fbank.ncnn.bin"); encoder.load_param("whisper_tiny_encoder.ncnn.param"); encoder.load_model("whisper_tiny_encoder.ncnn.bin"); embed_token.load_param("whisper_tiny_embed_token.ncnn.param"); embed_token.load_model("whisper_tiny_embed_token.ncnn.bin"); embed_position.load_param("whisper_tiny_embed_position.ncnn.param"); embed_position.load_model("whisper_tiny_embed_position.ncnn.bin"); decoder.load_param("whisper_tiny_decoder.ncnn.param"); decoder.load_model("whisper_tiny_decoder.ncnn.bin"); proj_out.load_param("whisper_tiny_proj_out.ncnn.param"); proj_out.load_model("whisper_tiny_proj_out.ncnn.bin"); // fbank.load_param("whisper_large_v3_turbo_fbank.ncnn.param"); // fbank.load_model("whisper_large_v3_turbo_fbank.ncnn.bin"); // // encoder.load_param("whisper_large_v3_turbo_encoder.ncnn.param"); // encoder.load_model("whisper_large_v3_turbo_encoder.ncnn.bin"); // // embed_token.load_param("whisper_large_v3_turbo_embed_token.ncnn.param"); // embed_token.load_model("whisper_large_v3_turbo_embed_token.ncnn.bin"); // // embed_position.load_param("whisper_large_v3_turbo_embed_position.ncnn.param"); // embed_position.load_model("whisper_large_v3_turbo_embed_position.ncnn.bin"); // // decoder.load_param("whisper_large_v3_turbo_decoder.ncnn.param"); // decoder.load_model("whisper_large_v3_turbo_decoder.ncnn.bin"); // // proj_out.load_param("whisper_large_v3_turbo_proj_out.ncnn.param"); // proj_out.load_model("whisper_large_v3_turbo_proj_out.ncnn.bin"); tokenizer.load("whisper_vocab.txt"); // resolve kv cache blob indexes for (size_t i = 0; i < decoder.layers().size(); i++) { const ncnn::Layer* mha = decoder.layers()[i]; if (mha->typeindex != ncnn::LayerType::MultiHeadAttention) continue; const size_t input_count = mha->bottoms.size(); const size_t output_count = mha->tops.size(); if (output_count == 3) { kv_cache_indexes.push_back(mha->bottoms[input_count - 2]); kv_cache_indexes.push_back(mha->bottoms[input_count - 1]); out_kv_cache_indexes.push_back(mha->tops[output_count - 2]); out_kv_cache_indexes.push_back(mha->tops[output_count - 1]); } } return 0; } // apply log_softmax in-place static void log_softmax_inplace(ncnn::Mat& m) { ncnn::Option opt; opt.use_packing_layout = false; opt.use_fp16_storage = false; { ncnn::Layer* softmax = ncnn::create_layer_cpu("Softmax"); ncnn::ParamDict pd; pd.set(0, 0); // axis softmax->load_param(pd); softmax->forward_inplace(m, opt); delete softmax; } { ncnn::Layer* log = ncnn::create_layer_cpu("UnaryOp"); ncnn::ParamDict pd; pd.set(0, 8); // log log->load_param(pd); log->forward_inplace(m, opt); delete log; } } int Whisper::detect_lang(const std::vector& samples, std::string& lang) const { std::vector ids(1); ids[0] = token_startoftranscript; ncnn::Mat input_features; extract_fbank_feature(samples, input_features); ncnn::Mat encoder_states; run_encoder(input_features, encoder_states); ncnn::Mat logits; std::vector out_kvcache; run_decoder_prefill(ids, encoder_states, logits, out_kvcache); // find the lang token with highest prob // we are only interested in lang part and no_speech int lang_id = token_lang_first; float max_prob = logits[token_lang_first]; for (int i = token_lang_first; i <= token_lang_last; i++) { float prob = logits[i]; if (prob > max_prob) { max_prob = prob; lang_id = i; } } lang = token_langs[lang_id - token_lang_first]; return 0; } int Whisper::transcribe(const std::vector& samples, const char* lang, std::string& text) const { // find lang token id by lang string int token_lang = -1; for (int i = 0; i < token_lang_count; i++) { if (strcmp(token_langs[i], lang) == 0) { token_lang = token_lang_first + i; break; } } if (token_lang == -1) { fprintf(stderr, "language %s not supported\n", lang); return -1; } // initialize with prompt tokens std::vector ids(4); ids[0] = token_startoftranscript; ids[1] = token_lang; ids[2] = token_transcribe; ids[3] = token_notimestamps; ncnn::Mat input_features; extract_fbank_feature(samples, input_features); ncnn::Mat encoder_states; run_encoder(input_features, encoder_states); const int beam_size = 5; const int max_candidates = 5; std::vector finished_beams; std::vector beams(1); beams[0].ids = ids; beams[0].score = 0.f; int step = 0; // beam search loop for (;;) { std::vector candidates; for (size_t i = 0; i < beams.size(); i++) { const Result& beam = beams[i]; ncnn::Mat logits; std::vector out_kvcache; if (step == 0) { run_decoder_prefill(beam.ids, encoder_states, logits, out_kvcache); } else { run_decoder_step(beam.ids, encoder_states, logits, beam.kvcache, out_kvcache); } log_softmax_inplace(logits); // get topk candidates const int topk = 5; std::vector > vec(logits.w); for (int j = 0; j < logits.w; j++) { vec[j] = std::make_pair(logits[j], j); } std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::greater >()); for (int j = 0; j < topk; j++) { int next_id = vec[j].second; float next_id_score = vec[j].first; Result candidate; candidate.ids = beam.ids; candidate.ids.push_back(next_id); candidate.score = beam.score + next_id_score; candidate.kvcache = out_kvcache; candidates.push_back(candidate); } } // sort candidates by score std::sort(candidates.begin(), candidates.end(), [](const Result& a, const Result& b) { return a.score > b.score; }); beams.clear(); for (size_t i = 0; i < candidates.size(); i++) { const Result& candidate = candidates[i]; if (candidate.ids.back() == token_endoftext) { finished_beams.push_back(candidate); } else { beams.push_back(candidate); } } if (beams.size() > beam_size) { beams.resize(beam_size); } step++; if (beams.empty()) { break; } if (finished_beams.size() >= max_candidates) { break; } } if (finished_beams.empty()) { // no results return 0; } // find the best result based on average score int max_avg_score_index = 0; float max_avg_score = -FLT_MAX; for (size_t i = 0; i < finished_beams.size(); i++) { const Result& result = finished_beams[i]; float avg_score = result.score / result.ids.size(); if (avg_score > max_avg_score) { max_avg_score_index = (int)i; max_avg_score = avg_score; } } const Result& best_result = finished_beams[max_avg_score_index]; text = tokenizer.decode(best_result.ids); return 0; } int Whisper::extract_fbank_feature(const std::vector& samples, ncnn::Mat& input_features) const { const int samples_size = (int)samples.size(); // pad to 480000, normalize samples to -1~1 ncnn::Mat waveform(480000); waveform.fill(0.f); { for (int i = 0; i < samples_size; i++) { waveform[i] = samples[i] / 32768.0f; } } ncnn::Extractor ex = fbank.create_extractor(); ex.input("in0", waveform); ex.extract("out0", input_features); // drop the last frame { ncnn::Mat input_features_3k(input_features.w - 1, input_features.h); for (int i = 0; i < input_features.h; i++) { memcpy(input_features_3k.row(i), input_features.row(i), (input_features.w - 1) * sizeof(float)); } input_features = input_features_3k; } return 0; } int Whisper::run_encoder(const ncnn::Mat& input_features, ncnn::Mat& encoder_states) const { ncnn::Extractor ex = encoder.create_extractor(); ex.input("in0", input_features); ex.extract("out0", encoder_states); return 0; } int Whisper::run_decoder_prefill(const std::vector& tokens, const ncnn::Mat& encoder_states, ncnn::Mat& last_logits, std::vector& out_kvcache) const { const int dst_seqlen = tokens.size(); // token embedding ncnn::Mat token_embeds; { ncnn::Mat input_tokens(dst_seqlen); int* p = input_tokens; memcpy(p, tokens.data(), tokens.size() * sizeof(int)); ncnn::Extractor ex = embed_token.create_extractor(); ex.input("in0", input_tokens); ex.extract("out0", token_embeds); } // position embedding ncnn::Mat position_embeds; { ncnn::Mat input_positions(dst_seqlen); int* p = input_positions; for (int i = 0; i < dst_seqlen; i++) { p[i] = i; } ncnn::Extractor ex = embed_position.create_extractor(); ex.input("in0", input_positions); ex.extract("out0", position_embeds); } // input embedding = token + position ncnn::Mat input_embeds; { input_embeds.create_like(token_embeds); for (int i = 0; i < input_embeds.total(); i++) { input_embeds[i] = token_embeds[i] + position_embeds[i]; } } // create attention mask (causal mask) ncnn::Mat attention_mask(dst_seqlen, dst_seqlen); attention_mask.fill(0.f); for (int i = 0; i < dst_seqlen; i++) { for (int j = i + 1; j < dst_seqlen; j++) { attention_mask.row(i)[j] = -INFINITY; } } ncnn::Mat output_states; { ncnn::Extractor ex = decoder.create_extractor(); ex.input("in0", input_embeds); ex.input("in1", encoder_states); ex.input("in2", attention_mask); out_kvcache.resize(out_kv_cache_indexes.size()); for (size_t i = 0; i < out_kv_cache_indexes.size(); i++) { ex.extract(out_kv_cache_indexes[i], out_kvcache[i], 1); } ex.extract("out0", output_states); } // get last token's state for next token prediction ncnn::Mat last_state = output_states.row_range(dst_seqlen - 1, 1).clone(); { ncnn::Extractor ex = proj_out.create_extractor(); ex.input("in0", last_state); ex.extract("out0", last_logits); } last_logits = last_logits.reshape(last_logits.w); return 0; } int Whisper::run_decoder_step(const std::vector& tokens, const ncnn::Mat& encoder_states, ncnn::Mat& last_logits, const std::vector& kvcache, std::vector& out_kvcache) const { const int token_id = tokens.back(); const int dst_seqlen = 1; // token embedding ncnn::Mat token_embeds; { ncnn::Mat input_tokens(dst_seqlen); ((int*)input_tokens)[0] = token_id; ncnn::Extractor ex = embed_token.create_extractor(); ex.input("in0", input_tokens); ex.extract("out0", token_embeds); } // position embedding ncnn::Mat position_embeds; { ncnn::Mat input_positions(dst_seqlen); ((int*)input_positions)[0] = tokens.size() - 1; ncnn::Extractor ex = embed_position.create_extractor(); ex.input("in0", input_positions); ex.extract("out0", position_embeds); } // input embedding = token + position ncnn::Mat input_embeds; { input_embeds.create_like(token_embeds); for (int i = 0; i < input_embeds.total(); i++) { input_embeds[i] = token_embeds[i] + position_embeds[i]; } } // single token doesn't need attention mask ncnn::Mat attention_mask(dst_seqlen, dst_seqlen); attention_mask.fill(0.f); ncnn::Mat output_states; { ncnn::Extractor ex = decoder.create_extractor(); ex.input("in0", input_embeds); ex.input("in1", encoder_states); ex.input("in2", attention_mask); // pass in kv cache from previous steps for (size_t i = 0; i < kv_cache_indexes.size(); i++) { ex.input(kv_cache_indexes[i], kvcache[i]); } // extract updated kv cache out_kvcache.resize(out_kv_cache_indexes.size()); for (size_t i = 0; i < out_kv_cache_indexes.size(); i++) { ex.extract(out_kv_cache_indexes[i], out_kvcache[i], 1); } ex.extract("out0", output_states); } // get last token's state for prediction ncnn::Mat last_state = output_states.row_range(dst_seqlen - 1, 1).clone(); { ncnn::Extractor ex = proj_out.create_extractor(); ex.input("in0", last_state); ex.extract("out0", last_logits); } last_logits = last_logits.reshape(last_logits.w); return 0; } static int load_wav_samples(const char* wavpath, std::vector& samples) { FILE* fp = fopen(wavpath, "rb"); if (!fp) { fprintf(stderr, "open %s failed\n", wavpath); return -1; } // https://stackoverflow.com/questions/1537964/visual-c-equivalent-of-gccs-attribute-packed #ifdef _MSC_VER #define PACK(__Declaration__) __pragma(pack(push, 1)) __Declaration__ __pragma(pack(pop)) #else #define PACK(__Declaration__) __Declaration__ __attribute__((__packed__)) #endif PACK(struct wav_header { char riff[4]; uint32_t chunk_size; char wave[4]; char fmt[4]; uint32_t subchunk1_size; uint16_t audio_format; uint16_t num_channels; uint32_t sample_rate; uint32_t byte_rate; uint16_t block_align; uint16_t bits_per_sample; char data[4]; uint32_t data_size; }); wav_header header; if (fread(&header, sizeof(wav_header), 1, fp) != 1) { fprintf(stderr, "failed to read wav header from %s\n", wavpath); fclose(fp); return -1; } if (memcmp(header.riff, "RIFF", 4) != 0 || memcmp(header.wave, "WAVE", 4) != 0 || memcmp(header.fmt, "fmt ", 4) != 0 || memcmp(header.data, "data", 4) != 0) { fprintf(stderr, "%s is not a valid wav file\n", wavpath); fclose(fp); return -1; } if (header.subchunk1_size != 16 || header.audio_format != 1 || header.num_channels != 1 || header.sample_rate != 16000 || header.bits_per_sample != 16) { fprintf(stderr, "%s is not pcm s16le 16k wav\n", wavpath); fprintf(stderr, "ffmpeg -i input.xxx -vn -c:a pcm_s16le -ac 1 -ar 16000 -fflags bitexact output.wav\n"); fclose(fp); return -1; } fseek(fp, 0, SEEK_END); long len = ftell(fp); samples.resize((len - sizeof(wav_header)) / sizeof(short)); rewind(fp); fseek(fp, sizeof(wav_header), SEEK_SET); fread(samples.data(), 1, len - sizeof(wav_header), fp); fclose(fp); return 0; } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [wavpath]\n", argv[0]); return -1; } const char* wavpath = argv[1]; std::vector samples; int ret = load_wav_samples(wavpath, samples); if (ret != 0) { fprintf(stderr, "load wav failed\n"); return -1; } if (samples.size() > 480000) { fprintf(stderr, "audio duration too long, truncate to 30s\n"); samples.resize(480000); } Whisper whisper; whisper.load(); // detect language first std::string lang; whisper.detect_lang(samples, lang); fprintf(stderr, "lang = %s\n", lang.c_str()); // transcribe audio to text std::string text; whisper.transcribe(samples, lang.c_str(), text); fprintf(stderr, "text = %s\n", text.c_str()); return 0; } ================================================ FILE: examples/yolact.cpp ================================================ // Copyright 2020 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include struct Object { cv::Rect_ rect; int label; float prob; std::vector maskdata; cv::Mat mask; }; static inline float intersection_area(const Object& a, const Object& b) { cv::Rect_ inter = a.rect & b.rect; return inter.area(); } static void qsort_descent_inplace(std::vector& objects, int left, int right) { int i = left; int j = right; float p = objects[(left + right) / 2].prob; while (i <= j) { while (objects[i].prob > p) i++; while (objects[j].prob < p) j--; if (i <= j) { // swap std::swap(objects[i], objects[j]); i++; j--; } } #pragma omp parallel sections { #pragma omp section { if (left < j) qsort_descent_inplace(objects, left, j); } #pragma omp section { if (i < right) qsort_descent_inplace(objects, i, right); } } } static void qsort_descent_inplace(std::vector& objects) { if (objects.empty()) return; qsort_descent_inplace(objects, 0, objects.size() - 1); } static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold, bool agnostic = false) { picked.clear(); const int n = faceobjects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = faceobjects[i].rect.area(); } for (int i = 0; i < n; i++) { const Object& a = faceobjects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { const Object& b = faceobjects[picked[j]]; if (!agnostic && a.label != b.label) continue; // intersection over union float inter_area = intersection_area(a, b); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area if (inter_area / union_area > nms_threshold) keep = 0; } if (keep) picked.push_back(i); } } static int detect_yolact(const cv::Mat& bgr, std::vector& objects) { ncnn::Net yolact; yolact.opt.use_vulkan_compute = true; // original model converted from https://github.com/dbolya/yolact // yolact_resnet50_54_800000.pth // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models if (yolact.load_param("yolact.param")) exit(-1); if (yolact.load_model("yolact.bin")) exit(-1); const int target_size = 550; int img_w = bgr.cols; int img_h = bgr.rows; ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, target_size, target_size); const float mean_vals[3] = {123.68f, 116.78f, 103.94f}; const float norm_vals[3] = {1.0 / 58.40f, 1.0 / 57.12f, 1.0 / 57.38f}; in.substract_mean_normalize(mean_vals, norm_vals); ncnn::Extractor ex = yolact.create_extractor(); ex.input("input.1", in); ncnn::Mat maskmaps; ncnn::Mat location; ncnn::Mat mask; ncnn::Mat confidence; ex.extract("619", maskmaps); // 138x138 x 32 ex.extract("816", location); // 4 x 19248 ex.extract("818", mask); // maskdim 32 x 19248 ex.extract("820", confidence); // 81 x 19248 int num_class = confidence.w; int num_priors = confidence.h; // make priorbox ncnn::Mat priorbox(4, num_priors); { const int conv_ws[5] = {69, 35, 18, 9, 5}; const int conv_hs[5] = {69, 35, 18, 9, 5}; const float aspect_ratios[3] = {1.f, 0.5f, 2.f}; const float scales[5] = {24.f, 48.f, 96.f, 192.f, 384.f}; float* pb = priorbox; for (int p = 0; p < 5; p++) { int conv_w = conv_ws[p]; int conv_h = conv_hs[p]; float scale = scales[p]; for (int i = 0; i < conv_h; i++) { for (int j = 0; j < conv_w; j++) { // +0.5 because priors are in center-size notation float cx = (j + 0.5f) / conv_w; float cy = (i + 0.5f) / conv_h; for (int k = 0; k < 3; k++) { float ar = aspect_ratios[k]; ar = sqrt(ar); float w = scale * ar / 550; float h = scale / ar / 550; // This is for backward compatibility with a bug where I made everything square by accident // cfg.backbone.use_square_anchors: h = w; pb[0] = cx; pb[1] = cy; pb[2] = w; pb[3] = h; pb += 4; } } } } } const float confidence_thresh = 0.05f; const float nms_threshold = 0.5f; const int keep_top_k = 200; std::vector > class_candidates; class_candidates.resize(num_class); for (int i = 0; i < num_priors; i++) { const float* conf = confidence.row(i); const float* loc = location.row(i); const float* pb = priorbox.row(i); const float* maskdata = mask.row(i); // find class id with highest score // start from 1 to skip background int label = 0; float score = 0.f; for (int j = 1; j < num_class; j++) { float class_score = conf[j]; if (class_score > score) { label = j; score = class_score; } } // ignore background or low score if (label == 0 || score <= confidence_thresh) continue; // CENTER_SIZE float var[4] = {0.1f, 0.1f, 0.2f, 0.2f}; float pb_cx = pb[0]; float pb_cy = pb[1]; float pb_w = pb[2]; float pb_h = pb[3]; float bbox_cx = var[0] * loc[0] * pb_w + pb_cx; float bbox_cy = var[1] * loc[1] * pb_h + pb_cy; float bbox_w = (float)(exp(var[2] * loc[2]) * pb_w); float bbox_h = (float)(exp(var[3] * loc[3]) * pb_h); float obj_x1 = bbox_cx - bbox_w * 0.5f; float obj_y1 = bbox_cy - bbox_h * 0.5f; float obj_x2 = bbox_cx + bbox_w * 0.5f; float obj_y2 = bbox_cy + bbox_h * 0.5f; // clip obj_x1 = std::max(std::min(obj_x1 * bgr.cols, (float)(bgr.cols - 1)), 0.f); obj_y1 = std::max(std::min(obj_y1 * bgr.rows, (float)(bgr.rows - 1)), 0.f); obj_x2 = std::max(std::min(obj_x2 * bgr.cols, (float)(bgr.cols - 1)), 0.f); obj_y2 = std::max(std::min(obj_y2 * bgr.rows, (float)(bgr.rows - 1)), 0.f); // append object Object obj; obj.rect = cv::Rect_(obj_x1, obj_y1, obj_x2 - obj_x1 + 1, obj_y2 - obj_y1 + 1); obj.label = label; obj.prob = score; obj.maskdata = std::vector(maskdata, maskdata + mask.w); class_candidates[label].push_back(obj); } objects.clear(); for (int i = 0; i < (int)class_candidates.size(); i++) { std::vector& candidates = class_candidates[i]; qsort_descent_inplace(candidates); std::vector picked; nms_sorted_bboxes(candidates, picked, nms_threshold); for (int j = 0; j < (int)picked.size(); j++) { int z = picked[j]; objects.push_back(candidates[z]); } } qsort_descent_inplace(objects); // keep_top_k if (keep_top_k < (int)objects.size()) { objects.resize(keep_top_k); } // generate mask for (int i = 0; i < (int)objects.size(); i++) { Object& obj = objects[i]; cv::Mat mask(maskmaps.h, maskmaps.w, CV_32FC1); { mask = cv::Scalar(0.f); for (int p = 0; p < maskmaps.c; p++) { const float* maskmap = maskmaps.channel(p); float coeff = obj.maskdata[p]; float* mp = (float*)mask.data; // mask += m * coeff for (int j = 0; j < maskmaps.w * maskmaps.h; j++) { mp[j] += maskmap[j] * coeff; } } } cv::Mat mask2; cv::resize(mask, mask2, cv::Size(img_w, img_h)); // crop obj box and binarize obj.mask = cv::Mat(img_h, img_w, CV_8UC1); { obj.mask = cv::Scalar(0); for (int y = 0; y < img_h; y++) { if (y < obj.rect.y || y > obj.rect.y + obj.rect.height) continue; const float* mp2 = mask2.ptr(y); uchar* bmp = obj.mask.ptr(y); for (int x = 0; x < img_w; x++) { if (x < obj.rect.x || x > obj.rect.x + obj.rect.width) continue; bmp[x] = mp2[x] > 0.5f ? 255 : 0; } } } } return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = {"background", "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush" }; static const unsigned char colors[81][3] = { {56, 0, 255}, {226, 255, 0}, {0, 94, 255}, {0, 37, 255}, {0, 255, 94}, {255, 226, 0}, {0, 18, 255}, {255, 151, 0}, {170, 0, 255}, {0, 255, 56}, {255, 0, 75}, {0, 75, 255}, {0, 255, 169}, {255, 0, 207}, {75, 255, 0}, {207, 0, 255}, {37, 0, 255}, {0, 207, 255}, {94, 0, 255}, {0, 255, 113}, {255, 18, 0}, {255, 0, 56}, {18, 0, 255}, {0, 255, 226}, {170, 255, 0}, {255, 0, 245}, {151, 255, 0}, {132, 255, 0}, {75, 0, 255}, {151, 0, 255}, {0, 151, 255}, {132, 0, 255}, {0, 255, 245}, {255, 132, 0}, {226, 0, 255}, {255, 37, 0}, {207, 255, 0}, {0, 255, 207}, {94, 255, 0}, {0, 226, 255}, {56, 255, 0}, {255, 94, 0}, {255, 113, 0}, {0, 132, 255}, {255, 0, 132}, {255, 170, 0}, {255, 0, 188}, {113, 255, 0}, {245, 0, 255}, {113, 0, 255}, {255, 188, 0}, {0, 113, 255}, {255, 0, 0}, {0, 56, 255}, {255, 0, 113}, {0, 255, 188}, {255, 0, 94}, {255, 0, 18}, {18, 255, 0}, {0, 255, 132}, {0, 188, 255}, {0, 245, 255}, {0, 169, 255}, {37, 255, 0}, {255, 0, 151}, {188, 0, 255}, {0, 255, 37}, {0, 255, 0}, {255, 0, 170}, {255, 0, 37}, {255, 75, 0}, {0, 0, 255}, {255, 207, 0}, {255, 0, 226}, {255, 245, 0}, {188, 255, 0}, {0, 255, 18}, {0, 255, 75}, {0, 255, 151}, {255, 56, 0}, {245, 255, 0} }; cv::Mat image = bgr.clone(); int color_index = 0; for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; if (obj.prob < 0.15) continue; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); const unsigned char* color = colors[color_index % 81]; color_index++; cv::rectangle(image, obj.rect, cv::Scalar(color[0], color[1], color[2])); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); // draw mask for (int y = 0; y < image.rows; y++) { const uchar* mp = obj.mask.ptr(y); uchar* p = image.ptr(y); for (int x = 0; x < image.cols; x++) { if (mp[x] == 255) { p[0] = cv::saturate_cast(p[0] * 0.5 + color[0] * 0.5); p[1] = cv::saturate_cast(p[1] * 0.5 + color[1] * 0.5); p[2] = cv::saturate_cast(p[2] * 0.5 + color[2] * 0.5); } p += 3; } } } cv::imwrite("result.png", image); cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_yolact(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/yolo11.cpp ================================================ // Copyright 2025 Tencent // SPDX-License-Identifier: BSD-3-Clause // 1. install // pip3 install -U ultralytics pnnx ncnn // 2. export yolo11 torchscript // yolo export model=yolo11n.pt format=torchscript // 3. convert torchscript with static shape // pnnx yolo11n.torchscript // 4. modify yolo11n_pnnx.py for dynamic shape inference // A. modify reshape to support dynamic image sizes // B. permute tensor before concat and adjust concat axis // C. drop post-process part // before: // v_235 = v_204.view(1, 144, 6400) // v_236 = v_219.view(1, 144, 1600) // v_237 = v_234.view(1, 144, 400) // v_238 = torch.cat((v_235, v_236, v_237), dim=2) // ... // after: // v_235 = v_204.view(1, 144, -1).transpose(1, 2) // v_236 = v_219.view(1, 144, -1).transpose(1, 2) // v_237 = v_234.view(1, 144, -1).transpose(1, 2) // v_238 = torch.cat((v_235, v_236, v_237), dim=1) // return v_238 // D. modify area attention for dynamic shape inference // before: // v_95 = self.model_10_m_0_attn_qkv_conv(v_94) // v_96 = v_95.view(1, 2, 128, 400) // v_97, v_98, v_99 = torch.split(tensor=v_96, dim=2, split_size_or_sections=(32,32,64)) // v_100 = torch.transpose(input=v_97, dim0=-2, dim1=-1) // v_101 = torch.matmul(input=v_100, other=v_98) // v_102 = (v_101 * 0.176777) // v_103 = F.softmax(input=v_102, dim=-1) // v_104 = torch.transpose(input=v_103, dim0=-2, dim1=-1) // v_105 = torch.matmul(input=v_99, other=v_104) // v_106 = v_105.view(1, 128, 20, 20) // v_107 = v_99.reshape(1, 128, 20, 20) // v_108 = self.model_10_m_0_attn_pe_conv(v_107) // v_109 = (v_106 + v_108) // v_110 = self.model_10_m_0_attn_proj_conv(v_109) // after: // v_95 = self.model_10_m_0_attn_qkv_conv(v_94) // v_96 = v_95.view(1, 2, 128, -1) // v_97, v_98, v_99 = torch.split(tensor=v_96, dim=2, split_size_or_sections=(32,32,64)) // v_100 = torch.transpose(input=v_97, dim0=-2, dim1=-1) // v_101 = torch.matmul(input=v_100, other=v_98) // v_102 = (v_101 * 0.176777) // v_103 = F.softmax(input=v_102, dim=-1) // v_104 = torch.transpose(input=v_103, dim0=-2, dim1=-1) // v_105 = torch.matmul(input=v_99, other=v_104) // v_106 = v_105.view(1, 128, v_95.size(2), v_95.size(3)) // v_107 = v_99.reshape(1, 128, v_95.size(2), v_95.size(3)) // v_108 = self.model_10_m_0_attn_pe_conv(v_107) // v_109 = (v_106 + v_108) // v_110 = self.model_10_m_0_attn_proj_conv(v_109) // 5. re-export yolo11 torchscript // python3 -c 'import yolo11n_pnnx; yolo11n_pnnx.export_torchscript()' // 6. convert new torchscript with dynamic shape // pnnx yolo11n_pnnx.py.pt inputshape=[1,3,640,640] inputshape2=[1,3,320,320] // 7. now you get ncnn model files // mv yolo11n_pnnx.py.ncnn.param yolo11n.ncnn.param // mv yolo11n_pnnx.py.ncnn.bin yolo11n.ncnn.bin // the out blob would be a 2-dim tensor with w=144 h=8400 // // | bbox-reg 16 x 4 | per-class scores(80) | // +-----+-----+-----+-----+----------------------+ // | dx0 | dy0 | dx1 | dy1 |0.1 0.0 0.0 0.5 ......| // all /| | | | | . | // boxes | .. | .. | .. | .. |0.0 0.9 0.0 0.0 ......| // (8400)| | | | | . | // \| | | | | . | // +-----+-----+-----+-----+----------------------+ // #include "layer.h" #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include #include struct Object { cv::Rect_ rect; int label; float prob; }; static inline float intersection_area(const Object& a, const Object& b) { cv::Rect_ inter = a.rect & b.rect; return inter.area(); } static void qsort_descent_inplace(std::vector& objects, int left, int right) { int i = left; int j = right; float p = objects[(left + right) / 2].prob; while (i <= j) { while (objects[i].prob > p) i++; while (objects[j].prob < p) j--; if (i <= j) { // swap std::swap(objects[i], objects[j]); i++; j--; } } // #pragma omp parallel sections { // #pragma omp section { if (left < j) qsort_descent_inplace(objects, left, j); } // #pragma omp section { if (i < right) qsort_descent_inplace(objects, i, right); } } } static void qsort_descent_inplace(std::vector& objects) { if (objects.empty()) return; qsort_descent_inplace(objects, 0, objects.size() - 1); } static void nms_sorted_bboxes(const std::vector& objects, std::vector& picked, float nms_threshold, bool agnostic = false) { picked.clear(); const int n = objects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = objects[i].rect.area(); } for (int i = 0; i < n; i++) { const Object& a = objects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { const Object& b = objects[picked[j]]; if (!agnostic && a.label != b.label) continue; // intersection over union float inter_area = intersection_area(a, b); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area if (inter_area / union_area > nms_threshold) keep = 0; } if (keep) picked.push_back(i); } } static inline float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); } static void generate_proposals(const ncnn::Mat& pred, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) { const int w = in_pad.w; const int h = in_pad.h; const int num_grid_x = w / stride; const int num_grid_y = h / stride; const int reg_max_1 = 16; const int num_class = pred.w - reg_max_1 * 4; // number of classes. 80 for COCO for (int y = 0; y < num_grid_y; y++) { for (int x = 0; x < num_grid_x; x++) { const ncnn::Mat pred_grid = pred.row_range(y * num_grid_x + x, 1); // find label with max score int label = -1; float score = -FLT_MAX; { const ncnn::Mat pred_score = pred_grid.range(reg_max_1 * 4, num_class); for (int k = 0; k < num_class; k++) { float s = pred_score[k]; if (s > score) { label = k; score = s; } } score = sigmoid(score); } if (score >= prob_threshold) { ncnn::Mat pred_bbox = pred_grid.range(0, reg_max_1 * 4).reshape(reg_max_1, 4); { ncnn::Layer* softmax = ncnn::create_layer("Softmax"); ncnn::ParamDict pd; pd.set(0, 1); // axis pd.set(1, 1); softmax->load_param(pd); ncnn::Option opt; opt.num_threads = 1; opt.use_packing_layout = false; softmax->create_pipeline(opt); softmax->forward_inplace(pred_bbox, opt); softmax->destroy_pipeline(opt); delete softmax; } float pred_ltrb[4]; for (int k = 0; k < 4; k++) { float dis = 0.f; const float* dis_after_sm = pred_bbox.row(k); for (int l = 0; l < reg_max_1; l++) { dis += l * dis_after_sm[l]; } pred_ltrb[k] = dis * stride; } float pb_cx = (x + 0.5f) * stride; float pb_cy = (y + 0.5f) * stride; float x0 = pb_cx - pred_ltrb[0]; float y0 = pb_cy - pred_ltrb[1]; float x1 = pb_cx + pred_ltrb[2]; float y1 = pb_cy + pred_ltrb[3]; Object obj; obj.rect.x = x0; obj.rect.y = y0; obj.rect.width = x1 - x0; obj.rect.height = y1 - y0; obj.label = label; obj.prob = score; objects.push_back(obj); } } } } static void generate_proposals(const ncnn::Mat& pred, const std::vector& strides, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) { const int w = in_pad.w; const int h = in_pad.h; int pred_row_offset = 0; for (size_t i = 0; i < strides.size(); i++) { const int stride = strides[i]; const int num_grid_x = w / stride; const int num_grid_y = h / stride; const int num_grid = num_grid_x * num_grid_y; generate_proposals(pred.row_range(pred_row_offset, num_grid), stride, in_pad, prob_threshold, objects); pred_row_offset += num_grid; } } static int detect_yolo11(const cv::Mat& bgr, std::vector& objects) { ncnn::Net yolo11; yolo11.opt.use_vulkan_compute = true; // yolo11.opt.use_bf16_storage = true; // https://github.com/nihui/ncnn-android-yolo11/tree/master/app/src/main/assets yolo11.load_param("yolo11n.ncnn.param"); yolo11.load_model("yolo11n.ncnn.bin"); // yolo11.load_param("yolo11s.ncnn.param"); // yolo11.load_model("yolo11s.ncnn.bin"); // yolo11.load_param("yolo11m.ncnn.param"); // yolo11.load_model("yolo11m.ncnn.bin"); const int target_size = 640; const float prob_threshold = 0.25f; const float nms_threshold = 0.45f; int img_w = bgr.cols; int img_h = bgr.rows; // ultralytics/cfg/models/v8/yolo11.yaml std::vector strides(3); strides[0] = 8; strides[1] = 16; strides[2] = 32; const int max_stride = 32; // letterbox pad to multiple of max_stride int w = img_w; int h = img_h; float scale = 1.f; if (w > h) { scale = (float)target_size / w; w = target_size; h = h * scale; } else { scale = (float)target_size / h; h = target_size; w = w * scale; } ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h); // letterbox pad to target_size rectangle int wpad = (w + max_stride - 1) / max_stride * max_stride - w; int hpad = (h + max_stride - 1) / max_stride * max_stride - h; ncnn::Mat in_pad; ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; in_pad.substract_mean_normalize(0, norm_vals); ncnn::Extractor ex = yolo11.create_extractor(); ex.input("in0", in_pad); ncnn::Mat out; ex.extract("out0", out); std::vector proposals; generate_proposals(out, strides, in_pad, prob_threshold, proposals); // sort all proposals by score from highest to lowest qsort_descent_inplace(proposals); // apply nms with nms_threshold std::vector picked; nms_sorted_bboxes(proposals, picked, nms_threshold); int count = picked.size(); objects.resize(count); for (int i = 0; i < count; i++) { objects[i] = proposals[picked[i]]; // adjust offset to original unpadded float x0 = (objects[i].rect.x - (wpad / 2)) / scale; float y0 = (objects[i].rect.y - (hpad / 2)) / scale; float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale; float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale; // clip x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f); y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f); x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f); y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f); objects[i].rect.x = x0; objects[i].rect.y = y0; objects[i].rect.width = x1 - x0; objects[i].rect.height = y1 - y0; } return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = { "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush" }; static cv::Scalar colors[] = { cv::Scalar(244, 67, 54), cv::Scalar(233, 30, 99), cv::Scalar(156, 39, 176), cv::Scalar(103, 58, 183), cv::Scalar(63, 81, 181), cv::Scalar(33, 150, 243), cv::Scalar(3, 169, 244), cv::Scalar(0, 188, 212), cv::Scalar(0, 150, 136), cv::Scalar(76, 175, 80), cv::Scalar(139, 195, 74), cv::Scalar(205, 220, 57), cv::Scalar(255, 235, 59), cv::Scalar(255, 193, 7), cv::Scalar(255, 152, 0), cv::Scalar(255, 87, 34), cv::Scalar(121, 85, 72), cv::Scalar(158, 158, 158), cv::Scalar(96, 125, 139) }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; const cv::Scalar& color = colors[i % 19]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::rectangle(image, obj.rect, color); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_yolo11(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/yolo11_cls.cpp ================================================ // Copyright 2025 Tencent // SPDX-License-Identifier: BSD-3-Clause // 1. install // pip3 install -U ultralytics pnnx ncnn // 2. export yolo11-cls torchscript // yolo export model=yolo11n-cls.pt format=torchscript // 3. convert torchscript with static shape // pnnx yolo11n-cls.torchscript // 4. now you get ncnn model files // yolo11n_cls.ncnn.param // yolo11n_cls.ncnn.bin #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include #include struct Object { int label; float prob; }; static void get_topk(const ncnn::Mat& cls_scores, int topk, std::vector& objects) { // partial sort topk with index int size = cls_scores.w; std::vector > vec; vec.resize(size); for (int i = 0; i < size; i++) { vec[i] = std::make_pair(cls_scores[i], i); } std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::greater >()); objects.resize(topk); for (int i = 0; i < topk; i++) { objects[i].label = vec[i].second; objects[i].prob = vec[i].first; } } static int detect_yolo11_cls(const cv::Mat& bgr, std::vector& objects) { ncnn::Net yolo11; yolo11.opt.use_vulkan_compute = true; // yolo11.opt.use_bf16_storage = true; // https://github.com/nihui/ncnn-android-yolo11/tree/master/app/src/main/assets yolo11.load_param("yolo11n_cls.ncnn.param"); yolo11.load_model("yolo11n_cls.ncnn.bin"); // yolo11.load_param("yolo11s_cls.ncnn.param"); // yolo11.load_model("yolo11s_cls.ncnn.bin"); // yolo11.load_param("yolo11m_cls.ncnn.param"); // yolo11.load_model("yolo11m_cls.ncnn.bin"); const int target_size = 224; const int topk = 5; int img_w = bgr.cols; int img_h = bgr.rows; // letterbox pad int w = img_w; int h = img_h; float scale = 1.f; if (w > h) { scale = (float)target_size / w; w = target_size; h = h * scale; } else { scale = (float)target_size / h; h = target_size; w = w * scale; } ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h); // letterbox pad to target_size rectangle int wpad = target_size - w; int hpad = target_size - h; ncnn::Mat in_pad; ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; in_pad.substract_mean_normalize(0, norm_vals); ncnn::Extractor ex = yolo11.create_extractor(); ex.input("in0", in_pad); ncnn::Mat out; ex.extract("out0", out); // return top-5 get_topk(out, topk, objects); return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = { "tench", "goldfish", "great white shark", "tiger shark", "hammerhead", "electric ray", "stingray", "cock", "hen", "ostrich", "brambling", "goldfinch", "house finch", "junco", "indigo bunting", "robin", "bulbul", "jay", "magpie", "chickadee", "water ouzel", "kite", "bald eagle", "vulture", "great grey owl", "European fire salamander", "common newt", "eft", "spotted salamander", "axolotl", "bullfrog", "tree frog", "tailed frog", "loggerhead", "leatherback turtle", "mud turtle", "terrapin", "box turtle", "banded gecko", "common iguana", "American chameleon", "whiptail", "agama", "frilled lizard", "alligator lizard", "Gila monster", "green lizard", "African chameleon", "Komodo dragon", "African crocodile", "American alligator", "triceratops", "thunder snake", "ringneck snake", "hognose snake", "green snake", "king snake", "garter snake", "water snake", "vine snake", "night snake", "boa constrictor", "rock python", "Indian cobra", "green mamba", "sea snake", "horned viper", "diamondback", "sidewinder", "trilobite", "harvestman", "scorpion", "black and gold garden spider", "barn spider", "garden spider", "black widow", "tarantula", "wolf spider", "tick", "centipede", "black grouse", "ptarmigan", "ruffed grouse", "prairie chicken", "peacock", "quail", "partridge", "African grey", "macaw", "sulphur-crested cockatoo", "lorikeet", "coucal", "bee eater", "hornbill", "hummingbird", "jacamar", "toucan", "drake", "red-breasted merganser", "goose", "black swan", "tusker", "echidna", "platypus", "wallaby", "koala", "wombat", "jellyfish", "sea anemone", "brain coral", "flatworm", "nematode", "conch", "snail", "slug", "sea slug", "chiton", "chambered nautilus", "Dungeness crab", "rock crab", "fiddler crab", "king crab", "American lobster", "spiny lobster", "crayfish", "hermit crab", "isopod", "white stork", "black stork", "spoonbill", "flamingo", "little blue heron", "American egret", "bittern", "crane (bird)", "limpkin", "European gallinule", "American coot", "bustard", "ruddy turnstone", "red-backed sandpiper", "redshank", "dowitcher", "oystercatcher", "pelican", "king penguin", "albatross", "grey whale", "killer whale", "dugong", "sea lion", "Chihuahua", "Japanese spaniel", "Maltese dog", "Pekinese", "Shih-Tzu", "Blenheim spaniel", "papillon", "toy terrier", "Rhodesian ridgeback", "Afghan hound", "basset", "beagle", "bloodhound", "bluetick", "black-and-tan coonhound", "Walker hound", "English foxhound", "redbone", "borzoi", "Irish wolfhound", "Italian greyhound", "whippet", "Ibizan hound", "Norwegian elkhound", "otterhound", "Saluki", "Scottish deerhound", "Weimaraner", "Staffordshire bullterrier", "American Staffordshire terrier", "Bedlington terrier", "Border terrier", "Kerry blue terrier", "Irish terrier", "Norfolk terrier", "Norwich terrier", "Yorkshire terrier", "wire-haired fox terrier", "Lakeland terrier", "Sealyham terrier", "Airedale", "cairn", "Australian terrier", "Dandie Dinmont", "Boston bull", "miniature schnauzer", "giant schnauzer", "standard schnauzer", "Scotch terrier", "Tibetan terrier", "silky terrier", "soft-coated wheaten terrier", "West Highland white terrier", "Lhasa", "flat-coated retriever", "curly-coated retriever", "golden retriever", "Labrador retriever", "Chesapeake Bay retriever", "German short-haired pointer", "vizsla", "English setter", "Irish setter", "Gordon setter", "Brittany spaniel", "clumber", "English springer", "Welsh springer spaniel", "cocker spaniel", "Sussex spaniel", "Irish water spaniel", "kuvasz", "schipperke", "groenendael", "malinois", "briard", "kelpie", "komondor", "Old English sheepdog", "Shetland sheepdog", "collie", "Border collie", "Bouvier des Flandres", "Rottweiler", "German shepherd", "Doberman", "miniature pinscher", "Greater Swiss Mountain dog", "Bernese mountain dog", "Appenzeller", "EntleBucher", "boxer", "bull mastiff", "Tibetan mastiff", "French bulldog", "Great Dane", "Saint Bernard", "Eskimo dog", "malamute", "Siberian husky", "dalmatian", "affenpinscher", "basenji", "pug", "Leonberg", "Newfoundland", "Great Pyrenees", "Samoyed", "Pomeranian", "chow", "keeshond", "Brabancon griffon", "Pembroke", "Cardigan", "toy poodle", "miniature poodle", "standard poodle", "Mexican hairless", "timber wolf", "white wolf", "red wolf", "coyote", "dingo", "dhole", "African hunting dog", "hyena", "red fox", "kit fox", "Arctic fox", "grey fox", "tabby", "tiger cat", "Persian cat", "Siamese cat", "Egyptian cat", "cougar", "lynx", "leopard", "snow leopard", "jaguar", "lion", "tiger", "cheetah", "brown bear", "American black bear", "ice bear", "sloth bear", "mongoose", "meerkat", "tiger beetle", "ladybug", "ground beetle", "long-horned beetle", "leaf beetle", "dung beetle", "rhinoceros beetle", "weevil", "fly", "bee", "ant", "grasshopper", "cricket", "walking stick", "cockroach", "mantis", "cicada", "leafhopper", "lacewing", "dragonfly", "damselfly", "admiral", "ringlet", "monarch", "cabbage butterfly", "sulphur butterfly", "lycaenid", "starfish", "sea urchin", "sea cucumber", "wood rabbit", "hare", "Angora", "hamster", "porcupine", "fox squirrel", "marmot", "beaver", "guinea pig", "sorrel", "zebra", "hog", "wild boar", "warthog", "hippopotamus", "ox", "water buffalo", "bison", "ram", "bighorn", "ibex", "hartebeest", "impala", "gazelle", "Arabian camel", "llama", "weasel", "mink", "polecat", "black-footed ferret", "otter", "skunk", "badger", "armadillo", "three-toed sloth", "orangutan", "gorilla", "chimpanzee", "gibbon", "siamang", "guenon", "patas", "baboon", "macaque", "langur", "colobus", "proboscis monkey", "marmoset", "capuchin", "howler monkey", "titi", "spider monkey", "squirrel monkey", "Madagascar cat", "indri", "Indian elephant", "African elephant", "lesser panda", "giant panda", "barracouta", "eel", "coho", "rock beauty", "anemone fish", "sturgeon", "gar", "lionfish", "puffer", "abacus", "abaya", "academic gown", "accordion", "acoustic guitar", "aircraft carrier", "airliner", "airship", "altar", "ambulance", "amphibian", "analog clock", "apiary", "apron", "ashcan", "assault rifle", "backpack", "bakery", "balance beam", "balloon", "ballpoint", "Band Aid", "banjo", "bannister", "barbell", "barber chair", "barbershop", "barn", "barometer", "barrel", "barrow", "baseball", "basketball", "bassinet", "bassoon", "bathing cap", "bath towel", "bathtub", "beach wagon", "beacon", "beaker", "bearskin", "beer bottle", "beer glass", "bell cote", "bib", "bicycle-built-for-two", "bikini", "binder", "binoculars", "birdhouse", "boathouse", "bobsled", "bolo tie", "bonnet", "bookcase", "bookshop", "bottlecap", "bow", "bow tie", "brass", "brassiere", "breakwater", "breastplate", "broom", "bucket", "buckle", "bulletproof vest", "bullet train", "butcher shop", "cab", "caldron", "candle", "cannon", "canoe", "can opener", "cardigan", "car mirror", "carousel", "carpenter's kit", "carton", "car wheel", "cash machine", "cassette", "cassette player", "castle", "catamaran", "CD player", "cello", "cellular telephone", "chain", "chainlink fence", "chain mail", "chain saw", "chest", "chiffonier", "chime", "china cabinet", "Christmas stocking", "church", "cinema", "cleaver", "cliff dwelling", "cloak", "clog", "cocktail shaker", "coffee mug", "coffeepot", "coil", "combination lock", "computer keyboard", "confectionery", "container ship", "convertible", "corkscrew", "cornet", "cowboy boot", "cowboy hat", "cradle", "crane (machine)", "crash helmet", "crate", "crib", "Crock Pot", "croquet ball", "crutch", "cuirass", "dam", "desk", "desktop computer", "dial telephone", "diaper", "digital clock", "digital watch", "dining table", "dishrag", "dishwasher", "disk brake", "dock", "dogsled", "dome", "doormat", "drilling platform", "drum", "drumstick", "dumbbell", "Dutch oven", "electric fan", "electric guitar", "electric locomotive", "entertainment center", "envelope", "espresso maker", "face powder", "feather boa", "file", "fireboat", "fire engine", "fire screen", "flagpole", "flute", "folding chair", "football helmet", "forklift", "fountain", "fountain pen", "four-poster", "freight car", "French horn", "frying pan", "fur coat", "garbage truck", "gasmask", "gas pump", "goblet", "go-kart", "golf ball", "golfcart", "gondola", "gong", "gown", "grand piano", "greenhouse", "grille", "grocery store", "guillotine", "hair slide", "hair spray", "half track", "hammer", "hamper", "hand blower", "hand-held computer", "handkerchief", "hard disc", "harmonica", "harp", "harvester", "hatchet", "holster", "home theater", "honeycomb", "hook", "hoopskirt", "horizontal bar", "horse cart", "hourglass", "iPod", "iron", "jack-o'-lantern", "jean", "jeep", "jersey", "jigsaw puzzle", "jinrikisha", "joystick", "kimono", "knee pad", "knot", "lab coat", "ladle", "lampshade", "laptop", "lawn mower", "lens cap", "letter opener", "library", "lifeboat", "lighter", "limousine", "liner", "lipstick", "Loafer", "lotion", "loudspeaker", "loupe", "lumbermill", "magnetic compass", "mailbag", "mailbox", "maillot (tights)", "maillot (tank suit)", "manhole cover", "maraca", "marimba", "mask", "matchstick", "maypole", "maze", "measuring cup", "medicine chest", "megalith", "microphone", "microwave", "military uniform", "milk can", "minibus", "miniskirt", "minivan", "missile", "mitten", "mixing bowl", "mobile home", "Model T", "modem", "monastery", "monitor", "moped", "mortar", "mortarboard", "mosque", "mosquito net", "motor scooter", "mountain bike", "mountain tent", "mouse", "mousetrap", "moving van", "muzzle", "nail", "neck brace", "necklace", "nipple", "notebook", "obelisk", "oboe", "ocarina", "odometer", "oil filter", "organ", "oscilloscope", "overskirt", "oxcart", "oxygen mask", "packet", "paddle", "paddlewheel", "padlock", "paintbrush", "pajama", "palace", "panpipe", "paper towel", "parachute", "parallel bars", "park bench", "parking meter", "passenger car", "patio", "pay-phone", "pedestal", "pencil box", "pencil sharpener", "perfume", "Petri dish", "photocopier", "pick", "pickelhaube", "picket fence", "pickup", "pier", "piggy bank", "pill bottle", "pillow", "ping-pong ball", "pinwheel", "pirate", "pitcher", "plane", "planetarium", "plastic bag", "plate rack", "plow", "plunger", "Polaroid camera", "pole", "police van", "poncho", "pool table", "pop bottle", "pot", "potter's wheel", "power drill", "prayer rug", "printer", "prison", "projectile", "projector", "puck", "punching bag", "purse", "quill", "quilt", "racer", "racket", "radiator", "radio", "radio telescope", "rain barrel", "recreational vehicle", "reel", "reflex camera", "refrigerator", "remote control", "restaurant", "revolver", "rifle", "rocking chair", "rotisserie", "rubber eraser", "rugby ball", "rule", "running shoe", "safe", "safety pin", "saltshaker", "sandal", "sarong", "sax", "scabbard", "scale", "school bus", "schooner", "scoreboard", "screen", "screw", "screwdriver", "seat belt", "sewing machine", "shield", "shoe shop", "shoji", "shopping basket", "shopping cart", "shovel", "shower cap", "shower curtain", "ski", "ski mask", "sleeping bag", "slide rule", "sliding door", "slot", "snorkel", "snowmobile", "snowplow", "soap dispenser", "soccer ball", "sock", "solar dish", "sombrero", "soup bowl", "space bar", "space heater", "space shuttle", "spatula", "speedboat", "spider web", "spindle", "sports car", "spotlight", "stage", "steam locomotive", "steel arch bridge", "steel drum", "stethoscope", "stole", "stone wall", "stopwatch", "stove", "strainer", "streetcar", "stretcher", "studio couch", "stupa", "submarine", "suit", "sundial", "sunglass", "sunglasses", "sunscreen", "suspension bridge", "swab", "sweatshirt", "swimming trunks", "swing", "switch", "syringe", "table lamp", "tank", "tape player", "teapot", "teddy", "television", "tennis ball", "thatch", "theater curtain", "thimble", "thresher", "throne", "tile roof", "toaster", "tobacco shop", "toilet seat", "torch", "totem pole", "tow truck", "toyshop", "tractor", "trailer truck", "tray", "trench coat", "tricycle", "trimaran", "tripod", "triumphal arch", "trolleybus", "trombone", "tub", "turnstile", "typewriter keyboard", "umbrella", "unicycle", "upright", "vacuum", "vase", "vault", "velvet", "vending machine", "vestment", "viaduct", "violin", "volleyball", "waffle iron", "wall clock", "wallet", "wardrobe", "warplane", "washbasin", "washer", "water bottle", "water jug", "water tower", "whiskey jug", "whistle", "wig", "window screen", "window shade", "Windsor tie", "wine bottle", "wing", "wok", "wooden spoon", "wool", "worm fence", "wreck", "yawl", "yurt", "web site", "comic book", "crossword puzzle", "street sign", "traffic light", "book jacket", "menu", "plate", "guacamole", "consomme", "hot pot", "trifle", "ice cream", "ice lolly", "French loaf", "bagel", "pretzel", "cheeseburger", "hotdog", "mashed potato", "head cabbage", "broccoli", "cauliflower", "zucchini", "spaghetti squash", "acorn squash", "butternut squash", "cucumber", "artichoke", "bell pepper", "cardoon", "mushroom", "Granny Smith", "strawberry", "orange", "lemon", "fig", "pineapple", "banana", "jackfruit", "custard apple", "pomegranate", "hay", "carbonara", "chocolate sauce", "dough", "meat loaf", "pizza", "potpie", "burrito", "red wine", "espresso", "cup", "eggnog", "alp", "bubble", "cliff", "coral reef", "geyser", "lakeside", "promontory", "sandbar", "seashore", "valley", "volcano", "ballplayer", "groom", "scuba diver", "rapeseed", "daisy", "yellow lady's slipper", "corn", "acorn", "hip", "buckeye", "coral fungus", "agaric", "gyromitra", "stinkhorn", "earthstar", "hen-of-the-woods", "bolete", "ear", "toilet tissue" }; cv::Mat image = bgr.clone(); int y_offset = 0; for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; fprintf(stderr, "%d = %.5f\n", obj.label, obj.prob); char text[256]; sprintf(text, "%4.1f%% %s", obj.prob * 100, class_names[obj.label]); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = 0; int y = y_offset; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); y_offset += label_size.height; } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_yolo11_cls(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/yolo11_obb.cpp ================================================ // Copyright 2025 Tencent // SPDX-License-Identifier: BSD-3-Clause // 1. install // pip3 install -U ultralytics pnnx ncnn // 2. export yolo11-obb torchscript // yolo export model=yolo11n-obb.pt format=torchscript // 3. convert torchscript with static shape // pnnx yolo11n-obb.torchscript // 4. modify yolo11n_obb_pnnx.py for dynamic shape inference // A. modify reshape to support dynamic image sizes // B. permute tensor before concat and adjust concat axis // C. drop post-process part // before: // v_195 = v_194.view(1, 1, 16384) // v_201 = v_200.view(1, 1, 4096) // v_207 = v_206.view(1, 1, 1024) // v_208 = torch.cat((v_195, v_201, v_207), dim=2) // ... // v_256 = v_225.view(1, 79, 16384) // v_257 = v_240.view(1, 79, 4096) // v_258 = v_255.view(1, 79, 1024) // v_259 = torch.cat((v_256, v_257, v_258), dim=2) // ... // after: // v_195 = v_194.view(1, 1, -1).transpose(1, 2) // v_201 = v_200.view(1, 1, -1).transpose(1, 2) // v_207 = v_206.view(1, 1, -1).transpose(1, 2) // v_208 = torch.cat((v_195, v_201, v_207), dim=1) // ... // v_256 = v_225.view(1, 79, -1).transpose(1, 2) // v_257 = v_240.view(1, 79, -1).transpose(1, 2) // v_258 = v_255.view(1, 79, -1).transpose(1, 2) // v_259 = torch.cat((v_256, v_257, v_258), dim=1) // return v_259, v_208 // D. modify area attention for dynamic shape inference // before: // v_95 = self.model_10_m_0_attn_qkv_conv(v_94) // v_96 = v_95.view(1, 2, 128, 1024) // v_97, v_98, v_99 = torch.split(tensor=v_96, dim=2, split_size_or_sections=(32,32,64)) // v_100 = torch.transpose(input=v_97, dim0=-2, dim1=-1) // v_101 = torch.matmul(input=v_100, other=v_98) // v_102 = (v_101 * 0.176777) // v_103 = F.softmax(input=v_102, dim=-1) // v_104 = torch.transpose(input=v_103, dim0=-2, dim1=-1) // v_105 = torch.matmul(input=v_99, other=v_104) // v_106 = v_105.view(1, 128, 32, 32) // v_107 = v_99.reshape(1, 128, 32, 32) // v_108 = self.model_10_m_0_attn_pe_conv(v_107) // v_109 = (v_106 + v_108) // v_110 = self.model_10_m_0_attn_proj_conv(v_109) // after: // v_95 = self.model_10_m_0_attn_qkv_conv(v_94) // v_96 = v_95.view(1, 2, 128, -1) // v_97, v_98, v_99 = torch.split(tensor=v_96, dim=2, split_size_or_sections=(32,32,64)) // v_100 = torch.transpose(input=v_97, dim0=-2, dim1=-1) // v_101 = torch.matmul(input=v_100, other=v_98) // v_102 = (v_101 * 0.176777) // v_103 = F.softmax(input=v_102, dim=-1) // v_104 = torch.transpose(input=v_103, dim0=-2, dim1=-1) // v_105 = torch.matmul(input=v_99, other=v_104) // v_106 = v_105.view(1, 128, v_95.size(2), v_95.size(3)) // v_107 = v_99.reshape(1, 128, v_95.size(2), v_95.size(3)) // v_108 = self.model_10_m_0_attn_pe_conv(v_107) // v_109 = (v_106 + v_108) // v_110 = self.model_10_m_0_attn_proj_conv(v_109) // 5. re-export yolo11-obb torchscript // python3 -c 'import yolo11n_obb_pnnx; yolo11n_obb_pnnx.export_torchscript()' // 6. convert new torchscript with dynamic shape // pnnx yolo11n_obb_pnnx.py.pt inputshape=[1,3,1024,1024] inputshape2=[1,3,512,512] // 7. now you get ncnn model files // mv yolo11n_obb_pnnx.py.ncnn.param yolo11n_obb.ncnn.param // mv yolo11n_obb_pnnx.py.ncnn.bin yolo11n_obb.ncnn.bin // the out blob would be a 2-dim tensor with w=79 h=21504 // // | bbox-reg 16 x 4 |score(15)| // +-----+-----+-----+-----+---------+ // | dx0 | dy0 | dx1 | dy1 | 0.1 ... | // all /| | | | | ... | // boxes | .. | .. | .. | .. | 0.0 ... | // (21504)| | | | | . ... | // \| | | | | . ... | // +-----+-----+-----+-----+---------+ // // the out blob would be a 2-dim tensor with w=1 h=21504 // // | degree(1)| // +----------+ // | 0.1 | // all /| | // boxes | 0.0 | // (21504)| . | // \| . | // +----------+ // #include "layer.h" #include "net.h" #include #include #include #include #include #include #include struct Object { cv::RotatedRect rrect; int label; float prob; }; static inline float intersection_area(const Object& a, const Object& b) { std::vector intersection; cv::rotatedRectangleIntersection(a.rrect, b.rrect, intersection); if (intersection.empty()) return 0.f; return cv::contourArea(intersection); } static void qsort_descent_inplace(std::vector& objects, int left, int right) { int i = left; int j = right; float p = objects[(left + right) / 2].prob; while (i <= j) { while (objects[i].prob > p) i++; while (objects[j].prob < p) j--; if (i <= j) { // swap std::swap(objects[i], objects[j]); i++; j--; } } // #pragma omp parallel sections { // #pragma omp section { if (left < j) qsort_descent_inplace(objects, left, j); } // #pragma omp section { if (i < right) qsort_descent_inplace(objects, i, right); } } } static void qsort_descent_inplace(std::vector& objects) { if (objects.empty()) return; qsort_descent_inplace(objects, 0, objects.size() - 1); } static void nms_sorted_bboxes(const std::vector& objects, std::vector& picked, float nms_threshold, bool agnostic = false) { picked.clear(); const int n = objects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = objects[i].rrect.size.area(); } for (int i = 0; i < n; i++) { const Object& a = objects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { const Object& b = objects[picked[j]]; if (!agnostic && a.label != b.label) continue; // intersection over union float inter_area = intersection_area(a, b); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area; if (inter_area / union_area > nms_threshold) keep = 0; } if (keep) picked.push_back(i); } } static inline float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); } static void generate_proposals(const ncnn::Mat& pred, const ncnn::Mat& pred_angle, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) { const int w = in_pad.w; const int h = in_pad.h; const int num_grid_x = w / stride; const int num_grid_y = h / stride; const int reg_max_1 = 16; const int num_class = pred.w - reg_max_1 * 4; // number of classes. 15 for DOTAv1 for (int y = 0; y < num_grid_y; y++) { for (int x = 0; x < num_grid_x; x++) { const ncnn::Mat pred_grid = pred.row_range(y * num_grid_x + x, 1); // find label with max score int label = -1; float score = -FLT_MAX; { const ncnn::Mat pred_score = pred_grid.range(reg_max_1 * 4, num_class); for (int k = 0; k < num_class; k++) { float s = pred_score[k]; if (s > score) { label = k; score = s; } } score = sigmoid(score); } if (score >= prob_threshold) { ncnn::Mat pred_bbox = pred_grid.range(0, reg_max_1 * 4).reshape(reg_max_1, 4).clone(); { ncnn::Layer* softmax = ncnn::create_layer("Softmax"); ncnn::ParamDict pd; pd.set(0, 1); // axis pd.set(1, 1); softmax->load_param(pd); ncnn::Option opt; opt.num_threads = 1; opt.use_packing_layout = false; softmax->create_pipeline(opt); softmax->forward_inplace(pred_bbox, opt); softmax->destroy_pipeline(opt); delete softmax; } float pred_ltrb[4]; for (int k = 0; k < 4; k++) { float dis = 0.f; const float* dis_after_sm = pred_bbox.row(k); for (int l = 0; l < reg_max_1; l++) { dis += l * dis_after_sm[l]; } pred_ltrb[k] = dis * stride; } float pb_cx = (x + 0.5f) * stride; float pb_cy = (y + 0.5f) * stride; const float angle = sigmoid(pred_angle.row(y * num_grid_x + x)[0]) - 0.25f; const float angle_rad = angle * 3.14159265358979323846f; const float angle_degree = angle * 180.f; float cos = cosf(angle_rad); float sin = sinf(angle_rad); float xx = (pred_ltrb[2] - pred_ltrb[0]) * 0.5f; float yy = (pred_ltrb[3] - pred_ltrb[1]) * 0.5f; float xr = xx * cos - yy * sin; float yr = xx * sin + yy * cos; const float cx = pb_cx + xr; const float cy = pb_cy + yr; const float ww = pred_ltrb[2] + pred_ltrb[0]; const float hh = pred_ltrb[3] + pred_ltrb[1]; Object obj; obj.rrect = cv::RotatedRect(cv::Point2f(cx, cy), cv::Size_(ww, hh), angle_degree); obj.label = label; obj.prob = score; objects.push_back(obj); } } } } static void generate_proposals(const ncnn::Mat& pred, const ncnn::Mat& pred_angle, const std::vector& strides, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) { const int w = in_pad.w; const int h = in_pad.h; int pred_row_offset = 0; for (size_t i = 0; i < strides.size(); i++) { const int stride = strides[i]; const int num_grid_x = w / stride; const int num_grid_y = h / stride; const int num_grid = num_grid_x * num_grid_y; generate_proposals(pred.row_range(pred_row_offset, num_grid), pred_angle.row_range(pred_row_offset, num_grid), stride, in_pad, prob_threshold, objects); pred_row_offset += num_grid; } } static int detect_yolo11_obb(const cv::Mat& bgr, std::vector& objects) { ncnn::Net yolo11; yolo11.opt.use_vulkan_compute = true; // yolo11.opt.use_bf16_storage = true; // https://github.com/nihui/ncnn-android-yolo11/tree/master/app/src/main/assets yolo11.load_param("yolo11n_obb.ncnn.param"); yolo11.load_model("yolo11n_obb.ncnn.bin"); // yolo11.load_param("yolo11s_obb.ncnn.param"); // yolo11.load_model("yolo11s_obb.ncnn.bin"); // yolo11.load_param("yolo11m_obb.ncnn.param"); // yolo11.load_model("yolo11m_obb.ncnn.bin"); const int target_size = 1024; const float prob_threshold = 0.25f; const float nms_threshold = 0.45f; int img_w = bgr.cols; int img_h = bgr.rows; // ultralytics/cfg/models/v8/yolo11.yaml std::vector strides(3); strides[0] = 8; strides[1] = 16; strides[2] = 32; const int max_stride = 32; // letterbox pad to multiple of max_stride int w = img_w; int h = img_h; float scale = 1.f; if (w > h) { scale = (float)target_size / w; w = target_size; h = h * scale; } else { scale = (float)target_size / h; h = target_size; w = w * scale; } ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h); // letterbox pad to target_size rectangle int wpad = (w + max_stride - 1) / max_stride * max_stride - w; int hpad = (h + max_stride - 1) / max_stride * max_stride - h; ncnn::Mat in_pad; ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; in_pad.substract_mean_normalize(0, norm_vals); ncnn::Extractor ex = yolo11.create_extractor(); ex.input("in0", in_pad); ncnn::Mat out; ex.extract("out0", out); ncnn::Mat out_angle; ex.extract("out1", out_angle); std::vector proposals; generate_proposals(out, out_angle, strides, in_pad, prob_threshold, proposals); // sort all proposals by score from highest to lowest qsort_descent_inplace(proposals); // apply nms with nms_threshold std::vector picked; nms_sorted_bboxes(proposals, picked, nms_threshold); int count = picked.size(); if (count == 0) return 0; objects.resize(count); for (int i = 0; i < count; i++) { Object obj = proposals[picked[i]]; // adjust offset to original unpadded obj.rrect.center.x = (obj.rrect.center.x - (wpad / 2)) / scale; obj.rrect.center.y = (obj.rrect.center.y - (hpad / 2)) / scale; obj.rrect.size.width = (obj.rrect.size.width) / scale; obj.rrect.size.height = (obj.rrect.size.height) / scale; objects[i] = obj; } return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = { "plane", "ship", "storage tank", "baseball diamond", "tennis court", "basketball court", "ground track field", "harbor", "bridge", "large vehicle", "small vehicle", "helicopter", "roundabout", "soccer ball field", "swimming pool" }; static const cv::Scalar colors[] = { cv::Scalar(156, 39, 176), cv::Scalar(103, 58, 183), cv::Scalar(63, 81, 181), cv::Scalar(33, 150, 243), cv::Scalar(3, 169, 244), cv::Scalar(0, 188, 212), cv::Scalar(0, 150, 136), cv::Scalar(76, 175, 80), cv::Scalar(139, 195, 74), cv::Scalar(205, 220, 57), cv::Scalar(255, 235, 59), cv::Scalar(255, 193, 7), cv::Scalar(255, 152, 0), cv::Scalar(255, 87, 34), cv::Scalar(121, 85, 72), cv::Scalar(158, 158, 158), cv::Scalar(96, 125, 139) }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; const cv::Scalar& color = colors[obj.label]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f @ %.2f\n", obj.label, obj.prob, obj.rrect.center.x, obj.rrect.center.y, obj.rrect.size.width, obj.rrect.size.height, obj.rrect.angle); cv::Point2f corners[4]; obj.rrect.points(corners); cv::line(image, corners[0], corners[1], color); cv::line(image, corners[1], corners[2], color); cv::line(image, corners[2], corners[3], color); cv::line(image, corners[3], corners[0], color); } for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; const cv::Scalar& color = colors[obj.label]; char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rrect.center.x - label_size.width / 2; int y = obj.rrect.center.y - label_size.height / 2 - baseLine; if (y < 0) y = 0; if (y + label_size.height > image.rows) y = image.rows - label_size.height; if (x < 0) x = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_yolo11_obb(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/yolo11_pose.cpp ================================================ // Copyright 2025 Tencent // SPDX-License-Identifier: BSD-3-Clause // 1. install // pip3 install -U ultralytics pnnx ncnn // 2. export yolo11-pose torchscript // yolo export model=yolo11n-pose.pt format=torchscript // 3. convert torchscript with static shape // pnnx yolo11n-pose.torchscript // 4. modify yolo11n_pose_pnnx.py for dynamic shape inference // A. modify reshape to support dynamic image sizes // B. permute tensor before concat and adjust concat axis // C. drop post-process part // before: // v_195 = v_194.view(1, 51, 6400) // v_201 = v_200.view(1, 51, 1600) // v_207 = v_206.view(1, 51, 400) // v_208 = torch.cat((v_195, v_201, v_207), dim=-1) // ... // v_254 = v_223.view(1, 65, 6400) // v_255 = v_238.view(1, 65, 1600) // v_256 = v_253.view(1, 65, 400) // v_257 = torch.cat((v_254, v_255, v_256), dim=2) // ... // after: // v_195 = v_194.view(1, 51, -1).transpose(1, 2) // v_201 = v_200.view(1, 51, -1).transpose(1, 2) // v_207 = v_206.view(1, 51, -1).transpose(1, 2) // v_208 = torch.cat((v_195, v_201, v_207), dim=1) // ... // v_254 = v_223.view(1, 65, -1).transpose(1, 2) // v_255 = v_238.view(1, 65, -1).transpose(1, 2) // v_256 = v_253.view(1, 65, -1).transpose(1, 2) // v_257 = torch.cat((v_254, v_255, v_256), dim=1) // return v_257, v_208 // D. modify area attention for dynamic shape inference // before: // v_95 = self.model_10_m_0_attn_qkv_conv(v_94) // v_96 = v_95.view(1, 2, 128, 400) // v_97, v_98, v_99 = torch.split(tensor=v_96, dim=2, split_size_or_sections=(32,32,64)) // v_100 = torch.transpose(input=v_97, dim0=-2, dim1=-1) // v_101 = torch.matmul(input=v_100, other=v_98) // v_102 = (v_101 * 0.176777) // v_103 = F.softmax(input=v_102, dim=-1) // v_104 = torch.transpose(input=v_103, dim0=-2, dim1=-1) // v_105 = torch.matmul(input=v_99, other=v_104) // v_106 = v_105.view(1, 128, 20, 20) // v_107 = v_99.reshape(1, 128, 20, 20) // v_108 = self.model_10_m_0_attn_pe_conv(v_107) // v_109 = (v_106 + v_108) // v_110 = self.model_10_m_0_attn_proj_conv(v_109) // after: // v_95 = self.model_10_m_0_attn_qkv_conv(v_94) // v_96 = v_95.view(1, 2, 128, -1) // v_97, v_98, v_99 = torch.split(tensor=v_96, dim=2, split_size_or_sections=(32,32,64)) // v_100 = torch.transpose(input=v_97, dim0=-2, dim1=-1) // v_101 = torch.matmul(input=v_100, other=v_98) // v_102 = (v_101 * 0.176777) // v_103 = F.softmax(input=v_102, dim=-1) // v_104 = torch.transpose(input=v_103, dim0=-2, dim1=-1) // v_105 = torch.matmul(input=v_99, other=v_104) // v_106 = v_105.view(1, 128, v_95.size(2), v_95.size(3)) // v_107 = v_99.reshape(1, 128, v_95.size(2), v_95.size(3)) // v_108 = self.model_10_m_0_attn_pe_conv(v_107) // v_109 = (v_106 + v_108) // v_110 = self.model_10_m_0_attn_proj_conv(v_109) // 5. re-export yolo11-pose torchscript // python3 -c 'import yolo11n_pose_pnnx; yolo11n_pose_pnnx.export_torchscript()' // 6. convert new torchscript with dynamic shape // pnnx yolo11n_pose_pnnx.py.pt inputshape=[1,3,640,640] inputshape2=[1,3,320,320] // 7. now you get ncnn model files // mv yolo11n_pose_pnnx.py.ncnn.param yolo11n_pose.ncnn.param // mv yolo11n_pose_pnnx.py.ncnn.bin yolo11n_pose.ncnn.bin // the out blob would be a 2-dim tensor with w=65 h=8400 // // | bbox-reg 16 x 4 |score(1)| // +-----+-----+-----+-----+--------+ // | dx0 | dy0 | dx1 | dy1 | 0.1 | // all /| | | | | | // boxes | .. | .. | .. | .. | 0.0 | // (8400)| | | | | . | // \| | | | | . | // +-----+-----+-----+-----+--------+ // // // | pose (51) | // +-----------+ // |0.1........| // all /| | // boxes |0.0........| // (8400)| . | // \| . | // +-----------+ // #include "layer.h" #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include #include struct KeyPoint { cv::Point2f p; float prob; }; struct Object { cv::Rect_ rect; int label; float prob; std::vector keypoints; }; static inline float intersection_area(const Object& a, const Object& b) { cv::Rect_ inter = a.rect & b.rect; return inter.area(); } static void qsort_descent_inplace(std::vector& objects, int left, int right) { int i = left; int j = right; float p = objects[(left + right) / 2].prob; while (i <= j) { while (objects[i].prob > p) i++; while (objects[j].prob < p) j--; if (i <= j) { // swap std::swap(objects[i], objects[j]); i++; j--; } } // #pragma omp parallel sections { // #pragma omp section { if (left < j) qsort_descent_inplace(objects, left, j); } // #pragma omp section { if (i < right) qsort_descent_inplace(objects, i, right); } } } static void qsort_descent_inplace(std::vector& objects) { if (objects.empty()) return; qsort_descent_inplace(objects, 0, objects.size() - 1); } static void nms_sorted_bboxes(const std::vector& objects, std::vector& picked, float nms_threshold, bool agnostic = false) { picked.clear(); const int n = objects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = objects[i].rect.area(); } for (int i = 0; i < n; i++) { const Object& a = objects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { const Object& b = objects[picked[j]]; if (!agnostic && a.label != b.label) continue; // intersection over union float inter_area = intersection_area(a, b); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area if (inter_area / union_area > nms_threshold) keep = 0; } if (keep) picked.push_back(i); } } static inline float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); } static void generate_proposals(const ncnn::Mat& pred, const ncnn::Mat& pred_points, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) { const int w = in_pad.w; const int h = in_pad.h; const int num_grid_x = w / stride; const int num_grid_y = h / stride; const int reg_max_1 = 16; const int num_points = pred_points.w / 3; for (int y = 0; y < num_grid_y; y++) { for (int x = 0; x < num_grid_x; x++) { const ncnn::Mat pred_grid = pred.row_range(y * num_grid_x + x, 1); const ncnn::Mat pred_points_grid = pred_points.row_range(y * num_grid_x + x, 1).reshape(3, num_points); // find label with max score int label = 0; float score = sigmoid(pred_grid[reg_max_1 * 4]); if (score >= prob_threshold) { ncnn::Mat pred_bbox = pred_grid.range(0, reg_max_1 * 4).reshape(reg_max_1, 4).clone(); { ncnn::Layer* softmax = ncnn::create_layer("Softmax"); ncnn::ParamDict pd; pd.set(0, 1); // axis pd.set(1, 1); softmax->load_param(pd); ncnn::Option opt; opt.num_threads = 1; opt.use_packing_layout = false; softmax->create_pipeline(opt); softmax->forward_inplace(pred_bbox, opt); softmax->destroy_pipeline(opt); delete softmax; } float pred_ltrb[4]; for (int k = 0; k < 4; k++) { float dis = 0.f; const float* dis_after_sm = pred_bbox.row(k); for (int l = 0; l < reg_max_1; l++) { dis += l * dis_after_sm[l]; } pred_ltrb[k] = dis * stride; } float pb_cx = (x + 0.5f) * stride; float pb_cy = (y + 0.5f) * stride; float x0 = pb_cx - pred_ltrb[0]; float y0 = pb_cy - pred_ltrb[1]; float x1 = pb_cx + pred_ltrb[2]; float y1 = pb_cy + pred_ltrb[3]; std::vector keypoints; for (int k = 0; k < num_points; k++) { KeyPoint keypoint; keypoint.p.x = (x + pred_points_grid.row(k)[0] * 2) * stride; keypoint.p.y = (y + pred_points_grid.row(k)[1] * 2) * stride; keypoint.prob = sigmoid(pred_points_grid.row(k)[2]); keypoints.push_back(keypoint); } Object obj; obj.rect.x = x0; obj.rect.y = y0; obj.rect.width = x1 - x0; obj.rect.height = y1 - y0; obj.label = label; obj.prob = score; obj.keypoints = keypoints; objects.push_back(obj); } } } } static void generate_proposals(const ncnn::Mat& pred, const ncnn::Mat& pred_points, const std::vector& strides, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) { const int w = in_pad.w; const int h = in_pad.h; int pred_row_offset = 0; for (size_t i = 0; i < strides.size(); i++) { const int stride = strides[i]; const int num_grid_x = w / stride; const int num_grid_y = h / stride; const int num_grid = num_grid_x * num_grid_y; generate_proposals(pred.row_range(pred_row_offset, num_grid), pred_points.row_range(pred_row_offset, num_grid), stride, in_pad, prob_threshold, objects); pred_row_offset += num_grid; } } static int detect_yolo11_pose(const cv::Mat& bgr, std::vector& objects) { ncnn::Net yolo11; yolo11.opt.use_vulkan_compute = true; // yolo11.opt.use_bf16_storage = true; // https://github.com/nihui/ncnn-android-yolo11/tree/master/app/src/main/assets yolo11.load_param("yolo11n_pose.ncnn.param"); yolo11.load_model("yolo11n_pose.ncnn.bin"); // yolo11.load_param("yolo11s_pose.ncnn.param"); // yolo11.load_model("yolo11s_pose.ncnn.bin"); // yolo11.load_param("yolo11m_pose.ncnn.param"); // yolo11.load_model("yolo11m_pose.ncnn.bin"); const int target_size = 640; const float prob_threshold = 0.25f; const float nms_threshold = 0.45f; const float mask_threshold = 0.5f; int img_w = bgr.cols; int img_h = bgr.rows; // ultralytics/cfg/models/v8/yolo11.yaml std::vector strides(3); strides[0] = 8; strides[1] = 16; strides[2] = 32; const int max_stride = 32; // letterbox pad to multiple of max_stride int w = img_w; int h = img_h; float scale = 1.f; if (w > h) { scale = (float)target_size / w; w = target_size; h = h * scale; } else { scale = (float)target_size / h; h = target_size; w = w * scale; } ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h); // letterbox pad to target_size rectangle int wpad = (w + max_stride - 1) / max_stride * max_stride - w; int hpad = (h + max_stride - 1) / max_stride * max_stride - h; ncnn::Mat in_pad; ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; in_pad.substract_mean_normalize(0, norm_vals); ncnn::Extractor ex = yolo11.create_extractor(); ex.input("in0", in_pad); ncnn::Mat out; ex.extract("out0", out); ncnn::Mat out_points; ex.extract("out1", out_points); std::vector proposals; generate_proposals(out, out_points, strides, in_pad, prob_threshold, proposals); // sort all proposals by score from highest to lowest qsort_descent_inplace(proposals); // apply nms with nms_threshold std::vector picked; nms_sorted_bboxes(proposals, picked, nms_threshold); int count = picked.size(); if (count == 0) return 0; const int num_points = out_points.w / 3; objects.resize(count); for (int i = 0; i < count; i++) { objects[i] = proposals[picked[i]]; // adjust offset to original unpadded float x0 = (objects[i].rect.x - (wpad / 2)) / scale; float y0 = (objects[i].rect.y - (hpad / 2)) / scale; float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale; float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale; for (int j = 0; j < num_points; j++) { objects[i].keypoints[j].p.x = (objects[i].keypoints[j].p.x - (wpad / 2)) / scale; objects[i].keypoints[j].p.y = (objects[i].keypoints[j].p.y - (hpad / 2)) / scale; } // clip x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f); y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f); x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f); y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f); objects[i].rect.x = x0; objects[i].rect.y = y0; objects[i].rect.width = x1 - x0; objects[i].rect.height = y1 - y0; } return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = {"person"}; static const cv::Scalar colors[] = { cv::Scalar(244, 67, 54), cv::Scalar(233, 30, 99), cv::Scalar(156, 39, 176), cv::Scalar(103, 58, 183), cv::Scalar(63, 81, 181), cv::Scalar(33, 150, 243), cv::Scalar(3, 169, 244), cv::Scalar(0, 188, 212), cv::Scalar(0, 150, 136), cv::Scalar(76, 175, 80), cv::Scalar(139, 195, 74), cv::Scalar(205, 220, 57), cv::Scalar(255, 235, 59), cv::Scalar(255, 193, 7), cv::Scalar(255, 152, 0), cv::Scalar(255, 87, 34), cv::Scalar(121, 85, 72), cv::Scalar(158, 158, 158), cv::Scalar(96, 125, 139) }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; const cv::Scalar& color = colors[i % 19]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); // draw bone static const int joint_pairs[16][2] = { {0, 1}, {1, 3}, {0, 2}, {2, 4}, {5, 6}, {5, 7}, {7, 9}, {6, 8}, {8, 10}, {5, 11}, {6, 12}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16} }; static const cv::Scalar bone_colors[] = { cv::Scalar(0, 255, 0), cv::Scalar(0, 255, 0), cv::Scalar(0, 255, 0), cv::Scalar(0, 255, 0), cv::Scalar(255, 128, 0), cv::Scalar(255, 128, 0), cv::Scalar(255, 128, 0), cv::Scalar(255, 128, 0), cv::Scalar(255, 128, 0), cv::Scalar(255, 51, 255), cv::Scalar(255, 51, 255), cv::Scalar(255, 51, 255), cv::Scalar(51, 153, 255), cv::Scalar(51, 153, 255), cv::Scalar(51, 153, 255), cv::Scalar(51, 153, 255), }; for (int j = 0; j < 16; j++) { const KeyPoint& p1 = obj.keypoints[joint_pairs[j][0]]; const KeyPoint& p2 = obj.keypoints[joint_pairs[j][1]]; if (p1.prob < 0.2f || p2.prob < 0.2f) continue; cv::line(image, p1.p, p2.p, bone_colors[j], 2); } // draw joint for (size_t j = 0; j < obj.keypoints.size(); j++) { const KeyPoint& keypoint = obj.keypoints[j]; fprintf(stderr, "%.2f %.2f = %.5f\n", keypoint.p.x, keypoint.p.y, keypoint.prob); if (keypoint.prob < 0.2f) continue; cv::circle(image, keypoint.p, 3, color, -1); } cv::rectangle(image, obj.rect, color); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_yolo11_pose(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/yolo11_seg.cpp ================================================ // Copyright 2025 Tencent // SPDX-License-Identifier: BSD-3-Clause // 1. install // pip3 install -U ultralytics pnnx ncnn // 2. export yolo11-seg torchscript // yolo export model=yolo11n-seg.pt format=torchscript // 3. convert torchscript with static shape // pnnx yolo11n-seg.torchscript // 4. modify yolo11n_seg_pnnx.py for dynamic shape inference // A. modify reshape to support dynamic image sizes // B. permute tensor before concat and adjust concat axis // C. drop post-process part // before: // v_202 = v_201.view(1, 32, 6400) // v_208 = v_207.view(1, 32, 1600) // v_214 = v_213.view(1, 32, 400) // v_215 = torch.cat((v_202, v_208, v_214), dim=2) // ... // v_261 = v_230.view(1, 144, 6400) // v_262 = v_245.view(1, 144, 1600) // v_263 = v_260.view(1, 144, 400) // v_264 = torch.cat((v_261, v_262, v_263), dim=2) // ... // v_285 = (v_284, v_196, ) // return v_285 // after: // v_202 = v_201.view(1, 32, -1).transpose(1, 2) // v_208 = v_207.view(1, 32, -1).transpose(1, 2) // v_214 = v_213.view(1, 32, -1).transpose(1, 2) // v_215 = torch.cat((v_202, v_208, v_214), dim=1) // ... // v_261 = v_230.view(1, 144, -1).transpose(1, 2) // v_262 = v_245.view(1, 144, -1).transpose(1, 2) // v_263 = v_260.view(1, 144, -1).transpose(1, 2) // v_264 = torch.cat((v_261, v_262, v_263), dim=1) // return v_264, v_215, v_196 // D. modify area attention for dynamic shape inference // before: // v_95 = self.model_10_m_0_attn_qkv_conv(v_94) // v_96 = v_95.view(1, 2, 128, 400) // v_97, v_98, v_99 = torch.split(tensor=v_96, dim=2, split_size_or_sections=(32,32,64)) // v_100 = torch.transpose(input=v_97, dim0=-2, dim1=-1) // v_101 = torch.matmul(input=v_100, other=v_98) // v_102 = (v_101 * 0.176777) // v_103 = F.softmax(input=v_102, dim=-1) // v_104 = torch.transpose(input=v_103, dim0=-2, dim1=-1) // v_105 = torch.matmul(input=v_99, other=v_104) // v_106 = v_105.view(1, 128, 20, 20) // v_107 = v_99.reshape(1, 128, 20, 20) // v_108 = self.model_10_m_0_attn_pe_conv(v_107) // v_109 = (v_106 + v_108) // v_110 = self.model_10_m_0_attn_proj_conv(v_109) // after: // v_95 = self.model_10_m_0_attn_qkv_conv(v_94) // v_96 = v_95.view(1, 2, 128, -1) // v_97, v_98, v_99 = torch.split(tensor=v_96, dim=2, split_size_or_sections=(32,32,64)) // v_100 = torch.transpose(input=v_97, dim0=-2, dim1=-1) // v_101 = torch.matmul(input=v_100, other=v_98) // v_102 = (v_101 * 0.176777) // v_103 = F.softmax(input=v_102, dim=-1) // v_104 = torch.transpose(input=v_103, dim0=-2, dim1=-1) // v_105 = torch.matmul(input=v_99, other=v_104) // v_106 = v_105.view(1, 128, v_95.size(2), v_95.size(3)) // v_107 = v_99.reshape(1, 128, v_95.size(2), v_95.size(3)) // v_108 = self.model_10_m_0_attn_pe_conv(v_107) // v_109 = (v_106 + v_108) // v_110 = self.model_10_m_0_attn_proj_conv(v_109) // 5. re-export yolo11-seg torchscript // python3 -c 'import yolo11n_seg_pnnx; yolo11n_seg_pnnx.export_torchscript()' // 6. convert new torchscript with dynamic shape // pnnx yolo11n_seg_pnnx.py.pt inputshape=[1,3,640,640] inputshape2=[1,3,320,320] // 7. now you get ncnn model files // mv yolo11n_seg_pnnx.py.ncnn.param yolo11n_seg.ncnn.param // mv yolo11n_seg_pnnx.py.ncnn.bin yolo11n_seg.ncnn.bin // the out blob would be a 2-dim tensor with w=176 h=8400 // // | bbox-reg 16 x 4 | per-class scores(80) | // +-----+-----+-----+-----+----------------------+ // | dx0 | dy0 | dx1 | dy1 |0.1 0.0 0.0 0.5 ......| // all /| | | | | . | // boxes | .. | .. | .. | .. |0.0 0.9 0.0 0.0 ......| // (8400)| | | | | . | // \| | | | | . | // +-----+-----+-----+-----+----------------------+ // // // | mask (32) | // +-----------+ // |0.1........| // all /| | // boxes |0.0........| // (8400)| . | // \| . | // +-----------+ // #include "layer.h" #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include #include struct Object { cv::Rect_ rect; int label; float prob; int gindex; cv::Mat mask; }; static inline float intersection_area(const Object& a, const Object& b) { cv::Rect_ inter = a.rect & b.rect; return inter.area(); } static void qsort_descent_inplace(std::vector& objects, int left, int right) { int i = left; int j = right; float p = objects[(left + right) / 2].prob; while (i <= j) { while (objects[i].prob > p) i++; while (objects[j].prob < p) j--; if (i <= j) { // swap std::swap(objects[i], objects[j]); i++; j--; } } // #pragma omp parallel sections { // #pragma omp section { if (left < j) qsort_descent_inplace(objects, left, j); } // #pragma omp section { if (i < right) qsort_descent_inplace(objects, i, right); } } } static void qsort_descent_inplace(std::vector& objects) { if (objects.empty()) return; qsort_descent_inplace(objects, 0, objects.size() - 1); } static void nms_sorted_bboxes(const std::vector& objects, std::vector& picked, float nms_threshold, bool agnostic = false) { picked.clear(); const int n = objects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = objects[i].rect.area(); } for (int i = 0; i < n; i++) { const Object& a = objects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { const Object& b = objects[picked[j]]; if (!agnostic && a.label != b.label) continue; // intersection over union float inter_area = intersection_area(a, b); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area if (inter_area / union_area > nms_threshold) keep = 0; } if (keep) picked.push_back(i); } } static inline float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); } static void generate_proposals(const ncnn::Mat& pred, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) { const int w = in_pad.w; const int h = in_pad.h; const int num_grid_x = w / stride; const int num_grid_y = h / stride; const int reg_max_1 = 16; const int num_class = pred.w - reg_max_1 * 4; // number of classes. 80 for COCO for (int y = 0; y < num_grid_y; y++) { for (int x = 0; x < num_grid_x; x++) { const ncnn::Mat pred_grid = pred.row_range(y * num_grid_x + x, 1); // find label with max score int label = -1; float score = -FLT_MAX; { const ncnn::Mat pred_score = pred_grid.range(reg_max_1 * 4, num_class); for (int k = 0; k < num_class; k++) { float s = pred_score[k]; if (s > score) { label = k; score = s; } } score = sigmoid(score); } if (score >= prob_threshold) { ncnn::Mat pred_bbox = pred_grid.range(0, reg_max_1 * 4).reshape(reg_max_1, 4).clone(); { ncnn::Layer* softmax = ncnn::create_layer("Softmax"); ncnn::ParamDict pd; pd.set(0, 1); // axis pd.set(1, 1); softmax->load_param(pd); ncnn::Option opt; opt.num_threads = 1; opt.use_packing_layout = false; softmax->create_pipeline(opt); softmax->forward_inplace(pred_bbox, opt); softmax->destroy_pipeline(opt); delete softmax; } float pred_ltrb[4]; for (int k = 0; k < 4; k++) { float dis = 0.f; const float* dis_after_sm = pred_bbox.row(k); for (int l = 0; l < reg_max_1; l++) { dis += l * dis_after_sm[l]; } pred_ltrb[k] = dis * stride; } float pb_cx = (x + 0.5f) * stride; float pb_cy = (y + 0.5f) * stride; float x0 = pb_cx - pred_ltrb[0]; float y0 = pb_cy - pred_ltrb[1]; float x1 = pb_cx + pred_ltrb[2]; float y1 = pb_cy + pred_ltrb[3]; Object obj; obj.rect.x = x0; obj.rect.y = y0; obj.rect.width = x1 - x0; obj.rect.height = y1 - y0; obj.label = label; obj.prob = score; obj.gindex = y * num_grid_x + x; objects.push_back(obj); } } } } static void generate_proposals(const ncnn::Mat& pred, const std::vector& strides, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) { const int w = in_pad.w; const int h = in_pad.h; int pred_row_offset = 0; for (size_t i = 0; i < strides.size(); i++) { const int stride = strides[i]; const int num_grid_x = w / stride; const int num_grid_y = h / stride; const int num_grid = num_grid_x * num_grid_y; std::vector objects_stride; generate_proposals(pred.row_range(pred_row_offset, num_grid), stride, in_pad, prob_threshold, objects_stride); for (size_t j = 0; j < objects_stride.size(); j++) { Object obj = objects_stride[j]; obj.gindex += pred_row_offset; objects.push_back(obj); } pred_row_offset += num_grid; } } static int detect_yolo11_seg(const cv::Mat& bgr, std::vector& objects) { ncnn::Net yolo11; yolo11.opt.use_vulkan_compute = true; // yolo11.opt.use_bf16_storage = true; // https://github.com/nihui/ncnn-android-yolo11/tree/master/app/src/main/assets yolo11.load_param("yolo11n_seg.ncnn.param"); yolo11.load_model("yolo11n_seg.ncnn.bin"); // yolo11.load_param("yolo11s_seg.ncnn.param"); // yolo11.load_model("yolo11s_seg.ncnn.bin"); // yolo11.load_param("yolo11m_seg.ncnn.param"); // yolo11.load_model("yolo11m_seg.ncnn.bin"); const int target_size = 640; const float prob_threshold = 0.25f; const float nms_threshold = 0.45f; const float mask_threshold = 0.5f; int img_w = bgr.cols; int img_h = bgr.rows; // ultralytics/cfg/models/v8/yolo11.yaml std::vector strides(3); strides[0] = 8; strides[1] = 16; strides[2] = 32; const int max_stride = 32; // letterbox pad to multiple of max_stride int w = img_w; int h = img_h; float scale = 1.f; if (w > h) { scale = (float)target_size / w; w = target_size; h = h * scale; } else { scale = (float)target_size / h; h = target_size; w = w * scale; } ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h); // letterbox pad to target_size rectangle int wpad = (w + max_stride - 1) / max_stride * max_stride - w; int hpad = (h + max_stride - 1) / max_stride * max_stride - h; ncnn::Mat in_pad; ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; in_pad.substract_mean_normalize(0, norm_vals); ncnn::Extractor ex = yolo11.create_extractor(); ex.input("in0", in_pad); ncnn::Mat out; ex.extract("out0", out); std::vector proposals; generate_proposals(out, strides, in_pad, prob_threshold, proposals); // sort all proposals by score from highest to lowest qsort_descent_inplace(proposals); // apply nms with nms_threshold std::vector picked; nms_sorted_bboxes(proposals, picked, nms_threshold); int count = picked.size(); if (count == 0) return 0; ncnn::Mat mask_feat; ex.extract("out1", mask_feat); ncnn::Mat mask_protos; ex.extract("out2", mask_protos); ncnn::Mat objects_mask_feat(mask_feat.w, 1, count); objects.resize(count); for (int i = 0; i < count; i++) { objects[i] = proposals[picked[i]]; // adjust offset to original unpadded float x0 = (objects[i].rect.x - (wpad / 2)) / scale; float y0 = (objects[i].rect.y - (hpad / 2)) / scale; float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale; float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale; // clip x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f); y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f); x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f); y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f); objects[i].rect.x = x0; objects[i].rect.y = y0; objects[i].rect.width = x1 - x0; objects[i].rect.height = y1 - y0; // pick mask feat memcpy(objects_mask_feat.channel(i), mask_feat.row(objects[i].gindex), mask_feat.w * sizeof(float)); } // process mask ncnn::Mat objects_mask; { ncnn::Layer* gemm = ncnn::create_layer("Gemm"); ncnn::ParamDict pd; pd.set(6, 1); // constantC pd.set(7, count); // constantM pd.set(8, mask_protos.w * mask_protos.h); // constantN pd.set(9, mask_feat.w); // constantK pd.set(10, -1); // constant_broadcast_type_C pd.set(11, 1); // output_N1M gemm->load_param(pd); ncnn::Option opt; opt.num_threads = 1; opt.use_packing_layout = false; gemm->create_pipeline(opt); std::vector gemm_inputs(2); gemm_inputs[0] = objects_mask_feat; gemm_inputs[1] = mask_protos.reshape(mask_protos.w * mask_protos.h, 1, mask_protos.c); std::vector gemm_outputs(1); gemm->forward(gemm_inputs, gemm_outputs, opt); objects_mask = gemm_outputs[0].reshape(mask_protos.w, mask_protos.h, count); gemm->destroy_pipeline(opt); delete gemm; } { ncnn::Layer* sigmoid = ncnn::create_layer("Sigmoid"); ncnn::Option opt; opt.num_threads = 1; opt.use_packing_layout = false; sigmoid->create_pipeline(opt); sigmoid->forward_inplace(objects_mask, opt); sigmoid->destroy_pipeline(opt); delete sigmoid; } // resize mask map { ncnn::Mat objects_mask_resized; ncnn::resize_bilinear(objects_mask, objects_mask_resized, in_pad.w / scale, in_pad.h / scale); objects_mask = objects_mask_resized; } // create per-object mask for (int i = 0; i < count; i++) { Object& obj = objects[i]; const ncnn::Mat mm = objects_mask.channel(i); obj.mask = cv::Mat((int)obj.rect.height, (int)obj.rect.width, CV_8UC1); // adjust offset to original unpadded and clip inside object box for (int y = 0; y < (int)obj.rect.height; y++) { const float* pmm = mm.row((int)(hpad / 2 / scale + obj.rect.y + y)) + (int)(wpad / 2 / scale + obj.rect.x); uchar* pmask = obj.mask.ptr(y); for (int x = 0; x < (int)obj.rect.width; x++) { pmask[x] = pmm[x] > mask_threshold ? 1 : 0; } } } return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = { "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush" }; static cv::Scalar colors[] = { cv::Scalar(244, 67, 54), cv::Scalar(233, 30, 99), cv::Scalar(156, 39, 176), cv::Scalar(103, 58, 183), cv::Scalar(63, 81, 181), cv::Scalar(33, 150, 243), cv::Scalar(3, 169, 244), cv::Scalar(0, 188, 212), cv::Scalar(0, 150, 136), cv::Scalar(76, 175, 80), cv::Scalar(139, 195, 74), cv::Scalar(205, 220, 57), cv::Scalar(255, 235, 59), cv::Scalar(255, 193, 7), cv::Scalar(255, 152, 0), cv::Scalar(255, 87, 34), cv::Scalar(121, 85, 72), cv::Scalar(158, 158, 158), cv::Scalar(96, 125, 139) }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; const cv::Scalar& color = colors[i % 19]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); for (int y = 0; y < (int)obj.rect.height; y++) { const uchar* maskptr = obj.mask.ptr(y); uchar* bgrptr = image.ptr((int)obj.rect.y + y) + (int)obj.rect.x * 3; for (int x = 0; x < (int)obj.rect.width; x++) { if (maskptr[x]) { bgrptr[0] = bgrptr[0] * 0.5 + color[0] * 0.5; bgrptr[1] = bgrptr[1] * 0.5 + color[1] * 0.5; bgrptr[2] = bgrptr[2] * 0.5 + color[2] * 0.5; } bgrptr += 3; } } cv::rectangle(image, obj.rect, color); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_yolo11_seg(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/yolov2.cpp ================================================ // Copyright 2018 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include struct Object { cv::Rect_ rect; int label; float prob; }; static int detect_yolov2(const cv::Mat& bgr, std::vector& objects) { ncnn::Net yolov2; yolov2.opt.use_vulkan_compute = true; // original pretrained model from https://github.com/eric612/MobileNet-YOLO // https://github.com/eric612/MobileNet-YOLO/blob/master/models/yolov2/mobilenet_yolo_deploy.prototxt // https://github.com/eric612/MobileNet-YOLO/blob/master/models/yolov2/mobilenet_yolo_deploy_iter_80000.caffemodel // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models if (yolov2.load_param("mobilenet_yolo.param")) exit(-1); if (yolov2.load_model("mobilenet_yolo.bin")) exit(-1); const int target_size = 416; int img_w = bgr.cols; int img_h = bgr.rows; ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, target_size, target_size); // the Caffe-YOLOv2-Windows style // X' = X * scale - mean const float mean_vals[3] = {1.0f, 1.0f, 1.0f}; const float norm_vals[3] = {0.007843f, 0.007843f, 0.007843f}; in.substract_mean_normalize(0, norm_vals); in.substract_mean_normalize(mean_vals, 0); ncnn::Extractor ex = yolov2.create_extractor(); ex.input("data", in); ncnn::Mat out; ex.extract("detection_out", out); // printf("%d %d %d\n", out.w, out.h, out.c); objects.clear(); for (int i = 0; i < out.h; i++) { const float* values = out.row(i); Object object; object.label = values[0]; object.prob = values[1]; object.rect.x = values[2] * img_w; object.rect.y = values[3] * img_h; object.rect.width = values[4] * img_w - object.rect.x; object.rect.height = values[5] * img_h - object.rect.y; objects.push_back(object); } return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = {"background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor" }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0)); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_yolov2(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/yolov3.cpp ================================================ // Copyright 2018 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include struct Object { cv::Rect_ rect; int label; float prob; }; static int detect_yolov3(const cv::Mat& bgr, std::vector& objects) { ncnn::Net yolov3; yolov3.opt.use_vulkan_compute = true; // original pretrained model from https://github.com/eric612/MobileNet-YOLO // param : https://drive.google.com/open?id=1V9oKHP6G6XvXZqhZbzNKL6FI_clRWdC- // bin : https://drive.google.com/open?id=1DBcuFCr-856z3FRQznWL_S5h-Aj3RawA // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models if (yolov3.load_param("mobilenetv2_yolov3.param")) exit(-1); if (yolov3.load_model("mobilenetv2_yolov3.bin")) exit(-1); const int target_size = 352; int img_w = bgr.cols; int img_h = bgr.rows; ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, target_size, target_size); const float mean_vals[3] = {127.5f, 127.5f, 127.5f}; const float norm_vals[3] = {0.007843f, 0.007843f, 0.007843f}; in.substract_mean_normalize(mean_vals, norm_vals); ncnn::Extractor ex = yolov3.create_extractor(); ex.input("data", in); ncnn::Mat out; ex.extract("detection_out", out); // printf("%d %d %d\n", out.w, out.h, out.c); objects.clear(); for (int i = 0; i < out.h; i++) { const float* values = out.row(i); Object object; object.label = values[0]; object.prob = values[1]; object.rect.x = values[2] * img_w; object.rect.y = values[3] * img_h; object.rect.width = values[4] * img_w - object.rect.x; object.rect.height = values[5] * img_h - object.rect.y; objects.push_back(object); } return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = {"background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor" }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0)); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_yolov3(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/yolov4.cpp ================================================ // Copyright 2020 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "net.h" #include #include #include #if CV_MAJOR_VERSION >= 3 #include #endif #include #include #define NCNN_PROFILING #define YOLOV4_TINY //Using yolov4_tiny, if undef, using original yolov4 #ifdef NCNN_PROFILING #include "benchmark.h" #endif struct Object { cv::Rect_ rect; int label; float prob; }; static int init_yolov4(ncnn::Net* yolov4, int* target_size) { /* --> Set the params you need for the ncnn inference <-- */ yolov4->opt.num_threads = 4; //You need to compile with libgomp for multi thread support yolov4->opt.use_vulkan_compute = true; //You need to compile with libvulkan for gpu support /* --> End of setting params <-- */ int ret = 0; // original pretrained model from https://github.com/AlexeyAB/darknet // the ncnn model https://drive.google.com/drive/folders/1YzILvh0SKQPS_lrb33dmGNq7aVTKPWS0?usp=sharing // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models #ifdef YOLOV4_TINY const char* yolov4_param = "yolov4-tiny-opt.param"; const char* yolov4_model = "yolov4-tiny-opt.bin"; *target_size = 416; #else const char* yolov4_param = "yolov4-opt.param"; const char* yolov4_model = "yolov4-opt.bin"; *target_size = 608; #endif if (yolov4->load_param(yolov4_param)) exit(-1); if (yolov4->load_model(yolov4_model)) exit(-1); return 0; } static int detect_yolov4(const cv::Mat& bgr, std::vector& objects, int target_size, ncnn::Net* yolov4) { int img_w = bgr.cols; int img_h = bgr.rows; ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, bgr.cols, bgr.rows, target_size, target_size); const float mean_vals[3] = {0, 0, 0}; const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; in.substract_mean_normalize(mean_vals, norm_vals); ncnn::Extractor ex = yolov4->create_extractor(); ex.input("data", in); ncnn::Mat out; ex.extract("output", out); objects.clear(); for (int i = 0; i < out.h; i++) { const float* values = out.row(i); Object object; object.label = values[0]; object.prob = values[1]; object.rect.x = values[2] * img_w; object.rect.y = values[3] * img_h; object.rect.width = values[4] * img_w - object.rect.x; object.rect.height = values[5] * img_h - object.rect.y; objects.push_back(object); } return 0; } static int draw_objects(const cv::Mat& bgr, const std::vector& objects, int is_streaming) { static const char* class_names[] = {"background", "person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush" }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0)); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); if (is_streaming) { cv::waitKey(1); } else { cv::waitKey(0); } return 0; } int main(int argc, char** argv) { cv::Mat frame; std::vector objects; cv::VideoCapture cap; ncnn::Net yolov4; const char* devicepath; int target_size = 0; int is_streaming = 0; if (argc < 2) { fprintf(stderr, "Usage: %s [v4l input device or image]\n", argv[0]); return -1; } devicepath = argv[1]; #ifdef NCNN_PROFILING double t_load_start = ncnn::get_current_time(); #endif int ret = init_yolov4(&yolov4, &target_size); //We load model and param first! if (ret != 0) { fprintf(stderr, "Failed to load model or param, error %d", ret); return -1; } #ifdef NCNN_PROFILING double t_load_end = ncnn::get_current_time(); fprintf(stdout, "NCNN Init time %.02lfms\n", t_load_end - t_load_start); #endif if (strstr(devicepath, "/dev/video") == NULL) { frame = cv::imread(argv[1], 1); if (frame.empty()) { fprintf(stderr, "Failed to read image %s.\n", argv[1]); return -1; } } else { cap.open(devicepath); if (!cap.isOpened()) { fprintf(stderr, "Failed to open %s", devicepath); return -1; } cap >> frame; if (frame.empty()) { fprintf(stderr, "Failed to read from device %s.\n", devicepath); return -1; } is_streaming = 1; } while (1) { if (is_streaming) { #ifdef NCNN_PROFILING double t_capture_start = ncnn::get_current_time(); #endif cap >> frame; #ifdef NCNN_PROFILING double t_capture_end = ncnn::get_current_time(); fprintf(stdout, "NCNN OpenCV capture time %.02lfms\n", t_capture_end - t_capture_start); #endif if (frame.empty()) { fprintf(stderr, "OpenCV Failed to Capture from device %s\n", devicepath); return -1; } } #ifdef NCNN_PROFILING double t_detect_start = ncnn::get_current_time(); #endif detect_yolov4(frame, objects, target_size, &yolov4); //Create an extractor and run detection #ifdef NCNN_PROFILING double t_detect_end = ncnn::get_current_time(); fprintf(stdout, "NCNN detection time %.02lfms\n", t_detect_end - t_detect_start); #endif #ifdef NCNN_PROFILING double t_draw_start = ncnn::get_current_time(); #endif draw_objects(frame, objects, is_streaming); //Draw detection results on opencv image #ifdef NCNN_PROFILING double t_draw_end = ncnn::get_current_time(); fprintf(stdout, "NCNN OpenCV draw result time %.02lfms\n", t_draw_end - t_draw_start); #endif if (!is_streaming) { //If it is a still image, exit! return 0; } } return 0; } ================================================ FILE: examples/yolov5.cpp ================================================ // Copyright 2020 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "layer.h" #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include #include //#define YOLOV5_V60 1 //YOLOv5 v6.0 #define YOLOV5_V62 1 //YOLOv5 v6.2 export onnx model method https://github.com/shaoshengsong/yolov5_62_export_ncnn #if YOLOV5_V60 || YOLOV5_V62 #define MAX_STRIDE 64 #else #define MAX_STRIDE 32 class YoloV5Focus : public ncnn::Layer { public: YoloV5Focus() { one_blob_only = true; } virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const { int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; int outw = w / 2; int outh = h / 2; int outc = channels * 4; top_blob.create(outw, outh, outc, 4u, 1, opt.blob_allocator); if (top_blob.empty()) return -100; #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outc; p++) { const float* ptr = bottom_blob.channel(p % channels).row((p / channels) % 2) + ((p / channels) / 2); float* outptr = top_blob.channel(p); for (int i = 0; i < outh; i++) { for (int j = 0; j < outw; j++) { *outptr = *ptr; outptr += 1; ptr += 2; } ptr += w; } } return 0; } }; DEFINE_LAYER_CREATOR(YoloV5Focus) #endif //YOLOV5_V60 YOLOV5_V62 struct Object { cv::Rect_ rect; int label; float prob; }; static inline float intersection_area(const Object& a, const Object& b) { cv::Rect_ inter = a.rect & b.rect; return inter.area(); } static void qsort_descent_inplace(std::vector& faceobjects, int left, int right) { int i = left; int j = right; float p = faceobjects[(left + right) / 2].prob; while (i <= j) { while (faceobjects[i].prob > p) i++; while (faceobjects[j].prob < p) j--; if (i <= j) { // swap std::swap(faceobjects[i], faceobjects[j]); i++; j--; } } #pragma omp parallel sections { #pragma omp section { if (left < j) qsort_descent_inplace(faceobjects, left, j); } #pragma omp section { if (i < right) qsort_descent_inplace(faceobjects, i, right); } } } static void qsort_descent_inplace(std::vector& faceobjects) { if (faceobjects.empty()) return; qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1); } static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold, bool agnostic = false) { picked.clear(); const int n = faceobjects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = faceobjects[i].rect.area(); } for (int i = 0; i < n; i++) { const Object& a = faceobjects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { const Object& b = faceobjects[picked[j]]; if (!agnostic && a.label != b.label) continue; // intersection over union float inter_area = intersection_area(a, b); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area if (inter_area / union_area > nms_threshold) keep = 0; } if (keep) picked.push_back(i); } } static inline float sigmoid(float x) { return static_cast(1.f / (1.f + exp(-x))); } static void generate_proposals(const ncnn::Mat& anchors, int stride, const ncnn::Mat& in_pad, const ncnn::Mat& feat_blob, float prob_threshold, std::vector& objects) { const int num_grid = feat_blob.h; int num_grid_x; int num_grid_y; if (in_pad.w > in_pad.h) { num_grid_x = in_pad.w / stride; num_grid_y = num_grid / num_grid_x; } else { num_grid_y = in_pad.h / stride; num_grid_x = num_grid / num_grid_y; } const int num_class = feat_blob.w - 5; const int num_anchors = anchors.w / 2; for (int q = 0; q < num_anchors; q++) { const float anchor_w = anchors[q * 2]; const float anchor_h = anchors[q * 2 + 1]; const ncnn::Mat feat = feat_blob.channel(q); for (int i = 0; i < num_grid_y; i++) { for (int j = 0; j < num_grid_x; j++) { const float* featptr = feat.row(i * num_grid_x + j); float box_confidence = sigmoid(featptr[4]); if (box_confidence >= prob_threshold) { // find class index with max class score int class_index = 0; float class_score = -FLT_MAX; for (int k = 0; k < num_class; k++) { float score = featptr[5 + k]; if (score > class_score) { class_index = k; class_score = score; } } float confidence = box_confidence * sigmoid(class_score); if (confidence >= prob_threshold) { // yolov5/models/yolo.py Detect forward // y = x[i].sigmoid() // y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i] # xy // y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh float dx = sigmoid(featptr[0]); float dy = sigmoid(featptr[1]); float dw = sigmoid(featptr[2]); float dh = sigmoid(featptr[3]); float pb_cx = (dx * 2.f - 0.5f + j) * stride; float pb_cy = (dy * 2.f - 0.5f + i) * stride; float pb_w = pow(dw * 2.f, 2) * anchor_w; float pb_h = pow(dh * 2.f, 2) * anchor_h; float x0 = pb_cx - pb_w * 0.5f; float y0 = pb_cy - pb_h * 0.5f; float x1 = pb_cx + pb_w * 0.5f; float y1 = pb_cy + pb_h * 0.5f; Object obj; obj.rect.x = x0; obj.rect.y = y0; obj.rect.width = x1 - x0; obj.rect.height = y1 - y0; obj.label = class_index; obj.prob = confidence; objects.push_back(obj); } } } } } } static int detect_yolov5(const cv::Mat& bgr, std::vector& objects) { ncnn::Net yolov5; yolov5.opt.use_vulkan_compute = true; // yolov5.opt.use_bf16_storage = true; // original pretrained model from https://github.com/ultralytics/yolov5 // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models #if YOLOV5_V62 if (yolov5.load_param("yolov5s_6.2.param")) exit(-1); if (yolov5.load_model("yolov5s_6.2.bin")) exit(-1); #elif YOLOV5_V60 if (yolov5.load_param("yolov5s_6.0.param")) exit(-1); if (yolov5.load_model("yolov5s_6.0.bin")) exit(-1); #else yolov5.register_custom_layer("YoloV5Focus", YoloV5Focus_layer_creator); if (yolov5.load_param("yolov5s.param")) exit(-1); if (yolov5.load_model("yolov5s.bin")) exit(-1); #endif const int target_size = 640; const float prob_threshold = 0.25f; const float nms_threshold = 0.45f; int img_w = bgr.cols; int img_h = bgr.rows; // letterbox pad to multiple of MAX_STRIDE int w = img_w; int h = img_h; float scale = 1.f; if (w > h) { scale = (float)target_size / w; w = target_size; h = h * scale; } else { scale = (float)target_size / h; h = target_size; w = w * scale; } ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h); // pad to target_size rectangle // yolov5/utils/datasets.py letterbox int wpad = (w + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - w; int hpad = (h + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - h; ncnn::Mat in_pad; ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; in_pad.substract_mean_normalize(0, norm_vals); ncnn::Extractor ex = yolov5.create_extractor(); ex.input("images", in_pad); std::vector proposals; // anchor setting from yolov5/models/yolov5s.yaml // stride 8 { ncnn::Mat out; ex.extract("output", out); ncnn::Mat anchors(6); anchors[0] = 10.f; anchors[1] = 13.f; anchors[2] = 16.f; anchors[3] = 30.f; anchors[4] = 33.f; anchors[5] = 23.f; std::vector objects8; generate_proposals(anchors, 8, in_pad, out, prob_threshold, objects8); proposals.insert(proposals.end(), objects8.begin(), objects8.end()); } // stride 16 { ncnn::Mat out; #if YOLOV5_V62 ex.extract("353", out); #elif YOLOV5_V60 ex.extract("376", out); #else ex.extract("781", out); #endif ncnn::Mat anchors(6); anchors[0] = 30.f; anchors[1] = 61.f; anchors[2] = 62.f; anchors[3] = 45.f; anchors[4] = 59.f; anchors[5] = 119.f; std::vector objects16; generate_proposals(anchors, 16, in_pad, out, prob_threshold, objects16); proposals.insert(proposals.end(), objects16.begin(), objects16.end()); } // stride 32 { ncnn::Mat out; #if YOLOV5_V62 ex.extract("367", out); #elif YOLOV5_V60 ex.extract("401", out); #else ex.extract("801", out); #endif ncnn::Mat anchors(6); anchors[0] = 116.f; anchors[1] = 90.f; anchors[2] = 156.f; anchors[3] = 198.f; anchors[4] = 373.f; anchors[5] = 326.f; std::vector objects32; generate_proposals(anchors, 32, in_pad, out, prob_threshold, objects32); proposals.insert(proposals.end(), objects32.begin(), objects32.end()); } // sort all proposals by score from highest to lowest qsort_descent_inplace(proposals); // apply nms with nms_threshold std::vector picked; nms_sorted_bboxes(proposals, picked, nms_threshold); int count = picked.size(); objects.resize(count); for (int i = 0; i < count; i++) { objects[i] = proposals[picked[i]]; // adjust offset to original unpadded float x0 = (objects[i].rect.x - (wpad / 2)) / scale; float y0 = (objects[i].rect.y - (hpad / 2)) / scale; float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale; float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale; // clip x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f); y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f); x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f); y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f); objects[i].rect.x = x0; objects[i].rect.y = y0; objects[i].rect.width = x1 - x0; objects[i].rect.height = y1 - y0; } return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = { "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush" }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0)); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_yolov5(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/yolov5_pnnx.cpp ================================================ // Copyright 2022 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "layer.h" #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include #include struct Object { cv::Rect_ rect; int label; float prob; }; static inline float intersection_area(const Object& a, const Object& b) { cv::Rect_ inter = a.rect & b.rect; return inter.area(); } static void qsort_descent_inplace(std::vector& faceobjects, int left, int right) { int i = left; int j = right; float p = faceobjects[(left + right) / 2].prob; while (i <= j) { while (faceobjects[i].prob > p) i++; while (faceobjects[j].prob < p) j--; if (i <= j) { // swap std::swap(faceobjects[i], faceobjects[j]); i++; j--; } } #pragma omp parallel sections { #pragma omp section { if (left < j) qsort_descent_inplace(faceobjects, left, j); } #pragma omp section { if (i < right) qsort_descent_inplace(faceobjects, i, right); } } } static void qsort_descent_inplace(std::vector& faceobjects) { if (faceobjects.empty()) return; qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1); } static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold, bool agnostic = false) { picked.clear(); const int n = faceobjects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = faceobjects[i].rect.area(); } for (int i = 0; i < n; i++) { const Object& a = faceobjects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { const Object& b = faceobjects[picked[j]]; if (!agnostic && a.label != b.label) continue; // intersection over union float inter_area = intersection_area(a, b); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area if (inter_area / union_area > nms_threshold) keep = 0; } if (keep) picked.push_back(i); } } static inline float sigmoid(float x) { return static_cast(1.f / (1.f + exp(-x))); } static void generate_proposals(const ncnn::Mat& anchors, int stride, const ncnn::Mat& in_pad, const ncnn::Mat& feat_blob, float prob_threshold, std::vector& objects) { const int num_grid_x = feat_blob.w; const int num_grid_y = feat_blob.h; const int num_anchors = anchors.w / 2; const int num_class = feat_blob.c / num_anchors - 5; const int feat_offset = num_class + 5; for (int q = 0; q < num_anchors; q++) { const float anchor_w = anchors[q * 2]; const float anchor_h = anchors[q * 2 + 1]; for (int i = 0; i < num_grid_y; i++) { for (int j = 0; j < num_grid_x; j++) { // find class index with max class score int class_index = 0; float class_score = -FLT_MAX; for (int k = 0; k < num_class; k++) { float score = feat_blob.channel(q * feat_offset + 5 + k).row(i)[j]; if (score > class_score) { class_index = k; class_score = score; } } float box_score = feat_blob.channel(q * feat_offset + 4).row(i)[j]; float confidence = sigmoid(box_score) * sigmoid(class_score); if (confidence >= prob_threshold) { // yolov5/models/yolo.py Detect forward // y = x[i].sigmoid() // y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i] # xy // y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh float dx = sigmoid(feat_blob.channel(q * feat_offset + 0).row(i)[j]); float dy = sigmoid(feat_blob.channel(q * feat_offset + 1).row(i)[j]); float dw = sigmoid(feat_blob.channel(q * feat_offset + 2).row(i)[j]); float dh = sigmoid(feat_blob.channel(q * feat_offset + 3).row(i)[j]); float pb_cx = (dx * 2.f - 0.5f + j) * stride; float pb_cy = (dy * 2.f - 0.5f + i) * stride; float pb_w = pow(dw * 2.f, 2) * anchor_w; float pb_h = pow(dh * 2.f, 2) * anchor_h; float x0 = pb_cx - pb_w * 0.5f; float y0 = pb_cy - pb_h * 0.5f; float x1 = pb_cx + pb_w * 0.5f; float y1 = pb_cy + pb_h * 0.5f; Object obj; obj.rect.x = x0; obj.rect.y = y0; obj.rect.width = x1 - x0; obj.rect.height = y1 - y0; obj.label = class_index; obj.prob = confidence; objects.push_back(obj); } } } } } static int detect_yolov5(const cv::Mat& bgr, std::vector& objects) { ncnn::Net yolov5; yolov5.opt.use_vulkan_compute = true; // yolov5.opt.use_bf16_storage = true; // original pretrained model from https://github.com/ultralytics/yolov5 // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models if (yolov5.load_param("yolov5s.ncnn.param")) exit(-1); if (yolov5.load_model("yolov5s.ncnn.bin")) exit(-1); const int target_size = 640; const float prob_threshold = 0.25f; const float nms_threshold = 0.45f; int img_w = bgr.cols; int img_h = bgr.rows; // yolov5/models/common.py DetectMultiBackend const int max_stride = 64; // letterbox pad to multiple of max_stride int w = img_w; int h = img_h; float scale = 1.f; if (w > h) { scale = (float)target_size / w; w = target_size; h = h * scale; } else { scale = (float)target_size / h; h = target_size; w = w * scale; } ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h); // pad to target_size rectangle // yolov5/utils/datasets.py letterbox int wpad = (w + max_stride - 1) / max_stride * max_stride - w; int hpad = (h + max_stride - 1) / max_stride * max_stride - h; ncnn::Mat in_pad; ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; in_pad.substract_mean_normalize(0, norm_vals); ncnn::Extractor ex = yolov5.create_extractor(); ex.input("in0", in_pad); std::vector proposals; // anchor setting from yolov5/models/yolov5s.yaml // stride 8 { ncnn::Mat out; ex.extract("out0", out); ncnn::Mat anchors(6); anchors[0] = 10.f; anchors[1] = 13.f; anchors[2] = 16.f; anchors[3] = 30.f; anchors[4] = 33.f; anchors[5] = 23.f; std::vector objects8; generate_proposals(anchors, 8, in_pad, out, prob_threshold, objects8); proposals.insert(proposals.end(), objects8.begin(), objects8.end()); } // stride 16 { ncnn::Mat out; ex.extract("out1", out); ncnn::Mat anchors(6); anchors[0] = 30.f; anchors[1] = 61.f; anchors[2] = 62.f; anchors[3] = 45.f; anchors[4] = 59.f; anchors[5] = 119.f; std::vector objects16; generate_proposals(anchors, 16, in_pad, out, prob_threshold, objects16); proposals.insert(proposals.end(), objects16.begin(), objects16.end()); } // stride 32 { ncnn::Mat out; ex.extract("out2", out); ncnn::Mat anchors(6); anchors[0] = 116.f; anchors[1] = 90.f; anchors[2] = 156.f; anchors[3] = 198.f; anchors[4] = 373.f; anchors[5] = 326.f; std::vector objects32; generate_proposals(anchors, 32, in_pad, out, prob_threshold, objects32); proposals.insert(proposals.end(), objects32.begin(), objects32.end()); } // sort all proposals by score from highest to lowest qsort_descent_inplace(proposals); // apply nms with nms_threshold std::vector picked; nms_sorted_bboxes(proposals, picked, nms_threshold); int count = picked.size(); objects.resize(count); for (int i = 0; i < count; i++) { objects[i] = proposals[picked[i]]; // adjust offset to original unpadded float x0 = (objects[i].rect.x - (wpad / 2)) / scale; float y0 = (objects[i].rect.y - (hpad / 2)) / scale; float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale; float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale; // clip x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f); y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f); x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f); y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f); objects[i].rect.x = x0; objects[i].rect.y = y0; objects[i].rect.width = x1 - x0; objects[i].rect.height = y1 - y0; } return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = { "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush" }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0)); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_yolov5(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/yolov7.cpp ================================================ // Copyright 2020 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "layer.h" #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include #include #define MAX_STRIDE 32 struct Object { cv::Rect_ rect; int label; float prob; }; static inline float intersection_area(const Object& a, const Object& b) { cv::Rect_ inter = a.rect & b.rect; return inter.area(); } static void qsort_descent_inplace(std::vector& objects, int left, int right) { int i = left; int j = right; float p = objects[(left + right) / 2].prob; while (i <= j) { while (objects[i].prob > p) i++; while (objects[j].prob < p) j--; if (i <= j) { // swap std::swap(objects[i], objects[j]); i++; j--; } } #pragma omp parallel sections { #pragma omp section { if (left < j) qsort_descent_inplace(objects, left, j); } #pragma omp section { if (i < right) qsort_descent_inplace(objects, i, right); } } } static void qsort_descent_inplace(std::vector& objects) { if (objects.empty()) return; qsort_descent_inplace(objects, 0, objects.size() - 1); } static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold, bool agnostic = false) { picked.clear(); const int n = faceobjects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = faceobjects[i].rect.area(); } for (int i = 0; i < n; i++) { const Object& a = faceobjects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { const Object& b = faceobjects[picked[j]]; if (!agnostic && a.label != b.label) continue; // intersection over union float inter_area = intersection_area(a, b); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area if (inter_area / union_area > nms_threshold) keep = 0; } if (keep) picked.push_back(i); } } static inline float sigmoid(float x) { return static_cast(1.f / (1.f + exp(-x))); } static void generate_proposals(const ncnn::Mat& anchors, int stride, const ncnn::Mat& in_pad, const ncnn::Mat& feat_blob, float prob_threshold, std::vector& objects) { const int num_grid = feat_blob.h; int num_grid_x; int num_grid_y; if (in_pad.w > in_pad.h) { num_grid_x = in_pad.w / stride; num_grid_y = num_grid / num_grid_x; } else { num_grid_y = in_pad.h / stride; num_grid_x = num_grid / num_grid_y; } const int num_class = feat_blob.w - 5; const int num_anchors = anchors.w / 2; for (int q = 0; q < num_anchors; q++) { const float anchor_w = anchors[q * 2]; const float anchor_h = anchors[q * 2 + 1]; const ncnn::Mat feat = feat_blob.channel(q); for (int i = 0; i < num_grid_y; i++) { for (int j = 0; j < num_grid_x; j++) { const float* featptr = feat.row(i * num_grid_x + j); float box_confidence = sigmoid(featptr[4]); if (box_confidence >= prob_threshold) { // find class index with max class score int class_index = 0; float class_score = -FLT_MAX; for (int k = 0; k < num_class; k++) { float score = featptr[5 + k]; if (score > class_score) { class_index = k; class_score = score; } } float confidence = box_confidence * sigmoid(class_score); if (confidence >= prob_threshold) { float dx = sigmoid(featptr[0]); float dy = sigmoid(featptr[1]); float dw = sigmoid(featptr[2]); float dh = sigmoid(featptr[3]); float pb_cx = (dx * 2.f - 0.5f + j) * stride; float pb_cy = (dy * 2.f - 0.5f + i) * stride; float pb_w = pow(dw * 2.f, 2) * anchor_w; float pb_h = pow(dh * 2.f, 2) * anchor_h; float x0 = pb_cx - pb_w * 0.5f; float y0 = pb_cy - pb_h * 0.5f; float x1 = pb_cx + pb_w * 0.5f; float y1 = pb_cy + pb_h * 0.5f; Object obj; obj.rect.x = x0; obj.rect.y = y0; obj.rect.width = x1 - x0; obj.rect.height = y1 - y0; obj.label = class_index; obj.prob = confidence; objects.push_back(obj); } } } } } } static int detect_yolov7(const cv::Mat& bgr, std::vector& objects) { ncnn::Net yolov7; yolov7.opt.use_vulkan_compute = true; // yolov7.opt.use_bf16_storage = true; // original pretrained model from https://github.com/WongKinYiu/yolov7 // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models yolov7.load_param("yolov7-tiny.param"); yolov7.load_model("yolov7-tiny.bin"); const int target_size = 640; const float prob_threshold = 0.25f; const float nms_threshold = 0.45f; int img_w = bgr.cols; int img_h = bgr.rows; // letterbox pad to multiple of MAX_STRIDE int w = img_w; int h = img_h; float scale = 1.f; if (w > h) { scale = (float)target_size / w; w = target_size; h = h * scale; } else { scale = (float)target_size / h; h = target_size; w = w * scale; } ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h); int wpad = (w + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - w; int hpad = (h + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - h; ncnn::Mat in_pad; ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; in_pad.substract_mean_normalize(0, norm_vals); ncnn::Extractor ex = yolov7.create_extractor(); ex.input("images", in_pad); std::vector proposals; // stride 8 { ncnn::Mat out; ex.extract("output", out); ncnn::Mat anchors(6); anchors[0] = 12.f; anchors[1] = 16.f; anchors[2] = 19.f; anchors[3] = 36.f; anchors[4] = 40.f; anchors[5] = 28.f; std::vector objects8; generate_proposals(anchors, 8, in_pad, out, prob_threshold, objects8); proposals.insert(proposals.end(), objects8.begin(), objects8.end()); } // stride 16 { ncnn::Mat out; ex.extract("288", out); ncnn::Mat anchors(6); anchors[0] = 36.f; anchors[1] = 75.f; anchors[2] = 76.f; anchors[3] = 55.f; anchors[4] = 72.f; anchors[5] = 146.f; std::vector objects16; generate_proposals(anchors, 16, in_pad, out, prob_threshold, objects16); proposals.insert(proposals.end(), objects16.begin(), objects16.end()); } // stride 32 { ncnn::Mat out; ex.extract("302", out); ncnn::Mat anchors(6); anchors[0] = 142.f; anchors[1] = 110.f; anchors[2] = 192.f; anchors[3] = 243.f; anchors[4] = 459.f; anchors[5] = 401.f; std::vector objects32; generate_proposals(anchors, 32, in_pad, out, prob_threshold, objects32); proposals.insert(proposals.end(), objects32.begin(), objects32.end()); } // sort all proposals by score from highest to lowest qsort_descent_inplace(proposals); // apply nms with nms_threshold std::vector picked; nms_sorted_bboxes(proposals, picked, nms_threshold); int count = picked.size(); objects.resize(count); for (int i = 0; i < count; i++) { objects[i] = proposals[picked[i]]; // adjust offset to original unpadded float x0 = (objects[i].rect.x - (wpad / 2)) / scale; float y0 = (objects[i].rect.y - (hpad / 2)) / scale; float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale; float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale; // clip x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f); y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f); x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f); y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f); objects[i].rect.x = x0; objects[i].rect.y = y0; objects[i].rect.width = x1 - x0; objects[i].rect.height = y1 - y0; } return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = { "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush" }; static const unsigned char colors[19][3] = { {54, 67, 244}, {99, 30, 233}, {176, 39, 156}, {183, 58, 103}, {181, 81, 63}, {243, 150, 33}, {244, 169, 3}, {212, 188, 0}, {136, 150, 0}, {80, 175, 76}, {74, 195, 139}, {57, 220, 205}, {59, 235, 255}, {7, 193, 255}, {0, 152, 255}, {34, 87, 255}, {72, 85, 121}, {158, 158, 158}, {139, 125, 96} }; int color_index = 0; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; const unsigned char* color = colors[color_index % 19]; color_index++; cv::Scalar cc(color[0], color[1], color[2]); fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::rectangle(image, obj.rect, cc, 2); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cc, -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(255, 255, 255)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_yolov7(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/yolov7_pnnx.cpp ================================================ // Copyright 2022 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "layer.h" #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include #include struct Object { cv::Rect_ rect; int label; float prob; }; static inline float intersection_area(const Object& a, const Object& b) { cv::Rect_ inter = a.rect & b.rect; return inter.area(); } static void qsort_descent_inplace(std::vector& faceobjects, int left, int right) { int i = left; int j = right; float p = faceobjects[(left + right) / 2].prob; while (i <= j) { while (faceobjects[i].prob > p) i++; while (faceobjects[j].prob < p) j--; if (i <= j) { // swap std::swap(faceobjects[i], faceobjects[j]); i++; j--; } } #pragma omp parallel sections { #pragma omp section { if (left < j) qsort_descent_inplace(faceobjects, left, j); } #pragma omp section { if (i < right) qsort_descent_inplace(faceobjects, i, right); } } } static void qsort_descent_inplace(std::vector& faceobjects) { if (faceobjects.empty()) return; qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1); } static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold, bool agnostic = false) { picked.clear(); const int n = faceobjects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = faceobjects[i].rect.area(); } for (int i = 0; i < n; i++) { const Object& a = faceobjects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { const Object& b = faceobjects[picked[j]]; if (!agnostic && a.label != b.label) continue; // intersection over union float inter_area = intersection_area(a, b); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area if (inter_area / union_area > nms_threshold) keep = 0; } if (keep) picked.push_back(i); } } static inline float sigmoid(float x) { return static_cast(1.f / (1.f + exp(-x))); } static void generate_proposals(const ncnn::Mat& anchors, int stride, const ncnn::Mat& in_pad, const ncnn::Mat& feat_blob, float prob_threshold, std::vector& objects) { const int num_grid_x = feat_blob.w; const int num_grid_y = feat_blob.h; const int num_anchors = anchors.w / 2; const int num_class = 80; for (int q = 0; q < num_anchors; q++) { const float anchor_w = anchors[q * 2]; const float anchor_h = anchors[q * 2 + 1]; for (int i = 0; i < num_grid_y; i++) { for (int j = 0; j < num_grid_x; j++) { // find class index with max class score int class_index = 0; float class_score = -FLT_MAX; for (int k = 0; k < num_class; k++) { float score = feat_blob.channel(q * 85 + 5 + k).row(i)[j]; if (score > class_score) { class_index = k; class_score = score; } } float box_score = feat_blob.channel(q * 85 + 4).row(i)[j]; float confidence = sigmoid(box_score) * sigmoid(class_score); if (confidence >= prob_threshold) { // yolov5/models/yolo.py Detect forward // y = x[i].sigmoid() // y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i] # xy // y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh float dx = sigmoid(feat_blob.channel(q * 85 + 0).row(i)[j]); float dy = sigmoid(feat_blob.channel(q * 85 + 1).row(i)[j]); float dw = sigmoid(feat_blob.channel(q * 85 + 2).row(i)[j]); float dh = sigmoid(feat_blob.channel(q * 85 + 3).row(i)[j]); float pb_cx = (dx * 2.f - 0.5f + j) * stride; float pb_cy = (dy * 2.f - 0.5f + i) * stride; float pb_w = pow(dw * 2.f, 2) * anchor_w; float pb_h = pow(dh * 2.f, 2) * anchor_h; float x0 = pb_cx - pb_w * 0.5f; float y0 = pb_cy - pb_h * 0.5f; float x1 = pb_cx + pb_w * 0.5f; float y1 = pb_cy + pb_h * 0.5f; Object obj; obj.rect.x = x0; obj.rect.y = y0; obj.rect.width = x1 - x0; obj.rect.height = y1 - y0; obj.label = class_index; obj.prob = confidence; objects.push_back(obj); } } } } } static int detect_yolov7(const cv::Mat& bgr, std::vector& objects) { ncnn::Net yolov7; yolov7.opt.use_vulkan_compute = true; // yolov7.opt.use_bf16_storage = true; // git clone https://github.com/WongKinYiu/yolov7 // cd yolov7 // wget https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7.pt // python models/export.py --weights yolov7.pt // pnnx yolov7.torchscript.pt inputshape=[1,3,640,640] inputshape=[1,3,320,320] yolov7.load_param("yolov7.param"); yolov7.load_model("yolov7.bin"); const int target_size = 640; const float prob_threshold = 0.25f; const float nms_threshold = 0.45f; int img_w = bgr.cols; int img_h = bgr.rows; // yolov5/models/common.py DetectMultiBackend const int max_stride = 64; // letterbox pad to multiple of max_stride int w = img_w; int h = img_h; float scale = 1.f; if (w > h) { scale = (float)target_size / w; w = target_size; h = h * scale; } else { scale = (float)target_size / h; h = target_size; w = w * scale; } ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h); // pad to target_size rectangle // yolov5/utils/datasets.py letterbox int wpad = (w + max_stride - 1) / max_stride * max_stride - w; int hpad = (h + max_stride - 1) / max_stride * max_stride - h; ncnn::Mat in_pad; ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; in_pad.substract_mean_normalize(0, norm_vals); ncnn::Extractor ex = yolov7.create_extractor(); ex.input("in0", in_pad); std::vector proposals; // anchor setting from yolov5/models/yolov5s.yaml // stride 8 { ncnn::Mat out; ex.extract("out0", out); ncnn::Mat anchors(6); anchors[0] = 12.f; anchors[1] = 16.f; anchors[2] = 19.f; anchors[3] = 36.f; anchors[4] = 40.f; anchors[5] = 28.f; std::vector objects8; generate_proposals(anchors, 8, in_pad, out, prob_threshold, objects8); proposals.insert(proposals.end(), objects8.begin(), objects8.end()); } // stride 16 { ncnn::Mat out; ex.extract("out1", out); ncnn::Mat anchors(6); anchors[0] = 36.f; anchors[1] = 75.f; anchors[2] = 76.f; anchors[3] = 55.f; anchors[4] = 72.f; anchors[5] = 146.f; std::vector objects16; generate_proposals(anchors, 16, in_pad, out, prob_threshold, objects16); proposals.insert(proposals.end(), objects16.begin(), objects16.end()); } // stride 32 { ncnn::Mat out; ex.extract("out2", out); ncnn::Mat anchors(6); anchors[0] = 142.f; anchors[1] = 110.f; anchors[2] = 192.f; anchors[3] = 243.f; anchors[4] = 459.f; anchors[5] = 401.f; std::vector objects32; generate_proposals(anchors, 32, in_pad, out, prob_threshold, objects32); proposals.insert(proposals.end(), objects32.begin(), objects32.end()); } // sort all proposals by score from highest to lowest qsort_descent_inplace(proposals); // apply nms with nms_threshold std::vector picked; nms_sorted_bboxes(proposals, picked, nms_threshold); int count = picked.size(); objects.resize(count); for (int i = 0; i < count; i++) { objects[i] = proposals[picked[i]]; // adjust offset to original unpadded float x0 = (objects[i].rect.x - (wpad / 2)) / scale; float y0 = (objects[i].rect.y - (hpad / 2)) / scale; float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale; float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale; // clip x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f); y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f); x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f); y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f); objects[i].rect.x = x0; objects[i].rect.y = y0; objects[i].rect.width = x1 - x0; objects[i].rect.height = y1 - y0; } return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = { "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush" }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0)); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_yolov7(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/yolov8.cpp ================================================ // Copyright 2024 Tencent // SPDX-License-Identifier: BSD-3-Clause // 1. install // pip3 install -U ultralytics pnnx ncnn // 2. export yolov8 torchscript // yolo export model=yolov8n.pt format=torchscript // 3. convert torchscript with static shape // pnnx yolov8n.torchscript // 4. modify yolov8n_pnnx.py for dynamic shape inference // A. modify reshape to support dynamic image sizes // B. permute tensor before concat and adjust concat axis // C. drop post-process part // before: // v_165 = v_142.view(1, 144, 6400) // v_166 = v_153.view(1, 144, 1600) // v_167 = v_164.view(1, 144, 400) // v_168 = torch.cat((v_165, v_166, v_167), dim=2) // ... // after: // v_165 = v_142.view(1, 144, -1).transpose(1, 2) // v_166 = v_153.view(1, 144, -1).transpose(1, 2) // v_167 = v_164.view(1, 144, -1).transpose(1, 2) // v_168 = torch.cat((v_165, v_166, v_167), dim=1) // return v_168 // 5. re-export yolov8 torchscript // python3 -c 'import yolov8n_pnnx; yolov8n_pnnx.export_torchscript()' // 6. convert new torchscript with dynamic shape // pnnx yolov8n_pnnx.py.pt inputshape=[1,3,640,640] inputshape2=[1,3,320,320] // 7. now you get ncnn model files // mv yolov8n_pnnx.py.ncnn.param yolov8n.ncnn.param // mv yolov8n_pnnx.py.ncnn.bin yolov8n.ncnn.bin // the out blob would be a 2-dim tensor with w=144 h=8400 // // | bbox-reg 16 x 4 | per-class scores(80) | // +-----+-----+-----+-----+----------------------+ // | dx0 | dy0 | dx1 | dy1 |0.1 0.0 0.0 0.5 ......| // all /| | | | | . | // boxes | .. | .. | .. | .. |0.0 0.9 0.0 0.0 ......| // (8400)| | | | | . | // \| | | | | . | // +-----+-----+-----+-----+----------------------+ // #include "layer.h" #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include #include struct Object { cv::Rect_ rect; int label; float prob; }; static inline float intersection_area(const Object& a, const Object& b) { cv::Rect_ inter = a.rect & b.rect; return inter.area(); } static void qsort_descent_inplace(std::vector& objects, int left, int right) { int i = left; int j = right; float p = objects[(left + right) / 2].prob; while (i <= j) { while (objects[i].prob > p) i++; while (objects[j].prob < p) j--; if (i <= j) { // swap std::swap(objects[i], objects[j]); i++; j--; } } // #pragma omp parallel sections { // #pragma omp section { if (left < j) qsort_descent_inplace(objects, left, j); } // #pragma omp section { if (i < right) qsort_descent_inplace(objects, i, right); } } } static void qsort_descent_inplace(std::vector& objects) { if (objects.empty()) return; qsort_descent_inplace(objects, 0, objects.size() - 1); } static void nms_sorted_bboxes(const std::vector& objects, std::vector& picked, float nms_threshold, bool agnostic = false) { picked.clear(); const int n = objects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = objects[i].rect.area(); } for (int i = 0; i < n; i++) { const Object& a = objects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { const Object& b = objects[picked[j]]; if (!agnostic && a.label != b.label) continue; // intersection over union float inter_area = intersection_area(a, b); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area if (inter_area / union_area > nms_threshold) keep = 0; } if (keep) picked.push_back(i); } } static inline float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); } static void generate_proposals(const ncnn::Mat& pred, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) { const int w = in_pad.w; const int h = in_pad.h; const int num_grid_x = w / stride; const int num_grid_y = h / stride; const int reg_max_1 = 16; const int num_class = pred.w - reg_max_1 * 4; // number of classes. 80 for COCO for (int y = 0; y < num_grid_y; y++) { for (int x = 0; x < num_grid_x; x++) { const ncnn::Mat pred_grid = pred.row_range(y * num_grid_x + x, 1); // find label with max score int label = -1; float score = -FLT_MAX; { const ncnn::Mat pred_score = pred_grid.range(reg_max_1 * 4, num_class); for (int k = 0; k < num_class; k++) { float s = pred_score[k]; if (s > score) { label = k; score = s; } } score = sigmoid(score); } if (score >= prob_threshold) { ncnn::Mat pred_bbox = pred_grid.range(0, reg_max_1 * 4).reshape(reg_max_1, 4); { ncnn::Layer* softmax = ncnn::create_layer("Softmax"); ncnn::ParamDict pd; pd.set(0, 1); // axis pd.set(1, 1); softmax->load_param(pd); ncnn::Option opt; opt.num_threads = 1; opt.use_packing_layout = false; softmax->create_pipeline(opt); softmax->forward_inplace(pred_bbox, opt); softmax->destroy_pipeline(opt); delete softmax; } float pred_ltrb[4]; for (int k = 0; k < 4; k++) { float dis = 0.f; const float* dis_after_sm = pred_bbox.row(k); for (int l = 0; l < reg_max_1; l++) { dis += l * dis_after_sm[l]; } pred_ltrb[k] = dis * stride; } float pb_cx = (x + 0.5f) * stride; float pb_cy = (y + 0.5f) * stride; float x0 = pb_cx - pred_ltrb[0]; float y0 = pb_cy - pred_ltrb[1]; float x1 = pb_cx + pred_ltrb[2]; float y1 = pb_cy + pred_ltrb[3]; Object obj; obj.rect.x = x0; obj.rect.y = y0; obj.rect.width = x1 - x0; obj.rect.height = y1 - y0; obj.label = label; obj.prob = score; objects.push_back(obj); } } } } static void generate_proposals(const ncnn::Mat& pred, const std::vector& strides, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) { const int w = in_pad.w; const int h = in_pad.h; int pred_row_offset = 0; for (size_t i = 0; i < strides.size(); i++) { const int stride = strides[i]; const int num_grid_x = w / stride; const int num_grid_y = h / stride; const int num_grid = num_grid_x * num_grid_y; generate_proposals(pred.row_range(pred_row_offset, num_grid), stride, in_pad, prob_threshold, objects); pred_row_offset += num_grid; } } static int detect_yolov8(const cv::Mat& bgr, std::vector& objects) { ncnn::Net yolov8; yolov8.opt.use_vulkan_compute = true; // yolov8.opt.use_bf16_storage = true; // https://github.com/nihui/ncnn-android-yolov8/tree/master/app/src/main/assets yolov8.load_param("yolov8n.ncnn.param"); yolov8.load_model("yolov8n.ncnn.bin"); // yolov8.load_param("yolov8s.ncnn.param"); // yolov8.load_model("yolov8s.ncnn.bin"); // yolov8.load_param("yolov8m.ncnn.param"); // yolov8.load_model("yolov8m.ncnn.bin"); // if you use oiv7 models, you shall call draw_objects_oiv() instead // yolov8.load_param("yolov8n_oiv7.ncnn.param"); // yolov8.load_model("yolov8n_oiv7.ncnn.bin"); // yolov8.load_param("yolov8s_oiv7.ncnn.param"); // yolov8.load_model("yolov8s_oiv7.ncnn.bin"); // yolov8.load_param("yolov8m_oiv7.ncnn.param"); // yolov8.load_model("yolov8m_oiv7.ncnn.bin"); const int target_size = 640; const float prob_threshold = 0.25f; const float nms_threshold = 0.45f; int img_w = bgr.cols; int img_h = bgr.rows; // ultralytics/cfg/models/v8/yolov8.yaml std::vector strides(3); strides[0] = 8; strides[1] = 16; strides[2] = 32; const int max_stride = 32; // letterbox pad to multiple of max_stride int w = img_w; int h = img_h; float scale = 1.f; if (w > h) { scale = (float)target_size / w; w = target_size; h = h * scale; } else { scale = (float)target_size / h; h = target_size; w = w * scale; } ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h); // letterbox pad to target_size rectangle int wpad = (w + max_stride - 1) / max_stride * max_stride - w; int hpad = (h + max_stride - 1) / max_stride * max_stride - h; ncnn::Mat in_pad; ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; in_pad.substract_mean_normalize(0, norm_vals); ncnn::Extractor ex = yolov8.create_extractor(); ex.input("in0", in_pad); ncnn::Mat out; ex.extract("out0", out); std::vector proposals; generate_proposals(out, strides, in_pad, prob_threshold, proposals); // sort all proposals by score from highest to lowest qsort_descent_inplace(proposals); // apply nms with nms_threshold std::vector picked; nms_sorted_bboxes(proposals, picked, nms_threshold); int count = picked.size(); objects.resize(count); for (int i = 0; i < count; i++) { objects[i] = proposals[picked[i]]; // adjust offset to original unpadded float x0 = (objects[i].rect.x - (wpad / 2)) / scale; float y0 = (objects[i].rect.y - (hpad / 2)) / scale; float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale; float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale; // clip x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f); y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f); x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f); y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f); objects[i].rect.x = x0; objects[i].rect.y = y0; objects[i].rect.width = x1 - x0; objects[i].rect.height = y1 - y0; } return 0; } static void draw_objects_coco(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = { "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush" }; static cv::Scalar colors[] = { cv::Scalar(244, 67, 54), cv::Scalar(233, 30, 99), cv::Scalar(156, 39, 176), cv::Scalar(103, 58, 183), cv::Scalar(63, 81, 181), cv::Scalar(33, 150, 243), cv::Scalar(3, 169, 244), cv::Scalar(0, 188, 212), cv::Scalar(0, 150, 136), cv::Scalar(76, 175, 80), cv::Scalar(139, 195, 74), cv::Scalar(205, 220, 57), cv::Scalar(255, 235, 59), cv::Scalar(255, 193, 7), cv::Scalar(255, 152, 0), cv::Scalar(255, 87, 34), cv::Scalar(121, 85, 72), cv::Scalar(158, 158, 158), cv::Scalar(96, 125, 139) }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; const cv::Scalar& color = colors[i % 19]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::rectangle(image, obj.rect, color); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } static void draw_objects_oiv(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = { "Accordion", "Adhesive tape", "Aircraft", "Airplane", "Alarm clock", "Alpaca", "Ambulance", "Animal", "Ant", "Antelope", "Apple", "Armadillo", "Artichoke", "Auto part", "Axe", "Backpack", "Bagel", "Baked goods", "Balance beam", "Ball", "Balloon", "Banana", "Band-aid", "Banjo", "Barge", "Barrel", "Baseball bat", "Baseball glove", "Bat (Animal)", "Bathroom accessory", "Bathroom cabinet", "Bathtub", "Beaker", "Bear", "Bed", "Bee", "Beehive", "Beer", "Beetle", "Bell pepper", "Belt", "Bench", "Bicycle", "Bicycle helmet", "Bicycle wheel", "Bidet", "Billboard", "Billiard table", "Binoculars", "Bird", "Blender", "Blue jay", "Boat", "Bomb", "Book", "Bookcase", "Boot", "Bottle", "Bottle opener", "Bow and arrow", "Bowl", "Bowling equipment", "Box", "Boy", "Brassiere", "Bread", "Briefcase", "Broccoli", "Bronze sculpture", "Brown bear", "Building", "Bull", "Burrito", "Bus", "Bust", "Butterfly", "Cabbage", "Cabinetry", "Cake", "Cake stand", "Calculator", "Camel", "Camera", "Can opener", "Canary", "Candle", "Candy", "Cannon", "Canoe", "Cantaloupe", "Car", "Carnivore", "Carrot", "Cart", "Cassette deck", "Castle", "Cat", "Cat furniture", "Caterpillar", "Cattle", "Ceiling fan", "Cello", "Centipede", "Chainsaw", "Chair", "Cheese", "Cheetah", "Chest of drawers", "Chicken", "Chime", "Chisel", "Chopsticks", "Christmas tree", "Clock", "Closet", "Clothing", "Coat", "Cocktail", "Cocktail shaker", "Coconut", "Coffee", "Coffee cup", "Coffee table", "Coffeemaker", "Coin", "Common fig", "Common sunflower", "Computer keyboard", "Computer monitor", "Computer mouse", "Container", "Convenience store", "Cookie", "Cooking spray", "Corded phone", "Cosmetics", "Couch", "Countertop", "Cowboy hat", "Crab", "Cream", "Cricket ball", "Crocodile", "Croissant", "Crown", "Crutch", "Cucumber", "Cupboard", "Curtain", "Cutting board", "Dagger", "Dairy Product", "Deer", "Desk", "Dessert", "Diaper", "Dice", "Digital clock", "Dinosaur", "Dishwasher", "Dog", "Dog bed", "Doll", "Dolphin", "Door", "Door handle", "Doughnut", "Dragonfly", "Drawer", "Dress", "Drill (Tool)", "Drink", "Drinking straw", "Drum", "Duck", "Dumbbell", "Eagle", "Earrings", "Egg (Food)", "Elephant", "Envelope", "Eraser", "Face powder", "Facial tissue holder", "Falcon", "Fashion accessory", "Fast food", "Fax", "Fedora", "Filing cabinet", "Fire hydrant", "Fireplace", "Fish", "Flag", "Flashlight", "Flower", "Flowerpot", "Flute", "Flying disc", "Food", "Food processor", "Football", "Football helmet", "Footwear", "Fork", "Fountain", "Fox", "French fries", "French horn", "Frog", "Fruit", "Frying pan", "Furniture", "Garden Asparagus", "Gas stove", "Giraffe", "Girl", "Glasses", "Glove", "Goat", "Goggles", "Goldfish", "Golf ball", "Golf cart", "Gondola", "Goose", "Grape", "Grapefruit", "Grinder", "Guacamole", "Guitar", "Hair dryer", "Hair spray", "Hamburger", "Hammer", "Hamster", "Hand dryer", "Handbag", "Handgun", "Harbor seal", "Harmonica", "Harp", "Harpsichord", "Hat", "Headphones", "Heater", "Hedgehog", "Helicopter", "Helmet", "High heels", "Hiking equipment", "Hippopotamus", "Home appliance", "Honeycomb", "Horizontal bar", "Horse", "Hot dog", "House", "Houseplant", "Human arm", "Human beard", "Human body", "Human ear", "Human eye", "Human face", "Human foot", "Human hair", "Human hand", "Human head", "Human leg", "Human mouth", "Human nose", "Humidifier", "Ice cream", "Indoor rower", "Infant bed", "Insect", "Invertebrate", "Ipod", "Isopod", "Jacket", "Jacuzzi", "Jaguar (Animal)", "Jeans", "Jellyfish", "Jet ski", "Jug", "Juice", "Kangaroo", "Kettle", "Kitchen & dining room table", "Kitchen appliance", "Kitchen knife", "Kitchen utensil", "Kitchenware", "Kite", "Knife", "Koala", "Ladder", "Ladle", "Ladybug", "Lamp", "Land vehicle", "Lantern", "Laptop", "Lavender (Plant)", "Lemon", "Leopard", "Light bulb", "Light switch", "Lighthouse", "Lily", "Limousine", "Lion", "Lipstick", "Lizard", "Lobster", "Loveseat", "Luggage and bags", "Lynx", "Magpie", "Mammal", "Man", "Mango", "Maple", "Maracas", "Marine invertebrates", "Marine mammal", "Measuring cup", "Mechanical fan", "Medical equipment", "Microphone", "Microwave oven", "Milk", "Miniskirt", "Mirror", "Missile", "Mixer", "Mixing bowl", "Mobile phone", "Monkey", "Moths and butterflies", "Motorcycle", "Mouse", "Muffin", "Mug", "Mule", "Mushroom", "Musical instrument", "Musical keyboard", "Nail (Construction)", "Necklace", "Nightstand", "Oboe", "Office building", "Office supplies", "Orange", "Organ (Musical Instrument)", "Ostrich", "Otter", "Oven", "Owl", "Oyster", "Paddle", "Palm tree", "Pancake", "Panda", "Paper cutter", "Paper towel", "Parachute", "Parking meter", "Parrot", "Pasta", "Pastry", "Peach", "Pear", "Pen", "Pencil case", "Pencil sharpener", "Penguin", "Perfume", "Person", "Personal care", "Personal flotation device", "Piano", "Picnic basket", "Picture frame", "Pig", "Pillow", "Pineapple", "Pitcher (Container)", "Pizza", "Pizza cutter", "Plant", "Plastic bag", "Plate", "Platter", "Plumbing fixture", "Polar bear", "Pomegranate", "Popcorn", "Porch", "Porcupine", "Poster", "Potato", "Power plugs and sockets", "Pressure cooker", "Pretzel", "Printer", "Pumpkin", "Punching bag", "Rabbit", "Raccoon", "Racket", "Radish", "Ratchet (Device)", "Raven", "Rays and skates", "Red panda", "Refrigerator", "Remote control", "Reptile", "Rhinoceros", "Rifle", "Ring binder", "Rocket", "Roller skates", "Rose", "Rugby ball", "Ruler", "Salad", "Salt and pepper shakers", "Sandal", "Sandwich", "Saucer", "Saxophone", "Scale", "Scarf", "Scissors", "Scoreboard", "Scorpion", "Screwdriver", "Sculpture", "Sea lion", "Sea turtle", "Seafood", "Seahorse", "Seat belt", "Segway", "Serving tray", "Sewing machine", "Shark", "Sheep", "Shelf", "Shellfish", "Shirt", "Shorts", "Shotgun", "Shower", "Shrimp", "Sink", "Skateboard", "Ski", "Skirt", "Skull", "Skunk", "Skyscraper", "Slow cooker", "Snack", "Snail", "Snake", "Snowboard", "Snowman", "Snowmobile", "Snowplow", "Soap dispenser", "Sock", "Sofa bed", "Sombrero", "Sparrow", "Spatula", "Spice rack", "Spider", "Spoon", "Sports equipment", "Sports uniform", "Squash (Plant)", "Squid", "Squirrel", "Stairs", "Stapler", "Starfish", "Stationary bicycle", "Stethoscope", "Stool", "Stop sign", "Strawberry", "Street light", "Stretcher", "Studio couch", "Submarine", "Submarine sandwich", "Suit", "Suitcase", "Sun hat", "Sunglasses", "Surfboard", "Sushi", "Swan", "Swim cap", "Swimming pool", "Swimwear", "Sword", "Syringe", "Table", "Table tennis racket", "Tablet computer", "Tableware", "Taco", "Tank", "Tap", "Tart", "Taxi", "Tea", "Teapot", "Teddy bear", "Telephone", "Television", "Tennis ball", "Tennis racket", "Tent", "Tiara", "Tick", "Tie", "Tiger", "Tin can", "Tire", "Toaster", "Toilet", "Toilet paper", "Tomato", "Tool", "Toothbrush", "Torch", "Tortoise", "Towel", "Tower", "Toy", "Traffic light", "Traffic sign", "Train", "Training bench", "Treadmill", "Tree", "Tree house", "Tripod", "Trombone", "Trousers", "Truck", "Trumpet", "Turkey", "Turtle", "Umbrella", "Unicycle", "Van", "Vase", "Vegetable", "Vehicle", "Vehicle registration plate", "Violin", "Volleyball (Ball)", "Waffle", "Waffle iron", "Wall clock", "Wardrobe", "Washing machine", "Waste container", "Watch", "Watercraft", "Watermelon", "Weapon", "Whale", "Wheel", "Wheelchair", "Whisk", "Whiteboard", "Willow", "Window", "Window blind", "Wine", "Wine glass", "Wine rack", "Winter melon", "Wok", "Woman", "Wood-burning stove", "Woodpecker", "Worm", "Wrench", "Zebra", "Zucchini" }; static cv::Scalar colors[] = { cv::Scalar(244, 67, 54), cv::Scalar(233, 30, 99), cv::Scalar(156, 39, 176), cv::Scalar(103, 58, 183), cv::Scalar(63, 81, 181), cv::Scalar(33, 150, 243), cv::Scalar(3, 169, 244), cv::Scalar(0, 188, 212), cv::Scalar(0, 150, 136), cv::Scalar(76, 175, 80), cv::Scalar(139, 195, 74), cv::Scalar(205, 220, 57), cv::Scalar(255, 235, 59), cv::Scalar(255, 193, 7), cv::Scalar(255, 152, 0), cv::Scalar(255, 87, 34), cv::Scalar(121, 85, 72), cv::Scalar(158, 158, 158), cv::Scalar(96, 125, 139) }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; const cv::Scalar& color = colors[i % 19]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::rectangle(image, obj.rect, color); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_yolov8(m, objects); draw_objects_coco(m, objects); // draw_objects_oiv(m, objects); return 0; } ================================================ FILE: examples/yolov8_cls.cpp ================================================ // Copyright 2024 Tencent // SPDX-License-Identifier: BSD-3-Clause // 1. install // pip3 install -U ultralytics pnnx ncnn // 2. export yolov8-cls torchscript // yolo export model=yolov8n-cls.pt format=torchscript // 3. convert torchscript with static shape // pnnx yolov8n-cls.torchscript // 4. now you get ncnn model files // yolov8n_cls.ncnn.param // yolov8n_cls.ncnn.bin #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include #include struct Object { int label; float prob; }; static void get_topk(const ncnn::Mat& cls_scores, int topk, std::vector& objects) { // partial sort topk with index int size = cls_scores.w; std::vector > vec; vec.resize(size); for (int i = 0; i < size; i++) { vec[i] = std::make_pair(cls_scores[i], i); } std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::greater >()); objects.resize(topk); for (int i = 0; i < topk; i++) { objects[i].label = vec[i].second; objects[i].prob = vec[i].first; } } static int detect_yolov8_cls(const cv::Mat& bgr, std::vector& objects) { ncnn::Net yolov8; yolov8.opt.use_vulkan_compute = true; // yolov8.opt.use_bf16_storage = true; // https://github.com/nihui/ncnn-android-yolov8/tree/master/app/src/main/assets yolov8.load_param("yolov8n_cls.ncnn.param"); yolov8.load_model("yolov8n_cls.ncnn.bin"); // yolov8.load_param("yolov8s_cls.ncnn.param"); // yolov8.load_model("yolov8s_cls.ncnn.bin"); // yolov8.load_param("yolov8m_cls.ncnn.param"); // yolov8.load_model("yolov8m_cls.ncnn.bin"); const int target_size = 224; const int topk = 5; int img_w = bgr.cols; int img_h = bgr.rows; // letterbox pad int w = img_w; int h = img_h; float scale = 1.f; if (w > h) { scale = (float)target_size / w; w = target_size; h = h * scale; } else { scale = (float)target_size / h; h = target_size; w = w * scale; } ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h); // letterbox pad to target_size rectangle int wpad = target_size - w; int hpad = target_size - h; ncnn::Mat in_pad; ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; in_pad.substract_mean_normalize(0, norm_vals); ncnn::Extractor ex = yolov8.create_extractor(); ex.input("in0", in_pad); ncnn::Mat out; ex.extract("out0", out); // return top-5 get_topk(out, topk, objects); return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = { "tench", "goldfish", "great white shark", "tiger shark", "hammerhead", "electric ray", "stingray", "cock", "hen", "ostrich", "brambling", "goldfinch", "house finch", "junco", "indigo bunting", "robin", "bulbul", "jay", "magpie", "chickadee", "water ouzel", "kite", "bald eagle", "vulture", "great grey owl", "European fire salamander", "common newt", "eft", "spotted salamander", "axolotl", "bullfrog", "tree frog", "tailed frog", "loggerhead", "leatherback turtle", "mud turtle", "terrapin", "box turtle", "banded gecko", "common iguana", "American chameleon", "whiptail", "agama", "frilled lizard", "alligator lizard", "Gila monster", "green lizard", "African chameleon", "Komodo dragon", "African crocodile", "American alligator", "triceratops", "thunder snake", "ringneck snake", "hognose snake", "green snake", "king snake", "garter snake", "water snake", "vine snake", "night snake", "boa constrictor", "rock python", "Indian cobra", "green mamba", "sea snake", "horned viper", "diamondback", "sidewinder", "trilobite", "harvestman", "scorpion", "black and gold garden spider", "barn spider", "garden spider", "black widow", "tarantula", "wolf spider", "tick", "centipede", "black grouse", "ptarmigan", "ruffed grouse", "prairie chicken", "peacock", "quail", "partridge", "African grey", "macaw", "sulphur-crested cockatoo", "lorikeet", "coucal", "bee eater", "hornbill", "hummingbird", "jacamar", "toucan", "drake", "red-breasted merganser", "goose", "black swan", "tusker", "echidna", "platypus", "wallaby", "koala", "wombat", "jellyfish", "sea anemone", "brain coral", "flatworm", "nematode", "conch", "snail", "slug", "sea slug", "chiton", "chambered nautilus", "Dungeness crab", "rock crab", "fiddler crab", "king crab", "American lobster", "spiny lobster", "crayfish", "hermit crab", "isopod", "white stork", "black stork", "spoonbill", "flamingo", "little blue heron", "American egret", "bittern", "crane (bird)", "limpkin", "European gallinule", "American coot", "bustard", "ruddy turnstone", "red-backed sandpiper", "redshank", "dowitcher", "oystercatcher", "pelican", "king penguin", "albatross", "grey whale", "killer whale", "dugong", "sea lion", "Chihuahua", "Japanese spaniel", "Maltese dog", "Pekinese", "Shih-Tzu", "Blenheim spaniel", "papillon", "toy terrier", "Rhodesian ridgeback", "Afghan hound", "basset", "beagle", "bloodhound", "bluetick", "black-and-tan coonhound", "Walker hound", "English foxhound", "redbone", "borzoi", "Irish wolfhound", "Italian greyhound", "whippet", "Ibizan hound", "Norwegian elkhound", "otterhound", "Saluki", "Scottish deerhound", "Weimaraner", "Staffordshire bullterrier", "American Staffordshire terrier", "Bedlington terrier", "Border terrier", "Kerry blue terrier", "Irish terrier", "Norfolk terrier", "Norwich terrier", "Yorkshire terrier", "wire-haired fox terrier", "Lakeland terrier", "Sealyham terrier", "Airedale", "cairn", "Australian terrier", "Dandie Dinmont", "Boston bull", "miniature schnauzer", "giant schnauzer", "standard schnauzer", "Scotch terrier", "Tibetan terrier", "silky terrier", "soft-coated wheaten terrier", "West Highland white terrier", "Lhasa", "flat-coated retriever", "curly-coated retriever", "golden retriever", "Labrador retriever", "Chesapeake Bay retriever", "German short-haired pointer", "vizsla", "English setter", "Irish setter", "Gordon setter", "Brittany spaniel", "clumber", "English springer", "Welsh springer spaniel", "cocker spaniel", "Sussex spaniel", "Irish water spaniel", "kuvasz", "schipperke", "groenendael", "malinois", "briard", "kelpie", "komondor", "Old English sheepdog", "Shetland sheepdog", "collie", "Border collie", "Bouvier des Flandres", "Rottweiler", "German shepherd", "Doberman", "miniature pinscher", "Greater Swiss Mountain dog", "Bernese mountain dog", "Appenzeller", "EntleBucher", "boxer", "bull mastiff", "Tibetan mastiff", "French bulldog", "Great Dane", "Saint Bernard", "Eskimo dog", "malamute", "Siberian husky", "dalmatian", "affenpinscher", "basenji", "pug", "Leonberg", "Newfoundland", "Great Pyrenees", "Samoyed", "Pomeranian", "chow", "keeshond", "Brabancon griffon", "Pembroke", "Cardigan", "toy poodle", "miniature poodle", "standard poodle", "Mexican hairless", "timber wolf", "white wolf", "red wolf", "coyote", "dingo", "dhole", "African hunting dog", "hyena", "red fox", "kit fox", "Arctic fox", "grey fox", "tabby", "tiger cat", "Persian cat", "Siamese cat", "Egyptian cat", "cougar", "lynx", "leopard", "snow leopard", "jaguar", "lion", "tiger", "cheetah", "brown bear", "American black bear", "ice bear", "sloth bear", "mongoose", "meerkat", "tiger beetle", "ladybug", "ground beetle", "long-horned beetle", "leaf beetle", "dung beetle", "rhinoceros beetle", "weevil", "fly", "bee", "ant", "grasshopper", "cricket", "walking stick", "cockroach", "mantis", "cicada", "leafhopper", "lacewing", "dragonfly", "damselfly", "admiral", "ringlet", "monarch", "cabbage butterfly", "sulphur butterfly", "lycaenid", "starfish", "sea urchin", "sea cucumber", "wood rabbit", "hare", "Angora", "hamster", "porcupine", "fox squirrel", "marmot", "beaver", "guinea pig", "sorrel", "zebra", "hog", "wild boar", "warthog", "hippopotamus", "ox", "water buffalo", "bison", "ram", "bighorn", "ibex", "hartebeest", "impala", "gazelle", "Arabian camel", "llama", "weasel", "mink", "polecat", "black-footed ferret", "otter", "skunk", "badger", "armadillo", "three-toed sloth", "orangutan", "gorilla", "chimpanzee", "gibbon", "siamang", "guenon", "patas", "baboon", "macaque", "langur", "colobus", "proboscis monkey", "marmoset", "capuchin", "howler monkey", "titi", "spider monkey", "squirrel monkey", "Madagascar cat", "indri", "Indian elephant", "African elephant", "lesser panda", "giant panda", "barracouta", "eel", "coho", "rock beauty", "anemone fish", "sturgeon", "gar", "lionfish", "puffer", "abacus", "abaya", "academic gown", "accordion", "acoustic guitar", "aircraft carrier", "airliner", "airship", "altar", "ambulance", "amphibian", "analog clock", "apiary", "apron", "ashcan", "assault rifle", "backpack", "bakery", "balance beam", "balloon", "ballpoint", "Band Aid", "banjo", "bannister", "barbell", "barber chair", "barbershop", "barn", "barometer", "barrel", "barrow", "baseball", "basketball", "bassinet", "bassoon", "bathing cap", "bath towel", "bathtub", "beach wagon", "beacon", "beaker", "bearskin", "beer bottle", "beer glass", "bell cote", "bib", "bicycle-built-for-two", "bikini", "binder", "binoculars", "birdhouse", "boathouse", "bobsled", "bolo tie", "bonnet", "bookcase", "bookshop", "bottlecap", "bow", "bow tie", "brass", "brassiere", "breakwater", "breastplate", "broom", "bucket", "buckle", "bulletproof vest", "bullet train", "butcher shop", "cab", "caldron", "candle", "cannon", "canoe", "can opener", "cardigan", "car mirror", "carousel", "carpenter's kit", "carton", "car wheel", "cash machine", "cassette", "cassette player", "castle", "catamaran", "CD player", "cello", "cellular telephone", "chain", "chainlink fence", "chain mail", "chain saw", "chest", "chiffonier", "chime", "china cabinet", "Christmas stocking", "church", "cinema", "cleaver", "cliff dwelling", "cloak", "clog", "cocktail shaker", "coffee mug", "coffeepot", "coil", "combination lock", "computer keyboard", "confectionery", "container ship", "convertible", "corkscrew", "cornet", "cowboy boot", "cowboy hat", "cradle", "crane (machine)", "crash helmet", "crate", "crib", "Crock Pot", "croquet ball", "crutch", "cuirass", "dam", "desk", "desktop computer", "dial telephone", "diaper", "digital clock", "digital watch", "dining table", "dishrag", "dishwasher", "disk brake", "dock", "dogsled", "dome", "doormat", "drilling platform", "drum", "drumstick", "dumbbell", "Dutch oven", "electric fan", "electric guitar", "electric locomotive", "entertainment center", "envelope", "espresso maker", "face powder", "feather boa", "file", "fireboat", "fire engine", "fire screen", "flagpole", "flute", "folding chair", "football helmet", "forklift", "fountain", "fountain pen", "four-poster", "freight car", "French horn", "frying pan", "fur coat", "garbage truck", "gasmask", "gas pump", "goblet", "go-kart", "golf ball", "golfcart", "gondola", "gong", "gown", "grand piano", "greenhouse", "grille", "grocery store", "guillotine", "hair slide", "hair spray", "half track", "hammer", "hamper", "hand blower", "hand-held computer", "handkerchief", "hard disc", "harmonica", "harp", "harvester", "hatchet", "holster", "home theater", "honeycomb", "hook", "hoopskirt", "horizontal bar", "horse cart", "hourglass", "iPod", "iron", "jack-o'-lantern", "jean", "jeep", "jersey", "jigsaw puzzle", "jinrikisha", "joystick", "kimono", "knee pad", "knot", "lab coat", "ladle", "lampshade", "laptop", "lawn mower", "lens cap", "letter opener", "library", "lifeboat", "lighter", "limousine", "liner", "lipstick", "Loafer", "lotion", "loudspeaker", "loupe", "lumbermill", "magnetic compass", "mailbag", "mailbox", "maillot (tights)", "maillot (tank suit)", "manhole cover", "maraca", "marimba", "mask", "matchstick", "maypole", "maze", "measuring cup", "medicine chest", "megalith", "microphone", "microwave", "military uniform", "milk can", "minibus", "miniskirt", "minivan", "missile", "mitten", "mixing bowl", "mobile home", "Model T", "modem", "monastery", "monitor", "moped", "mortar", "mortarboard", "mosque", "mosquito net", "motor scooter", "mountain bike", "mountain tent", "mouse", "mousetrap", "moving van", "muzzle", "nail", "neck brace", "necklace", "nipple", "notebook", "obelisk", "oboe", "ocarina", "odometer", "oil filter", "organ", "oscilloscope", "overskirt", "oxcart", "oxygen mask", "packet", "paddle", "paddlewheel", "padlock", "paintbrush", "pajama", "palace", "panpipe", "paper towel", "parachute", "parallel bars", "park bench", "parking meter", "passenger car", "patio", "pay-phone", "pedestal", "pencil box", "pencil sharpener", "perfume", "Petri dish", "photocopier", "pick", "pickelhaube", "picket fence", "pickup", "pier", "piggy bank", "pill bottle", "pillow", "ping-pong ball", "pinwheel", "pirate", "pitcher", "plane", "planetarium", "plastic bag", "plate rack", "plow", "plunger", "Polaroid camera", "pole", "police van", "poncho", "pool table", "pop bottle", "pot", "potter's wheel", "power drill", "prayer rug", "printer", "prison", "projectile", "projector", "puck", "punching bag", "purse", "quill", "quilt", "racer", "racket", "radiator", "radio", "radio telescope", "rain barrel", "recreational vehicle", "reel", "reflex camera", "refrigerator", "remote control", "restaurant", "revolver", "rifle", "rocking chair", "rotisserie", "rubber eraser", "rugby ball", "rule", "running shoe", "safe", "safety pin", "saltshaker", "sandal", "sarong", "sax", "scabbard", "scale", "school bus", "schooner", "scoreboard", "screen", "screw", "screwdriver", "seat belt", "sewing machine", "shield", "shoe shop", "shoji", "shopping basket", "shopping cart", "shovel", "shower cap", "shower curtain", "ski", "ski mask", "sleeping bag", "slide rule", "sliding door", "slot", "snorkel", "snowmobile", "snowplow", "soap dispenser", "soccer ball", "sock", "solar dish", "sombrero", "soup bowl", "space bar", "space heater", "space shuttle", "spatula", "speedboat", "spider web", "spindle", "sports car", "spotlight", "stage", "steam locomotive", "steel arch bridge", "steel drum", "stethoscope", "stole", "stone wall", "stopwatch", "stove", "strainer", "streetcar", "stretcher", "studio couch", "stupa", "submarine", "suit", "sundial", "sunglass", "sunglasses", "sunscreen", "suspension bridge", "swab", "sweatshirt", "swimming trunks", "swing", "switch", "syringe", "table lamp", "tank", "tape player", "teapot", "teddy", "television", "tennis ball", "thatch", "theater curtain", "thimble", "thresher", "throne", "tile roof", "toaster", "tobacco shop", "toilet seat", "torch", "totem pole", "tow truck", "toyshop", "tractor", "trailer truck", "tray", "trench coat", "tricycle", "trimaran", "tripod", "triumphal arch", "trolleybus", "trombone", "tub", "turnstile", "typewriter keyboard", "umbrella", "unicycle", "upright", "vacuum", "vase", "vault", "velvet", "vending machine", "vestment", "viaduct", "violin", "volleyball", "waffle iron", "wall clock", "wallet", "wardrobe", "warplane", "washbasin", "washer", "water bottle", "water jug", "water tower", "whiskey jug", "whistle", "wig", "window screen", "window shade", "Windsor tie", "wine bottle", "wing", "wok", "wooden spoon", "wool", "worm fence", "wreck", "yawl", "yurt", "web site", "comic book", "crossword puzzle", "street sign", "traffic light", "book jacket", "menu", "plate", "guacamole", "consomme", "hot pot", "trifle", "ice cream", "ice lolly", "French loaf", "bagel", "pretzel", "cheeseburger", "hotdog", "mashed potato", "head cabbage", "broccoli", "cauliflower", "zucchini", "spaghetti squash", "acorn squash", "butternut squash", "cucumber", "artichoke", "bell pepper", "cardoon", "mushroom", "Granny Smith", "strawberry", "orange", "lemon", "fig", "pineapple", "banana", "jackfruit", "custard apple", "pomegranate", "hay", "carbonara", "chocolate sauce", "dough", "meat loaf", "pizza", "potpie", "burrito", "red wine", "espresso", "cup", "eggnog", "alp", "bubble", "cliff", "coral reef", "geyser", "lakeside", "promontory", "sandbar", "seashore", "valley", "volcano", "ballplayer", "groom", "scuba diver", "rapeseed", "daisy", "yellow lady's slipper", "corn", "acorn", "hip", "buckeye", "coral fungus", "agaric", "gyromitra", "stinkhorn", "earthstar", "hen-of-the-woods", "bolete", "ear", "toilet tissue" }; cv::Mat image = bgr.clone(); int y_offset = 0; for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; fprintf(stderr, "%d = %.5f\n", obj.label, obj.prob); char text[256]; sprintf(text, "%4.1f%% %s", obj.prob * 100, class_names[obj.label]); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = 0; int y = y_offset; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); y_offset += label_size.height; } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_yolov8_cls(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/yolov8_obb.cpp ================================================ // Copyright 2024 Tencent // SPDX-License-Identifier: BSD-3-Clause // 1. install // pip3 install -U ultralytics pnnx ncnn // 2. export yolov8-obb torchscript // yolo export model=yolov8n-obb.pt format=torchscript // 3. convert torchscript with static shape // pnnx yolov8n-obb.torchscript // 4. modify yolov8n_obb_pnnx.py for dynamic shape inference // A. modify reshape to support dynamic image sizes // B. permute tensor before concat and adjust concat axis // C. drop post-process part // before: // v_137 = v_136.view(1, 1, 16384) // v_143 = v_142.view(1, 1, 4096) // v_149 = v_148.view(1, 1, 1024) // v_150 = torch.cat((v_137, v_143, v_149), dim=2) // ... // v_186 = v_163.view(1, 79, 16384) // v_187 = v_174.view(1, 79, 4096) // v_188 = v_185.view(1, 79, 1024) // v_189 = torch.cat((v_186, v_187, v_188), dim=2) // ... // after: // v_137 = v_136.view(1, 1, -1).transpose(1, 2) // v_143 = v_142.view(1, 1, -1).transpose(1, 2) // v_149 = v_148.view(1, 1, -1).transpose(1, 2) // v_150 = torch.cat((v_137, v_143, v_149), dim=1) // ... // v_186 = v_163.view(1, 79, -1).transpose(1, 2) // v_187 = v_174.view(1, 79, -1).transpose(1, 2) // v_188 = v_185.view(1, 79, -1).transpose(1, 2) // v_189 = torch.cat((v_186, v_187, v_188), dim=1) // return v_189, v_150 // 5. re-export yolov8-obb torchscript // python3 -c 'import yolov8n_obb_pnnx; yolov8n_obb_pnnx.export_torchscript()' // 6. convert new torchscript with dynamic shape // pnnx yolov8n_obb_pnnx.py.pt inputshape=[1,3,1024,1024] inputshape2=[1,3,512,512] // 7. now you get ncnn model files // mv yolov8n_obb_pnnx.py.ncnn.param yolov8n_obb.ncnn.param // mv yolov8n_obb_pnnx.py.ncnn.bin yolov8n_obb.ncnn.bin // the out blob would be a 2-dim tensor with w=79 h=21504 // // | bbox-reg 16 x 4 |score(15)| // +-----+-----+-----+-----+---------+ // | dx0 | dy0 | dx1 | dy1 | 0.1 ... | // all /| | | | | ... | // boxes | .. | .. | .. | .. | 0.0 ... | // (21504)| | | | | . ... | // \| | | | | . ... | // +-----+-----+-----+-----+---------+ // // the out blob would be a 2-dim tensor with w=1 h=21504 // // | degree(1)| // +----------+ // | 0.1 | // all /| | // boxes | 0.0 | // (21504)| . | // \| . | // +----------+ // #include "layer.h" #include "net.h" #include #include #include #include #include #include #include struct Object { cv::RotatedRect rrect; int label; float prob; }; static inline float intersection_area(const Object& a, const Object& b) { std::vector intersection; cv::rotatedRectangleIntersection(a.rrect, b.rrect, intersection); if (intersection.empty()) return 0.f; return cv::contourArea(intersection); } static void qsort_descent_inplace(std::vector& objects, int left, int right) { int i = left; int j = right; float p = objects[(left + right) / 2].prob; while (i <= j) { while (objects[i].prob > p) i++; while (objects[j].prob < p) j--; if (i <= j) { // swap std::swap(objects[i], objects[j]); i++; j--; } } // #pragma omp parallel sections { // #pragma omp section { if (left < j) qsort_descent_inplace(objects, left, j); } // #pragma omp section { if (i < right) qsort_descent_inplace(objects, i, right); } } } static void qsort_descent_inplace(std::vector& objects) { if (objects.empty()) return; qsort_descent_inplace(objects, 0, objects.size() - 1); } static void nms_sorted_bboxes(const std::vector& objects, std::vector& picked, float nms_threshold, bool agnostic = false) { picked.clear(); const int n = objects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = objects[i].rrect.size.area(); } for (int i = 0; i < n; i++) { const Object& a = objects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { const Object& b = objects[picked[j]]; if (!agnostic && a.label != b.label) continue; // intersection over union float inter_area = intersection_area(a, b); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area; if (inter_area / union_area > nms_threshold) keep = 0; } if (keep) picked.push_back(i); } } static inline float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); } static void generate_proposals(const ncnn::Mat& pred, const ncnn::Mat& pred_angle, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) { const int w = in_pad.w; const int h = in_pad.h; const int num_grid_x = w / stride; const int num_grid_y = h / stride; const int reg_max_1 = 16; const int num_class = pred.w - reg_max_1 * 4; // number of classes. 15 for DOTAv1 for (int y = 0; y < num_grid_y; y++) { for (int x = 0; x < num_grid_x; x++) { const ncnn::Mat pred_grid = pred.row_range(y * num_grid_x + x, 1); // find label with max score int label = -1; float score = -FLT_MAX; { const ncnn::Mat pred_score = pred_grid.range(reg_max_1 * 4, num_class); for (int k = 0; k < num_class; k++) { float s = pred_score[k]; if (s > score) { label = k; score = s; } } score = sigmoid(score); } if (score >= prob_threshold) { ncnn::Mat pred_bbox = pred_grid.range(0, reg_max_1 * 4).reshape(reg_max_1, 4).clone(); { ncnn::Layer* softmax = ncnn::create_layer("Softmax"); ncnn::ParamDict pd; pd.set(0, 1); // axis pd.set(1, 1); softmax->load_param(pd); ncnn::Option opt; opt.num_threads = 1; opt.use_packing_layout = false; softmax->create_pipeline(opt); softmax->forward_inplace(pred_bbox, opt); softmax->destroy_pipeline(opt); delete softmax; } float pred_ltrb[4]; for (int k = 0; k < 4; k++) { float dis = 0.f; const float* dis_after_sm = pred_bbox.row(k); for (int l = 0; l < reg_max_1; l++) { dis += l * dis_after_sm[l]; } pred_ltrb[k] = dis * stride; } float pb_cx = (x + 0.5f) * stride; float pb_cy = (y + 0.5f) * stride; const float angle = sigmoid(pred_angle.row(y * num_grid_x + x)[0]) - 0.25f; const float angle_rad = angle * 3.14159265358979323846f; const float angle_degree = angle * 180.f; float cos = cosf(angle_rad); float sin = sinf(angle_rad); float xx = (pred_ltrb[2] - pred_ltrb[0]) * 0.5f; float yy = (pred_ltrb[3] - pred_ltrb[1]) * 0.5f; float xr = xx * cos - yy * sin; float yr = xx * sin + yy * cos; const float cx = pb_cx + xr; const float cy = pb_cy + yr; const float ww = pred_ltrb[2] + pred_ltrb[0]; const float hh = pred_ltrb[3] + pred_ltrb[1]; Object obj; obj.rrect = cv::RotatedRect(cv::Point2f(cx, cy), cv::Size_(ww, hh), angle_degree); obj.label = label; obj.prob = score; objects.push_back(obj); } } } } static void generate_proposals(const ncnn::Mat& pred, const ncnn::Mat& pred_angle, const std::vector& strides, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) { const int w = in_pad.w; const int h = in_pad.h; int pred_row_offset = 0; for (size_t i = 0; i < strides.size(); i++) { const int stride = strides[i]; const int num_grid_x = w / stride; const int num_grid_y = h / stride; const int num_grid = num_grid_x * num_grid_y; generate_proposals(pred.row_range(pred_row_offset, num_grid), pred_angle.row_range(pred_row_offset, num_grid), stride, in_pad, prob_threshold, objects); pred_row_offset += num_grid; } } static int detect_yolov8_obb(const cv::Mat& bgr, std::vector& objects) { ncnn::Net yolov8; yolov8.opt.use_vulkan_compute = true; // yolov8.opt.use_bf16_storage = true; // https://github.com/nihui/ncnn-android-yolov8/tree/master/app/src/main/assets yolov8.load_param("yolov8n_obb.ncnn.param"); yolov8.load_model("yolov8n_obb.ncnn.bin"); // yolov8.load_param("yolov8s_obb.ncnn.param"); // yolov8.load_model("yolov8s_obb.ncnn.bin"); // yolov8.load_param("yolov8m_obb.ncnn.param"); // yolov8.load_model("yolov8m_obb.ncnn.bin"); const int target_size = 1024; const float prob_threshold = 0.25f; const float nms_threshold = 0.45f; int img_w = bgr.cols; int img_h = bgr.rows; // ultralytics/cfg/models/v8/yolov8.yaml std::vector strides(3); strides[0] = 8; strides[1] = 16; strides[2] = 32; const int max_stride = 32; // letterbox pad to multiple of max_stride int w = img_w; int h = img_h; float scale = 1.f; if (w > h) { scale = (float)target_size / w; w = target_size; h = h * scale; } else { scale = (float)target_size / h; h = target_size; w = w * scale; } ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h); // letterbox pad to target_size rectangle int wpad = (w + max_stride - 1) / max_stride * max_stride - w; int hpad = (h + max_stride - 1) / max_stride * max_stride - h; ncnn::Mat in_pad; ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; in_pad.substract_mean_normalize(0, norm_vals); ncnn::Extractor ex = yolov8.create_extractor(); ex.input("in0", in_pad); ncnn::Mat out; ex.extract("out0", out); ncnn::Mat out_angle; ex.extract("out1", out_angle); std::vector proposals; generate_proposals(out, out_angle, strides, in_pad, prob_threshold, proposals); // sort all proposals by score from highest to lowest qsort_descent_inplace(proposals); // apply nms with nms_threshold std::vector picked; nms_sorted_bboxes(proposals, picked, nms_threshold); int count = picked.size(); if (count == 0) return 0; objects.resize(count); for (int i = 0; i < count; i++) { Object obj = proposals[picked[i]]; // adjust offset to original unpadded obj.rrect.center.x = (obj.rrect.center.x - (wpad / 2)) / scale; obj.rrect.center.y = (obj.rrect.center.y - (hpad / 2)) / scale; obj.rrect.size.width = (obj.rrect.size.width) / scale; obj.rrect.size.height = (obj.rrect.size.height) / scale; objects[i] = obj; } return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = { "plane", "ship", "storage tank", "baseball diamond", "tennis court", "basketball court", "ground track field", "harbor", "bridge", "large vehicle", "small vehicle", "helicopter", "roundabout", "soccer ball field", "swimming pool" }; static const cv::Scalar colors[] = { cv::Scalar(156, 39, 176), cv::Scalar(103, 58, 183), cv::Scalar(63, 81, 181), cv::Scalar(33, 150, 243), cv::Scalar(3, 169, 244), cv::Scalar(0, 188, 212), cv::Scalar(0, 150, 136), cv::Scalar(76, 175, 80), cv::Scalar(139, 195, 74), cv::Scalar(205, 220, 57), cv::Scalar(255, 235, 59), cv::Scalar(255, 193, 7), cv::Scalar(255, 152, 0), cv::Scalar(255, 87, 34), cv::Scalar(121, 85, 72), cv::Scalar(158, 158, 158), cv::Scalar(96, 125, 139) }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; const cv::Scalar& color = colors[obj.label]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f @ %.2f\n", obj.label, obj.prob, obj.rrect.center.x, obj.rrect.center.y, obj.rrect.size.width, obj.rrect.size.height, obj.rrect.angle); cv::Point2f corners[4]; obj.rrect.points(corners); cv::line(image, corners[0], corners[1], color); cv::line(image, corners[1], corners[2], color); cv::line(image, corners[2], corners[3], color); cv::line(image, corners[3], corners[0], color); } for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; const cv::Scalar& color = colors[obj.label]; char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rrect.center.x - label_size.width / 2; int y = obj.rrect.center.y - label_size.height / 2 - baseLine; if (y < 0) y = 0; if (y + label_size.height > image.rows) y = image.rows - label_size.height; if (x < 0) x = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_yolov8_obb(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/yolov8_pose.cpp ================================================ // Copyright 2024 Tencent // SPDX-License-Identifier: BSD-3-Clause // 1. install // pip3 install -U ultralytics pnnx ncnn // 2. export yolov8-pose torchscript // yolo export model=yolov8n-pose.pt format=torchscript // 3. convert torchscript with static shape // pnnx yolov8n-pose.torchscript // 4. modify yolov8n_pose_pnnx.py for dynamic shape inference // A. modify reshape to support dynamic image sizes // B. permute tensor before concat and adjust concat axis // C. drop post-process part // before: // v_137 = v_136.view(1, 51, 6400) // v_143 = v_142.view(1, 51, 1600) // v_149 = v_148.view(1, 51, 400) // v_150 = torch.cat((v_137, v_143, v_149), dim=-1) // ... // v_184 = v_161.view(1, 65, 6400) // v_185 = v_172.view(1, 65, 1600) // v_186 = v_183.view(1, 65, 400) // v_187 = torch.cat((v_184, v_185, v_186), dim=2) // ... // after: // v_137 = v_136.view(1, 51, -1).transpose(1, 2) // v_143 = v_142.view(1, 51, -1).transpose(1, 2) // v_149 = v_148.view(1, 51, -1).transpose(1, 2) // v_150 = torch.cat((v_137, v_143, v_149), dim=1) // ... // v_184 = v_161.view(1, 65, -1).transpose(1, 2) // v_185 = v_172.view(1, 65, -1).transpose(1, 2) // v_186 = v_183.view(1, 65, -1).transpose(1, 2) // v_187 = torch.cat((v_184, v_185, v_186), dim=1) // return v_187, v_150 // 5. re-export yolov8-pose torchscript // python3 -c 'import yolov8n_pose_pnnx; yolov8n_pose_pnnx.export_torchscript()' // 6. convert new torchscript with dynamic shape // pnnx yolov8n_pose_pnnx.py.pt inputshape=[1,3,640,640] inputshape2=[1,3,320,320] // 7. now you get ncnn model files // mv yolov8n_pose_pnnx.py.ncnn.param yolov8n_pose.ncnn.param // mv yolov8n_pose_pnnx.py.ncnn.bin yolov8n_pose.ncnn.bin // the out blob would be a 2-dim tensor with w=65 h=8400 // // | bbox-reg 16 x 4 |score(1)| // +-----+-----+-----+-----+--------+ // | dx0 | dy0 | dx1 | dy1 | 0.1 | // all /| | | | | | // boxes | .. | .. | .. | .. | 0.0 | // (8400)| | | | | . | // \| | | | | . | // +-----+-----+-----+-----+--------+ // // // | pose (51) | // +-----------+ // |0.1........| // all /| | // boxes |0.0........| // (8400)| . | // \| . | // +-----------+ // #include "layer.h" #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include #include struct KeyPoint { cv::Point2f p; float prob; }; struct Object { cv::Rect_ rect; int label; float prob; std::vector keypoints; }; static inline float intersection_area(const Object& a, const Object& b) { cv::Rect_ inter = a.rect & b.rect; return inter.area(); } static void qsort_descent_inplace(std::vector& objects, int left, int right) { int i = left; int j = right; float p = objects[(left + right) / 2].prob; while (i <= j) { while (objects[i].prob > p) i++; while (objects[j].prob < p) j--; if (i <= j) { // swap std::swap(objects[i], objects[j]); i++; j--; } } // #pragma omp parallel sections { // #pragma omp section { if (left < j) qsort_descent_inplace(objects, left, j); } // #pragma omp section { if (i < right) qsort_descent_inplace(objects, i, right); } } } static void qsort_descent_inplace(std::vector& objects) { if (objects.empty()) return; qsort_descent_inplace(objects, 0, objects.size() - 1); } static void nms_sorted_bboxes(const std::vector& objects, std::vector& picked, float nms_threshold, bool agnostic = false) { picked.clear(); const int n = objects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = objects[i].rect.area(); } for (int i = 0; i < n; i++) { const Object& a = objects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { const Object& b = objects[picked[j]]; if (!agnostic && a.label != b.label) continue; // intersection over union float inter_area = intersection_area(a, b); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area if (inter_area / union_area > nms_threshold) keep = 0; } if (keep) picked.push_back(i); } } static inline float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); } static void generate_proposals(const ncnn::Mat& pred, const ncnn::Mat& pred_points, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) { const int w = in_pad.w; const int h = in_pad.h; const int num_grid_x = w / stride; const int num_grid_y = h / stride; const int reg_max_1 = 16; const int num_points = pred_points.w / 3; for (int y = 0; y < num_grid_y; y++) { for (int x = 0; x < num_grid_x; x++) { const ncnn::Mat pred_grid = pred.row_range(y * num_grid_x + x, 1); const ncnn::Mat pred_points_grid = pred_points.row_range(y * num_grid_x + x, 1).reshape(3, num_points); // find label with max score int label = 0; float score = sigmoid(pred_grid[reg_max_1 * 4]); if (score >= prob_threshold) { ncnn::Mat pred_bbox = pred_grid.range(0, reg_max_1 * 4).reshape(reg_max_1, 4).clone(); { ncnn::Layer* softmax = ncnn::create_layer("Softmax"); ncnn::ParamDict pd; pd.set(0, 1); // axis pd.set(1, 1); softmax->load_param(pd); ncnn::Option opt; opt.num_threads = 1; opt.use_packing_layout = false; softmax->create_pipeline(opt); softmax->forward_inplace(pred_bbox, opt); softmax->destroy_pipeline(opt); delete softmax; } float pred_ltrb[4]; for (int k = 0; k < 4; k++) { float dis = 0.f; const float* dis_after_sm = pred_bbox.row(k); for (int l = 0; l < reg_max_1; l++) { dis += l * dis_after_sm[l]; } pred_ltrb[k] = dis * stride; } float pb_cx = (x + 0.5f) * stride; float pb_cy = (y + 0.5f) * stride; float x0 = pb_cx - pred_ltrb[0]; float y0 = pb_cy - pred_ltrb[1]; float x1 = pb_cx + pred_ltrb[2]; float y1 = pb_cy + pred_ltrb[3]; std::vector keypoints; for (int k = 0; k < num_points; k++) { KeyPoint keypoint; keypoint.p.x = (x + pred_points_grid.row(k)[0] * 2) * stride; keypoint.p.y = (y + pred_points_grid.row(k)[1] * 2) * stride; keypoint.prob = sigmoid(pred_points_grid.row(k)[2]); keypoints.push_back(keypoint); } Object obj; obj.rect.x = x0; obj.rect.y = y0; obj.rect.width = x1 - x0; obj.rect.height = y1 - y0; obj.label = label; obj.prob = score; obj.keypoints = keypoints; objects.push_back(obj); } } } } static void generate_proposals(const ncnn::Mat& pred, const ncnn::Mat& pred_points, const std::vector& strides, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) { const int w = in_pad.w; const int h = in_pad.h; int pred_row_offset = 0; for (size_t i = 0; i < strides.size(); i++) { const int stride = strides[i]; const int num_grid_x = w / stride; const int num_grid_y = h / stride; const int num_grid = num_grid_x * num_grid_y; generate_proposals(pred.row_range(pred_row_offset, num_grid), pred_points.row_range(pred_row_offset, num_grid), stride, in_pad, prob_threshold, objects); pred_row_offset += num_grid; } } static int detect_yolov8_pose(const cv::Mat& bgr, std::vector& objects) { ncnn::Net yolov8; yolov8.opt.use_vulkan_compute = true; // yolov8.opt.use_bf16_storage = true; // https://github.com/nihui/ncnn-android-yolov8/tree/master/app/src/main/assets yolov8.load_param("yolov8n_pose.ncnn.param"); yolov8.load_model("yolov8n_pose.ncnn.bin"); // yolov8.load_param("yolov8s_pose.ncnn.param"); // yolov8.load_model("yolov8s_pose.ncnn.bin"); // yolov8.load_param("yolov8m_pose.ncnn.param"); // yolov8.load_model("yolov8m_pose.ncnn.bin"); const int target_size = 640; const float prob_threshold = 0.25f; const float nms_threshold = 0.45f; const float mask_threshold = 0.5f; int img_w = bgr.cols; int img_h = bgr.rows; // ultralytics/cfg/models/v8/yolov8.yaml std::vector strides(3); strides[0] = 8; strides[1] = 16; strides[2] = 32; const int max_stride = 32; // letterbox pad to multiple of max_stride int w = img_w; int h = img_h; float scale = 1.f; if (w > h) { scale = (float)target_size / w; w = target_size; h = h * scale; } else { scale = (float)target_size / h; h = target_size; w = w * scale; } ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h); // letterbox pad to target_size rectangle int wpad = (w + max_stride - 1) / max_stride * max_stride - w; int hpad = (h + max_stride - 1) / max_stride * max_stride - h; ncnn::Mat in_pad; ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; in_pad.substract_mean_normalize(0, norm_vals); ncnn::Extractor ex = yolov8.create_extractor(); ex.input("in0", in_pad); ncnn::Mat out; ex.extract("out0", out); ncnn::Mat out_points; ex.extract("out1", out_points); std::vector proposals; generate_proposals(out, out_points, strides, in_pad, prob_threshold, proposals); // sort all proposals by score from highest to lowest qsort_descent_inplace(proposals); // apply nms with nms_threshold std::vector picked; nms_sorted_bboxes(proposals, picked, nms_threshold); int count = picked.size(); if (count == 0) return 0; const int num_points = out_points.w / 3; objects.resize(count); for (int i = 0; i < count; i++) { objects[i] = proposals[picked[i]]; // adjust offset to original unpadded float x0 = (objects[i].rect.x - (wpad / 2)) / scale; float y0 = (objects[i].rect.y - (hpad / 2)) / scale; float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale; float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale; for (int j = 0; j < num_points; j++) { objects[i].keypoints[j].p.x = (objects[i].keypoints[j].p.x - (wpad / 2)) / scale; objects[i].keypoints[j].p.y = (objects[i].keypoints[j].p.y - (hpad / 2)) / scale; } // clip x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f); y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f); x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f); y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f); objects[i].rect.x = x0; objects[i].rect.y = y0; objects[i].rect.width = x1 - x0; objects[i].rect.height = y1 - y0; } return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = {"person"}; static const cv::Scalar colors[] = { cv::Scalar(244, 67, 54), cv::Scalar(233, 30, 99), cv::Scalar(156, 39, 176), cv::Scalar(103, 58, 183), cv::Scalar(63, 81, 181), cv::Scalar(33, 150, 243), cv::Scalar(3, 169, 244), cv::Scalar(0, 188, 212), cv::Scalar(0, 150, 136), cv::Scalar(76, 175, 80), cv::Scalar(139, 195, 74), cv::Scalar(205, 220, 57), cv::Scalar(255, 235, 59), cv::Scalar(255, 193, 7), cv::Scalar(255, 152, 0), cv::Scalar(255, 87, 34), cv::Scalar(121, 85, 72), cv::Scalar(158, 158, 158), cv::Scalar(96, 125, 139) }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; const cv::Scalar& color = colors[i % 19]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); // draw bone static const int joint_pairs[16][2] = { {0, 1}, {1, 3}, {0, 2}, {2, 4}, {5, 6}, {5, 7}, {7, 9}, {6, 8}, {8, 10}, {5, 11}, {6, 12}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16} }; static const cv::Scalar bone_colors[] = { cv::Scalar(0, 255, 0), cv::Scalar(0, 255, 0), cv::Scalar(0, 255, 0), cv::Scalar(0, 255, 0), cv::Scalar(255, 128, 0), cv::Scalar(255, 128, 0), cv::Scalar(255, 128, 0), cv::Scalar(255, 128, 0), cv::Scalar(255, 128, 0), cv::Scalar(255, 51, 255), cv::Scalar(255, 51, 255), cv::Scalar(255, 51, 255), cv::Scalar(51, 153, 255), cv::Scalar(51, 153, 255), cv::Scalar(51, 153, 255), cv::Scalar(51, 153, 255), }; for (int j = 0; j < 16; j++) { const KeyPoint& p1 = obj.keypoints[joint_pairs[j][0]]; const KeyPoint& p2 = obj.keypoints[joint_pairs[j][1]]; if (p1.prob < 0.2f || p2.prob < 0.2f) continue; cv::line(image, p1.p, p2.p, bone_colors[j], 2); } // draw joint for (size_t j = 0; j < obj.keypoints.size(); j++) { const KeyPoint& keypoint = obj.keypoints[j]; fprintf(stderr, "%.2f %.2f = %.5f\n", keypoint.p.x, keypoint.p.y, keypoint.prob); if (keypoint.prob < 0.2f) continue; cv::circle(image, keypoint.p, 3, color, -1); } cv::rectangle(image, obj.rect, color); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_yolov8_pose(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/yolov8_seg.cpp ================================================ // Copyright 2024 Tencent // SPDX-License-Identifier: BSD-3-Clause // 1. install // pip3 install -U ultralytics pnnx ncnn // 2. export yolov8-seg torchscript // yolo export model=yolov8n-seg.pt format=torchscript // 3. convert torchscript with static shape // pnnx yolov8n-seg.torchscript // 4. modify yolov8n_seg_pnnx.py for dynamic shape inference // A. modify reshape to support dynamic image sizes // B. permute tensor before concat and adjust concat axis // C. drop post-process part // before: // v_144 = v_143.view(1, 32, 6400) // v_150 = v_149.view(1, 32, 1600) // v_156 = v_155.view(1, 32, 400) // v_157 = torch.cat((v_144, v_150, v_156), dim=2) // ... // v_191 = v_168.view(1, 144, 6400) // v_192 = v_179.view(1, 144, 1600) // v_193 = v_190.view(1, 144, 400) // v_194 = torch.cat((v_191, v_192, v_193), dim=2) // ... // v_215 = (v_214, v_138, ) // return v_215 // after: // v_144 = v_143.view(1, 32, -1).transpose(1, 2) // v_150 = v_149.view(1, 32, -1).transpose(1, 2) // v_156 = v_155.view(1, 32, -1).transpose(1, 2) // v_157 = torch.cat((v_144, v_150, v_156), dim=1) // ... // v_191 = v_168.view(1, 144, -1).transpose(1, 2) // v_192 = v_179.view(1, 144, -1).transpose(1, 2) // v_193 = v_190.view(1, 144, -1).transpose(1, 2) // v_194 = torch.cat((v_191, v_192, v_193), dim=1) // return v_194, v_157, v_138 // 5. re-export yolov8-seg torchscript // python3 -c 'import yolov8n_seg_pnnx; yolov8n_seg_pnnx.export_torchscript()' // 6. convert new torchscript with dynamic shape // pnnx yolov8n_seg_pnnx.py.pt inputshape=[1,3,640,640] inputshape2=[1,3,320,320] // 7. now you get ncnn model files // mv yolov8n_seg_pnnx.py.ncnn.param yolov8n_seg.ncnn.param // mv yolov8n_seg_pnnx.py.ncnn.bin yolov8n_seg.ncnn.bin // the out blob would be a 2-dim tensor with w=176 h=8400 // // | bbox-reg 16 x 4 | per-class scores(80) | // +-----+-----+-----+-----+----------------------+ // | dx0 | dy0 | dx1 | dy1 |0.1 0.0 0.0 0.5 ......| // all /| | | | | . | // boxes | .. | .. | .. | .. |0.0 0.9 0.0 0.0 ......| // (8400)| | | | | . | // \| | | | | . | // +-----+-----+-----+-----+----------------------+ // // // | mask (32) | // +-----------+ // |0.1........| // all /| | // boxes |0.0........| // (8400)| . | // \| . | // +-----------+ // #include "layer.h" #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include #include struct Object { cv::Rect_ rect; int label; float prob; int gindex; cv::Mat mask; }; static inline float intersection_area(const Object& a, const Object& b) { cv::Rect_ inter = a.rect & b.rect; return inter.area(); } static void qsort_descent_inplace(std::vector& objects, int left, int right) { int i = left; int j = right; float p = objects[(left + right) / 2].prob; while (i <= j) { while (objects[i].prob > p) i++; while (objects[j].prob < p) j--; if (i <= j) { // swap std::swap(objects[i], objects[j]); i++; j--; } } // #pragma omp parallel sections { // #pragma omp section { if (left < j) qsort_descent_inplace(objects, left, j); } // #pragma omp section { if (i < right) qsort_descent_inplace(objects, i, right); } } } static void qsort_descent_inplace(std::vector& objects) { if (objects.empty()) return; qsort_descent_inplace(objects, 0, objects.size() - 1); } static void nms_sorted_bboxes(const std::vector& objects, std::vector& picked, float nms_threshold, bool agnostic = false) { picked.clear(); const int n = objects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = objects[i].rect.area(); } for (int i = 0; i < n; i++) { const Object& a = objects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { const Object& b = objects[picked[j]]; if (!agnostic && a.label != b.label) continue; // intersection over union float inter_area = intersection_area(a, b); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area if (inter_area / union_area > nms_threshold) keep = 0; } if (keep) picked.push_back(i); } } static inline float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); } static void generate_proposals(const ncnn::Mat& pred, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) { const int w = in_pad.w; const int h = in_pad.h; const int num_grid_x = w / stride; const int num_grid_y = h / stride; const int reg_max_1 = 16; const int num_class = pred.w - reg_max_1 * 4; // number of classes. 80 for COCO for (int y = 0; y < num_grid_y; y++) { for (int x = 0; x < num_grid_x; x++) { const ncnn::Mat pred_grid = pred.row_range(y * num_grid_x + x, 1); // find label with max score int label = -1; float score = -FLT_MAX; { const ncnn::Mat pred_score = pred_grid.range(reg_max_1 * 4, num_class); for (int k = 0; k < num_class; k++) { float s = pred_score[k]; if (s > score) { label = k; score = s; } } score = sigmoid(score); } if (score >= prob_threshold) { ncnn::Mat pred_bbox = pred_grid.range(0, reg_max_1 * 4).reshape(reg_max_1, 4).clone(); { ncnn::Layer* softmax = ncnn::create_layer("Softmax"); ncnn::ParamDict pd; pd.set(0, 1); // axis pd.set(1, 1); softmax->load_param(pd); ncnn::Option opt; opt.num_threads = 1; opt.use_packing_layout = false; softmax->create_pipeline(opt); softmax->forward_inplace(pred_bbox, opt); softmax->destroy_pipeline(opt); delete softmax; } float pred_ltrb[4]; for (int k = 0; k < 4; k++) { float dis = 0.f; const float* dis_after_sm = pred_bbox.row(k); for (int l = 0; l < reg_max_1; l++) { dis += l * dis_after_sm[l]; } pred_ltrb[k] = dis * stride; } float pb_cx = (x + 0.5f) * stride; float pb_cy = (y + 0.5f) * stride; float x0 = pb_cx - pred_ltrb[0]; float y0 = pb_cy - pred_ltrb[1]; float x1 = pb_cx + pred_ltrb[2]; float y1 = pb_cy + pred_ltrb[3]; Object obj; obj.rect.x = x0; obj.rect.y = y0; obj.rect.width = x1 - x0; obj.rect.height = y1 - y0; obj.label = label; obj.prob = score; obj.gindex = y * num_grid_x + x; objects.push_back(obj); } } } } static void generate_proposals(const ncnn::Mat& pred, const std::vector& strides, const ncnn::Mat& in_pad, float prob_threshold, std::vector& objects) { const int w = in_pad.w; const int h = in_pad.h; int pred_row_offset = 0; for (size_t i = 0; i < strides.size(); i++) { const int stride = strides[i]; const int num_grid_x = w / stride; const int num_grid_y = h / stride; const int num_grid = num_grid_x * num_grid_y; std::vector objects_stride; generate_proposals(pred.row_range(pred_row_offset, num_grid), stride, in_pad, prob_threshold, objects_stride); for (size_t j = 0; j < objects_stride.size(); j++) { Object obj = objects_stride[j]; obj.gindex += pred_row_offset; objects.push_back(obj); } pred_row_offset += num_grid; } } static int detect_yolov8_seg(const cv::Mat& bgr, std::vector& objects) { ncnn::Net yolov8; yolov8.opt.use_vulkan_compute = true; // yolov8.opt.use_bf16_storage = true; // https://github.com/nihui/ncnn-android-yolov8/tree/master/app/src/main/assets yolov8.load_param("yolov8n_seg.ncnn.param"); yolov8.load_model("yolov8n_seg.ncnn.bin"); // yolov8.load_param("yolov8s_seg.ncnn.param"); // yolov8.load_model("yolov8s_seg.ncnn.bin"); // yolov8.load_param("yolov8m_seg.ncnn.param"); // yolov8.load_model("yolov8m_seg.ncnn.bin"); const int target_size = 640; const float prob_threshold = 0.25f; const float nms_threshold = 0.45f; const float mask_threshold = 0.5f; int img_w = bgr.cols; int img_h = bgr.rows; // ultralytics/cfg/models/v8/yolov8.yaml std::vector strides(3); strides[0] = 8; strides[1] = 16; strides[2] = 32; const int max_stride = 32; // letterbox pad to multiple of max_stride int w = img_w; int h = img_h; float scale = 1.f; if (w > h) { scale = (float)target_size / w; w = target_size; h = h * scale; } else { scale = (float)target_size / h; h = target_size; w = w * scale; } ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h); // letterbox pad to target_size rectangle int wpad = (w + max_stride - 1) / max_stride * max_stride - w; int hpad = (h + max_stride - 1) / max_stride * max_stride - h; ncnn::Mat in_pad; ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; in_pad.substract_mean_normalize(0, norm_vals); ncnn::Extractor ex = yolov8.create_extractor(); ex.input("in0", in_pad); ncnn::Mat out; ex.extract("out0", out); std::vector proposals; generate_proposals(out, strides, in_pad, prob_threshold, proposals); // sort all proposals by score from highest to lowest qsort_descent_inplace(proposals); // apply nms with nms_threshold std::vector picked; nms_sorted_bboxes(proposals, picked, nms_threshold); int count = picked.size(); if (count == 0) return 0; ncnn::Mat mask_feat; ex.extract("out1", mask_feat); ncnn::Mat mask_protos; ex.extract("out2", mask_protos); ncnn::Mat objects_mask_feat(mask_feat.w, 1, count); objects.resize(count); for (int i = 0; i < count; i++) { objects[i] = proposals[picked[i]]; // adjust offset to original unpadded float x0 = (objects[i].rect.x - (wpad / 2)) / scale; float y0 = (objects[i].rect.y - (hpad / 2)) / scale; float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale; float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale; // clip x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f); y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f); x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f); y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f); objects[i].rect.x = x0; objects[i].rect.y = y0; objects[i].rect.width = x1 - x0; objects[i].rect.height = y1 - y0; // pick mask feat memcpy(objects_mask_feat.channel(i), mask_feat.row(objects[i].gindex), mask_feat.w * sizeof(float)); } // process mask ncnn::Mat objects_mask; { ncnn::Layer* gemm = ncnn::create_layer("Gemm"); ncnn::ParamDict pd; pd.set(6, 1); // constantC pd.set(7, count); // constantM pd.set(8, mask_protos.w * mask_protos.h); // constantN pd.set(9, mask_feat.w); // constantK pd.set(10, -1); // constant_broadcast_type_C pd.set(11, 1); // output_N1M gemm->load_param(pd); ncnn::Option opt; opt.num_threads = 1; opt.use_packing_layout = false; gemm->create_pipeline(opt); std::vector gemm_inputs(2); gemm_inputs[0] = objects_mask_feat; gemm_inputs[1] = mask_protos.reshape(mask_protos.w * mask_protos.h, 1, mask_protos.c); std::vector gemm_outputs(1); gemm->forward(gemm_inputs, gemm_outputs, opt); objects_mask = gemm_outputs[0].reshape(mask_protos.w, mask_protos.h, count); gemm->destroy_pipeline(opt); delete gemm; } { ncnn::Layer* sigmoid = ncnn::create_layer("Sigmoid"); ncnn::Option opt; opt.num_threads = 1; opt.use_packing_layout = false; sigmoid->create_pipeline(opt); sigmoid->forward_inplace(objects_mask, opt); sigmoid->destroy_pipeline(opt); delete sigmoid; } // resize mask map { ncnn::Mat objects_mask_resized; ncnn::resize_bilinear(objects_mask, objects_mask_resized, in_pad.w / scale, in_pad.h / scale); objects_mask = objects_mask_resized; } // create per-object mask for (int i = 0; i < count; i++) { Object& obj = objects[i]; const ncnn::Mat mm = objects_mask.channel(i); obj.mask = cv::Mat((int)obj.rect.height, (int)obj.rect.width, CV_8UC1); // adjust offset to original unpadded and clip inside object box for (int y = 0; y < (int)obj.rect.height; y++) { const float* pmm = mm.row((int)(hpad / 2 / scale + obj.rect.y + y)) + (int)(wpad / 2 / scale + obj.rect.x); uchar* pmask = obj.mask.ptr(y); for (int x = 0; x < (int)obj.rect.width; x++) { pmask[x] = pmm[x] > mask_threshold ? 1 : 0; } } } return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = { "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush" }; static cv::Scalar colors[] = { cv::Scalar(244, 67, 54), cv::Scalar(233, 30, 99), cv::Scalar(156, 39, 176), cv::Scalar(103, 58, 183), cv::Scalar(63, 81, 181), cv::Scalar(33, 150, 243), cv::Scalar(3, 169, 244), cv::Scalar(0, 188, 212), cv::Scalar(0, 150, 136), cv::Scalar(76, 175, 80), cv::Scalar(139, 195, 74), cv::Scalar(205, 220, 57), cv::Scalar(255, 235, 59), cv::Scalar(255, 193, 7), cv::Scalar(255, 152, 0), cv::Scalar(255, 87, 34), cv::Scalar(121, 85, 72), cv::Scalar(158, 158, 158), cv::Scalar(96, 125, 139) }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; const cv::Scalar& color = colors[i % 19]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); for (int y = 0; y < (int)obj.rect.height; y++) { const uchar* maskptr = obj.mask.ptr(y); uchar* bgrptr = image.ptr((int)obj.rect.y + y) + (int)obj.rect.x * 3; for (int x = 0; x < (int)obj.rect.width; x++) { if (maskptr[x]) { bgrptr[0] = bgrptr[0] * 0.5 + color[0] * 0.5; bgrptr[1] = bgrptr[1] * 0.5 + color[1] * 0.5; bgrptr[2] = bgrptr[2] * 0.5 + color[2] * 0.5; } bgrptr += 3; } } cv::rectangle(image, obj.rect, color); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_yolov8_seg(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/yoloworld.cpp ================================================ // Copyright 2025 Tencent // SPDX-License-Identifier: BSD-3-Clause // 1. install // pip3 install -U ultralytics pnnx ncnn // 2. export yoloworld torchscript // yolo export model=yolov8s-world.pt format=torchscript // yolo export model=yolov8m-world.pt format=torchscript // yolo export model=yolov8l-world.pt format=torchscript // yolo export model=yolov8x-world.pt format=torchscript // yolo export model=yolov8s-worldv2.pt format=torchscript // yolo export model=yolov8m-worldv2.pt format=torchscript // yolo export model=yolov8l-worldv2.pt format=torchscript // yolo export model=yolov8x-worldv2.pt format=torchscript // 3. convert torchscript with static shape // pnnx yolov8s-world.torchscript // pnnx yolov8m-world.torchscript // pnnx yolov8l-world.torchscript // pnnx yolov8x-world.torchscript // pnnx yolov8s-worldv2.torchscript // pnnx yolov8m-worldv2.torchscript // pnnx yolov8l-worldv2.torchscript // pnnx yolov8x-worldv2.torchscript // the out blob would be a 2-dim tensor with w=8400 h=84 // // | all boxes (8400) | // +-------------------------+ // | center-x . | // bbox | center-y . | // | w . | // | h . | // +-------------------------+ // | 0.1 . | // per | 0.0 . | // class | 0.5 . | // scores | . . | // (80) | . . | // +-------------------------+ #include "layer.h" #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include #include struct Object { cv::Rect_ rect; int label; float prob; }; static inline float intersection_area(const Object& a, const Object& b) { cv::Rect_ inter = a.rect & b.rect; return inter.area(); } static void qsort_descent_inplace(std::vector& objects, int left, int right) { int i = left; int j = right; float p = objects[(left + right) / 2].prob; while (i <= j) { while (objects[i].prob > p) i++; while (objects[j].prob < p) j--; if (i <= j) { // swap std::swap(objects[i], objects[j]); i++; j--; } } // #pragma omp parallel sections { // #pragma omp section { if (left < j) qsort_descent_inplace(objects, left, j); } // #pragma omp section { if (i < right) qsort_descent_inplace(objects, i, right); } } } static void qsort_descent_inplace(std::vector& objects) { if (objects.empty()) return; qsort_descent_inplace(objects, 0, objects.size() - 1); } static void nms_sorted_bboxes(const std::vector& objects, std::vector& picked, float nms_threshold, bool agnostic = false) { picked.clear(); const int n = objects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = objects[i].rect.area(); } for (int i = 0; i < n; i++) { const Object& a = objects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { const Object& b = objects[picked[j]]; if (!agnostic && a.label != b.label) continue; // intersection over union float inter_area = intersection_area(a, b); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area if (inter_area / union_area > nms_threshold) keep = 0; } if (keep) picked.push_back(i); } } static void generate_proposals(const ncnn::Mat& pred, float prob_threshold, std::vector& objects) { const int num_boxes = pred.w; const int num_class = pred.h - 4; const ncnn::Mat pred_bbox = pred.row_range(0, 4); const ncnn::Mat pred_score = pred.row_range(4, num_class); for (int i = 0; i < num_boxes; i++) { int label = 0; float score = -9999.f; for (int j = 0; j < num_class; j++) { const float prob = pred_score.row(j)[i]; if (prob > score) { score = prob; label = j; } } if (score >= prob_threshold) { const float cx = pred_bbox.row(0)[i]; const float cy = pred_bbox.row(1)[i]; const float w = pred_bbox.row(2)[i]; const float h = pred_bbox.row(3)[i]; Object obj; obj.rect.x = cx - w / 2; obj.rect.y = cy - h / 2; obj.rect.width = w; obj.rect.height = h; obj.label = label; obj.prob = score; objects.push_back(obj); } } } static int detect_yoloworld(const cv::Mat& bgr, std::vector& objects) { ncnn::Net yoloworld; yoloworld.opt.use_vulkan_compute = true; // yoloworld.opt.use_bf16_storage = true; // https://github.com/nihui/ncnn-assets/tree/master/models // yoloworld.load_param("yolov8s_world.ncnn.param"); // yoloworld.load_model("yolov8s_world.ncnn.bin"); // yoloworld.load_param("yolov8m_world.ncnn.param"); // yoloworld.load_model("yolov8m_world.ncnn.bin"); // yoloworld.load_param("yolov8l_world.ncnn.param"); // yoloworld.load_model("yolov8l_world.ncnn.bin"); // yoloworld.load_param("yolov8x_world.ncnn.param"); // yoloworld.load_model("yolov8x_world.ncnn.bin"); yoloworld.load_param("yolov8s_worldv2.ncnn.param"); yoloworld.load_model("yolov8s_worldv2.ncnn.bin"); // yoloworld.load_param("yolov8m_worldv2.ncnn.param"); // yoloworld.load_model("yolov8m_worldv2.ncnn.bin"); // yoloworld.load_param("yolov8l_worldv2.ncnn.param"); // yoloworld.load_model("yolov8l_worldv2.ncnn.bin"); // yoloworld.load_param("yolov8x_worldv2.ncnn.param"); // yoloworld.load_model("yolov8x_worldv2.ncnn.bin"); const int target_size = 640; const float prob_threshold = 0.25f; const float nms_threshold = 0.45f; int img_w = bgr.cols; int img_h = bgr.rows; // letterbox pad int w = img_w; int h = img_h; float scale = 1.f; if (w > h) { scale = (float)target_size / w; w = target_size; h = h * scale; } else { scale = (float)target_size / h; h = target_size; w = w * scale; } ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h); // letterbox pad to target_size rectangle int wpad = target_size - w; int hpad = target_size - h; ncnn::Mat in_pad; ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; in_pad.substract_mean_normalize(0, norm_vals); ncnn::Extractor ex = yoloworld.create_extractor(); ex.input("in0", in_pad); ncnn::Mat out; ex.extract("out0", out); std::vector proposals; generate_proposals(out, prob_threshold, proposals); // sort all proposals by score from highest to lowest qsort_descent_inplace(proposals); // apply nms with nms_threshold std::vector picked; nms_sorted_bboxes(proposals, picked, nms_threshold); int count = picked.size(); objects.resize(count); for (int i = 0; i < count; i++) { objects[i] = proposals[picked[i]]; // adjust offset to original unpadded float x0 = (objects[i].rect.x - (wpad / 2)) / scale; float y0 = (objects[i].rect.y - (hpad / 2)) / scale; float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale; float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale; // clip x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f); y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f); x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f); y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f); objects[i].rect.x = x0; objects[i].rect.y = y0; objects[i].rect.width = x1 - x0; objects[i].rect.height = y1 - y0; } return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = { "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush" }; static cv::Scalar colors[] = { cv::Scalar(244, 67, 54), cv::Scalar(233, 30, 99), cv::Scalar(156, 39, 176), cv::Scalar(103, 58, 183), cv::Scalar(63, 81, 181), cv::Scalar(33, 150, 243), cv::Scalar(3, 169, 244), cv::Scalar(0, 188, 212), cv::Scalar(0, 150, 136), cv::Scalar(76, 175, 80), cv::Scalar(139, 195, 74), cv::Scalar(205, 220, 57), cv::Scalar(255, 235, 59), cv::Scalar(255, 193, 7), cv::Scalar(255, 152, 0), cv::Scalar(255, 87, 34), cv::Scalar(121, 85, 72), cv::Scalar(158, 158, 158), cv::Scalar(96, 125, 139) }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; const cv::Scalar& color = colors[i % 19]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::rectangle(image, obj.rect, color); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_yoloworld(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: examples/yolox.cpp ================================================ // Copyright 2020 Tencent // Copyright 2020-2021 Megvii Inc. // SPDX-License-Identifier: BSD-3-Clause #include "layer.h" #include "net.h" #if defined(USE_NCNN_SIMPLEOCV) #include "simpleocv.h" #else #include #include #include #endif #include #include #include #define YOLOX_NMS_THRESH 0.45 // nms threshold #define YOLOX_CONF_THRESH 0.25 // threshold of bounding box prob #define YOLOX_TARGET_SIZE 640 // target image size after resize, might use 416 for small model // YOLOX use the same focus in yolov5 class YoloV5Focus : public ncnn::Layer { public: YoloV5Focus() { one_blob_only = true; } virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const { int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; int outw = w / 2; int outh = h / 2; int outc = channels * 4; top_blob.create(outw, outh, outc, 4u, 1, opt.blob_allocator); if (top_blob.empty()) return -100; #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outc; p++) { const float* ptr = bottom_blob.channel(p % channels).row((p / channels) % 2) + ((p / channels) / 2); float* outptr = top_blob.channel(p); for (int i = 0; i < outh; i++) { for (int j = 0; j < outw; j++) { *outptr = *ptr; outptr += 1; ptr += 2; } ptr += w; } } return 0; } }; DEFINE_LAYER_CREATOR(YoloV5Focus) struct Object { cv::Rect_ rect; int label; float prob; }; struct GridAndStride { int grid0; int grid1; int stride; }; static inline float intersection_area(const Object& a, const Object& b) { cv::Rect_ inter = a.rect & b.rect; return inter.area(); } static void qsort_descent_inplace(std::vector& faceobjects, int left, int right) { int i = left; int j = right; float p = faceobjects[(left + right) / 2].prob; while (i <= j) { while (faceobjects[i].prob > p) i++; while (faceobjects[j].prob < p) j--; if (i <= j) { // swap std::swap(faceobjects[i], faceobjects[j]); i++; j--; } } #pragma omp parallel sections { #pragma omp section { if (left < j) qsort_descent_inplace(faceobjects, left, j); } #pragma omp section { if (i < right) qsort_descent_inplace(faceobjects, i, right); } } } static void qsort_descent_inplace(std::vector& objects) { if (objects.empty()) return; qsort_descent_inplace(objects, 0, objects.size() - 1); } static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold, bool agnostic = false) { picked.clear(); const int n = faceobjects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = faceobjects[i].rect.area(); } for (int i = 0; i < n; i++) { const Object& a = faceobjects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { const Object& b = faceobjects[picked[j]]; if (!agnostic && a.label != b.label) continue; // intersection over union float inter_area = intersection_area(a, b); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area if (inter_area / union_area > nms_threshold) keep = 0; } if (keep) picked.push_back(i); } } static void generate_grids_and_stride(const int target_w, const int target_h, std::vector& strides, std::vector& grid_strides) { for (int i = 0; i < (int)strides.size(); i++) { int stride = strides[i]; int num_grid_w = target_w / stride; int num_grid_h = target_h / stride; for (int g1 = 0; g1 < num_grid_h; g1++) { for (int g0 = 0; g0 < num_grid_w; g0++) { GridAndStride gs; gs.grid0 = g0; gs.grid1 = g1; gs.stride = stride; grid_strides.push_back(gs); } } } } static void generate_yolox_proposals(std::vector grid_strides, const ncnn::Mat& feat_blob, float prob_threshold, std::vector& objects) { const int num_grid = feat_blob.h; const int num_class = feat_blob.w - 5; const int num_anchors = grid_strides.size(); const float* feat_ptr = feat_blob.channel(0); for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++) { const int grid0 = grid_strides[anchor_idx].grid0; const int grid1 = grid_strides[anchor_idx].grid1; const int stride = grid_strides[anchor_idx].stride; // yolox/models/yolo_head.py decode logic // outputs[..., :2] = (outputs[..., :2] + grids) * strides // outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides float x_center = (feat_ptr[0] + grid0) * stride; float y_center = (feat_ptr[1] + grid1) * stride; float w = exp(feat_ptr[2]) * stride; float h = exp(feat_ptr[3]) * stride; float x0 = x_center - w * 0.5f; float y0 = y_center - h * 0.5f; float box_objectness = feat_ptr[4]; for (int class_idx = 0; class_idx < num_class; class_idx++) { float box_cls_score = feat_ptr[5 + class_idx]; float box_prob = box_objectness * box_cls_score; if (box_prob > prob_threshold) { Object obj; obj.rect.x = x0; obj.rect.y = y0; obj.rect.width = w; obj.rect.height = h; obj.label = class_idx; obj.prob = box_prob; objects.push_back(obj); } } // class loop feat_ptr += feat_blob.w; } // point anchor loop } static int detect_yolox(const cv::Mat& bgr, std::vector& objects) { ncnn::Net yolox; yolox.opt.use_vulkan_compute = true; // yolox.opt.use_bf16_storage = true; // Focus in yolov5 yolox.register_custom_layer("YoloV5Focus", YoloV5Focus_layer_creator); // original pretrained model from https://github.com/Megvii-BaseDetection/YOLOX // ncnn model param: https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_s_ncnn.tar.gz // NOTE that newest version YOLOX remove normalization of model (minus mean and then div by std), // which might cause your model outputs becoming a total mess, plz check carefully. if (yolox.load_param("yolox.param")) exit(-1); if (yolox.load_model("yolox.bin")) exit(-1); int img_w = bgr.cols; int img_h = bgr.rows; int w = img_w; int h = img_h; float scale = 1.f; if (w > h) { scale = (float)YOLOX_TARGET_SIZE / w; w = YOLOX_TARGET_SIZE; h = h * scale; } else { scale = (float)YOLOX_TARGET_SIZE / h; h = YOLOX_TARGET_SIZE; w = w * scale; } ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, img_w, img_h, w, h); // pad to YOLOX_TARGET_SIZE rectangle int wpad = (w + 31) / 32 * 32 - w; int hpad = (h + 31) / 32 * 32 - h; ncnn::Mat in_pad; // different from yolov5, yolox only pad on bottom and right side, // which means users don't need to extra padding info to decode boxes coordinate. ncnn::copy_make_border(in, in_pad, 0, hpad, 0, wpad, ncnn::BORDER_CONSTANT, 114.f); ncnn::Extractor ex = yolox.create_extractor(); ex.input("images", in_pad); std::vector proposals; { ncnn::Mat out; ex.extract("output", out); static const int stride_arr[] = {8, 16, 32}; // might have stride=64 in YOLOX std::vector strides(stride_arr, stride_arr + sizeof(stride_arr) / sizeof(stride_arr[0])); std::vector grid_strides; generate_grids_and_stride(in_pad.w, in_pad.h, strides, grid_strides); generate_yolox_proposals(grid_strides, out, YOLOX_CONF_THRESH, proposals); } // sort all proposals by score from highest to lowest qsort_descent_inplace(proposals); // apply nms with nms_threshold std::vector picked; nms_sorted_bboxes(proposals, picked, YOLOX_NMS_THRESH); int count = picked.size(); objects.resize(count); for (int i = 0; i < count; i++) { objects[i] = proposals[picked[i]]; // adjust offset to original unpadded float x0 = (objects[i].rect.x) / scale; float y0 = (objects[i].rect.y) / scale; float x1 = (objects[i].rect.x + objects[i].rect.width) / scale; float y1 = (objects[i].rect.y + objects[i].rect.height) / scale; // clip x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f); y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f); x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f); y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f); objects[i].rect.x = x0; objects[i].rect.y = y0; objects[i].rect.width = x1 - x0; objects[i].rect.height = y1 - y0; } return 0; } static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = { "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush" }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0)); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rect.x; int y = obj.rect.y - label_size.height - baseLine; if (y < 0) y = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } cv::imshow("image", image); cv::waitKey(0); } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_yolox(m, objects); draw_objects(m, objects); return 0; } ================================================ FILE: package.sh ================================================ #!/usr/bin/bash NAME=ncnn ##### package android lib ANDROIDPKGNAME=${NAME}-android-lib rm -rf $ANDROIDPKGNAME mkdir -p $ANDROIDPKGNAME mkdir -p $ANDROIDPKGNAME/armeabi-v7a mkdir -p $ANDROIDPKGNAME/arm64-v8a mkdir -p $ANDROIDPKGNAME/x86 mkdir -p $ANDROIDPKGNAME/x86_64 mkdir -p $ANDROIDPKGNAME/include cp build-android-armv7/install/lib/lib*.a $ANDROIDPKGNAME/armeabi-v7a/ cp build-android-aarch64/install/lib/lib*.a $ANDROIDPKGNAME/arm64-v8a/ cp build-android-x86/install/lib/lib*.a $ANDROIDPKGNAME/x86/ cp build-android-x86_64/install/lib/lib*.a $ANDROIDPKGNAME/x86_64/ cp -r build-android-aarch64/install/include/* $ANDROIDPKGNAME/include/ rm -f $ANDROIDPKGNAME.zip zip -9 -r $ANDROIDPKGNAME.zip $ANDROIDPKGNAME ##### package ios framework IOSPKGNAME=${NAME}.framework rm -rf $IOSPKGNAME mkdir -p $IOSPKGNAME/Versions/A/Headers mkdir -p $IOSPKGNAME/Versions/A/Resources ln -s A $IOSPKGNAME/Versions/Current ln -s Versions/Current/Headers $IOSPKGNAME/Headers ln -s Versions/Current/Resources $IOSPKGNAME/Resources ln -s Versions/Current/${NAME} $IOSPKGNAME/${NAME} lipo -create \ build-ios/install/lib/lib${NAME}.a \ build-ios-sim/install/lib/lib${NAME}.a \ -o $IOSPKGNAME/Versions/A/${NAME} cp -r build-ios/install/include/* $IOSPKGNAME/Versions/A/Headers/ cp Info.plist ${IOSPKGNAME}/Versions/A/Resources/ rm -f $IOSPKGNAME.zip zip -9 -y -r $IOSPKGNAME.zip $IOSPKGNAME ##### package ios framework bitcode IOSPKGNAME=${NAME}.framework rm -rf $IOSPKGNAME mkdir -p $IOSPKGNAME/Versions/A/Headers mkdir -p $IOSPKGNAME/Versions/A/Resources ln -s A $IOSPKGNAME/Versions/Current ln -s Versions/Current/Headers $IOSPKGNAME/Headers ln -s Versions/Current/Resources $IOSPKGNAME/Resources ln -s Versions/Current/${NAME} $IOSPKGNAME/${NAME} lipo -create \ build-ios-bitcode/install/lib/lib${NAME}.a \ build-ios-sim-bitcode/install/lib/lib${NAME}.a \ -o $IOSPKGNAME/Versions/A/${NAME} cp -r build-ios-bitcode/install/include/ncnn $IOSPKGNAME/Versions/A/Headers/ cp Info.plist ${IOSPKGNAME}/Versions/A/Resources/ rm -f $IOSPKGNAME-bitcode.zip zip -9 -y -r $IOSPKGNAME-bitcode.zip $IOSPKGNAME ##### package android lib vulkan ANDROIDPKGNAME=${NAME}-android-vulkan-lib rm -rf $ANDROIDPKGNAME mkdir -p $ANDROIDPKGNAME mkdir -p $ANDROIDPKGNAME/armeabi-v7a mkdir -p $ANDROIDPKGNAME/arm64-v8a mkdir -p $ANDROIDPKGNAME/x86 mkdir -p $ANDROIDPKGNAME/x86_64 mkdir -p $ANDROIDPKGNAME/include cp build-android-armv7-vulkan/install/lib/lib*.a $ANDROIDPKGNAME/armeabi-v7a/ cp build-android-aarch64-vulkan/install/lib/lib*.a $ANDROIDPKGNAME/arm64-v8a/ cp build-android-x86-vulkan/install/lib/lib*.a $ANDROIDPKGNAME/x86/ cp build-android-x86_64-vulkan/install/lib/lib*.a $ANDROIDPKGNAME/x86_64/ cp -r build-android-aarch64-vulkan/install/include/* $ANDROIDPKGNAME/include/ rm -f $ANDROIDPKGNAME.zip zip -9 -r $ANDROIDPKGNAME.zip $ANDROIDPKGNAME ##### package ios framework vulkan IOSPKGNAME=${NAME}.framework rm -rf $IOSPKGNAME mkdir -p $IOSPKGNAME/Versions/A/Headers mkdir -p $IOSPKGNAME/Versions/A/Resources ln -s A $IOSPKGNAME/Versions/Current ln -s Versions/Current/Headers $IOSPKGNAME/Headers ln -s Versions/Current/Resources $IOSPKGNAME/Resources ln -s Versions/Current/${NAME} $IOSPKGNAME/${NAME} lipo -create \ build-ios-vulkan/install/lib/lib${NAME}.a \ build-ios-sim-vulkan/install/lib/lib${NAME}.a \ -o $IOSPKGNAME/Versions/A/${NAME} cp -r build-ios-vulkan/install/include/ncnn $IOSPKGNAME/Versions/A/Headers/ cp Info.plist ${IOSPKGNAME}/Versions/A/Resources/ rm -f $IOSPKGNAME-vulkan.zip zip -9 -y -r $IOSPKGNAME-vulkan.zip $IOSPKGNAME ##### package ios framework vulkan bitcode IOSPKGNAME=${NAME}.framework rm -rf $IOSPKGNAME mkdir -p $IOSPKGNAME/Versions/A/Headers mkdir -p $IOSPKGNAME/Versions/A/Resources ln -s A $IOSPKGNAME/Versions/Current ln -s Versions/Current/Headers $IOSPKGNAME/Headers ln -s Versions/Current/Resources $IOSPKGNAME/Resources ln -s Versions/Current/${NAME} $IOSPKGNAME/${NAME} lipo -create \ build-ios-vulkan-bitcode/install/lib/lib${NAME}.a \ build-ios-sim-vulkan-bitcode/install/lib/lib${NAME}.a \ -o $IOSPKGNAME/Versions/A/${NAME} cp -r build-ios-vulkan-bitcode/install/include/ncnn $IOSPKGNAME/Versions/A/Headers/ cp Info.plist ${IOSPKGNAME}/Versions/A/Resources/ rm -f $IOSPKGNAME-vulkan-bitcode.zip zip -9 -y -r $IOSPKGNAME-vulkan-bitcode.zip $IOSPKGNAME ##### package ios framework glslang IOSPKGNAME=glslang.framework rm -rf $IOSPKGNAME mkdir -p $IOSPKGNAME/Versions/A/Headers mkdir -p $IOSPKGNAME/Versions/A/Resources ln -s A $IOSPKGNAME/Versions/Current ln -s Versions/Current/Headers $IOSPKGNAME/Headers ln -s Versions/Current/Resources $IOSPKGNAME/Resources ln -s Versions/Current/glslang $IOSPKGNAME/glslang libtool -static \ build-ios-vulkan/install/lib/libglslang.a \ build-ios-vulkan/install/lib/libSPIRV.a \ build-ios-vulkan/install/lib/libOGLCompiler.a \ build-ios-vulkan/install/lib/libOSDependent.a \ -o build-ios-vulkan/install/lib/libglslang_combined.a libtool -static \ build-ios-sim-vulkan/install/lib/libglslang.a \ build-ios-sim-vulkan/install/lib/libSPIRV.a \ build-ios-sim-vulkan/install/lib/libOGLCompiler.a \ build-ios-sim-vulkan/install/lib/libOSDependent.a \ -o build-ios-sim-vulkan/install/lib/libglslang_combined.a lipo -create \ build-ios-vulkan/install/lib/libglslang_combined.a \ build-ios-sim-vulkan/install/lib/libglslang_combined.a \ -o $IOSPKGNAME/Versions/A/glslang cp -r build-ios-vulkan/install/include/glslang $IOSPKGNAME/Versions/A/Headers/ cp Info.plist ${IOSPKGNAME}/Versions/A/Resources/ rm -f $IOSPKGNAME.zip zip -9 -y -r $IOSPKGNAME.zip $IOSPKGNAME ##### package ios framework glslang bitcode IOSPKGNAME=glslang.framework rm -rf $IOSPKGNAME mkdir -p $IOSPKGNAME/Versions/A/Headers mkdir -p $IOSPKGNAME/Versions/A/Resources ln -s A $IOSPKGNAME/Versions/Current ln -s Versions/Current/Headers $IOSPKGNAME/Headers ln -s Versions/Current/Resources $IOSPKGNAME/Resources ln -s Versions/Current/glslang $IOSPKGNAME/glslang libtool -static \ build-ios-vulkan-bitcode/install/lib/libglslang.a \ build-ios-vulkan-bitcode/install/lib/libSPIRV.a \ build-ios-vulkan-bitcode/install/lib/libOGLCompiler.a \ build-ios-vulkan-bitcode/install/lib/libOSDependent.a \ -o build-ios-vulkan-bitcode/install/lib/libglslang_combined.a libtool -static \ build-ios-sim-vulkan-bitcode/install/lib/libglslang.a \ build-ios-sim-vulkan-bitcode/install/lib/libSPIRV.a \ build-ios-sim-vulkan-bitcode/install/lib/libOGLCompiler.a \ build-ios-sim-vulkan-bitcode/install/lib/libOSDependent.a \ -o build-ios-sim-vulkan-bitcode/install/lib/libglslang_combined.a lipo -create \ build-ios-vulkan-bitcode/install/lib/libglslang_combined.a \ build-ios-sim-vulkan-bitcode/install/lib/libglslang_combined.a \ -o $IOSPKGNAME/Versions/A/glslang cp -r build-ios-vulkan-bitcode/install/include/glslang $IOSPKGNAME/Versions/A/Headers/ cp Info.plist ${IOSPKGNAME}/Versions/A/Resources/ rm -f $IOSPKGNAME-bitcode.zip zip -9 -y -r $IOSPKGNAME-bitcode.zip $IOSPKGNAME ================================================ FILE: pyproject.toml ================================================ [build-system] requires = [ "setuptools>=42", "wheel", "importlib-metadata", ] build-backend = "setuptools.build_meta" ================================================ FILE: python/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.4...3.10) project(pyncnn) set(PACKAGE_VERSION ${NCNN_VERSION_STRING}) add_definitions(-DVERSION_INFO="${PACKAGE_VERSION}") set( CMAKE_CXX_STANDARD 11 ) set( CMAKE_CXX_STANDARD_REQUIRED ON ) option(NCNN_SYSTEM_PYBIND11 "use system pybind11" OFF) if(CMAKE_CXX_COMPILER_ARCHITECTURE_ID MATCHES "ARM64") option(PYBIND11_PYTHONLIBS_OVERWRITE "" OFF) set(PYTHON_PREFIX "$ENV{LOCALAPPDATA}/pypa/cibuildwheel/Cache/nuget-cpython/pythonarm64.$ENV{PYTHON_VERSION}/tools") if(NOT DEFINED $ENV{CIBUILDWHEEL}) message(WARNING " This is hack for cibuildwheel on github action\n" " Use the right way to cross-compile python module for windows arm64 like follows\n" " set(PYTHON_PREFIX \"\")\n" ) endif() endif() if(NCNN_SYSTEM_PYBIND11) find_package(pybind11) if(NOT pybind11_FOUND) message(WARNING "pybind11 package not found! NCNN_SYSTEM_PYBIND11 will be turned off.") set(NCNN_SYSTEM_PYBIND11 OFF) endif() endif() if(NOT NCNN_SYSTEM_PYBIND11) if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/pybind11/CMakeLists.txt") message(FATAL_ERROR "The submodules were not downloaded! Please update submodules with \"git submodule update --init\" and try again.") else() add_subdirectory(pybind11) endif() endif() if("${CMAKE_LIBRARY_OUTPUT_DIRECTORY}" STREQUAL "") if(MSVC OR CMAKE_GENERATOR STREQUAL "Xcode") set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${CMAKE_CURRENT_BINARY_DIR}/ncnn/) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE ${CMAKE_CURRENT_BINARY_DIR}/ncnn/) endif(MSVC OR CMAKE_GENERATOR STREQUAL "Xcode") endif("${CMAKE_LIBRARY_OUTPUT_DIRECTORY}" STREQUAL "") # enable global link time optimization cmake_policy(SET CMP0069 NEW) set(CMAKE_POLICY_DEFAULT_CMP0069 NEW) include(CheckIPOSupported) check_ipo_supported(RESULT ipo_supported OUTPUT ipo_supported_output) if(ipo_supported) set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) endif() include_directories(${pybind11_INCLUDE_DIR} ${PYTHON_INCLUDE_DIRS}) pybind11_add_module(pyncnn src/main.cpp) set_target_properties(pyncnn PROPERTIES OUTPUT_NAME "ncnn") target_link_libraries(pyncnn PUBLIC ncnn) set_target_properties(pyncnn PROPERTIES PREFIX "" LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/ncnn") set_property(TARGET pyncnn PROPERTY FOLDER "python") if("${CMAKE_LIBRARY_OUTPUT_DIRECTORY}" STREQUAL "") add_custom_command(TARGET pyncnn POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/ncnn/ncnn${PYTHON_MODULE_PREFIX}${PYTHON_MODULE_EXTENSION} ${PROJECT_SOURCE_DIR}/ncnn/ncnn${PYTHON_MODULE_PREFIX}${PYTHON_MODULE_EXTENSION}) endif("${CMAKE_LIBRARY_OUTPUT_DIRECTORY}" STREQUAL "") configure_file(setup.py.i ${PROJECT_SOURCE_DIR}/setup.py) ================================================ FILE: python/README.md ================================================ # ncnn python wrapper of ncnn with [pybind11](https://github.com/pybind/pybind11), only support python3.x now. Install from pip ================== ncnn is available as wheel packages for macOS, Windows and Linux distributions, you can install with pip: ``` python -m pip install -U pip python -m pip install -U ncnn ``` # Build from source If you want to build ncnn with some options not as default, or just like to build everything yourself, it is not difficult to build ncnn from source. ## Prerequisites **On Unix (Linux, OS X)** * A compiler with C++11 support * CMake >= 3.4 **On Mac** * A compiler with C++11 support * CMake >= 3.4 **On Windows** * Visual Studio 2015 or higher * CMake >= 3.4 ## Build & Install 1. clone ncnn and init submodule. ```bash cd /pathto/ncnn git submodule init && git submodule update ``` 2. build and install. ``` python setup.py install ``` If you want to use a custom toolchain, you can install with the `CMAKE_TOOLCHAIN_FILE` environment variable, like this: ``` CMAKE_TOOLCHAIN_FILE="../../toolchains/power9le-linux-gnu-vsx.clang.toolchain.cmake" python setup.py install ``` if you want to enable the usage of vulkan, you can install as following: ``` python setup.py install --vulkan=on ``` > **Attention:** > > To enable Vulkan support, you must first install the Vulkan SDK. > > **For Windows or Linux Users:** > > Ensure that the `VULKAN_SDK` environment variable is set to the path of the Vulkan SDK. > > **For MacOS Users:** > > On MacOS, you will need to specify additional environment variables. For guidance on setting these variables, please refer to lines 279-286 in the following file: [ncnn/.github/workflows/release-python.yml at master · Tencent/ncnn](https://github.com/Tencent/ncnn/blob/master/.github/workflows/release-python.yml). ## Custom-build & Install 1. clone ncnn and init submodule. ```bash cd /pathto/ncnn git submodule init && git submodule update ``` 2. build. ```bash mkdir build cd build cmake -DNCNN_PYTHON=ON .. make ``` To use the pybind11 package provided by your system, set the CMake variable `NCNN_SYSTEM_PYBIND11` to `ON` during the build process, like this: ```bash mkdir build cd build cmake -DNCNN_PYTHON=ON -DNCNN_SYSTEM_PYBIND11=ON .. make ``` 3. install ```bash cd /pathto/ncnn pip install . ``` if you use conda or miniconda, you can also install as following: ```bash cd /pathto/ncnn python3 setup.py install ``` ## Tests **test** ```bash cd /pathto/ncnn/python python3 tests/test.py ``` **benchmark** ```bash cd /pathto/ncnn/python python3 tests/benchmark.py ``` ## Numpy **ncnn.Mat->numpy.array, with no memory copy** ```bash mat = ncnn.Mat(...) mat_np = np.array(mat) ``` **numpy.array->ncnn.Mat, with no memory copy** ```bash mat_np = np.array(...) mat = ncnn.Mat(mat_np) ``` # Model Zoo install requirements ```bash pip install -r requirements.txt ``` then you can import ncnn.model_zoo and get model list as follow: ```bash import ncnn import ncnn.model_zoo as model_zoo print(model_zoo.get_model_list()) ``` models now in model zoo are as list below: ```bash mobilenet_yolov2 mobilenetv2_yolov3 yolov4_tiny yolov4 yolov5s yolact mobilenet_ssd squeezenet_ssd mobilenetv2_ssdlite mobilenetv3_ssdlite squeezenet faster_rcnn peleenet_ssd retinaface rfcn shufflenetv2 simplepose nanodet ``` all model in model zoo has example in ncnn/python/examples folder # Custom Layer custom layer demo is in ncnn/python/ncnn/model_zoo/yolov5.py:23 ================================================ FILE: python/examples/fasterrcnn.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import sys import cv2 from ncnn.model_zoo import get_model from ncnn.utils import draw_detection_objects if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: %s [imagepath]\n" % (sys.argv[0])) sys.exit(0) imagepath = sys.argv[1] m = cv2.imread(imagepath) if m is None: print("cv2.imread %s failed\n" % (imagepath)) sys.exit(0) net = get_model("faster_rcnn", num_threads=4, use_gpu=True) objects = net(m) draw_detection_objects(m, net.class_names, objects) ================================================ FILE: python/examples/mobilenetssd.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import sys import cv2 from ncnn.model_zoo import get_model from ncnn.utils import draw_detection_objects if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: %s [imagepath]\n" % (sys.argv[0])) sys.exit(0) imagepath = sys.argv[1] m = cv2.imread(imagepath) if m is None: print("cv2.imread %s failed\n" % (imagepath)) sys.exit(0) net = get_model("mobilenet_ssd", num_threads=4, use_gpu=True) objects = net(m) draw_detection_objects(m, net.class_names, objects) ================================================ FILE: python/examples/mobilenetv2ssdlite.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import sys import cv2 from ncnn.model_zoo import get_model from ncnn.utils import draw_detection_objects if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: %s [imagepath]\n" % (sys.argv[0])) sys.exit(0) imagepath = sys.argv[1] m = cv2.imread(imagepath) if m is None: print("cv2.imread %s failed\n" % (imagepath)) sys.exit(0) net = get_model("mobilenetv2_ssdlite", num_threads=4, use_gpu=True) objects = net(m) draw_detection_objects(m, net.class_names, objects) ================================================ FILE: python/examples/mobilenetv3ssdlite.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import sys import cv2 from ncnn.model_zoo import get_model from ncnn.utils import draw_detection_objects if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: %s [imagepath]\n" % (sys.argv[0])) sys.exit(0) imagepath = sys.argv[1] m = cv2.imread(imagepath) if m is None: print("cv2.imread %s failed\n" % (imagepath)) sys.exit(0) net = get_model("mobilenetv3_ssdlite", num_threads=4, use_gpu=True) objects = net(m) draw_detection_objects(m, net.class_names, objects, 0.6) ================================================ FILE: python/examples/model_zoo.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause from ncnn.model_zoo import get_model_list if __name__ == "__main__": print(get_model_list()) ================================================ FILE: python/examples/nanodet.py ================================================ # Copyright 2021 Tencent # SPDX-License-Identifier: BSD-3-Clause import sys import cv2 from ncnn.model_zoo import get_model from ncnn.utils import draw_detection_objects if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: %s [imagepath]\n" % (sys.argv[0])) sys.exit(0) imagepath = sys.argv[1] m = cv2.imread(imagepath) if m is None: print("cv2.imread %s failed\n" % (imagepath)) sys.exit(0) net = get_model( "nanodet", target_size=320, prob_threshold=0.4, nms_threshold=0.5, num_threads=4, use_gpu=True, ) objects = net(m) draw_detection_objects(m, net.class_names, objects) ================================================ FILE: python/examples/peleenetssd.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import sys import cv2 import numpy as np from ncnn.model_zoo import get_model def draw_detection_objects_seg(image, class_names, objects, mat_map): color = [128, 255, 128, 244, 35, 232] color_count = len(color) for obj in objects: print( "%d = %.5f at %.2f %.2f %.2f x %.2f\n" % (obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.w, obj.rect.h) ) cv2.rectangle( image, (int(obj.rect.x), int(obj.rect.y)), (int(obj.rect.x + obj.rect.w), int(obj.rect.y + obj.rect.h)), (255, 0, 0), ) text = "%s %.1f%%" % (class_names[int(obj.label)], obj.prob * 100) label_size, baseLine = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) x = obj.rect.x y = obj.rect.y - label_size[1] - baseLine if y < 0: y = 0 if x + label_size[0] > image.shape[1]: x = image.shape[1] - label_size[0] cv2.rectangle( image, (int(x), int(y)), (int(x + label_size[0]), int(y + label_size[1] + baseLine)), (255, 255, 255), -1, ) cv2.putText( image, text, (int(x), int(y + label_size[1])), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), ) width = mat_map.w height = mat_map.h size = mat_map.c img_index2 = 0 threshold = 0.45 ptr2 = np.array(mat_map) for i in range(height): ptr1 = image[i].flatten() img_index1 = 0 for j in range(width): maxima = threshold index = -1 for c in range(size): # const float* ptr3 = ptr2 + c*width*height ptr3 = ptr2[c].flatten() if ptr3[img_index2] > maxima: maxima = ptr3[img_index2] index = c if index > -1: color_index = (index) * 3 if color_index < color_count: b = color[color_index] g = color[color_index + 1] r = color[color_index + 2] ptr1[img_index1] = b / 2 + ptr1[img_index1] / 2 ptr1[img_index1 + 1] = g / 2 + ptr1[img_index1 + 1] / 2 ptr1[img_index1 + 2] = r / 2 + ptr1[img_index1 + 2] / 2 img_index1 += 3 img_index2 += 1 image[i] = ptr1.reshape(image[i].shape) cv2.imshow("image", image) cv2.waitKey(0) if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: %s [imagepath]\n" % (sys.argv[0])) sys.exit(0) imagepath = sys.argv[1] m = cv2.imread(imagepath) if m is None: print("cv2.imread %s failed\n" % (imagepath)) sys.exit(0) net = get_model("peleenet_ssd", num_threads=4, use_gpu=True) objects, seg_out = net(m) draw_detection_objects_seg(m, net.class_names, objects, seg_out) ================================================ FILE: python/examples/retinaface.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import sys import cv2 from ncnn.model_zoo import get_model from ncnn.utils import draw_faceobjects if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: %s [imagepath]\n" % (sys.argv[0])) sys.exit(0) imagepath = sys.argv[1] m = cv2.imread(imagepath) if m is None: print("cv2.imread %s failed\n" % (imagepath)) sys.exit(0) net = get_model("retinaface", num_threads=4, use_gpu=True) faceobjects = net(m) draw_faceobjects(m, faceobjects) ================================================ FILE: python/examples/rfcn.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import sys import cv2 from ncnn.model_zoo import get_model from ncnn.utils import draw_detection_objects if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: %s [imagepath]\n" % (sys.argv[0])) sys.exit(0) imagepath = sys.argv[1] m = cv2.imread(imagepath) if m is None: print("cv2.imread %s failed\n" % (imagepath)) sys.exit(0) net = get_model("rfcn", num_threads=4, use_gpu=True) objects = net(m) draw_detection_objects(m, net.class_names, objects) ================================================ FILE: python/examples/shufflenetv2.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import sys import cv2 from ncnn.model_zoo import get_model from ncnn.utils import print_topk if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: %s [imagepath]\n" % (sys.argv[0])) sys.exit(0) imagepath = sys.argv[1] m = cv2.imread(imagepath) if m is None: print("cv2.imread %s failed\n" % (imagepath)) sys.exit(0) net = get_model("shufflenetv2", num_threads=4, use_gpu=True) cls_scores = net(m) print_topk(cls_scores, 3) ================================================ FILE: python/examples/simplepose.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import sys import cv2 from ncnn.model_zoo import get_model from ncnn.utils import draw_pose if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: %s [imagepath]\n" % (sys.argv[0])) sys.exit(0) imagepath = sys.argv[1] m = cv2.imread(imagepath) if m is None: print("cv2.imread %s failed\n" % (imagepath)) sys.exit(0) net = get_model("simplepose", num_threads=4, use_gpu=True) keypoints = net(m) draw_pose(m, keypoints) ================================================ FILE: python/examples/squeezenet.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import sys import cv2 from ncnn.model_zoo import get_model from ncnn.utils import print_topk if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: %s [imagepath]\n" % (sys.argv[0])) sys.exit(0) imagepath = sys.argv[1] m = cv2.imread(imagepath) if m is None: print("cv2.imread %s failed\n" % (imagepath)) sys.exit(0) net = get_model("squeezenet", num_threads=4, use_gpu=True) cls_scores = net(m) print_topk(cls_scores, 5) ================================================ FILE: python/examples/squeezenetssd.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import sys import cv2 from ncnn.model_zoo import get_model from ncnn.utils import draw_detection_objects if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: %s [imagepath]\n" % (sys.argv[0])) sys.exit(0) imagepath = sys.argv[1] m = cv2.imread(imagepath) if m is None: print("cv2.imread %s failed\n" % (imagepath)) sys.exit(0) net = get_model("squeezenet_ssd", num_threads=4, use_gpu=True) objects = net(m) draw_detection_objects(m, net.class_names, objects) ================================================ FILE: python/examples/yolact.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import sys import cv2 import numpy as np from ncnn.model_zoo import get_model from ncnn.utils import draw_detection_objects def draw_result(image, class_names, boxes, masks, classes, scores): colors = [ [56, 0, 255], [226, 255, 0], [0, 94, 255], [0, 37, 255], [0, 255, 94], [255, 226, 0], [0, 18, 255], [255, 151, 0], [170, 0, 255], [0, 255, 56], [255, 0, 75], [0, 75, 255], [0, 255, 169], [255, 0, 207], [75, 255, 0], [207, 0, 255], [37, 0, 255], [0, 207, 255], [94, 0, 255], [0, 255, 113], [255, 18, 0], [255, 0, 56], [18, 0, 255], [0, 255, 226], [170, 255, 0], [255, 0, 245], [151, 255, 0], [132, 255, 0], [75, 0, 255], [151, 0, 255], [0, 151, 255], [132, 0, 255], [0, 255, 245], [255, 132, 0], [226, 0, 255], [255, 37, 0], [207, 255, 0], [0, 255, 207], [94, 255, 0], [0, 226, 255], [56, 255, 0], [255, 94, 0], [255, 113, 0], [0, 132, 255], [255, 0, 132], [255, 170, 0], [255, 0, 188], [113, 255, 0], [245, 0, 255], [113, 0, 255], [255, 188, 0], [0, 113, 255], [255, 0, 0], [0, 56, 255], [255, 0, 113], [0, 255, 188], [255, 0, 94], [255, 0, 18], [18, 255, 0], [0, 255, 132], [0, 188, 255], [0, 245, 255], [0, 169, 255], [37, 255, 0], [255, 0, 151], [188, 0, 255], [0, 255, 37], [0, 255, 0], [255, 0, 170], [255, 0, 37], [255, 75, 0], [0, 0, 255], [255, 207, 0], [255, 0, 226], [255, 245, 0], [188, 255, 0], [0, 255, 18], [0, 255, 75], [0, 255, 151], [255, 56, 0], [245, 255, 0], ] color_index = 0 for box, mask, label, score in zip(boxes, masks, classes, scores): if score < 0.15: continue print( "%s = %.5f at %.2f %.2f %.2f x %.2f\n" % (label, score, box[0], box[1], box[2], box[3]) ) cv2.rectangle( image, (int(box[0]), int(box[1])), (int(box[0] + box[2]), int(int(box[1] + box[3]))), (255, 0, 0), ) text = "%s %.1f%%" % (class_names[int(label)], score * 100) label_size, baseLine = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) x = box[0] y = box[1] - label_size[1] - baseLine if y < 0: y = 0 if x + label_size[0] > image.shape[1]: x = image.shape[1] - label_size[0] cv2.rectangle( image, (int(x), int(y)), (int(x + label_size[0]), int(y + label_size[1] + baseLine)), (255, 255, 255), -1, ) cv2.putText( image, text, (int(x), int(y + label_size[1])), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), ) image[mask] = image[mask] * 0.5 + np.array(colors[color_index]) * 0.5 color_index += 1 cv2.imshow("image", image) cv2.waitKey(0) if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: %s [imagepath]" % (sys.argv[0])) sys.exit(0) imagepath = sys.argv[1] m = cv2.imread(imagepath) if m is None: print("cv2.imread %s failed\n" % (imagepath)) sys.exit(0) net = get_model( "yolact", target_size=550, confidence_threshold=0.05, nms_threshold=0.5, keep_top_k=200, num_threads=4, use_gpu=True, ) boxes, masks, classes, scores = net(m) draw_result(m, net.class_names, boxes, masks, classes, scores) ================================================ FILE: python/examples/yolov2.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import sys import cv2 from ncnn.model_zoo import get_model from ncnn.utils import draw_detection_objects if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: %s [imagepath]\n" % (sys.argv[0])) sys.exit(0) imagepath = sys.argv[1] m = cv2.imread(imagepath) if m is None: print("cv2.imread %s failed\n" % (imagepath)) sys.exit(0) net = get_model("mobilenet_yolov2", num_threads=4, use_gpu=True) objects = net(m) draw_detection_objects(m, net.class_names, objects) ================================================ FILE: python/examples/yolov3.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import sys import cv2 from ncnn.model_zoo import get_model from ncnn.utils import draw_detection_objects if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: %s [imagepath]\n" % (sys.argv[0])) sys.exit(0) imagepath = sys.argv[1] m = cv2.imread(imagepath) if m is None: print("cv2.imread %s failed\n" % (imagepath)) sys.exit(0) net = get_model("mobilenetv2_yolov3", num_threads=4, use_gpu=True) objects = net(m) draw_detection_objects(m, net.class_names, objects) ================================================ FILE: python/examples/yolov4.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import sys import cv2 from ncnn.model_zoo import get_model from ncnn.utils import draw_detection_objects if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: %s [v4l input device or image]\n" % (sys.argv[0])) sys.exit(0) devicepath = sys.argv[1] net = get_model("yolov4_tiny", num_threads=4, use_gpu=True) # net = get_model("yolov4", num_threads=4, use_gpu=True) if devicepath.find("/dev/video") == -1: m = cv2.imread(devicepath) if m is None: print("cv2.imread %s failed\n" % (devicepath)) sys.exit(0) objects = net(m) draw_detection_objects(m, net.class_names, objects) else: cap = cv2.VideoCapture(devicepath) if cap.isOpened() == False: print("Failed to open %s" % (devicepath)) sys.exit(0) while True: ret, frame = cap.read() objects = net(frame) draw_detection_objects(frame, net.class_names, objects) ================================================ FILE: python/examples/yolov5.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import sys import cv2 from ncnn.model_zoo import get_model from ncnn.utils import draw_detection_objects if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: %s [imagepath]\n" % (sys.argv[0])) sys.exit(0) imagepath = sys.argv[1] m = cv2.imread(imagepath) if m is None: print("cv2.imread %s failed\n" % (imagepath)) sys.exit(0) net = get_model( "yolov5s", target_size=640, prob_threshold=0.25, nms_threshold=0.45, num_threads=4, use_gpu=True, ) objects = net(m) draw_detection_objects(m, net.class_names, objects) ================================================ FILE: python/examples/yolov8.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import sys import cv2 from ncnn.model_zoo import get_model from ncnn.utils import draw_detection_objects if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: %s [imagepath]\n" % (sys.argv[0])) sys.exit(0) imagepath = sys.argv[1] m = cv2.imread(imagepath) if m is None: print("cv2.imread %s failed\n" % (imagepath)) sys.exit(0) net = get_model( "yolov8s", target_size=640, prob_threshold=0.25, nms_threshold=0.45, num_threads=4, use_gpu=True, ) objects = net(m) draw_detection_objects(m, net.class_names, objects) ================================================ FILE: python/ncnn/__init__.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause from .ncnn import * __version__ = ncnn.__version__ ================================================ FILE: python/ncnn/model_zoo/__init__.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause # coding: utf-8 """Predefined and pretrained models.""" from . import model_store from .model_zoo import get_model, get_model_list ================================================ FILE: python/ncnn/model_zoo/fasterrcnn.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import numpy as np import ncnn from .model_store import get_model_file from ..utils.objects import Detect_Object class Faster_RCNN: def __init__( self, img_width=600, img_height=600, num_threads=1, use_gpu=False, max_per_image=100, confidence_thresh=0.05, nms_threshold=0.3, ): self.img_width = img_width self.img_height = img_height self.num_threads = num_threads self.use_gpu = use_gpu self.mean_vals = [102.9801, 115.9465, 122.7717] self.norm_vals = [] self.net = ncnn.Net() self.net.opt.num_threads = self.num_threads self.net.opt.use_vulkan_compute = self.use_gpu # original pretrained model from https://github.com/rbgirshick/py-faster-rcnn # py-faster-rcnn/models/pascal_voc/ZF/faster_rcnn_alt_opt/faster_rcnn_test.pt # https://dl.dropboxusercontent.com/s/o6ii098bu51d139/faster_rcnn_models.tgz?dl=0 # ZF_faster_rcnn_final.caffemodel # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models self.net.load_param(get_model_file("ZF_faster_rcnn_final.param")) self.net.load_model(get_model_file("ZF_faster_rcnn_final.bin")) self.max_per_image = max_per_image self.confidence_thresh = confidence_thresh self.nms_threshold = nms_threshold self.class_names = [ "background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor", ] def __del__(self): self.net = None def __call__(self, img): # scale to target detect size h = img.shape[0] w = img.shape[1] scale = 1.0 if w < h: scale = float(self.img_width) / w w = self.img_width h = int(h * scale) else: scale = float(self.img_height) / h h = self.img_height w = int(w * scale) mat_in = ncnn.Mat.from_pixels_resize( img, ncnn.Mat.PixelType.PIXEL_BGR, img.shape[1], img.shape[0], w, h ) mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals) # method 1 use numpy to Mat interface # im_info = ncnn.Mat(np.array([h, w, scale], dtype=np.float32)) # method 2 use ncnn.Mat interface im_info = ncnn.Mat(3) im_info[0] = h im_info[1] = w im_info[2] = scale ex1 = self.net.create_extractor() ex1.input("data", mat_in) ex1.input("im_info", im_info) ret1, conv5_relu5 = ex1.extract("conv5_relu5") ret2, rois = ex1.extract("rois") class_candidates = [] for i in range(rois.c): ex2 = self.net.create_extractor() roi = rois.channel(i) # get single roi ex2.input("conv5_relu5", conv5_relu5) ex2.input("rois", roi) ret1, bbox_pred = ex2.extract("bbox_pred") ret2, cls_prob = ex2.extract("cls_prob") num_class = cls_prob.w while len(class_candidates) < num_class: class_candidates.append([]) # find class id with highest score label = 0 score = 0.0 for j in range(num_class): class_score = cls_prob[j] if class_score > score: label = j score = class_score # ignore background or low score if label == 0 or score <= self.confidence_thresh: continue # fprintf(stderr, "%d = %f\n", label, score); # unscale to image size x1 = roi[0] / scale y1 = roi[1] / scale x2 = roi[2] / scale y2 = roi[3] / scale pb_w = x2 - x1 + 1 pb_h = y2 - y1 + 1 # apply bbox regression dx = bbox_pred[label * 4] dy = bbox_pred[label * 4 + 1] dw = bbox_pred[label * 4 + 2] dh = bbox_pred[label * 4 + 3] cx = x1 + pb_w * 0.5 cy = y1 + pb_h * 0.5 obj_cx = cx + pb_w * dx obj_cy = cy + pb_h * dy obj_w = pb_w * np.exp(dw) obj_h = pb_h * np.exp(dh) obj_x1 = obj_cx - obj_w * 0.5 obj_y1 = obj_cy - obj_h * 0.5 obj_x2 = obj_cx + obj_w * 0.5 obj_y2 = obj_cy + obj_h * 0.5 # clip obj_x1 = np.maximum(np.minimum(obj_x1, float(img.shape[1] - 1)), 0.0) obj_y1 = np.maximum(np.minimum(obj_y1, float(img.shape[0] - 1)), 0.0) obj_x2 = np.maximum(np.minimum(obj_x2, float(img.shape[1] - 1)), 0.0) obj_y2 = np.maximum(np.minimum(obj_y2, float(img.shape[0] - 1)), 0.0) # append object obj = Detect_Object() obj.rect.x = obj_x1 obj.rect.y = obj_y1 obj.rect.w = obj_x2 - obj_x1 + 1 obj.rect.h = obj_y2 - obj_y1 + 1 obj.label = label obj.prob = score class_candidates[label].append(obj) # post process objects = [] for candidates in class_candidates: if len(candidates) == 0: continue candidates.sort(key=lambda obj: obj.prob, reverse=True) picked = self.nms_sorted_bboxes(candidates, self.nms_threshold) for j in range(len(picked)): z = picked[j] objects.append(candidates[z]) objects.sort(key=lambda obj: obj.prob, reverse=True) objects = objects[: self.max_per_image] return objects def nms_sorted_bboxes(self, objects, nms_threshold): picked = [] n = len(objects) areas = np.zeros((n,), dtype=np.float32) for i in range(n): areas[i] = objects[i].rect.area() for i in range(n): a = objects[i] keep = True for j in range(len(picked)): b = objects[picked[j]] # intersection over union inter_area = a.rect.intersection_area(b.rect) union_area = areas[i] + areas[picked[j]] - inter_area # float IoU = inter_area / union_area if inter_area / union_area > nms_threshold: keep = False if keep: picked.append(i) return picked ================================================ FILE: python/ncnn/model_zoo/mobilenetssd.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import ncnn from .model_store import get_model_file from ..utils.objects import Detect_Object class MobileNet_SSD: def __init__(self, target_size=300, num_threads=1, use_gpu=False): self.target_size = target_size self.num_threads = num_threads self.use_gpu = use_gpu self.mean_vals = [127.5, 127.5, 127.5] self.norm_vals = [0.007843, 0.007843, 0.007843] self.net = ncnn.Net() self.net.opt.num_threads = self.num_threads self.net.opt.use_vulkan_compute = self.use_gpu # model is converted from https://github.com/chuanqi305/MobileNet-SSD # and can be downloaded from https://drive.google.com/open?id=0ByaKLD9QaPtucWk0Y0dha1VVY0U # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models self.net.load_param(get_model_file("mobilenet_ssd_voc_ncnn.param")) self.net.load_model(get_model_file("mobilenet_ssd_voc_ncnn.bin")) self.class_names = [ "background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor", ] def __del__(self): self.net = None def __call__(self, img): img_h = img.shape[0] img_w = img.shape[1] mat_in = ncnn.Mat.from_pixels_resize( img, ncnn.Mat.PixelType.PIXEL_BGR, img.shape[1], img.shape[0], self.target_size, self.target_size, ) mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals) ex = self.net.create_extractor() ex.input("data", mat_in) ret, mat_out = ex.extract("detection_out") objects = [] # printf("%d %d %d\n", mat_out.w, mat_out.h, mat_out.c) # method 1, use ncnn.Mat.row to get the result, no memory copy for i in range(mat_out.h): values = mat_out.row(i) obj = Detect_Object() obj.label = values[0] obj.prob = values[1] obj.rect.x = values[2] * img_w obj.rect.y = values[3] * img_h obj.rect.w = values[4] * img_w - obj.rect.x obj.rect.h = values[5] * img_h - obj.rect.y objects.append(obj) """ #method 2, use ncnn.Mat->numpy.array to get the result, no memory copy too out = np.array(mat_out) for i in range(len(out)): values = out[i] obj = Detect_Object() obj.label = values[0] obj.prob = values[1] obj.rect.x = values[2] * img_w obj.rect.y = values[3] * img_h obj.rect.w = values[4] * img_w - obj.rect.x obj.rect.h = values[5] * img_h - obj.rect.y objects.append(obj) """ return objects ================================================ FILE: python/ncnn/model_zoo/mobilenetv2ssdlite.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import ncnn from .model_store import get_model_file from ..utils.objects import Detect_Object class Noop(ncnn.Layer): pass def Noop_layer_creator(): return Noop() class MobileNetV2_SSDLite: def __init__(self, target_size=300, num_threads=1, use_gpu=False): self.target_size = target_size self.num_threads = num_threads self.use_gpu = use_gpu self.mean_vals = [127.5, 127.5, 127.5] self.norm_vals = [0.007843, 0.007843, 0.007843] self.net = ncnn.Net() self.net.opt.num_threads = self.num_threads self.net.opt.use_vulkan_compute = self.use_gpu # self.net.register_custom_layer("Silence", Noop_layer_creator) # original pretrained model from https://github.com/chuanqi305/MobileNetv2-SSDLite # https://github.com/chuanqi305/MobileNetv2-SSDLite/blob/master/ssdlite/voc/deploy.prototxt # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models self.net.load_param(get_model_file("mobilenetv2_ssdlite_voc.param")) self.net.load_model(get_model_file("mobilenetv2_ssdlite_voc.bin")) self.class_names = [ "background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor", ] def __del__(self): self.net = None def __call__(self, img): img_h = img.shape[0] img_w = img.shape[1] mat_in = ncnn.Mat.from_pixels_resize( img, ncnn.Mat.PixelType.PIXEL_BGR, img_w, img_h, self.target_size, self.target_size, ) mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals) ex = self.net.create_extractor() ex.input("data", mat_in) ret, mat_out = ex.extract("detection_out") objects = [] # printf("%d %d %d\n", mat_out.w, mat_out.h, mat_out.c) # method 1, use ncnn.Mat.row to get the result, no memory copy for i in range(mat_out.h): values = mat_out.row(i) obj = Detect_Object() obj.label = values[0] obj.prob = values[1] obj.rect.x = values[2] * img_w obj.rect.y = values[3] * img_h obj.rect.w = values[4] * img_w - obj.rect.x obj.rect.h = values[5] * img_h - obj.rect.y objects.append(obj) """ #method 2, use ncnn.Mat->numpy.array to get the result, no memory copy too out = np.array(mat_out) for i in range(len(out)): values = out[i] obj = Detect_Object() obj.label = values[0] obj.prob = values[1] obj.rect.x = values[2] * img_w obj.rect.y = values[3] * img_h obj.rect.w = values[4] * img_w - obj.rect.x obj.rect.h = values[5] * img_h - obj.rect.y objects.append(obj) """ return objects ================================================ FILE: python/ncnn/model_zoo/mobilenetv3ssdlite.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import numpy as np import ncnn from .model_store import get_model_file from ..utils.objects import Detect_Object def clamp(v, lo, hi): if v < lo: return lo elif hi < v: return hi else: return v class MobileNetV3_SSDLite: def __init__(self, target_size=300, num_threads=1, use_gpu=False): self.target_size = target_size self.num_threads = num_threads self.use_gpu = use_gpu self.mean_vals = [123.675, 116.28, 103.53] self.norm_vals = [1.0, 1.0, 1.0] self.net = ncnn.Net() self.net.opt.num_threads = self.num_threads self.net.opt.use_vulkan_compute = self.use_gpu # converted ncnn model from https://github.com/ujsyehao/mobilenetv3-ssd # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models self.net.load_param(get_model_file("mobilenetv3_ssdlite_voc.param")) self.net.load_model(get_model_file("mobilenetv3_ssdlite_voc.bin")) self.class_names = [ "background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor", ] def __del__(self): self.net = None def __call__(self, img): img_h = img.shape[0] img_w = img.shape[1] mat_in = ncnn.Mat.from_pixels_resize( img, ncnn.Mat.PixelType.PIXEL_BGR2RGB, img.shape[1], img.shape[0], self.target_size, self.target_size, ) mat_in.substract_mean_normalize([], self.norm_vals) mat_in.substract_mean_normalize(self.mean_vals, []) ex = self.net.create_extractor() ex.input("input", mat_in) ret, mat_out = ex.extract("detection_out") objects = [] # printf("%d %d %d\n", mat_out.w, mat_out.h, mat_out.c) # method 1, use ncnn.Mat.row to get the result, no memory copy for i in range(mat_out.h): values = mat_out.row(i) obj = Detect_Object() obj.label = values[0] obj.prob = values[1] x1 = ( clamp(values[2] * self.target_size, 0.0, float(self.target_size - 1)) / self.target_size * img_w ) y1 = ( clamp(values[3] * self.target_size, 0.0, float(self.target_size - 1)) / self.target_size * img_h ) x2 = ( clamp(values[4] * self.target_size, 0.0, float(self.target_size - 1)) / self.target_size * img_w ) y2 = ( clamp(values[5] * self.target_size, 0.0, float(self.target_size - 1)) / self.target_size * img_h ) if np.isnan(x1) or np.isnan(y1) or np.isnan(x2) or np.isnan(y2): continue obj.rect.x = x1 obj.rect.y = y1 obj.rect.w = x2 - x1 obj.rect.h = y2 - y1 objects.append(obj) """ #method 2, use ncnn.Mat->numpy.array to get the result, no memory copy too out = np.array(mat_out) for i in range(len(out)): values = out[i] obj = Detect_Object() obj.label = values[0] obj.prob = values[1] x1 = clamp(values[2] * self.img_width, 0.0, float(self.img_width - 1)) / self.img_width * img_w y1 = clamp(values[3] * self.img_height, 0.0, float(self.img_height - 1)) / self.img_height * img_h x2 = clamp(values[4] * self.img_width, 0.0, float(self.img_width - 1)) / self.img_width * img_w y2 = clamp(values[5] * self.img_height, 0.0, float(self.img_height - 1)) / self.img_height * img_h obj.rect.x = x1 obj.rect.y = y1 obj.rect.w = x2 - x1 obj.rect.h = y2 - y1 objects.append(obj) """ return objects ================================================ FILE: python/ncnn/model_zoo/model_store.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause """Model store which provides pretrained models.""" from __future__ import print_function __all__ = ["get_model_file", "purge"] import os import zipfile import logging import portalocker from ..utils import download, check_sha1 _model_sha1 = { name: checksum for checksum, name in [ ("4ff279e78cdb0b8bbc9363181df6f094ad46dc36", "mobilenet_yolo.param"), ("1528cf08b9823fc01aaebfc932ec8c8d4a3b1613", "mobilenet_yolo.bin"), ("3f5b78b0c982f8bdf3a2c3a27e6136d4d2680e96", "mobilenetv2_yolov3.param"), ("0705b0f8fe5a77718561b9b7d6ed4f33fcd3d455", "mobilenetv2_yolov3.bin"), ("de59186323ebad5650631e12a6cc66b526ec7df4", "yolov4-tiny-opt.param"), ("1765c3b251c041dd6ac59d2ec3ddf7b983fe9ee9", "yolov4-tiny-opt.bin"), ("e92d3a3a8ac5e6a6c08c433aa2252b0680124328", "yolov4-opt.param"), ("69d128b42b70fb790e9d3ccabcf1b6e8cc2859fe", "yolov4-opt.bin"), ("6fa8ccc8cabc0f5633ab3c6ffa268e6042b8888f", "yolov5s.param"), ("0cbab3664deb090480ea748c1305f6fe850b9ac4", "yolov5s.bin"), ("35ab0c1ce2864e0759d5794aa818df2de3013ab3", "yolov7-tiny.param"), ("c0454f072b41997aa230c3fe1c1d504566574b6c", "yolov7-tiny.bin"), ("e9de3c929d1c93f7dc94ed0f125795ac16ecc120", "yolov8s.param"), ("90f4eb9e90086e2ec3af4c7837f00757e710b9c6", "yolov8s.bin"), ("e65bae7052d9e9b9d45e1214a8d1b5fe6f64e8af", "yolact.param"), ("9bda99f50b1c14c98c5c6bbc08d4f782eed66548", "yolact.bin"), ("3723ce3e312db6a102cff1a5c39dae80e1de658e", "mobilenet_ssd_voc_ncnn.param"), ("8e2d2139550dcbee1ce5e200b7697b25aab29656", "mobilenet_ssd_voc_ncnn.bin"), ("52c669821dc32ef5b7ab30749fa71a3bc27786b8", "squeezenet_ssd_voc.param"), ("347e31d1cbe469259fa8305860a7c24a95039202", "squeezenet_ssd_voc.bin"), ("52dab628ecac8137e61ce3aea1a912f9c5a0a638", "mobilenetv2_ssdlite_voc.param"), ("9fea06f74f7c60d753cf703ea992f92e50a986d4", "mobilenetv2_ssdlite_voc.bin"), ("f36661eff1eda1e36185e7f2f28fc722ad8b66bb", "mobilenetv3_ssdlite_voc.param"), ("908f63ca9bff0061a499512664b9c533a0b7f485", "mobilenetv3_ssdlite_voc.bin"), ("a63d779a1f789af976bc4e2eae86fdd9b0bb6c2c", "squeezenet_v1.1.param"), ("262f0e33e37aeac69021b5a3556664be65fc0aeb", "squeezenet_v1.1.bin"), ("3ba57cccd1d4a583f6eb76eae25a2dbda7ce7f74", "ZF_faster_rcnn_final.param"), ("1095fbb5f846a1f311b40941add5fef691acaf8d", "ZF_faster_rcnn_final.bin"), ("3586ec3d663b1cc8ec8c662768caa9c7fbcf4fdc", "pelee.param"), ("2442ad483dc546940271591b86db0d9c8b1c7118", "pelee.bin"), ("6cfeda08d5494a1274199089fda77c421be1ecac", "mnet.25-opt.param"), ("3ff9a51dc81cdf506a87543dbf752071ffc50b8d", "mnet.25-opt.bin"), ("50acebff393c91468a73a7b7c604ef231429d068", "rfcn_end2end.param"), ("9a68cd937959b4dda9c5bf9c99181cb0e40f266b", "rfcn_end2end.bin"), ("d6b289cda068e9a9d8a171fb909352a05a39a494", "shufflenet_v2_x0.5.param"), ("2ccd631d04a1b7e05483cd8a8def76bca7d330a8", "shufflenet_v2_x0.5.bin"), ("7c8f8d72c60aab6802985423686b36c61be2f68c", "pose.param"), ("7f691540972715298c611a3e595b20c59c2147ce", "pose.bin"), ("979d09942881cf1207a93cbfa9853005a434469b", "nanodet_m.param"), ("51d868905361e4ba9c45bd12e8a5608e7aadd1bd", "nanodet_m.bin"), ] } _split_model_bins = { "ZF_faster_rcnn_final.bin": 3, "rfcn_end2end.bin": 2, "yolov4-opt.bin": 7, } github_repo_url = "https://github.com/nihui/ncnn-assets/raw/master/models/" _url_format = "{repo_url}{file_name}" def merge_file(root, files_in, file_out, remove=True): with open(file_out, "wb") as fd_out: for file_in in files_in: file = os.path.join(root, file_in) with open(file, "rb") as fd_in: fd_out.write(fd_in.read()) if remove == True: os.remove(file) def short_hash(name): if name not in _model_sha1: raise ValueError( "Pretrained model for {name} is not available.".format(name=name) ) return _model_sha1[name][:8] def get_model_file(name, tag=None, root=os.path.join("~", ".ncnn", "models")): r"""Return location for the pretrained on local file system. This function will download from online model zoo when model cannot be found or has mismatch. The root directory will be created if it doesn't exist. Parameters ---------- name : str Name of the model. root : str, default '~/.ncnn/models' Location for keeping the model parameters. Returns ------- file_path Path to the requested pretrained model file. """ if "NCNN_HOME" in os.environ: root = os.path.join(os.environ["NCNN_HOME"], "models") use_tag = isinstance(tag, str) if use_tag: file_name = "{name}-{short_hash}".format(name=name, short_hash=tag) else: file_name = "{name}".format(name=name) root = os.path.expanduser(root) params_path = os.path.join(root, file_name) lockfile = os.path.join(root, file_name + ".lock") if use_tag: sha1_hash = tag else: sha1_hash = _model_sha1[name] if not os.path.exists(root): os.makedirs(root) with portalocker.Lock( lockfile, timeout=int(os.environ.get("NCNN_MODEL_LOCK_TIMEOUT", 300)) ): if os.path.exists(params_path): if check_sha1(params_path, sha1_hash): return params_path else: logging.warning( "Hash mismatch in the content of model file '%s' detected. " "Downloading again.", params_path, ) else: logging.info("Model file not found. Downloading.") zip_file_path = os.path.join(root, file_name) if file_name in _split_model_bins: file_name_parts = [ "%s.part%02d" % (file_name, i + 1) for i in range(_split_model_bins[file_name]) ] for file_name_part in file_name_parts: file_path = os.path.join(root, file_name_part) repo_url = os.environ.get("NCNN_REPO", github_repo_url) if repo_url[-1] != "/": repo_url = repo_url + "/" download( _url_format.format(repo_url=repo_url, file_name=file_name_part), path=file_path, overwrite=True, ) merge_file(root, file_name_parts, zip_file_path) else: repo_url = os.environ.get("NCNN_REPO", github_repo_url) if repo_url[-1] != "/": repo_url = repo_url + "/" download( _url_format.format(repo_url=repo_url, file_name=file_name), path=zip_file_path, overwrite=True, ) if zip_file_path.endswith(".zip"): with zipfile.ZipFile(zip_file_path) as zf: zf.extractall(root) os.remove(zip_file_path) # Make sure we write the model file on networked filesystems try: os.sync() except AttributeError: pass if check_sha1(params_path, sha1_hash): return params_path else: raise ValueError("Downloaded file has different hash. Please try again.") def purge(root=os.path.join("~", ".ncnn", "models")): r"""Purge all pretrained model files in local file store. Parameters ---------- root : str, default '~/.ncnn/models' Location for keeping the model parameters. """ root = os.path.expanduser(root) files = os.listdir(root) for f in files: if f.endswith(".params"): os.remove(os.path.join(root, f)) ================================================ FILE: python/ncnn/model_zoo/model_zoo.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause from .yolov2 import MobileNet_YoloV2 from .yolov3 import MobileNetV2_YoloV3 from .yolov4 import YoloV4_Tiny, YoloV4 from .yolov5 import YoloV5s from .yolov7 import YoloV7_Tiny from .yolov8 import YoloV8s from .yolact import Yolact from .mobilenetssd import MobileNet_SSD from .squeezenetssd import SqueezeNet_SSD from .mobilenetv2ssdlite import MobileNetV2_SSDLite from .mobilenetv3ssdlite import MobileNetV3_SSDLite from .squeezenet import SqueezeNet from .fasterrcnn import Faster_RCNN from .peleenetssd import PeleeNet_SSD from .retinaface import RetinaFace from .rfcn import RFCN from .shufflenetv2 import ShuffleNetV2 from .simplepose import SimplePose from .nanodet import NanoDet __all__ = ["get_model", "get_model_list"] _models = { "mobilenet_yolov2": MobileNet_YoloV2, "mobilenetv2_yolov3": MobileNetV2_YoloV3, "yolov4_tiny": YoloV4_Tiny, "yolov4": YoloV4, "yolov5s": YoloV5s, "yolov7_tiny": YoloV7_Tiny, "yolov8s": YoloV8s, "yolact": Yolact, "mobilenet_ssd": MobileNet_SSD, "squeezenet_ssd": SqueezeNet_SSD, "mobilenetv2_ssdlite": MobileNetV2_SSDLite, "mobilenetv3_ssdlite": MobileNetV3_SSDLite, "squeezenet": SqueezeNet, "faster_rcnn": Faster_RCNN, "peleenet_ssd": PeleeNet_SSD, "retinaface": RetinaFace, "rfcn": RFCN, "shufflenetv2": ShuffleNetV2, "simplepose": SimplePose, "nanodet": NanoDet, } def get_model(name, **kwargs): name = name.lower() if name not in _models: err_str = '"%s" is not among the following model list:\n\t' % (name) err_str += "%s" % ("\n\t".join(sorted(_models.keys()))) raise ValueError(err_str) net = _models[name](**kwargs) return net def get_model_list(): return list(_models.keys()) ================================================ FILE: python/ncnn/model_zoo/nanodet.py ================================================ # Copyright 2021 Tencent # SPDX-License-Identifier: BSD-3-Clause import numpy as np import ncnn from .model_store import get_model_file from ..utils.objects import Detect_Object from ..utils.functional import * class NanoDet: def __init__( self, target_size=320, prob_threshold=0.4, nms_threshold=0.3, num_threads=1, use_gpu=False, ): self.target_size = target_size self.prob_threshold = prob_threshold self.nms_threshold = nms_threshold self.num_threads = num_threads self.use_gpu = use_gpu self.mean_vals = [103.53, 116.28, 123.675] self.norm_vals = [0.017429, 0.017507, 0.017125] self.net = ncnn.Net() self.net.opt.use_vulkan_compute = self.use_gpu self.net.opt.num_threads = self.num_threads # original pretrained model from https://github.com/RangiLyu/nanodet # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models self.net.load_param(get_model_file("nanodet_m.param")) self.net.load_model(get_model_file("nanodet_m.bin")) self.reg_max = 7 self.strides = [8, 16, 32] self.num_candidate = 1000 self.top_k = -1 self.class_names = [ "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush", ] def __del__(self): self.net = None def __call__(self, img): img_w = img.shape[1] img_h = img.shape[0] w = img_w h = img_h scale = 1.0 if w > h: scale = float(self.target_size) / w w = self.target_size h = int(h * scale) else: scale = float(self.target_size) / h h = self.target_size w = int(w * scale) mat_in = ncnn.Mat.from_pixels_resize( img, ncnn.Mat.PixelType.PIXEL_BGR, img_w, img_h, w, h ) # pad to target_size rectangle wpad = (w + 31) // 32 * 32 - w hpad = (h + 31) // 32 * 32 - h mat_in_pad = ncnn.copy_make_border( mat_in, hpad // 2, hpad - hpad // 2, wpad // 2, wpad - wpad // 2, ncnn.BorderType.BORDER_CONSTANT, 0, ) mat_in_pad.substract_mean_normalize(self.mean_vals, self.norm_vals) ex = self.net.create_extractor() ex.input("input.1", mat_in_pad) score_out_name = ["792", "814", "836"] scores = [ex.extract(x)[1] for x in score_out_name] scores = [np.reshape(x, (-1, 80)) for x in scores] boxes_out_name = ["795", "817", "839"] raw_boxes = [ex.extract(x)[1] for x in boxes_out_name] raw_boxes = [np.reshape(x, (-1, 32)) for x in raw_boxes] # generate centers decode_boxes = [] select_scores = [] for stride, box_distribute, score in zip(self.strides, raw_boxes, scores): # centers if mat_in_pad.w > mat_in_pad.h: fm_w = mat_in_pad.w // stride fm_h = score.shape[0] // fm_w else: fm_h = mat_in_pad.h // stride fm_w = score.shape[1] // fm_h h_range = np.arange(fm_h) w_range = np.arange(fm_w) ww, hh = np.meshgrid(w_range, h_range) ct_row = (hh.flatten() + 0.5) * stride ct_col = (ww.flatten() + 0.5) * stride center = np.stack((ct_col, ct_row, ct_col, ct_row), axis=1) # box distribution to distance reg_range = np.arange(self.reg_max + 1) box_distance = box_distribute.reshape((-1, self.reg_max + 1)) box_distance = softmax(box_distance) box_distance = box_distance * np.expand_dims(reg_range, axis=0) box_distance = np.sum(box_distance, axis=1).reshape((-1, 4)) box_distance = box_distance * stride # top K candidate topk_idx = np.argsort(score.max(axis=1))[::-1] topk_idx = topk_idx[: self.num_candidate] center = center[topk_idx] score = score[topk_idx] box_distance = box_distance[topk_idx] # decode box decode_box = center + [-1, -1, 1, 1] * box_distance select_scores.append(score) decode_boxes.append(decode_box) # nms bboxes = np.concatenate(decode_boxes, axis=0) confidences = np.concatenate(select_scores, axis=0) picked_box = [] picked_probs = [] picked_labels = [] for class_index in range(0, confidences.shape[1]): probs = confidences[:, class_index] mask = probs > self.prob_threshold probs = probs[mask] if probs.shape[0] == 0: continue subset_boxes = bboxes[mask, :] picked = nms( subset_boxes, probs, iou_threshold=self.nms_threshold, top_k=self.top_k, ) picked_box.append(subset_boxes[picked]) picked_probs.append(probs[picked]) picked_labels.extend([class_index] * len(picked)) if not picked_box: return [] picked_box = np.concatenate(picked_box) picked_probs = np.concatenate(picked_probs) # result with clip objects = [ Detect_Object( label, score, (bbox[0] - wpad / 2) / scale if bbox[0] > 0 else 0, (bbox[1] - hpad / 2) / scale if bbox[1] > 0 else 0, (bbox[2] - bbox[0]) / scale if bbox[2] < mat_in_pad.w else (mat_in_pad.w - bbox[0]) / scale, (bbox[3] - bbox[1]) / scale if bbox[3] < mat_in_pad.h else (mat_in_pad.h - bbox[1]) / scale, ) for label, score, bbox in zip(picked_labels, picked_probs, picked_box) ] return objects ================================================ FILE: python/ncnn/model_zoo/peleenetssd.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import ncnn from .model_store import get_model_file from ..utils.objects import Detect_Object class PeleeNet_SSD: def __init__(self, target_size=304, num_threads=1, use_gpu=False): self.target_size = target_size self.num_threads = num_threads self.use_gpu = use_gpu self.mean_vals = [103.9, 116.7, 123.6] self.norm_vals = [0.017, 0.017, 0.017] self.net = ncnn.Net() self.net.opt.num_threads = self.num_threads self.net.opt.use_vulkan_compute = self.use_gpu # model is converted from https://github.com/eric612/MobileNet-YOLO # and can be downloaded from https://drive.google.com/open?id=1Wt6jKv13sBRMHgrGAJYlOlRF-o80pC0g # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models self.net.load_param(get_model_file("pelee.param")) self.net.load_model(get_model_file("pelee.bin")) self.class_names = [ "background", "person", "rider", "car", "bus", "truck", "bike", "motor", "traffic light", "traffic sign", "train", ] def __del__(self): self.net = None def __call__(self, img): img_h = img.shape[0] img_w = img.shape[1] mat_in = ncnn.Mat.from_pixels_resize( img, ncnn.Mat.PixelType.PIXEL_BGR, img.shape[1], img.shape[0], self.target_size, self.target_size, ) mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals) ex = self.net.create_extractor() ex.input("data", mat_in) ret, mat_out = ex.extract("detection_out") objects = [] # printf("%d %d %d\n", mat_out.w, mat_out.h, mat_out.c) # method 1, use ncnn.Mat.row to get the result, no memory copy for i in range(mat_out.h): values = mat_out.row(i) obj = Detect_Object() obj.label = values[0] obj.prob = values[1] obj.rect.x = values[2] * img_w obj.rect.y = values[3] * img_h obj.rect.w = values[4] * img_w - obj.rect.x obj.rect.h = values[5] * img_h - obj.rect.y objects.append(obj) """ #method 2, use ncnn.Mat->numpy.array to get the result, no memory copy too out = np.array(mat_out) for i in range(len(out)): values = out[i] obj = Detect_Object() obj.label = values[0] obj.prob = values[1] obj.rect.x = values[2] * img_w obj.rect.y = values[3] * img_h obj.rect.w = values[4] * img_w - obj.rect.x obj.rect.h = values[5] * img_h - obj.rect.y objects.append(obj) """ ret, seg_out = ex.extract("sigmoid") resized = ncnn.Mat() ncnn.resize_bilinear(seg_out, resized, img_w, img_h) return objects, resized ================================================ FILE: python/ncnn/model_zoo/retinaface.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import numpy as np import ncnn from .model_store import get_model_file from ..utils.objects import Point, Face_Object class RetinaFace: def __init__( self, prob_threshold=0.8, nms_threshold=0.4, num_threads=1, use_gpu=False ): self.prob_threshold = prob_threshold self.nms_threshold = nms_threshold self.num_threads = num_threads self.use_gpu = use_gpu self.net = ncnn.Net() self.net.opt.num_threads = self.num_threads self.net.opt.use_vulkan_compute = self.use_gpu # model is converted from # https://github.com/deepinsight/insightface/tree/master/RetinaFace#retinaface-pretrained-models # https://github.com/deepinsight/insightface/issues/669 # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models self.net.load_param(get_model_file("mnet.25-opt.param")) self.net.load_model(get_model_file("mnet.25-opt.bin")) def __del__(self): self.net = None def __call__(self, img): img_h = img.shape[0] img_w = img.shape[1] mat_in = ncnn.Mat.from_pixels( img, ncnn.Mat.PixelType.PIXEL_BGR2RGB, img_w, img_h ) ex = self.net.create_extractor() ex.input("data", mat_in) faceobjects32 = self.detect_stride32(ex) faceobjects16 = self.detect_stride16(ex) faceobjects8 = self.detect_stride8(ex) faceproposals = [*faceobjects32, *faceobjects16, *faceobjects8] # sort all proposals by score from highest to lowest faceproposals.sort(key=lambda obj: obj.prob, reverse=True) # apply nms with nms_threshold picked = self.nms_sorted_bboxes(faceproposals, self.nms_threshold) face_count = len(picked) faceobjects = [] for i in range(face_count): faceobjects.append(faceproposals[picked[i]]) # clip to image size x0 = faceobjects[i].rect.x y0 = faceobjects[i].rect.y x1 = x0 + faceobjects[i].rect.w y1 = y0 + faceobjects[i].rect.h x0 = np.maximum(np.minimum(x0, float(img_w) - 1), 0.0) y0 = np.maximum(np.minimum(y0, float(img_h) - 1), 0.0) x1 = np.maximum(np.minimum(x1, float(img_w) - 1), 0.0) y1 = np.maximum(np.minimum(y1, float(img_h) - 1), 0.0) faceobjects[i].rect.x = x0 faceobjects[i].rect.y = y0 faceobjects[i].rect.w = x1 - x0 faceobjects[i].rect.h = y1 - y0 return faceobjects def detect_stride32(self, ex): ret1, score_blob = ex.extract("face_rpn_cls_prob_reshape_stride32") ret2, bbox_blob = ex.extract("face_rpn_bbox_pred_stride32") ret3, landmark_blob = ex.extract("face_rpn_landmark_pred_stride32") base_size = 16 feat_stride = 32 ratios = ncnn.Mat(1) ratios[0] = 1.0 scales = ncnn.Mat(2) scales[0] = 32.0 scales[1] = 16.0 anchors = self.generate_anchors(base_size, ratios, scales) faceobjects32 = self.generate_proposals( anchors, feat_stride, score_blob, bbox_blob, landmark_blob, self.prob_threshold, ) return faceobjects32 def detect_stride16(self, ex): ret1, score_blob = ex.extract("face_rpn_cls_prob_reshape_stride16") ret2, bbox_blob = ex.extract("face_rpn_bbox_pred_stride16") ret3, landmark_blob = ex.extract("face_rpn_landmark_pred_stride16") base_size = 16 feat_stride = 16 ratios = ncnn.Mat(1) ratios[0] = 1.0 scales = ncnn.Mat(2) scales[0] = 8.0 scales[1] = 4.0 anchors = self.generate_anchors(base_size, ratios, scales) faceobjects16 = self.generate_proposals( anchors, feat_stride, score_blob, bbox_blob, landmark_blob, self.prob_threshold, ) return faceobjects16 def detect_stride8(self, ex): ret1, score_blob = ex.extract("face_rpn_cls_prob_reshape_stride8") ret2, bbox_blob = ex.extract("face_rpn_bbox_pred_stride8") ret3, landmark_blob = ex.extract("face_rpn_landmark_pred_stride8") base_size = 16 feat_stride = 8 ratios = ncnn.Mat(1) ratios[0] = 1.0 scales = ncnn.Mat(2) scales[0] = 2.0 scales[1] = 1.0 anchors = self.generate_anchors(base_size, ratios, scales) faceobjects8 = self.generate_proposals( anchors, feat_stride, score_blob, bbox_blob, landmark_blob, self.prob_threshold, ) return faceobjects8 def generate_anchors(self, base_size, ratios, scales): num_ratio = ratios.w num_scale = scales.w # anchors = ncnn.Mat() # anchors.create(w=4, h=num_ratio * num_scale) anchors_np = np.zeros((2, 4), dtype=np.float32) cx = base_size * 0.5 cy = base_size * 0.5 for i in range(num_ratio): ar = ratios[i] r_w = np.round(base_size / np.sqrt(ar)) r_h = np.round(r_w * ar) # round(base_size * np.sqrt(ar)) for j in range(num_scale): scale = scales[j] rs_w = r_w * scale rs_h = r_h * scale anchor = anchors_np[i * num_scale + j] anchor[0] = cx - rs_w * 0.5 anchor[1] = cy - rs_h * 0.5 anchor[2] = cx + rs_w * 0.5 anchor[3] = cy + rs_h * 0.5 anchors = ncnn.Mat(anchors_np) return anchors def generate_proposals( self, anchors, feat_stride, score_blob, bbox_blob, landmark_blob, prob_threshold ): faceobjects = [] w = score_blob.w h = score_blob.h # generate face proposal from bbox deltas and shifted anchors num_anchors = anchors.h for q in range(num_anchors): anchor = anchors.row(q) score = score_blob.channel(q + num_anchors) bbox = bbox_blob.channel_range(q * 4, 4) landmark = landmark_blob.channel_range(q * 10, 10) # shifted anchor anchor_y = anchor[1] anchor_w = anchor[2] - anchor[0] anchor_h = anchor[3] - anchor[1] for i in range(h): anchor_x = anchor[0] for j in range(w): index = i * w + j prob = score[index] if prob >= prob_threshold: # apply center size dx = bbox.channel(0)[index] dy = bbox.channel(1)[index] dw = bbox.channel(2)[index] dh = bbox.channel(3)[index] cx = anchor_x + anchor_w * 0.5 cy = anchor_y + anchor_h * 0.5 pb_cx = cx + anchor_w * dx pb_cy = cy + anchor_h * dy pb_w = anchor_w * np.exp(dw) pb_h = anchor_h * np.exp(dh) x0 = pb_cx - pb_w * 0.5 y0 = pb_cy - pb_h * 0.5 x1 = pb_cx + pb_w * 0.5 y1 = pb_cy + pb_h * 0.5 obj = Face_Object() obj.rect.x = x0 obj.rect.y = y0 obj.rect.w = x1 - x0 + 1 obj.rect.h = y1 - y0 + 1 obj.landmark = [Point(), Point(), Point(), Point(), Point()] obj.landmark[0].x = ( cx + (anchor_w + 1) * landmark.channel(0)[index] ) obj.landmark[0].y = ( cy + (anchor_h + 1) * landmark.channel(1)[index] ) obj.landmark[1].x = ( cx + (anchor_w + 1) * landmark.channel(2)[index] ) obj.landmark[1].y = ( cy + (anchor_h + 1) * landmark.channel(3)[index] ) obj.landmark[2].x = ( cx + (anchor_w + 1) * landmark.channel(4)[index] ) obj.landmark[2].y = ( cy + (anchor_h + 1) * landmark.channel(5)[index] ) obj.landmark[3].x = ( cx + (anchor_w + 1) * landmark.channel(6)[index] ) obj.landmark[3].y = ( cy + (anchor_h + 1) * landmark.channel(7)[index] ) obj.landmark[4].x = ( cx + (anchor_w + 1) * landmark.channel(8)[index] ) obj.landmark[4].y = ( cy + (anchor_h + 1) * landmark.channel(9)[index] ) obj.prob = prob faceobjects.append(obj) anchor_x += feat_stride anchor_y += feat_stride return faceobjects def nms_sorted_bboxes(self, faceobjects, nms_threshold): picked = [] n = len(faceobjects) areas = [] for i in range(n): areas.append(faceobjects[i].rect.area()) for i in range(n): a = faceobjects[i] keep = True for j in range(len(picked)): b = faceobjects[picked[j]] # intersection over union inter_area = a.rect.intersection_area(b.rect) union_area = areas[i] + areas[picked[j]] - inter_area # float IoU = inter_area / union_area if inter_area / union_area > nms_threshold: keep = False if keep: picked.append(i) return picked ================================================ FILE: python/ncnn/model_zoo/rfcn.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import numpy as np import ncnn from .model_store import get_model_file from ..utils.objects import Detect_Object class RFCN: def __init__( self, target_size=224, max_per_image=100, confidence_thresh=0.6, nms_threshold=0.3, num_threads=1, use_gpu=False, ): self.target_size = target_size self.max_per_image = max_per_image self.confidence_thresh = confidence_thresh self.nms_threshold = nms_threshold self.num_threads = num_threads self.use_gpu = use_gpu self.mean_vals = [102.9801, 115.9465, 122.7717] self.norm_vals = [] self.net = ncnn.Net() self.net.opt.num_threads = self.num_threads self.net.opt.use_vulkan_compute = self.use_gpu # original pretrained model from https://github.com/YuwenXiong/py-R-FCN # https://github.com/YuwenXiong/py-R-FCN/blob/master/models/pascal_voc/ResNet-50/rfcn_end2end/test_agnostic.prototxt # https://1drv.ms/u/s!AoN7vygOjLIQqUWHpY67oaC7mopf # resnet50_rfcn_final.caffemodel # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models self.net.load_param(get_model_file("rfcn_end2end.param")) self.net.load_model(get_model_file("rfcn_end2end.bin")) self.class_names = [ "background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor", ] def __del__(self): self.net = None def __call__(self, img): h = img.shape[0] w = img.shape[1] scale = 1.0 if w < h: scale = float(self.target_size) / w w = self.target_size h = h * scale else: scale = float(self.target_size) / h h = self.target_size w = w * scale mat_in = ncnn.Mat.from_pixels_resize( img, ncnn.Mat.PixelType.PIXEL_BGR, img.shape[1], img.shape[0], int(w), int(h), ) mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals) im_info = ncnn.Mat(3) im_info[0] = h im_info[1] = w im_info[2] = scale # step1, extract feature and all rois ex1 = self.net.create_extractor() ex1.input("data", mat_in) ex1.input("im_info", im_info) ret1, rfcn_cls = ex1.extract("rfcn_cls") ret2, rfcn_bbox = ex1.extract("rfcn_bbox") ret3, rois = ex1.extract("rois") # all rois # step2, extract bbox and score for each roi class_candidates = [] for i in range(rois.c): ex2 = self.net.create_extractor() roi = rois.channel(i) # get single roi ex2.input("rfcn_cls", rfcn_cls) ex2.input("rfcn_bbox", rfcn_bbox) ex2.input("rois", roi) ret1, bbox_pred = ex2.extract("bbox_pred") ret2, cls_prob = ex2.extract("cls_prob") num_class = cls_prob.w while len(class_candidates) < num_class: class_candidates.append([]) # find class id with highest score label = 0 score = 0.0 for j in range(num_class): class_score = cls_prob[j] if class_score > score: label = j score = class_score # ignore background or low score if label == 0 or score <= self.confidence_thresh: continue # fprintf(stderr, "%d = %f\n", label, score) # unscale to image size x1 = roi[0] / scale y1 = roi[1] / scale x2 = roi[2] / scale y2 = roi[3] / scale pb_w = x2 - x1 + 1 pb_h = y2 - y1 + 1 # apply bbox regression dx = bbox_pred[4] dy = bbox_pred[4 + 1] dw = bbox_pred[4 + 2] dh = bbox_pred[4 + 3] cx = x1 + pb_w * 0.5 cy = y1 + pb_h * 0.5 obj_cx = cx + pb_w * dx obj_cy = cy + pb_h * dy obj_w = pb_w * np.exp(dw) obj_h = pb_h * np.exp(dh) obj_x1 = obj_cx - obj_w * 0.5 obj_y1 = obj_cy - obj_h * 0.5 obj_x2 = obj_cx + obj_w * 0.5 obj_y2 = obj_cy + obj_h * 0.5 # clip obj_x1 = np.maximum(np.minimum(obj_x1, float(img.shape[1] - 1)), 0.0) obj_y1 = np.maximum(np.minimum(obj_y1, float(img.shape[0] - 1)), 0.0) obj_x2 = np.maximum(np.minimum(obj_x2, float(img.shape[1] - 1)), 0.0) obj_y2 = np.maximum(np.minimum(obj_y2, float(img.shape[0] - 1)), 0.0) # append object obj = Detect_Object() obj.rect.x = obj_x1 obj.rect.y = obj_y1 obj.rect.w = obj_x2 - obj_x1 + 1 obj.rect.h = obj_y2 - obj_y1 + 1 obj.label = label obj.prob = score class_candidates[label].append(obj) # post process objects = [] for candidates in class_candidates: if len(candidates) == 0: continue candidates.sort(key=lambda obj: obj.prob, reverse=True) picked = self.nms_sorted_bboxes(candidates, self.nms_threshold) for j in range(len(picked)): z = picked[j] objects.append(candidates[z]) objects.sort(key=lambda obj: obj.prob, reverse=True) objects = objects[: self.max_per_image] return objects def nms_sorted_bboxes(self, objects, nms_threshold): picked = [] n = len(objects) areas = np.zeros((n,), dtype=np.float32) for i in range(n): areas[i] = objects[i].rect.area() for i in range(n): a = objects[i] keep = True for j in range(len(picked)): b = objects[picked[j]] # intersection over union inter_area = a.rect.intersection_area(b.rect) union_area = areas[i] + areas[picked[j]] - inter_area # float IoU = inter_area / union_area if inter_area / union_area > nms_threshold: keep = False if keep: picked.append(i) return picked ================================================ FILE: python/ncnn/model_zoo/shufflenetv2.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import numpy as np import ncnn from .model_store import get_model_file class ShuffleNetV2: def __init__(self, target_size=224, num_threads=1, use_gpu=False): self.target_size = target_size self.num_threads = num_threads self.use_gpu = use_gpu self.mean_vals = [] self.norm_vals = [1 / 255.0, 1 / 255.0, 1 / 255.0] self.net = ncnn.Net() self.net.opt.num_threads = self.num_threads self.net.opt.use_vulkan_compute = self.use_gpu # https://github.com/miaow1988/ShuffleNet_V2_pytorch_caffe # models can be downloaded from https://github.com/miaow1988/ShuffleNet_V2_pytorch_caffe/releases # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models self.net.load_param(get_model_file("shufflenet_v2_x0.5.param")) self.net.load_model(get_model_file("shufflenet_v2_x0.5.bin")) def __del__(self): self.net = None def __call__(self, img): img_h = img.shape[0] img_w = img.shape[1] mat_in = ncnn.Mat.from_pixels_resize( img, ncnn.Mat.PixelType.PIXEL_BGR, img.shape[1], img.shape[0], self.target_size, self.target_size, ) mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals) ex = self.net.create_extractor() ex.input("data", mat_in) ret, mat_out = ex.extract("fc") # manually call softmax on the fc output # convert result into probability # skip if your model already has softmax operation softmax = ncnn.create_layer("Softmax") pd = ncnn.ParamDict() softmax.load_param(pd) softmax.forward_inplace(mat_out, self.net.opt) mat_out = mat_out.reshape(mat_out.w * mat_out.h * mat_out.c) cls_scores = np.array(mat_out) return cls_scores ================================================ FILE: python/ncnn/model_zoo/simplepose.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import ncnn from .model_store import get_model_file from ..utils.objects import KeyPoint class SimplePose: def __init__( self, target_width=192, target_height=256, num_threads=1, use_gpu=False ): self.target_width = target_width self.target_height = target_height self.num_threads = num_threads self.use_gpu = use_gpu self.mean_vals = [0.485 * 255.0, 0.456 * 255.0, 0.406 * 255.0] self.norm_vals = [1 / 0.229 / 255.0, 1 / 0.224 / 255.0, 1 / 0.225 / 255.0] self.net = ncnn.Net() self.net.opt.num_threads = self.num_threads self.net.opt.use_vulkan_compute = self.use_gpu # the simple baseline human pose estimation from gluon-cv # https://gluon-cv.mxnet.io/build/examples_pose/demo_simple_pose.html # mxnet model exported via # pose_net.hybridize() # pose_net.export('pose') # then mxnet2ncnn # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models self.net.load_param(get_model_file("pose.param")) self.net.load_model(get_model_file("pose.bin")) def __del__(self): self.net = None def __call__(self, img): h = img.shape[0] w = img.shape[1] mat_in = ncnn.Mat.from_pixels_resize( img, ncnn.Mat.PixelType.PIXEL_BGR2RGB, img.shape[1], img.shape[0], self.target_width, self.target_height, ) mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals) ex = self.net.create_extractor() ex.input("data", mat_in) ret, mat_out = ex.extract("conv3_fwd") keypoints = [] for p in range(mat_out.c): m = mat_out.channel(p) max_prob = 0.0 max_x = 0 max_y = 0 for y in range(mat_out.h): ptr = m.row(y) for x in range(mat_out.w): prob = ptr[x] if prob > max_prob: max_prob = prob max_x = x max_y = y keypoint = KeyPoint() keypoint.p.x = max_x * w / float(mat_out.w) keypoint.p.y = max_y * h / float(mat_out.h) keypoint.prob = max_prob keypoints.append(keypoint) return keypoints ================================================ FILE: python/ncnn/model_zoo/squeezenet.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import numpy as np import ncnn from .model_store import get_model_file class SqueezeNet: def __init__(self, target_size=227, num_threads=1, use_gpu=False): self.target_size = target_size self.num_threads = num_threads self.use_gpu = use_gpu self.mean_vals = [104.0, 117.0, 123.0] self.norm_vals = [] self.net = ncnn.Net() self.net.opt.num_threads = self.num_threads self.net.opt.use_vulkan_compute = self.use_gpu # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models self.net.load_param(get_model_file("squeezenet_v1.1.param")) self.net.load_model(get_model_file("squeezenet_v1.1.bin")) def __del__(self): self.net = None def __call__(self, img): img_h = img.shape[0] img_w = img.shape[1] mat_in = ncnn.Mat.from_pixels_resize( img, ncnn.Mat.PixelType.PIXEL_BGR, img.shape[1], img.shape[0], self.target_size, self.target_size, ) mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals) ex = self.net.create_extractor() ex.input("data", mat_in) ret, mat_out = ex.extract("prob") # printf("%d %d %d\n", mat_out.w, mat_out.h, mat_out.c) out = np.array(mat_out) return out ================================================ FILE: python/ncnn/model_zoo/squeezenetssd.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import ncnn from .model_store import get_model_file from ..utils.objects import Detect_Object class SqueezeNet_SSD: def __init__(self, target_size=300, num_threads=1, use_gpu=False): self.target_size = target_size self.num_threads = num_threads self.use_gpu = use_gpu self.mean_vals = [104.0, 117.0, 123.0] self.norm_vals = [] self.net = ncnn.Net() self.net.opt.num_threads = self.num_threads self.net.opt.use_vulkan_compute = self.use_gpu # original pretrained model from https://github.com/chuanqi305/SqueezeNet-SSD # squeezenet_ssd_voc_deploy.prototxt # https://drive.google.com/open?id=0B3gersZ2cHIxdGpyZlZnbEQ5Snc # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models self.net.load_param(get_model_file("squeezenet_ssd_voc.param")) self.net.load_model(get_model_file("squeezenet_ssd_voc.bin")) self.class_names = [ "background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor", ] def __del__(self): self.net = None def __call__(self, img): img_h = img.shape[0] img_w = img.shape[1] mat_in = ncnn.Mat.from_pixels_resize( img, ncnn.Mat.PixelType.PIXEL_BGR, img.shape[1], img.shape[0], self.target_size, self.target_size, ) mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals) ex = self.net.create_extractor() ex.input("data", mat_in) ret, mat_out = ex.extract("detection_out") objects = [] # printf("%d %d %d\n", mat_out.w, mat_out.h, mat_out.c) # method 1, use ncnn.Mat.row to get the result, no memory copy for i in range(mat_out.h): values = mat_out.row(i) obj = Detect_Object() obj.label = values[0] obj.prob = values[1] obj.rect.x = values[2] * img_w obj.rect.y = values[3] * img_h obj.rect.w = values[4] * img_w - obj.rect.x obj.rect.h = values[5] * img_h - obj.rect.y objects.append(obj) """ #method 2, use ncnn.Mat->numpy.array to get the result, no memory copy too out = np.array(mat_out) for i in range(len(out)): values = out[i] obj = Detect_Object() obj.label = values[0] obj.prob = values[1] obj.rect.x = values[2] * img_w obj.rect.y = values[3] * img_h obj.rect.w = values[4] * img_w - obj.rect.x obj.rect.h = values[5] * img_h - obj.rect.y objects.append(obj) """ return objects ================================================ FILE: python/ncnn/model_zoo/yolact.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause from math import sqrt import numpy as np import cv2 import ncnn from .model_store import get_model_file from ..utils.functional import sigmoid, nms class Yolact: def __init__( self, target_size=550, confidence_threshold=0.05, nms_threshold=0.5, keep_top_k=200, num_threads=1, use_gpu=False, ): self.target_size = target_size self.confidence_threshold = confidence_threshold self.nms_threshold = nms_threshold self.keep_top_k = keep_top_k self.num_threads = num_threads self.use_gpu = use_gpu self.mean_vals = [123.68, 116.78, 103.94] self.norm_vals = [1.0 / 58.40, 1.0 / 57.12, 1.0 / 57.38] self.net = ncnn.Net() self.net.opt.use_vulkan_compute = self.use_gpu self.net.opt.num_threads = self.num_threads # original model converted from https://github.com/dbolya/yolact # yolact_resnet50_54_800000.pth # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models self.net.load_param(get_model_file("yolact.param")) self.net.load_model(get_model_file("yolact.bin")) self.conv_ws = [69, 35, 18, 9, 5] self.conv_hs = [69, 35, 18, 9, 5] self.aspect_ratios = [1, 0.5, 2] self.scales = [24, 48, 96, 192, 384] self.priors = None self.last_img_size = None self.make_priors() self.class_names = [ "background", "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush", ] def __del__(self): self.net = None def __call__(self, img): img_h = img.shape[0] img_w = img.shape[1] mat_in = ncnn.Mat.from_pixels_resize( img, ncnn.Mat.PixelType.PIXEL_BGR2RGB, img_w, img_h, self.target_size, self.target_size, ) mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals) ex = self.net.create_extractor() ex.input("input.1", mat_in) ret1, proto_data = ex.extract("619") # 138x138 x 32 ret2, loc_data = ex.extract("816") # 4 x 19248 ret3, mask_data = ex.extract("818") # maskdim 32 x 19248 ret4, conf_data = ex.extract("820") # 81 x 19248 proto_data = np.array(proto_data) loc_data = np.array(loc_data) mask_data = np.array(mask_data) conf_data = np.array(conf_data) prior_data = self.make_priors() # decoded_boxes = self.decode(loc_data, prior_data) boxes, masks, classes, scores = self.detect( conf_data, loc_data, prior_data, mask_data, img_w, img_h ) # generate mask masks = proto_data.transpose(1, 2, 0) @ masks.T masks = sigmoid(masks) # Scale masks up to the full image masks = cv2.resize(masks, (img_w, img_h), interpolation=cv2.INTER_LINEAR) # transpose into the correct output shape [num_dets, proto_h, proto_w] masks = masks.transpose(2, 0, 1) masks = masks > 0.5 return boxes, masks, classes, scores def make_priors(self): """ Note that priors are [x,y,width,height] where (x,y) is the center of the box. """ if self.last_img_size != (self.target_size, self.target_size): prior_data = [] for conv_w, conv_h, scale in zip(self.conv_ws, self.conv_hs, self.scales): for i in range(conv_h): for j in range(conv_w): # +0.5 because priors are in center-size notation cx = (j + 0.5) / conv_w cy = (i + 0.5) / conv_h for ar in self.aspect_ratios: ar = sqrt(ar) w = scale * ar / self.target_size h = scale / ar / self.target_size # This is for backward compatibility with a bug where I made everything square by accident h = w prior_data += [cx, cy, w, h] self.priors = np.array(prior_data).reshape(-1, 4) self.last_img_size = (self.target_size, self.target_size) return self.priors def decode(self, loc, priors, img_w, img_h): """ Decode predicted bbox coordinates using the same scheme employed by Yolov2: https://arxiv.org/pdf/1612.08242.pdf b_x = (sigmoid(pred_x) - .5) / conv_w + prior_x b_y = (sigmoid(pred_y) - .5) / conv_h + prior_y b_w = prior_w * exp(loc_w) b_h = prior_h * exp(loc_h) Note that loc is inputed as [(s(x)-.5)/conv_w, (s(y)-.5)/conv_h, w, h] while priors are inputed as [x, y, w, h] where each coordinate is relative to size of the image (even sigmoid(x)). We do this in the network by dividing by the 'cell size', which is just the size of the convouts. Also note that prior_x and prior_y are center coordinates which is why we have to subtract .5 from sigmoid(pred_x and pred_y). Args: - loc: The predicted bounding boxes of size [num_priors, 4] - priors: The priorbox coords with size [num_priors, 4] Returns: A tensor of decoded relative coordinates in point form form with size [num_priors, 4(x, y, w, h)] """ variances = [0.1, 0.2] boxes = np.concatenate( ( priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], priors[:, 2:] * np.exp(loc[:, 2:] * variances[1]), ), 1, ) boxes[:, :2] -= boxes[:, 2:] / 2 # boxes[:, 2:] += boxes[:, :2] # crop np.where(boxes[:, 0] < 0, 0, boxes[:, 0]) np.where(boxes[:, 1] < 0, 0, boxes[:, 1]) np.where(boxes[:, 2] > 1, 1, boxes[:, 2]) np.where(boxes[:, 3] > 1, 1, boxes[:, 3]) # decode to img size boxes[:, 0] *= img_w boxes[:, 1] *= img_h boxes[:, 2] = boxes[:, 2] * img_w + 1 boxes[:, 3] = boxes[:, 3] * img_h + 1 return boxes def detect(self, conf_preds, loc_data, prior_data, mask_data, img_w, img_h): """ Perform nms for only the max scoring class that isn't background (class 0) """ cur_scores = conf_preds[:, 1:] num_class = cur_scores.shape[1] classes = np.argmax(cur_scores, axis=1) conf_scores = cur_scores[range(cur_scores.shape[0]), classes] # filte by confidence_threshold keep = conf_scores > self.confidence_threshold conf_scores = conf_scores[keep] classes = classes[keep] loc_data = loc_data[keep, :] prior_data = prior_data[keep, :] masks = mask_data[keep, :] # decode x, y, w, h boxes = self.decode(loc_data, prior_data, img_w, img_h) # nms for every class boxes_result = [] masks_result = [] classes_result = [] conf_scores_result = [] for i in range(num_class): where = np.where(classes == i) if len(where) == 0: continue boxes_tmp = boxes[where] masks_tmp = masks[where] classes_tmp = classes[where] conf_scores_tmp = conf_scores[where] score_mask = conf_scores_tmp > self.confidence_threshold boxes_tmp = boxes_tmp[score_mask] masks_tmp = masks_tmp[score_mask] classes_tmp = classes_tmp[score_mask] conf_scores_tmp = conf_scores_tmp[score_mask] indexes = nms( boxes_tmp, conf_scores_tmp, iou_threshold=self.nms_threshold, top_k=self.keep_top_k, ) for index in indexes: boxes_result.append(boxes_tmp[index]) masks_result.append(masks_tmp[index]) classes_result.append(classes_tmp[index] + 1) conf_scores_result.append(conf_scores_tmp[index]) # keep top k if len(conf_scores_result) > self.keep_top_k: indexes = np.argsort(conf_scores_result) indexes = indexes[: self.keep_top_k] boxes_result = boxes_result[indexes] masks_result = masks_result[indexes] classes_result = classes_result[indexes] conf_scores_result = conf_scores_result[indexes] return ( np.array(boxes_result), np.array(masks_result), np.array(classes_result), np.array(conf_scores_result), ) ================================================ FILE: python/ncnn/model_zoo/yolov2.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import ncnn from .model_store import get_model_file from ..utils.objects import Detect_Object class MobileNet_YoloV2: def __init__(self, target_size=416, num_threads=1, use_gpu=False): self.target_size = target_size self.num_threads = num_threads self.use_gpu = use_gpu self.mean_vals = [1.0, 1.0, 1.0] self.norm_vals = [0.007843, 0.007843, 0.007843] self.net = ncnn.Net() self.net.opt.num_threads = self.num_threads self.net.opt.use_vulkan_compute = self.use_gpu # original pretrained model from https://github.com/eric612/MobileNet-YOLO # https://github.com/eric612/MobileNet-YOLO/blob/master/models/yolov2/mobilenet_yolo_deploy.prototxt # https://github.com/eric612/MobileNet-YOLO/blob/master/models/yolov2/mobilenet_yolo_deploy_iter_80000.caffemodel # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models self.net.load_param(get_model_file("mobilenet_yolo.param")) self.net.load_model(get_model_file("mobilenet_yolo.bin")) self.class_names = [ "background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor", ] def __del__(self): self.net = None def __call__(self, img): img_h = img.shape[0] img_w = img.shape[1] mat_in = ncnn.Mat.from_pixels_resize( img, ncnn.Mat.PixelType.PIXEL_BGR, img.shape[1], img.shape[0], self.target_size, self.target_size, ) mat_in.substract_mean_normalize([], self.norm_vals) mat_in.substract_mean_normalize(self.mean_vals, []) ex = self.net.create_extractor() ex.input("data", mat_in) ret, mat_out = ex.extract("detection_out") objects = [] # printf("%d %d %d\n", mat_out.w, mat_out.h, mat_out.c) # method 1, use ncnn.Mat.row to get the result, no memory copy for i in range(mat_out.h): values = mat_out.row(i) obj = Detect_Object() obj.label = values[0] obj.prob = values[1] obj.rect.x = values[2] * img_w obj.rect.y = values[3] * img_h obj.rect.w = values[4] * img_w - obj.rect.x obj.rect.h = values[5] * img_h - obj.rect.y objects.append(obj) """ #method 2, use ncnn.Mat->numpy.array to get the result, no memory copy too out = np.array(mat_out) for i in range(len(out)): values = out[i] obj = Detect_Object() obj.label = values[0] obj.prob = values[1] obj.rect.x = values[2] * img_w obj.rect.y = values[3] * img_h obj.rect.w = values[4] * img_w - obj.rect.x obj.rect.h = values[5] * img_h - obj.rect.y objects.append(obj) """ return objects ================================================ FILE: python/ncnn/model_zoo/yolov3.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import ncnn from .model_store import get_model_file from ..utils.objects import Detect_Object class MobileNetV2_YoloV3: def __init__(self, target_size=352, num_threads=1, use_gpu=False): self.target_size = target_size self.num_threads = num_threads self.use_gpu = use_gpu self.mean_vals = [127.5, 127.5, 127.5] self.norm_vals = [0.007843, 0.007843, 0.007843] self.net = ncnn.Net() self.net.opt.num_threads = self.num_threads self.net.opt.use_vulkan_compute = self.use_gpu # original pretrained model from https://github.com/eric612/MobileNet-YOLO # param : https://drive.google.com/open?id=1V9oKHP6G6XvXZqhZbzNKL6FI_clRWdC- # bin : https://drive.google.com/open?id=1DBcuFCr-856z3FRQznWL_S5h-Aj3RawA # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models self.net.load_param(get_model_file("mobilenetv2_yolov3.param")) self.net.load_model(get_model_file("mobilenetv2_yolov3.bin")) self.class_names = [ "background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor", ] def __del__(self): self.net = None def __call__(self, img): img_h = img.shape[0] img_w = img.shape[1] mat_in = ncnn.Mat.from_pixels_resize( img, ncnn.Mat.PixelType.PIXEL_BGR, img.shape[1], img.shape[0], self.target_size, self.target_size, ) mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals) ex = self.net.create_extractor() ex.input("data", mat_in) ret, mat_out = ex.extract("detection_out") objects = [] # printf("%d %d %d\n", mat_out.w, mat_out.h, mat_out.c) # method 1, use ncnn.Mat.row to get the result, no memory copy for i in range(mat_out.h): values = mat_out.row(i) obj = Detect_Object() obj.label = values[0] obj.prob = values[1] obj.rect.x = values[2] * img_w obj.rect.y = values[3] * img_h obj.rect.w = values[4] * img_w - obj.rect.x obj.rect.h = values[5] * img_h - obj.rect.y objects.append(obj) """ #method 2, use ncnn.Mat->numpy.array to get the result, no memory copy too out = np.array(mat_out) for i in range(len(out)): values = out[i] obj = Detect_Object() obj.label = values[0] obj.prob = values[1] obj.x = values[2] * img_w obj.y = values[3] * img_h obj.w = values[4] * img_w - obj.x obj.h = values[5] * img_h - obj.y objects.append(obj) """ return objects ================================================ FILE: python/ncnn/model_zoo/yolov4.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import ncnn from .model_store import get_model_file from ..utils.objects import Detect_Object class YoloV4_Base: def __init__(self, tiny, target_size, num_threads=1, use_gpu=False): self.target_size = target_size self.num_threads = num_threads self.use_gpu = use_gpu self.mean_vals = [] self.norm_vals = [1 / 255.0, 1 / 255.0, 1 / 255.0] self.net = ncnn.Net() self.net.opt.use_vulkan_compute = self.use_gpu self.net.opt.num_threads = self.num_threads # original pretrained model from https://github.com/AlexeyAB/darknet # the ncnn model https://drive.google.com/drive/folders/1YzILvh0SKQPS_lrb33dmGNq7aVTKPWS0?usp=sharing # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models if tiny == True: self.net.load_param(get_model_file("yolov4-tiny-opt.param")) self.net.load_model(get_model_file("yolov4-tiny-opt.bin")) else: self.net.load_param(get_model_file("yolov4-opt.param")) self.net.load_model(get_model_file("yolov4-opt.bin")) self.class_names = [ "background", "person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush", ] def __del__(self): self.net = None def __call__(self, img): img_h = img.shape[0] img_w = img.shape[1] mat_in = ncnn.Mat.from_pixels_resize( img, ncnn.Mat.PixelType.PIXEL_BGR2RGB, img.shape[1], img.shape[0], self.target_size, self.target_size, ) mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals) ex = self.net.create_extractor() ex.input("data", mat_in) ret, mat_out = ex.extract("output") objects = [] # method 1, use ncnn.Mat.row to get the result, no memory copy for i in range(mat_out.h): values = mat_out.row(i) obj = Detect_Object() obj.label = values[0] obj.prob = values[1] obj.rect.x = values[2] * img_w obj.rect.y = values[3] * img_h obj.rect.w = values[4] * img_w - obj.rect.x obj.rect.h = values[5] * img_h - obj.rect.y objects.append(obj) """ #method 2, use ncnn.Mat->numpy.array to get the result, no memory copy too out = np.array(mat_out) for i in range(len(out)): values = out[i] obj = Detect_Object() obj.label = values[0] obj.prob = values[1] obj.x = values[2] * img_w obj.y = values[3] * img_h obj.w = values[4] * img_w - obj.x obj.h = values[5] * img_h - obj.y objects.append(obj) """ return objects class YoloV4_Tiny(YoloV4_Base): def __init__(self, **kwargs): super(YoloV4_Tiny, self).__init__(True, 416, **kwargs) class YoloV4(YoloV4_Base): def __init__(self, **kwargs): super(YoloV4, self).__init__(False, 608, **kwargs) ================================================ FILE: python/ncnn/model_zoo/yolov5.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import time import numpy as np import ncnn from .model_store import get_model_file from ..utils.objects import Detect_Object from ..utils.functional import * class YoloV5Focus(ncnn.Layer): yolov5FocusLayers = [] def __init__(self): ncnn.Layer.__init__(self) self.one_blob_only = True self.yolov5FocusLayers.append(self) def forward(self, bottom_blob, top_blob, opt): x = np.array(bottom_blob) x = np.concatenate( [ x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2], ] ) top_blob.clone_from(ncnn.Mat(x), opt.blob_allocator) if top_blob.empty(): return -100 return 0 def YoloV5Focus_layer_creator(): return YoloV5Focus() def YoloV5Focus_layer_destroyer(layer): for i in range(len(YoloV5Focus.yolov5FocusLayers)): if YoloV5Focus.yolov5FocusLayers[i] == layer: del YoloV5Focus.yolov5FocusLayers[i] break class YoloV5s: def __init__( self, target_size=640, prob_threshold=0.25, nms_threshold=0.45, num_threads=1, use_gpu=False, ): self.target_size = target_size self.prob_threshold = prob_threshold self.nms_threshold = nms_threshold self.num_threads = num_threads self.use_gpu = use_gpu self.mean_vals = [] self.norm_vals = [1 / 255.0, 1 / 255.0, 1 / 255.0] self.net = ncnn.Net() self.net.opt.use_vulkan_compute = self.use_gpu self.net.opt.num_threads = self.num_threads self.net.register_custom_layer( "YoloV5Focus", YoloV5Focus_layer_creator, YoloV5Focus_layer_destroyer ) # original pretrained model from https://github.com/ultralytics/yolov5 # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models self.net.load_param(get_model_file("yolov5s.param")) self.net.load_model(get_model_file("yolov5s.bin")) self.grid = [make_grid(10, 6), make_grid(20, 12), make_grid(40, 24)] self.stride = np.array([32, 16, 8]) self.anchor_grid = np.array( [ [116, 90, 156, 198, 373, 326], [30, 61, 62, 45, 59, 119], [10, 13, 16, 30, 33, 23], ] ).reshape((3, 1, 3, 1, 1, 2)) self.class_names = [ "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush", ] def __del__(self): self.net = None def __call__(self, img): img_w = img.shape[1] img_h = img.shape[0] w = img_w h = img_h scale = 1.0 if w > h: scale = float(self.target_size) / w w = self.target_size h = int(h * scale) else: scale = float(self.target_size) / h h = self.target_size w = int(w * scale) mat_in = ncnn.Mat.from_pixels_resize( img, ncnn.Mat.PixelType.PIXEL_BGR2RGB, img_w, img_h, w, h ) # pad to target_size rectangle # yolov5/utils/datasets.py letterbox wpad = (w + 31) // 32 * 32 - w hpad = (h + 31) // 32 * 32 - h mat_in_pad = ncnn.copy_make_border( mat_in, hpad // 2, hpad - hpad // 2, wpad // 2, wpad - wpad // 2, ncnn.BorderType.BORDER_CONSTANT, 114.0, ) mat_in_pad.substract_mean_normalize(self.mean_vals, self.norm_vals) ex = self.net.create_extractor() ex.input("images", mat_in_pad) # anchor setting from yolov5/models/yolov5s.yaml ret1, mat_out1 = ex.extract("output") # stride 8 ret2, mat_out2 = ex.extract("781") # stride 16 ret3, mat_out3 = ex.extract("801") # stride 32 pred = [np.array(mat_out3), np.array(mat_out2), np.array(mat_out1)] z = [] for i in range(len(pred)): num_grid = pred[i].shape[1] if mat_in_pad.w > mat_in_pad.h: num_grid_x = mat_in_pad.w // self.stride[i] num_grid_y = num_grid // num_grid_x else: num_grid_y = mat_in_pad.h // self.stride[i] num_grid_x = num_grid // num_grid_y if ( self.grid[i].shape[0] != num_grid_x or self.grid[i].shape[1] != num_grid_y ): self.grid[i] = make_grid(num_grid_x, num_grid_y) y = sigmoid(pred[i]) y = y.reshape(pred[i].shape[0], num_grid_y, num_grid_x, pred[i].shape[2]) y[..., 0:2] = (y[..., 0:2] * 2.0 - 0.5 + self.grid[i]) * self.stride[ i ] # xy y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh z.append(y.reshape(1, -1, y.shape[-1])) pred = np.concatenate(z, 1) result = self.non_max_suppression( pred, self.prob_threshold, self.nms_threshold )[0] objects = [ Detect_Object( obj[5], obj[4], (obj[0] - (wpad / 2)) / scale, (obj[1] - (hpad / 2)) / scale, (obj[2] - obj[0]) / scale, (obj[3] - obj[1]) / scale, ) for obj in result ] return objects def non_max_suppression( self, prediction, conf_thres=0.1, iou_thres=0.6, merge=False, classes=None, agnostic=False, ): """Performs Non-Maximum Suppression (NMS) on inference results Returns: detections with shape: nx6 (x1, y1, x2, y2, conf, cls) """ nc = prediction[0].shape[1] - 5 # number of classes xc = prediction[..., 4] > conf_thres # candidates # Settings min_wh, max_wh = 2, 4096 # (pixels) minimum and maximum box width and height max_det = 300 # maximum number of detections per image time_limit = 10.0 # seconds to quit after redundant = True # require redundant detections multi_label = nc > 1 # multiple labels per box (adds 0.5ms/img) t = time.time() output = [None] * prediction.shape[0] for xi, x in enumerate(prediction): # image index, image inference # Apply constraints # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0 # width-height x = x[xc[xi]] # confidence # If none remain process next image if not x.shape[0]: continue # Compute conf x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf # Box (center x, center y, width, height) to (x1, y1, x2, y2) box = xywh2xyxy(x[:, :4]) # Detections matrix nx6 (xyxy, conf, cls) if multi_label: i, j = (x[:, 5:] > conf_thres).nonzero() x = np.concatenate( (box[i], x[i, j + 5, None], j[:, None].astype(np.float32)), axis=1 ) else: # best class only conf, j = x[:, 5:].max(1, keepdim=True) x = np.concatenate((box, conf, j.float()), axis=1)[ conf.view(-1) > conf_thres ] # Filter by class if classes: x = x[(x[:, 5:6] == np.array(classes)).any(1)] # Apply finite constraint # if not torch.isfinite(x).all(): # x = x[torch.isfinite(x).all(1)] # If none remain process next image n = x.shape[0] # number of boxes if not n: continue # Sort by confidence # x = x[x[:, 4].argsort(descending=True)] # Batched NMS c = x[:, 5:6] * (0 if agnostic else max_wh) # classes boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores i = nms(boxes, scores, iou_threshold=iou_thres) if len(i) > max_det: # limit detections i = i[:max_det] if merge and (1 < n < 3e3): # Merge NMS (boxes merged using weighted mean) try: # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4) iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix weights = iou * scores[None] # box weights x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum( 1, keepdim=True ) # merged boxes if redundant: i = i[iou.sum(1) > 1] # require redundancy except: # possible CUDA error https://github.com/ultralytics/yolov3/issues/1139 print(x, i, x.shape, i.shape) pass output[xi] = x[i] if (time.time() - t) > time_limit: break # time limit exceeded return output ================================================ FILE: python/ncnn/model_zoo/yolov7.py ================================================ # Copyright 2020 Tencent # Copyright 2023 Kenny Bradley # SPDX-License-Identifier: BSD-3-Clause # Ported yolov7-tiny to python based on: # - https://github.com/Qengineering/YoloV7-ncnn-Raspberry-Pi-4/blob/main/yolo.cpp # Format based on the ncnn yolov4 implementation import ncnn from .model_store import get_model_file from ..utils.objects import Detect_Object import numpy as np #def sigmoid_binned(val) # this could use a much faster binned lookup table instead of np.exp and floating division def sigmoid(val): return 1.0 / (1.0 + np.exp(-val)) #IOU functions: #find the overlap width given ([x1,x2], [x3,x4]) or ([y1,y2], [y3,y4]) def calcOverlap(r1, r2): #r1 contains r2 if r1[0] <= r2[0] and r1[1] >= r2[1]: return r2[1] - r2[0] #r2 contains r1 elif r1[0] >= r2[0] and r1[1] <= r2[1]: return r1[1] - r1[0] #r1.1 is between r2.0 and r2.1 elif r1[0] <= r2[0] and r1[1] >= r2[0]: # r1[1] <= r2[1] is true since the first if failed return r1[1] - r2[0] #r1.0 is between r2.0 and r2.1 elif r1[0] >= r2[0] and r1[0] <= r2[1]: # r1[1] >= r2[1] is true since the second if failed return r2[1] - r1[0] else: return 0 #find X and Y overlaps and return intersection area def calcIntersection(r1 : Detect_Object, r2 : Detect_Object): xOverlap = calcOverlap([r1.rect.x, r1.rect.x+r1.rect.w], [r2.rect.x, r2.rect.x+r2.rect.w]) yOverlap = calcOverlap([r1.rect.y, r1.rect.y+r1.rect.h], [r2.rect.y, r2.rect.y+r2.rect.h]) return xOverlap*yOverlap #with r = [X1,X2,Y1,Y2] as the format return the IOU def IOU(r1 : Detect_Object, r2 : Detect_Object): intersection = calcIntersection(r1,r2) #union = r1 area + r2 area - duplicate area union = (r1.rect.w*r1.rect.h) + (r2.rect.w*r2.rect.h) - intersection if union == 0: return 0 else: return intersection/union #NMS #detections are pre-sorted in ascending confidence order #detections are a list of Detect_Objects with : label, prob, rect def NMS(detections, iou_thresh=0.45): cleanDetections = [] detByClasses = {} #group by class for det in detections: #det.label is the class if det.label not in detByClasses.keys(): detByClasses[det.label] = [] detByClasses[det.label].append(det) #for each class find the values to keep for key, dets in detByClasses.items(): for i in range(len(dets)): keep = 1 #keep unless a higher priority det has IOU > thresh for j in range(i+1,len(dets)): iou = IOU(dets[i], dets[j]) if iou > iou_thresh: keep = 0 break if keep: cleanDetections.append(dets[i]) #return cleaner list of Detect_Object values return cleanDetections class YoloV7_Base: def __init__(self, target_size, num_threads=1, use_gpu=False, use_strides=[8,16,32]): self.target_size = target_size self.num_threads = num_threads self.use_gpu = use_gpu self.use_strides = use_strides self.mean_vals = [] self.norm_vals = [1 / 255.0, 1 / 255.0, 1 / 255.0] self.net = ncnn.Net() self.net.opt.use_vulkan_compute = self.use_gpu self.net.opt.num_threads = self.num_threads # original pretrained model from https://github.com/AlexeyAB/darknet # the ncnn model https://drive.google.com/drive/folders/1YzILvh0SKQPS_lrb33dmGNq7aVTKPWS0?usp=sharing # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models self.net.load_param(get_model_file("yolov7-tiny.param")) self.net.load_model(get_model_file("yolov7-tiny.bin")) self.class_names = [ "person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush" ] def __del__(self): self.net = None def __call__(self, img): img_h = img.shape[0] img_w = img.shape[1] mat_in = ncnn.Mat.from_pixels_resize( img, ncnn.Mat.PixelType.PIXEL_BGR2RGB, img.shape[1], img.shape[0], self.target_size, self.target_size, ) mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals) ex = self.net.create_extractor() ex.input("images", mat_in) outValues = [] if 8 in self.use_strides: ret8, out8 = ex.extract("output"); outValues.append(out8) else: outValues.append(None) if 16 in self.use_strides: ret16, out16 = ex.extract("288"); outValues.append(out16) else: outValues.append(None) if 32 in self.use_strides: ret32, out32 = ex.extract("302"); outValues.append(out32) else: outValues.append(None) # P3/8, P4/16, P5/32 anchors = [[12,16, 19,36, 40,28], [36,75, 76,55, 72,146], [142,110, 192,243, 459,401]] strides = [8,16,32] objects = [] #this threshold is the value for which sigmoid gives 0.25 which is the threshold threshNonSigmoid = -1.098612 for strideCount, mat_out in enumerate(outValues): if mat_out is None: continue stride = strides[strideCount] for c in range(3): mat = mat_out.channel(c) #yolo should always be square, it is expected to be 52x52 # but sqrt() guarantees the correct size for side side = int(np.sqrt(mat.h)) anchorW = anchors[strideCount][c*2] anchorH = anchors[strideCount][c*2+1] index = 0 for i in range(side): for j in range(side): #values 5-84 are class data classData=mat.row(index)[5:] maxLabel = max(classData) #optimization #if either the objectness or max class score resolve to < 0.25 we can skip this # but the values are pre-sigmoid so compare to threshNonSigmoid. # 1 / (1+e^(-1.098612)) = 0.25 so just compare to the -1.098612 threshold if mat.row(index)[4] < threshNonSigmoid or maxLabel < threshNonSigmoid: index += 1 continue #values 0-3 are coordinate data locData = mat.row(index)[0:4] #value 4 is the box confidence score box_score = sigmoid(mat.row(index)[4]) #get the highest scoring class for this detection to multiply by the box_score label = np.argmax(classData) class_score = sigmoid(mat.row(index)[label+5]) conf = box_score * class_score if conf > 0.25: obj = Detect_Object() obj.label = self.class_names[label] obj.prob = conf #convert from raw yolo output to W,H and X,Y obj.rect.w = ((sigmoid(locData[2]) *2) ** 2) * anchorW obj.rect.h = ((sigmoid(locData[3]) *2) ** 2) * anchorH obj.rect.x = ((sigmoid(locData[0]) * 2) - 0.5 + j) * stride - (obj.rect.w/2) obj.rect.y = ((sigmoid(locData[1]) * 2) - 0.5 + i) * stride - (obj.rect.h/2) objects.append(obj) index +=1 #sort based on probability in ascending order objects.sort(key = lambda x: x.prob) filtered_objects = NMS(objects) #rescale to input image size XscaleAdj = img_w / self.target_size YscaleAdj = img_h / self.target_size for count in range(len(filtered_objects)): filtered_objects[count].rect.x *= XscaleAdj filtered_objects[count].rect.w *= XscaleAdj filtered_objects[count].rect.y *= YscaleAdj filtered_objects[count].rect.h *= YscaleAdj return filtered_objects class YoloV7_Tiny(YoloV7_Base): def __init__(self, **kwargs): super(YoloV7_Tiny, self).__init__(416, **kwargs) ================================================ FILE: python/ncnn/model_zoo/yolov8.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import time import numpy as np import ncnn from .model_store import get_model_file from ..utils.objects import Detect_Object from ..utils.functional import * from typing import Iterable class YoloV8s: def __init__( self, target_size=640, prob_threshold=0.25, nms_threshold=0.45, num_threads=1, use_gpu=False, ): self.target_size = target_size self.prob_threshold = prob_threshold self.nms_threshold = nms_threshold self.num_threads = num_threads self.use_gpu = use_gpu self.reg_max = 16 self.mean_vals = [] self.norm_vals = [1 / 255.0, 1 / 255.0, 1 / 255.0] self.net = ncnn.Net() self.net.opt.use_vulkan_compute = self.use_gpu self.net.opt.num_threads = self.num_threads # original pretrained model from https://github.com/ultralytics/ultralytics # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models self.net.load_param(get_model_file("yolov8s.param")) self.net.load_model(get_model_file("yolov8s.bin")) self.grid = [make_grid(20, 20), make_grid(40, 40), make_grid(80, 80)] self.stride = np.array([32, 16, 8]) self.class_names = [ "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush", ] def __del__(self): self.net = None def __call__(self, img): img_w = img.shape[1] img_h = img.shape[0] w = img_w h = img_h scale = 1.0 if w > h: scale = float(self.target_size) / w w = self.target_size h = int(h * scale) else: scale = float(self.target_size) / h h = self.target_size w = int(w * scale) mat_in = ncnn.Mat.from_pixels_resize( img, ncnn.Mat.PixelType.PIXEL_BGR2RGB, img_w, img_h, w, h ) # pad to target_size rectangle # yolov5/utils/datasets.py letterbox wpad = (w + 31) // 32 * 32 - w hpad = (h + 31) // 32 * 32 - h mat_in_pad = ncnn.copy_make_border( mat_in, hpad // 2, hpad - hpad // 2, wpad // 2, wpad - wpad // 2, ncnn.BorderType.BORDER_CONSTANT, 114.0, ) mat_in_pad.substract_mean_normalize(self.mean_vals, self.norm_vals) ex = self.net.create_extractor() ex.input("in0", mat_in_pad) ret1, mat_out1 = ex.extract("out0") # stride 8 ret2, mat_out2 = ex.extract("out1") # stride 16 ret3, mat_out3 = ex.extract("out2") # stride 32 pred = [np.array(mat_out3), np.array(mat_out2), np.array(mat_out1)] z = [] for i in range(len(pred)): num_grid_x = mat_in_pad.w // self.stride[i] num_grid_y = mat_in_pad.h // self.stride[i] if ( self.grid[i].shape[1] != num_grid_y or self.grid[i].shape[2] != num_grid_x ): self.grid[i] = make_grid(num_grid_x, num_grid_y) cls, box = np.split(pred[i].transpose((1, 2, 0)), [len(self.class_names), ], -1) box = softmax(box.reshape(-1, self.reg_max)) box = box.reshape(num_grid_y, num_grid_x, 4, self.reg_max) box = box @ np.arange(0, self.reg_max, dtype=np.float32) cls = sigmoid(cls) conf = cls.max(-1, keepdims=True) x1y1 = (self.grid[i][0] + 0.5 - box[..., :2]) * self.stride[i] x2y2 = (self.grid[i][0] + 0.5 + box[..., 2:]) * self.stride[i] res = np.concatenate([x1y1, x2y2, conf, cls], -1) z.append(res.reshape((1, -1, len(self.class_names) + 5))) pred = np.concatenate(z, 1) result = self.non_max_suppression( pred, self.prob_threshold, self.nms_threshold )[0] if isinstance(result, Iterable): objects = [ Detect_Object( obj[5], obj[4], (obj[0] - (wpad / 2)) / scale, (obj[1] - (hpad / 2)) / scale, (obj[2] - obj[0]) / scale, (obj[3] - obj[1]) / scale, ) for obj in result ] else: objects = [] return objects def non_max_suppression( self, prediction, conf_thres=0.1, iou_thres=0.6, merge=False, classes=None, agnostic=False, ): """Performs Non-Maximum Suppression (NMS) on inference results Returns: detections with shape: nx6 (x1, y1, x2, y2, conf, cls) """ nc = prediction[0].shape[1] - 5 # number of classes xc = prediction[..., 4] > conf_thres # candidates # Settings min_wh, max_wh = 2, 4096 # (pixels) minimum and maximum box width and height max_det = 300 # maximum number of detections per image time_limit = 10.0 # seconds to quit after redundant = True # require redundant detections multi_label = nc > 1 # multiple labels per box (adds 0.5ms/img) t = time.time() output = [None] * prediction.shape[0] for xi, x in enumerate(prediction): # image index, image inference # Apply constraints # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0 # width-height x = x[xc[xi]] # confidence # If none remain process next image if not x.shape[0]: continue box = x[:, :4] # Detections matrix nx6 (xyxy, conf, cls) if multi_label: i, j = (x[:, 5:] > conf_thres).nonzero() x = np.concatenate( (box[i], x[i, j + 5, None], j[:, None].astype(np.float32)), axis=1 ) else: # best class only conf, j = x[:, 5:].max(1, keepdim=True) x = np.concatenate((box, conf, j.float()), axis=1)[ conf.view(-1) > conf_thres ] # Filter by class if classes: x = x[(x[:, 5:6] == np.array(classes)).any(1)] # Apply finite constraint # if not torch.isfinite(x).all(): # x = x[torch.isfinite(x).all(1)] # If none remain process next image n = x.shape[0] # number of boxes if not n: continue # Sort by confidence # x = x[x[:, 4].argsort(descending=True)] # Batched NMS c = x[:, 5:6] * (0 if agnostic else max_wh) # classes boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores i = nms(boxes, scores, iou_threshold=iou_thres) if len(i) > max_det: # limit detections i = i[:max_det] if merge and (1 < n < 3e3): # Merge NMS (boxes merged using weighted mean) try: # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4) iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix weights = iou * scores[None] # box weights x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum( 1, keepdim=True ) # merged boxes if redundant: i = i[iou.sum(1) > 1] # require redundancy except: # possible CUDA error https://github.com/ultralytics/yolov3/issues/1139 print(x, i, x.shape, i.shape) pass output[xi] = x[i] if (time.time() - t) > time_limit: break # time limit exceeded return output ================================================ FILE: python/ncnn/utils/__init__.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause from .download import download, check_sha1 from .visual import * from .objects import * ================================================ FILE: python/ncnn/utils/download.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause """Download files with progress bar.""" import os import hashlib import requests from tqdm import tqdm def check_sha1(filename, sha1_hash): """Check whether the sha1 hash of the file content matches the expected hash. Parameters ---------- filename : str Path to the file. sha1_hash : str Expected sha1 hash in hexadecimal digits. Returns ------- bool Whether the file content matches the expected hash. """ sha1 = hashlib.sha1() with open(filename, "rb") as f: while True: data = f.read(1048576) if not data: break sha1.update(data) sha1_file = sha1.hexdigest() l = min(len(sha1_file), len(sha1_hash)) return sha1.hexdigest()[0:l] == sha1_hash[0:l] def download(url, path=None, overwrite=False, sha1_hash=None): """Download an given URL Parameters ---------- url : str URL to download path : str, optional Destination path to store downloaded file. By default stores to the current directory with same name as in url. overwrite : bool, optional Whether to overwrite destination file if already exists. sha1_hash : str, optional Expected sha1 hash in hexadecimal digits. Will ignore existing file when hash is specified but doesn't match. Returns ------- str The file path of the downloaded file. """ if path is None: fname = url.split("/")[-1] else: path = os.path.expanduser(path) if os.path.isdir(path): fname = os.path.join(path, url.split("/")[-1]) else: fname = path if ( overwrite or not os.path.exists(fname) or (sha1_hash and not check_sha1(fname, sha1_hash)) ): dirname = os.path.dirname(os.path.abspath(os.path.expanduser(fname))) if not os.path.exists(dirname): os.makedirs(dirname) print("Downloading %s from %s..." % (fname, url)) r = requests.get(url, stream=True) if r.status_code != 200: raise RuntimeError("Failed downloading url %s" % url) total_length = r.headers.get("content-length") with open(fname, "wb") as f: if total_length is None: # no content length header for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) else: total_length = int(total_length) for chunk in tqdm( r.iter_content(chunk_size=1024), total=int(total_length / 1024.0 + 0.5), unit="KB", unit_scale=False, dynamic_ncols=True, ): f.write(chunk) if sha1_hash and not check_sha1(fname, sha1_hash): raise UserWarning( "File {} is downloaded but the content hash does not match. " "The repo may be outdated or download may be incomplete. " 'If the "repo_url" is overridden, consider switching to ' "the default repo.".format(fname) ) return fname ================================================ FILE: python/ncnn/utils/functional.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import numpy as np def xywh2xyxy(x): # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right y = np.zeros_like(x) y[:, 0] = x[:, 0] - x[:, 2] / 2 # top left x y[:, 1] = x[:, 1] - x[:, 3] / 2 # top left y y[:, 2] = x[:, 0] + x[:, 2] / 2 # bottom right x y[:, 3] = x[:, 1] + x[:, 3] / 2 # bottom right y return y def xyxy2xywh(x): # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] where xy1=top-left, xy2=bottom-right y = np.zeros_like(x) y[:, 0] = (x[:, 0] + x[:, 2]) / 2 # x center y[:, 1] = (x[:, 1] + x[:, 3]) / 2 # y center y[:, 2] = x[:, 2] - x[:, 0] # width y[:, 3] = x[:, 3] - x[:, 1] # height return y def make_grid(nx=20, ny=20): xv1, yv1 = np.meshgrid(np.arange(nx), np.arange(ny)) z1 = np.stack((xv1, yv1), 2).reshape((1, ny, nx, 2)).astype(np.float32) return z1 def sigmoid(x): return 1 / (1 + np.exp(-x)) def softmax(x): max_value = np.max(x, axis=-1) x -= max_value.reshape((x.shape[0], 1)) x = np.exp(x) sum_value = np.sum(x, axis=-1) x /= sum_value.reshape((x.shape[0], 1)) return x def iou_of(boxes0, boxes1, eps=1e-5): """Return intersection-over-union (Jaccard index) of boxes. Args: boxes0 (N, 4): ground truth boxes. boxes1 (N or 1, 4): predicted boxes. eps: a small number to avoid 0 as denominator. Returns: iou (N): IoU values. """ overlap_left_top = np.maximum(boxes0[..., :2], boxes1[..., :2]) overlap_right_bottom = np.minimum(boxes0[..., 2:], boxes1[..., 2:]) overlap_area = area_of(overlap_left_top, overlap_right_bottom) area0 = area_of(boxes0[..., :2], boxes0[..., 2:]) area1 = area_of(boxes1[..., :2], boxes1[..., 2:]) return overlap_area / (area0 + area1 - overlap_area + eps) def area_of(left_top, right_bottom): """Compute the areas of rectangles given two corners. Args: left_top (N, 2): left top corner. right_bottom (N, 2): right bottom corner. Returns: area (N): return the area. """ hw = np.clip(right_bottom - left_top, 0.0, None) return hw[..., 0] * hw[..., 1] def nms(boxes, scores, iou_threshold, top_k=-1, candidate_size=200): """ Args: box_scores (N, 5): boxes in corner-form(x1, y1, x2, y2) and probabilities. iou_threshold: intersection over union threshold. top_k: keep top_k results. If k <= 0, keep all the results. candidate_size: only consider the candidates with the highest scores. Returns: picked: a list of indexes of the kept boxes """ picked = [] indexes = np.argsort(scores) indexes = indexes[-candidate_size:] while len(indexes) > 0: current = indexes[-1] picked.append(current) if 0 < top_k == len(picked) or len(indexes) == 1: break current_box = boxes[current, :] indexes = indexes[:-1] rest_boxes = boxes[indexes, :] iou = iou_of( rest_boxes, np.expand_dims(current_box, axis=0), ) indexes = indexes[iou <= iou_threshold] return picked ================================================ FILE: python/ncnn/utils/objects.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import numpy as np class Point(object): def __init__(self): self.x = 0.0 self.y = 0.0 class Rect(object): def __init__(self, x=0, y=0, w=0, h=0): self.x = x self.y = y self.w = w self.h = h def area(self): return self.w * self.h def intersection_area(self, b): x1 = np.maximum(self.x, b.x) y1 = np.maximum(self.y, b.y) x2 = np.minimum(self.x + self.w, b.x + b.w) y2 = np.minimum(self.y + self.h, b.y + b.h) return np.abs(x1 - x2) * np.abs(y1 - y2) class Detect_Object(object): def __init__(self, label=0, prob=0, x=0, y=0, w=0, h=0): self.label = label self.prob = prob self.rect = Rect(x, y, w, h) class Face_Object(object): def __init__(self): self.prob = 0.0 self.rect = Rect() self.landmark = [] class KeyPoint(object): def __init__(self): self.p = Point() self.prob = 0.0 ================================================ FILE: python/ncnn/utils/visual.py ================================================ # Copyright 2020 Tencent # SPDX-License-Identifier: BSD-3-Clause import numpy as np import cv2 def draw_detection_objects(image, class_names, objects, min_prob=0.0): for obj in objects: if obj.prob < min_prob: continue print( "%d = %.5f at %.2f %.2f %.2f x %.2f\n" % (obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.w, obj.rect.h) ) cv2.rectangle( image, (int(obj.rect.x), int(obj.rect.y)), (int(obj.rect.x + obj.rect.w), int(obj.rect.y + obj.rect.h)), (255, 0, 0), ) text = "%s %.1f%%" % (class_names[int(obj.label)], obj.prob * 100) label_size, baseLine = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) x = obj.rect.x y = obj.rect.y - label_size[1] - baseLine if y < 0: y = 0 if x + label_size[0] > image.shape[1]: x = image.shape[1] - label_size[0] cv2.rectangle( image, (int(x), int(y)), (int(x + label_size[0]), int(y + label_size[1] + baseLine)), (255, 255, 255), -1, ) cv2.putText( image, text, (int(x), int(y + label_size[1])), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), ) cv2.imshow("image", image) cv2.waitKey(0) def print_topk(cls_scores, topk): indexes = np.argsort(cls_scores)[::-1][0:topk] scores = cls_scores[indexes] for index, score in zip(indexes, scores): print("%d=%f" % (index, score)) def draw_faceobjects(image, faceobjects): for obj in faceobjects: print( "%.5f at %.2f %.2f %.2f x %.2f" % (obj.prob, obj.rect.x, obj.rect.y, obj.rect.w, obj.rect.h) ) cv2.rectangle( image, (int(obj.rect.x), int(obj.rect.y)), (int(obj.rect.x + obj.rect.w), int(obj.rect.y + obj.rect.h)), (255, 0, 0), ) cv2.circle( image, (int(obj.landmark[0].x), int(obj.landmark[0].y)), 2, (0, 255, 255), -1, ) cv2.circle( image, (int(obj.landmark[1].x), int(obj.landmark[1].y)), 2, (0, 255, 255), -1, ) cv2.circle( image, (int(obj.landmark[2].x), int(obj.landmark[2].y)), 2, (0, 255, 255), -1, ) cv2.circle( image, (int(obj.landmark[3].x), int(obj.landmark[3].y)), 2, (0, 255, 255), -1, ) cv2.circle( image, (int(obj.landmark[4].x), int(obj.landmark[4].y)), 2, (0, 255, 255), -1, ) text = "%.1f%%" % (obj.prob * 100) label_size, baseLine = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) x = obj.rect.x y = obj.rect.y - label_size[1] - baseLine if y < 0: y = 0 if x + label_size[0] > image.shape[1]: x = image.shape[1] - label_size[0] cv2.rectangle( image, (int(x), int(y)), (int(x + label_size[0]), int(y + label_size[1] + baseLine)), (255, 255, 255), -1, ) cv2.putText( image, text, (int(x), int(y + label_size[1])), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), ) cv2.imshow("image", image) cv2.waitKey(0) def draw_pose(image, keypoints): # draw bone joint_pairs = [ (0, 1), (1, 3), (0, 2), (2, 4), (5, 6), (5, 7), (7, 9), (6, 8), (8, 10), (5, 11), (6, 12), (11, 12), (11, 13), (12, 14), (13, 15), (14, 16), ] for i in range(16): p1 = keypoints[joint_pairs[i][0]] p2 = keypoints[joint_pairs[i][1]] if p1.prob < 0.2 or p2.prob < 0.2: continue cv2.line( image, (int(p1.p.x), int(p1.p.y)), (int(p2.p.x), int(p2.p.y)), (255, 0, 0), 2, ) # draw joint for keypoint in keypoints: print("%.2f %.2f = %.5f" % (keypoint.p.x, keypoint.p.y, keypoint.prob)) if keypoint.prob < 0.2: continue cv2.circle(image, (int(keypoint.p.x), int(keypoint.p.y)), 3, (0, 255, 0), -1) cv2.imshow("image", image) cv2.waitKey(0) ================================================ FILE: python/requirements.txt ================================================ numpy tqdm requests portalocker opencv-python ================================================ FILE: python/setup.py.i ================================================ import sys from setuptools import setup, find_packages try: from wheel.bdist_wheel import bdist_wheel as _bdist_wheel class bdist_wheel(_bdist_wheel): def finalize_options(self): _bdist_wheel.finalize_options(self) self.root_is_pure = False except ImportError: bdist_wheel = None if sys.version_info < (3, 0): sys.exit("Sorry, Python < 3.0 is not supported") requirements = ["numpy", "tqdm", "requests", "portalocker", "opencv-python"] setup( name="ncnn", version="${PACKAGE_VERSION}", author="nihui", author_email="nihuini@tencent.com", maintainer="caishanli", maintainer_email="caishanli25@gmail.com", description="ncnn is a high-performance neural network inference framework optimized for the mobile platform", url="https://github.com/Tencent/ncnn", classifiers=[ "Programming Language :: C++", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Programming Language :: Python :: 3.14", "License :: OSI Approved :: BSD License", "Operating System :: OS Independent", "Topic :: Scientific/Engineering :: Artificial Intelligence", ], license="BSD-3", python_requires=">=3.5", packages=find_packages(), package_dir={"": "."}, package_data={"ncnn": ["ncnn${PYTHON_MODULE_PREFIX}${PYTHON_MODULE_EXTENSION}"]}, install_requires=requirements, cmdclass={"bdist_wheel": bdist_wheel}, ) ================================================ FILE: python/src/main.cpp ================================================ // Copyright 2020 Tencent // SPDX-License-Identifier: BSD-3-Clause #include #include #include #include #include #include #include #include #include #include #include "pybind11_mat.h" #include "pybind11_datareader.h" #include "pybind11_allocator.h" #include "pybind11_modelbin.h" #include "pybind11_layer.h" using namespace ncnn; namespace py = pybind11; class DataReaderFromMemoryCopy : public DataReaderFromMemory { public: explicit DataReaderFromMemoryCopy(const unsigned char*& mem) : DataReaderFromMemory(mem) { } virtual size_t reference(size_t size, const void** buf) const { return 0; } }; struct LayerFactory { std::string name; int index; std::function creator; std::function destroyer; layer_creator_func creator_func; layer_destroyer_func destroyer_func; }; #define LayerFactoryDeclear(n) \ static ncnn::Layer* LayerCreator##n(void*); \ static void LayerDestroyer##n(ncnn::Layer*, void*); LayerFactoryDeclear(0); LayerFactoryDeclear(1); LayerFactoryDeclear(2); LayerFactoryDeclear(3); LayerFactoryDeclear(4); LayerFactoryDeclear(5); LayerFactoryDeclear(6); LayerFactoryDeclear(7); LayerFactoryDeclear(8); LayerFactoryDeclear(9); std::vector g_layer_factroys = { {"", -1, nullptr, nullptr, LayerCreator0, LayerDestroyer0}, {"", -1, nullptr, nullptr, LayerCreator1, LayerDestroyer1}, {"", -1, nullptr, nullptr, LayerCreator2, LayerDestroyer2}, {"", -1, nullptr, nullptr, LayerCreator3, LayerDestroyer3}, {"", -1, nullptr, nullptr, LayerCreator4, LayerDestroyer4}, {"", -1, nullptr, nullptr, LayerCreator5, LayerDestroyer5}, {"", -1, nullptr, nullptr, LayerCreator6, LayerDestroyer6}, {"", -1, nullptr, nullptr, LayerCreator7, LayerDestroyer7}, {"", -1, nullptr, nullptr, LayerCreator8, LayerDestroyer8}, {"", -1, nullptr, nullptr, LayerCreator9, LayerDestroyer9}, }; int g_layer_factroy_index = 0; #define LayerFactoryDefine(n) \ static ncnn::Layer* LayerCreator##n(void* p) \ { \ if (g_layer_factroys[n].creator != nullptr) \ { \ return g_layer_factroys[n].creator(); \ } \ return nullptr; \ } \ static void LayerDestroyer##n(ncnn::Layer* layer, void* p) \ { \ if (g_layer_factroys[n].destroyer) \ { \ g_layer_factroys[n].destroyer(layer); \ } \ } LayerFactoryDefine(0); LayerFactoryDefine(1); LayerFactoryDefine(2); LayerFactoryDefine(3); LayerFactoryDefine(4); LayerFactoryDefine(5); LayerFactoryDefine(6); LayerFactoryDefine(7); LayerFactoryDefine(8); LayerFactoryDefine(9); PYBIND11_MODULE(ncnn, m) { auto atexit = py::module_::import("atexit"); atexit.attr("register")(py::cpp_function([]() { for (int i = 0; i < g_layer_factroys.size(); i++) { g_layer_factroys[i].creator = nullptr; g_layer_factroys[i].destroyer = nullptr; } })); py::class_ >(m, "Allocator"); py::class_ >(m, "PoolAllocator") .def(py::init<>()) .def("set_size_compare_ratio", &PoolAllocator::set_size_compare_ratio, py::arg("src")) .def("clear", &PoolAllocator::clear) .def("fastMalloc", &PoolAllocator::fastMalloc, py::arg("size")) .def("fastFree", &PoolAllocator::fastFree, py::arg("ptr")); py::class_ >(m, "UnlockedPoolAllocator") .def(py::init<>()) .def("set_size_compare_ratio", &UnlockedPoolAllocator::set_size_compare_ratio, py::arg("src")) .def("clear", &UnlockedPoolAllocator::clear) .def("fastMalloc", &UnlockedPoolAllocator::fastMalloc, py::arg("size")) .def("fastFree", &UnlockedPoolAllocator::fastFree, py::arg("ptr")); py::class_ >(m, "DataReader") .def(py::init<>()) #if NCNN_STRING .def("scan", &DataReader::scan, py::arg("format"), py::arg("p")) #endif // NCNN_STRING .def("read", &DataReader::read, py::arg("buf"), py::arg("size")); py::class_ >(m, "DataReaderFromEmpty") .def(py::init<>()) #if NCNN_STRING .def("scan", &DataReaderFromEmpty::scan, py::arg("format"), py::arg("p")) #endif // NCNN_STRING .def("read", &DataReaderFromEmpty::read, py::arg("buf"), py::arg("size")); py::class_(m, "Blob") .def(py::init<>()) #if NCNN_STRING .def_readwrite("name", &Blob::name) #endif // NCNN_STRING .def_readwrite("producer", &Blob::producer) .def_readwrite("consumer", &Blob::consumer) .def_readwrite("shape", &Blob::shape); py::class_ >(m, "ModelBin") .def(py::init<>()) .def("load", (Mat(ModelBin::*)(int, int) const) & ModelBin::load, py::arg("w"), py::arg("type")) .def("load", (Mat(ModelBin::*)(int, int, int) const) & ModelBin::load, py::arg("w"), py::arg("h"), py::arg("type")) .def("load", (Mat(ModelBin::*)(int, int, int, int) const) & ModelBin::load, py::arg("w"), py::arg("h"), py::arg("c"), py::arg("type")) .def("load", (Mat(ModelBin::*)(int, int, int, int, int) const) & ModelBin::load, py::arg("w"), py::arg("h"), py::arg("d"), py::arg("c"), py::arg("type")); py::class_ >(m, "ModelBinFromDataReader") .def(py::init(), py::arg("dr")) .def("load", &ModelBinFromDataReader::load, py::arg("w"), py::arg("type")); py::class_ >(m, "ModelBinFromMatArray") .def(py::init(), py::arg("weights")) .def("load", &ModelBinFromMatArray::load, py::arg("w"), py::arg("type")); py::class_(m, "ParamDict") .def(py::init<>()) .def("type", &ParamDict::type, py::arg("id")) .def("get", (int (ParamDict::*)(int, int) const) & ParamDict::get, py::arg("id"), py::arg("def")) .def("get", (float (ParamDict::*)(int, float) const) & ParamDict::get, py::arg("id"), py::arg("def")) .def("get", (Mat(ParamDict::*)(int, const Mat&) const) & ParamDict::get, py::arg("id"), py::arg("def")) .def("set", (void (ParamDict::*)(int, int)) & ParamDict::set, py::arg("id"), py::arg("i")) .def("set", (void (ParamDict::*)(int, float)) & ParamDict::set, py::arg("id"), py::arg("f")) .def("set", (void (ParamDict::*)(int, const Mat&)) & ParamDict::set, py::arg("id"), py::arg("v")); py::class_